Repository: injetlee/Python
Branch: master
Commit: 94faec41b8a7
Files: 26
Total size: 37.9 KB

Directory structure:
gitextract_gicf6sdo/

├── .gitignore
├── CpuToInfluxdb.py
├── ModifyFilename.py
├── Python 黑魔法/
│   ├── Python 远程开机.py
│   └── README.MD
├── README.md
├── biyingSpider.py
├── countFile.py
├── countPm.py
├── douban_book.py
├── douban_movie.py
├── excelToDatabase.py
├── image_recognition_zhihu.py
├── lagouSpider.py
├── login_zhihu.py
├── qiubai_crawer.py
├── readExcel.py
├── wechat/
│   ├── README.MD
│   ├── connect.py
│   ├── face_id.py
│   ├── requirements.txt
│   └── utils.py
└── 爬虫集合/
    ├── README.MD
    ├── lagou.py
    ├── meizitu.py
    └── qiubai_crawer.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
/no_use
*.xlsx

================================================
FILE: CpuToInfluxdb.py
================================================
import psutil
import os
from influxdb import InfluxDBClient
import time,math,random


#获取当前运行的pid
p1=psutil.Process(os.getpid()) 


from influxdb import InfluxDBClient
import time,math,random
while True:
    a = psutil.virtual_memory().percent  #内存占用率

    b = psutil.cpu_percent(interval=1.0) #cpu占用率

    json_body = [
        {
            "measurement": "cpu_load_short",
            "tags": {
                "host": "server01",
                "region": "us-west"
            },
            #"time": "2009-11-10T23:00:00Z",
            "fields": {
                "cpu": b,
                "mem": a
            }
        }
    ]
    client = InfluxDBClient('localhost', 8086, 'root', 'root', 'xxyyxx')
    client.create_database('xxyyxx',if_not_exists=False)
    client.write_points(json_body)
    #result = client.query('select value from cpu_load_short;')
    #print("Result: {0}".format(result))
    time.sleep(2)

================================================
FILE: ModifyFilename.py
================================================
import os
dir = os.getcwd()
subdir = os.listdir(dir)
for i in subdir:
    path = os.path.join(dir, i)
    if os.path.isdir(path):
        end_dir = os.listdir(path)
        for i in range(len(end_dir)):
            newname = end_dir[i][0:50]
            os.rename(os.path.join(path, end_dir[
                      i]), os.path.join(path, newname))


================================================
FILE: Python 黑魔法/Python 远程开机.py
================================================
def wake_up(request, mac='DC-4A-3E-78-3E-0A'):
    MAC = mac
    BROADCAST = "192.168.0.255"
    if len(MAC) != 17:
        raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'")
    mac_address = MAC.replace("-", '')
    data = ''.join(['FFFFFFFFFFFF', mac_address * 20])  # 构造原始数据格式
    send_data = b''

    # 把原始数据转换为16进制字节数组，
    for i in range(0, len(data), 2):
        send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))])
    print(send_data)

    # 通过socket广播出去，为避免失败，间隔广播三次
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
        sock.sendto(send_data, (BROADCAST, 7))
        time.sleep(1)
        sock.sendto(send_data, (BROADCAST, 7))
        time.sleep(1)
        sock.sendto(send_data, (BROADCAST, 7))
        return HttpResponse()
        print("Done")
    except Exception as e:
        return HttpResponse()
        print(e)

================================================
FILE: Python 黑魔法/README.MD
================================================
# 代码详细说明请看文章

[Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ)


================================================
FILE: README.md
================================================

# 欢迎关注我的微信公众号【智能制造社区】

## 左手代码，右手制造，分享智能制造相关技术和业务，包括 Python, C#, 数据库，工业大数据、物联网技术及MES/ERP/SAP等系统。

## 可以通过微信公众号加我好友

![二维码](qrcode.jpg)

# 内容列表

## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat)

- ### Python 微信公众号开发—小白篇(一)

- ### Python 公众号开发—颜值检测

## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88)

- ### Python 爬虫入门(一)——爬取糗事百科

- ### Python 爬虫入门(二)——爬取妹子图

- ### Python 爬虫——Python 岗位分析报告

- ### Python 爬虫利器——Selenium介绍

- ### Python 爬虫—— 抖音 App 视频抓包爬取

## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95)

- ### Python 远程关机

## SQL 数据库

- [1 小时 SQL 极速入门（一）](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA)
- [1 小时 SQL 极速入门（二）](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA)
- [1 小时 SQL 极速入门（三）](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q)
- [SQL 高级查询——（层次化查询，递归）](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q)
- [GROUP BY高级查询,ROLLUP，CUBE，GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ)
- [SQL 行转列，列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ)

## 其他

- 1.[获取当前CPU状态，存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py)

- 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py)

- 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py)

- 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py)

- 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py)

- 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py)

- 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py)

- 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py)

- 9.[下载必应首页图片,只下载当天的，一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py)


================================================
FILE: biyingSpider.py
================================================
import requests
import re
import time
local = time.strftime("%Y.%m.%d")
url = 'http://cn.bing.com/'
con = requests.get(url)
content = con.text
reg = r"(az/hprichbg/rb/.*?.jpg)"
a = re.findall(reg, content, re.S)[0]
print(a)
picUrl = url + a
read = requests.get(picUrl)
f = open('%s.jpg' % local, 'wb')
f.write(read.content)
f.close()


================================================
FILE: countFile.py
================================================
import os
result = []
def get_all(cwd):
    get_dir = os.listdir(cwd)  #遍历当前目录，获取文件列表
    for i in get_dir:          
        sub_dir = os.path.join(cwd,i)  # 把第一步获取的文件加入路径
        if os.path.isdir(sub_dir):     #如果当前仍然是文件夹，递归调用
            get_all(sub_dir)
        else:
            ax = os.path.basename(sub_dir)  #如果当前路径不是文件夹，则把文件名放入列表
            result.append(ax)
            print(len(result))   #对列表计数
            
if __name__ == "__main__": 
    cur_path = os.getcwd()   #当前目录
    get_all(cur_path)

================================================
FILE: countPm.py
================================================
# -*- coding:utf-8 -*-
def count_pm(*args):
    alist = list([round(i*2-8,2) for i in args])  #计算三种颗粒浓度
    result = []
    for pm in alist:
    	pm_abs = abs(pm)
    	result.append(generate_iso_code(pm_abs))
    print (result)
    return result
    	
def generate_iso_code(x):
	pm_value = [0.01,0.02,0.04,0.08,0.16,0.32,0.64,1.3,2.5,5,10,20,40,80]  #颗粒浓度
	iso = list(range(1,25))   #iso级别，共24级
	for i in range(len(pm_value)):           #for循环得到某个浓度范围的iso4006级别
		if pm_value[i] < x <= pm_value[i+1]:
			iso_code = iso[i]
			break
	return iso_code
			
if __name__ == '__main__':
    count_pm(7.95,5.85,3.98)		
    count_pm(7.918,5.949,5.456)	
    count_pm(6.916,3.956,3.956)		


================================================
FILE: douban_book.py
================================================
from bs4 import BeautifulSoup
import requests
from openpyxl import Workbook
excel_name = "书籍.xlsx"
wb = Workbook()
ws1 = wb.active
ws1.title='书籍'


def get_html(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
    html = requests.get(url, headers=header).content
    return html


def get_con(html):
    soup = BeautifulSoup(html,'html.parser')
    book_list = soup.find('div', attrs={'class': 'article'})
    page = soup.find('div', attrs={'class': 'paginator'})
    next_page = page.find('span', attrs={'class': 'next'}).find('a')
    name = []
    for i in book_list.find_all('table'):
        book_name = i.find('div', attrs={'class': 'pl2'})
        m = list(book_name.find('a').stripped_strings)
        if len(m)>1:
            x = m[0]+m[1]
        else:
            x = m[0]
        #print(x)
        name.append(x)
    if next_page:
        return name, next_page.get('href')
    else:
        return name, None


def main():
    url = 'https://book.douban.com/top250'
    name_list=[]
    while url:
        html = get_html(url)
        name, url = get_con(html)
        name_list = name_list + name
    for i in name_list:
        location = 'A%s'%(name_list.index(i)+1)
        print(i)
        print(location)
        ws1[location]=i
    wb.save(filename=excel_name)


if __name__ == '__main__':
    main()


================================================
FILE: douban_movie.py
================================================
#!/usr/bin/env python
# encoding=utf-8
import requests
import re
import codecs
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook()
dest_filename = '电影.xlsx'
ws1 = wb.active
ws1.title = "电影top250"

DOWNLOAD_URL = 'http://movie.douban.com/top250/'


def download_page(url):
    """获取url地址页面内容"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    data = requests.get(url, headers=headers).content
    return data


def get_li(doc):
    soup = BeautifulSoup(doc, 'html.parser')
    ol = soup.find('ol', class_='grid_view')
    name = []  # 名字
    star_con = []  # 评价人数
    score = []  # 评分
    info_list = []  # 短评
    for i in ol.find_all('li'):
        detail = i.find('div', attrs={'class': 'hd'})
        movie_name = detail.find(
            'span', attrs={'class': 'title'}).get_text()  # 电影名字
        level_star = i.find(
            'span', attrs={'class': 'rating_num'}).get_text()  # 评分
        star = i.find('div', attrs={'class': 'star'})
        star_num = star.find(text=re.compile('评价'))  # 评价

        info = i.find('span', attrs={'class': 'inq'})  # 短评
        if info:  # 判断是否有短评
            info_list.append(info.get_text())
        else:
            info_list.append('无')
        score.append(level_star)

        name.append(movie_name)
        star_con.append(star_num)
    page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
    if page:
        return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
    return name, star_con, score, info_list, None


def main():
    url = DOWNLOAD_URL
    name = []
    star_con = []
    score = []
    info = []
    while url:
        doc = download_page(url)
        movie, star, level_num, info_list, url = get_li(doc)
        name = name + movie
        star_con = star_con + star
        score = score + level_num
        info = info + info_list
    for (i, m, o, p) in zip(name, star_con, score, info):
        col_A = 'A%s' % (name.index(i) + 1)
        col_B = 'B%s' % (name.index(i) + 1)
        col_C = 'C%s' % (name.index(i) + 1)
        col_D = 'D%s' % (name.index(i) + 1)
        ws1[col_A] = i
        ws1[col_B] = m
        ws1[col_C] = o
        ws1[col_D] = p
    wb.save(filename=dest_filename)


if __name__ == '__main__':
    main()


================================================
FILE: excelToDatabase.py
================================================
from openpyxl import load_workbook
import pymysql
config = {
	'host': '127.0.0.1',
	'port':3306,
	'user': 'root',
	'password': 'root',
	'charset': 'utf8mb4',
	#'cursorclass': pymysql.cursors.DictCursor

}
conn = pymysql.connect(**config)
conn.autocommit(1)
cursor = conn.cursor()
name = 'lyexcel'
cursor.execute('create database if not exists %s' %name)
conn.select_db(name)
table_name = 'info'
cursor.execute('create table if not exists %s(id MEDIUMINT NOT NULL AUTO_INCREMENT,name varchar(30),tel varchar(30),primary key (id))'%table_name)

wb2 = load_workbook('hpu.xlsx')
ws=wb2.get_sheet_names()
for row in wb2:
	print("1")
	for cell in row:
		value1=(cell[0].value,cell[4].value)
		cursor.execute('insert into info (name,tel) values(%s,%s)',value1)

print("overing...")
# for row in A:
# 	print(row)
#print (wb2.get_sheet_names())


================================================
FILE: image_recognition_zhihu.py
================================================
# -*- coding:UTF-8 -*-

import  requests , time ,random
import  hmac ,json ,base64
from bs4 import BeautifulSoup
from hashlib import sha1
import TencentYoutuyun
from PIL import Image
import uuid


def recognition_captcha(data):
    ''' 识别验证码 '''

    file_id = str(uuid.uuid1())
    filename = 'captcha_'+ file_id +'.gif'
    filename_png =  'captcha_'+ file_id +'.png'

    if(data is None):
        return 
    data = base64.b64decode(data.encode('utf-8'))
    with open( filename ,'wb') as fb:
        fb.write( data )    
    
    appid = 'appid' # 接入优图服务，注册账号获取 
    secret_id = 'secret_id'  
    secret_key = 'secret_key'  
    userid= 'userid' 
    end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT   

    youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化

    # 拿到的是gif格式，而优图只支持 JPG PNG BMP 其中之一，这时我们需要 pip install Pillow 来转换格式
    im = Image.open( filename)
    im.save( filename_png ,"png")
    im.close()
    
    result = youtu.generalocr( filename_png , data_type = 0 , seq = '')  #  0代表本地路径，1代表url

    return result


def get_captcha(sessiona,headers):
    ''' 获取验证码 '''
    
    need_cap = False

    while( need_cap is not True):
        try:
            sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
            resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
            need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码
            time.sleep( 0.5 + random.randint(1,9)/10 )
        except Exception:
            continue

    try:
        resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据，注意是put
        img_data = json.loads(resp3.text)["img_base64"]
    except Exception:
        return     
    

    return img_data

def create_point( point_data, confidence ):
    ''' 获得点阵 '''

    # 实际操作下，套路不深，x间隔25，y相同，共7个点 ，先模拟意思一下
    points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]}
    wi = 0
    input_points = []
    
    for word in ( point_data['items'][0]['words'] ):
        wi = wi+1
        if( word['confidence'] < confidence ):
            try:
                input_points.append(points[wi]) # 倒置的中文，优图识别不出来，置信度会低于0.5
            except KeyError:
                continue
        
    if( len(input_points) > 2 or len(input_points) == 0 ):
        return []  # 7个字中只有2个倒置中文的成功率高
    
    result = {}
    result['img_size']=[200,44]
    result['input_points']=input_points
    result = json.dumps(result)
    print(result)
    return result

def bolting(k_low,k_hi,k3_confidence):
    ''' 筛选把握大的进行验证 '''

    start = time.time()
    
    is_success = False
    while(is_success is not True):
    
        points_len = 1
        angle = -20
        img_ko = []

        while(points_len != 21  or  angle < k_low  or angle > k_hi ):  
            img_data = get_captcha(sessiona,headers)
            img_ko = recognition_captcha(img_data)
     
            ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
            # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) 
            # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因，也因为优图自身的原因，此处要特殊处理
        
            # with open( "json.txt" ,'wb') as fb:
            #     fb.write( img_ko_json )  
    
            try:
                points_len = len(img_ko['items'][0]['itemstring'])
                angle = img_ko['angle']
            except Exception:
                points_len = 1
                angle = -20
                continue

        # print(img_ko_json.decode('utf8')) ## stdout用的是utf8，需转码才能正常显示
        # print('-'*50)
        
        input_text = create_point( img_ko ,k3_confidence )
        if(type(input_text) == type([])):
            continue
        
        data = {
            "input_text":input_text   
            }

        # 提交过快会被拒绝，{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ，假装思考5秒钟
        time.sleep( 4 + random.randint(1,9)/10 )
        try:    
            resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers)
        except Exception:
            continue
        
        print("angle: "+ str(angle) )
        print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功，会回应{"success":true}，开心
        print('-'*50)
        try:
            is_success = json.loads(resp5.text)["success"]
        except KeyError:
            continue

    end = time.time()

    return end-start


if __name__ == "__main__":
    
    sessiona = requests.Session()
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}

    k3_confidence = 0.71
    
    '''
    # 可视化数据会被保存在云端供浏览
    # https://plot.ly/~weldon2010/4
    # 纯属学习，并未看出"角度"范围扩大对图像识别的影响，大部分时候60s内能搞定，说明优图还是很强悍的，识别速度也非常快
    '''
    runtime_list_x = []
    runtime_list_y = []
    nn = range(1,11) # 愿意的话搞多线程，1百万次更有意思
    
    # 成功尝试100次，形成2维数据以热力图的方式展示
    for y in nn :
        for x in  nn :
            runtime_list_x.append( bolting(-3,3,k3_confidence) )
            print( "y: " + str(runtime_list_y) )
            print( "x: " + str(runtime_list_x) )
        runtime_list_y.append(runtime_list_x.copy())
        runtime_list_x = []

    print ("-"*30)    
    print( runtime_list_y )
    print ("-"*30)

    # pip install plotly 数据可视化
    import plotly
    import plotly.graph_objs as go
    plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号，去官网注册
    trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ])
    data=[trace]
    plotly.plotly.plot(data, filename='weldon-time2-heatmap')    
   
    # 尝试后发现一个特点，基本都是1~2个倒置中文，这样我们可以借此提速
    # 角度范围放大，仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找

### chcp 65001 (win下改变cmd字符集)
### python  c:\python34\image_recognition_zhihu.py


================================================
FILE: lagouSpider.py
================================================
import requests
from openpyxl import Workbook

def get_json(url, page, lang_name):
    data = {'first': 'true', 'pn': page, 'kd': lang_name}
    json = requests.post(url, data).json()
    list_con = json['content']['positionResult']['result']
    info_list = []
    for i in list_con:
        info = []
        info.append(i['companyShortName'])
        info.append(i['companyName'])
        info.append(i['salary'])
        info.append(i['city'])
        info.append(i['education'])
        info_list.append(info)
    return info_list


def main():
    lang_name = input('职位名：')
    page = 1
    url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    info_result = []
    while page < 31:
        info = get_json(url, page, lang_name)
        info_result = info_result + info
        page += 1
    wb = Workbook()
    ws1 = wb.active
    ws1.title = lang_name
    for row in info_result:
        ws1.append(row)
    wb.save('职位信息.xlsx')

if __name__ == '__main__':
    main()


================================================
FILE: login_zhihu.py
================================================
# -*- coding:UTF-8 -*-

import  requests , time
import  hmac ,json
from bs4 import BeautifulSoup
from hashlib import sha1


def get_captcha(data,need_cap):
    ''' 处理验证码 '''
    if need_cap is False:
        return
    with open('captcha.gif','wb') as fb:
        fb.write(data)
    return input('captcha:')
    
def get_signature(grantType,clientId,source,timestamp):
    ''' 处理签名 '''
	
    hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1)
    hm.update(str.encode(grantType))
    hm.update(str.encode(clientId))
    hm.update(str.encode(source))
    hm.update(str.encode(timestamp))

    return  str(hm.hexdigest())


def login(username,password,oncaptcha,sessiona,headers):
    ''' 处理登录 '''
    
    resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
    resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
    need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码

    grantType = 'password'
    clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
    source ='com.zhihu.web'
    timestamp = str((time.time()*1000)).split('.')[0]  # 签名只按这个时间戳变化
       
    captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content
    
    data = {
        "client_id":clientId,
        "grant_type":grantType,
        "timestamp":timestamp,
        "source":source,
        "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名
        "username":username,
        "password":password,
        "lang":"cn",
        "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码
        "ref_source":"other_",
        "utm_source":""
    }
    
    print("**2**: "+str(data))
    print("-"*50)
    resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content
    print(BeautifulSoup(resp,'html.parser'))
    
    print("-"*50)
    return resp 

if __name__ == "__main__":
    sessiona = requests.Session()
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}

    login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了
    resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers)  # 登录进去了，可以看私信了
    print(BeautifulSoup(resp.content ,'html.parser'))
    
    
### chcp 65001 (win下改变cmd字符集)
### python  c:\python34\login_zhihu.py
### 有非常无语的事情发生，还以为代码没生效


================================================
FILE: qiubai_crawer.py
================================================
import requests
from bs4 import BeautifulSoup


def download_page(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    r = requests.get(url, headers=headers)
    return r.text


def get_content(html, page):
    output = """第{}页 作者：{} 性别：{} 年龄：{} 点赞：{} 评论：{}\n{}\n------------\n"""
    soup = BeautifulSoup(html, 'html.parser')
    con = soup.find(id='content-left')
    con_list = con.find_all('div', class_="article")
    for i in con_list:
        author = i.find('h2').string  # 获取作者名字
        content = i.find('div', class_='content').find('span').get_text()  # 获取内容
        stats = i.find('div', class_='stats')
        vote = stats.find('span', class_='stats-vote').find('i', class_='number').string
        comment = stats.find('span', class_='stats-comments').find('i', class_='number').string
        author_info = i.find('div', class_='articleGender')  # 获取作者 年龄，性别
        if author_info is not None:  # 非匿名用户
            class_list = author_info['class']
            if "womenIcon" in class_list:
                gender = '女'
            elif "manIcon" in class_list:
                gender = '男'
            else:
                gender = ''
            age = author_info.string   # 获取年龄
        else:  # 匿名用户
            gender = ''
            age = ''

        save_txt(output.format(page, author, gender, age, vote, comment, content))


def save_txt(*args):
    for i in args:
        with open('qiubai.txt', 'a', encoding='utf-8') as f:
            f.write(i)


def main():
    # 我们点击下面链接，在页面下方可以看到共有13页，可以构造如下 url，
    # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。
    for i in range(1, 14):
        url = 'https://qiushibaike.com/text/page/{}'.format(i)
        html = download_page(url)
        get_content(html, i)

if __name__ == '__main__':
    main()


================================================
FILE: readExcel.py
================================================
from openpyxl import Workbook
from openpyxl.compat import range
from openpyxl.cell import get_column_letter
wb = Workbook()
dest_filename = 'empty_book2.xlsx'
ws1 = wb.active  # 第一个表
ws1.title = "range names"  # 第一个表命名
# 遍历第一个表的1到40行，赋值一个600内的随机数
for row in range(1, 40):
    ws1.append(range(60))
ws2 = wb.create_sheet(title="Pi")
ws2['F5'] = 3.14
ws3 = wb.create_sheet(title="Data")
for row in range(10, 20):
    for col in range(27, 54):
        _ = ws3.cell(column=col, row=row, value="%s" % get_column_letter(col))
wb.save(filename=dest_filename)


================================================
FILE: wechat/README.MD
================================================
# 详细使用请看文章

[Python微信公众号开发—小白篇(一)](https://mp.weixin.qq.com/s/iMPUC0yxI-zuf4AjtyAu6g)

[Python公众号开发—颜值检测](https://mp.weixin.qq.com/s/I0DxhIHkeqhc2LeQ2ICHeA)

================================================
FILE: wechat/connect.py
================================================
# -*-coding:utf-8 -*-
import falcon
from falcon import uri
from wechatpy.utils import check_signature
from wechatpy.exceptions import InvalidSignatureException
from wechatpy import parse_message
from wechatpy.replies import TextReply, ImageReply

from utils import img_download, img_upload
from face_id import access_api


class Connect(object):

    def on_get(self, req, resp):
        query_string = req.query_string
        query_list = query_string.split('&')
        b = {}
        for i in query_list:
            b[i.split('=')[0]] = i.split('=')[1]

        try:
            check_signature(token='lengxiao', signature=b['signature'], timestamp=b['timestamp'], nonce=b['nonce'])
            resp.body = (b['echostr'])
        except InvalidSignatureException:
            pass
        resp.status = falcon.HTTP_200

    def on_post(self, req, resp):
        xml = req.stream.read()
        msg = parse_message(xml)
        if msg.type == 'text':
            reply = TextReply(content=msg.content, message=msg)
            xml = reply.render()
            resp.body = (xml)
            resp.status = falcon.HTTP_200
        elif msg.type == 'image':
            name = img_download(msg.image, msg.source)  
            print(name)
            r = access_api('images/' + name)
            if r == 'success':
                media_id = img_upload('image', 'faces/' + name)
                reply = ImageReply(media_id=media_id, message=msg)
            else:
                reply = TextReply(content='人脸检测失败，请上传1M以下人脸清晰的照片', message=msg)
            xml = reply.render()
            resp.body = (xml)
            resp.status = falcon.HTTP_200

app = falcon.API()
connect = Connect()
app.add_route('/connect', connect)


================================================
FILE: wechat/face_id.py
================================================
# -*-coding:utf-8 -*-
import time
import random
import base64
import hashlib
import requests
from urllib.parse import urlencode
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os


# 一.计算接口鉴权，构造请求参数

def random_str():
    '''得到随机字符串nonce_str'''
    str = 'abcdefghijklmnopqrstuvwxyz'
    r = ''
    for i in range(15):
        index = random.randint(0,25)
        r += str[index]
    return r


def image(name):
    with open(name, 'rb') as f:
        content = f.read()
    return base64.b64encode(content)


def get_params(img):
    '''组织接口请求的参数形式，并且计算sign接口鉴权信息，
    最终返回接口请求所需要的参数字典'''
    params = {
        'app_id': '1106860829',
        'time_stamp': str(int(time.time())),
        'nonce_str': random_str(),
        'image': img,
        'mode': '0'

    }

    sort_dict = sorted(params.items(), key=lambda item: item[0], reverse=False)  # 排序
    sort_dict.append(('app_key', 'P8Gt8nxi6k8vLKbS'))  # 添加app_key
    rawtext = urlencode(sort_dict).encode()  # URL编码
    sha = hashlib.md5()
    sha.update(rawtext)
    md5text = sha.hexdigest().upper()  # 计算出sign，接口鉴权
    params['sign'] = md5text  # 添加到请求参数列表中
    return params

# 二.请求接口URL


def access_api(img):
    frame = cv2.imread(img)
    nparry_encode = cv2.imencode('.jpg', frame)[1]
    data_encode = np.array(nparry_encode)
    img_encode = base64.b64encode(data_encode)  # 图片转为base64编码格式
    url = 'https://api.ai.qq.com/fcgi-bin/face/face_detectface' 
    res = requests.post(url, get_params(img_encode)).json()  # 请求URL,得到json信息
    # 把信息显示到图片上
    if res['ret'] == 0:  # 0代表请求成功
        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # 把opencv格式转换为PIL格式，方便写汉字
        draw = ImageDraw.Draw(pil_img)
        for obj in res['data']['face_list']:
            img_width = res['data']['image_width']  # 图像宽度
            img_height = res['data']['image_height']  # 图像高度
            # print(obj)
            x = obj['x']  # 人脸框左上角x坐标
            y = obj['y']  # 人脸框左上角y坐标
            w = obj['width']  # 人脸框宽度
            h = obj['height']  # 人脸框高度
            # 根据返回的值，自定义一下显示的文字内容
            if obj['glass'] == 1:  # 眼镜
                glass = '有'
            else:
                glass = '无'
            if obj['gender'] >= 70:  # 性别值从0-100表示从女性到男性
                gender = '男'
            elif 50 <= obj['gender'] < 70:
                gender = "娘"
            elif obj['gender'] < 30:
                gender = '女'
            else:
                gender = '女汉子'
            if 90 < obj['expression'] <= 100:  # 表情从0-100，表示笑的程度
                expression = '一笑倾城'
            elif 80 < obj['expression'] <= 90:
                expression = '心花怒放'
            elif 70 < obj['expression'] <= 80:
                expression = '兴高采烈'
            elif 60 < obj['expression'] <= 70:
                expression = '眉开眼笑'
            elif 50 < obj['expression'] <= 60:
                expression = '喜上眉梢'
            elif 40 < obj['expression'] <= 50:
                expression = '喜气洋洋'
            elif 30 < obj['expression'] <= 40:
                expression = '笑逐颜开'
            elif 20 < obj['expression'] <= 30:
                expression = '似笑非笑'
            elif 10 < obj['expression'] <= 20:
                expression = '半嗔半喜'
            elif 0 <= obj['expression'] <= 10:
                expression = '黯然伤神'
            delt = h // 5  # 确定文字垂直距离
            # 写入图片
            if len(res['data']['face_list']) > 1:  # 检测到多个人脸，就把信息写入人脸框内
                font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8')  # 提前把字体文件下载好
                draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font)
            elif img_width - x - w < 170:  # 避免图片太窄，导致文字显示不完全
                font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8')
                draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font)
                draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font)
            else:
                font = ImageFont.truetype('yahei.ttf', 20, encoding='utf-8')
                draw.text((x + w + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font)
                draw.text((x + w + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font)
                draw.text((x + w + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font)
                draw.text((x + w + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font)
                draw.text((x + w + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font)

            draw.rectangle((x, y, x + w, y + h), outline="#4CB050")  # 画出人脸方框
            cv2img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)  # 把 pil 格式转换为 cv
            cv2.imwrite('faces/{}'.format(os.path.basename(img)), cv2img)  # 保存图片到 face 文件夹下
        return 'success'
    else:
        return 'fail'

================================================
FILE: wechat/requirements.txt
================================================
certifi==2018.4.16
chardet==3.0.4
falcon==1.4.1
idna==2.6
numpy==1.14.5
opencv-python==3.4.1.15
optionaldict==0.1.1
Pillow==5.1.0
pycrypto==2.6.1
python-dateutil==2.7.3
python-mimeparse==1.6.0
requests==2.18.4
six==1.11.0
urllib3==1.22
waitress==1.1.0
wechatpy==1.7.0
xmltodict==0.11.0


================================================
FILE: wechat/utils.py
================================================
# -*-coding:utf-8 -*-
import requests
import json
import threading
import time
import os

token = ''
app_id = '开发者ID(AppID)'
secret = '开发者密码(AppSecret)'


def img_download(url, name):
    r = requests.get(url)
    with open('images/{}-{}.jpg'.format(name, time.strftime("%Y_%m_%d%H_%M_%S", time.localtime())), 'wb') as fd:
        fd.write(r.content)
    if os.path.getsize(fd.name) >= 1048576:
        return 'large'
    # print('namename', os.path.basename(fd.name))
    return os.path.basename(fd.name)


def get_access_token(appid, secret):
    '''获取access_token,100分钟刷新一次'''

    url = 'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}'.format(appid, secret)
    r = requests.get(url)
    parse_json = json.loads(r.text)
    global token
    token = parse_json['access_token']
    global timer
    timer = threading.Timer(6000, get_access_token)
    timer.start()


def img_upload(mediaType, name):
    global token
    url = "https://api.weixin.qq.com/cgi-bin/media/upload?access_token=%s&type=%s" % (token, mediaType)
    files = {'media': open('{}'.format(name), 'rb')}
    r = requests.post(url, files=files)
    parse_json = json.loads(r.text)
    return parse_json['media_id']

get_access_token(app_id, secret)

================================================
FILE: 爬虫集合/README.MD
================================================
# 代码详细说明请看文章

[Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw)

[Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew)

[Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA)

[Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw)

[Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g)

================================================
FILE: 爬虫集合/lagou.py
================================================
import random
import time

import requests
from openpyxl import Workbook
import pymysql.cursors


def get_conn():
    '''建立数据库连接'''
    conn = pymysql.connect(host='localhost',
                                user='root',
                                password='root',
                                db='python',
                                charset='utf8mb4',
                                cursorclass=pymysql.cursors.DictCursor)
    return conn


def insert(conn, info):
    '''数据写入数据库'''
    with conn.cursor() as cursor:
        sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
        cursor.execute(sql, info)
    conn.commit()


def get_json(url, page, lang_name):
    '''返回当前页面的信息列表'''
    headers = {
        'Host': 'www.lagou.com',
        'Connection': 'keep-alive',
        'Content-Length': '23',
        'Origin': 'https://www.lagou.com',
        'X-Anit-Forge-Code': '0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'X-Anit-Forge-Token': 'None',
        'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
    }
    data = {'first': 'false', 'pn': page, 'kd': lang_name}
    json = requests.post(url, data, headers=headers).json()
    list_con = json['content']['positionResult']['result']
    info_list = []
    for i in list_con:
        info = []
        info.append(i.get('companyShortName', '无'))
        info.append(i.get('companyFullName', '无'))
        info.append(i.get('industryField', '无'))
        info.append(i.get('companySize', '无'))
        info.append(i.get('salary', '无'))
        info.append(i.get('city', '无'))
        info.append(i.get('education', '无'))
        info_list.append(info)
    return info_list


def main():
    lang_name = 'python'
    wb = Workbook()  # 打开 excel 工作簿
    conn = get_conn()  # 建立数据库连接  不存数据库 注释此行
    for i in ['北京', '上海', '广州', '深圳', '杭州']:   # 五个城市
        page = 1
        ws1 = wb.active
        ws1.title = lang_name
        url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
        while page < 31:   # 每个城市30页信息
            info = get_json(url, page, lang_name)
            page += 1
            print(i, 'page', page)
            time.sleep(random.randint(10, 20))
            for row in info:
                insert(conn, tuple(row))  # 插入数据库，若不想存入 注释此行
                ws1.append(row)
    conn.close()  # 关闭数据库连接，不存数据库 注释此行
    wb.save('{}职位信息.xlsx'.format(lang_name))

if __name__ == '__main__':
    main()

================================================
FILE: 爬虫集合/meizitu.py
================================================
import requests
import os
import time
import threading
from bs4 import BeautifulSoup


def download_page(url):
    '''
    用于下载页面
    '''
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    r = requests.get(url, headers=headers)
    r.encoding = 'gb2312'
    return r.text


def get_pic_list(html):
    '''
    获取每个页面的套图列表,之后循环调用get_pic函数获取图片
    '''
    soup = BeautifulSoup(html, 'html.parser')
    pic_list = soup.find_all('li', class_='wp-item')
    for i in pic_list:
        a_tag = i.find('h3', class_='tit').find('a')
        link = a_tag.get('href')
        text = a_tag.get_text()
        get_pic(link, text)


def get_pic(link, text):
    '''
    获取当前页面的图片,并保存
    '''
    html = download_page(link)  # 下载界面
    soup = BeautifulSoup(html, 'html.parser')
    pic_list = soup.find('div', id="picture").find_all('img')  # 找到界面所有图片
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    create_dir('pic/{}'.format(text))
    for i in pic_list:
        pic_link = i.get('src')  # 拿到图片的具体 url
        r = requests.get(pic_link, headers=headers)  # 下载图片，之后保存到文件
        with open('pic/{}/{}'.format(text, pic_link.split('/')[-1]), 'wb') as f:
            f.write(r.content)
            time.sleep(1)   # 休息一下，不要给网站太大压力，避免被封


def create_dir(name):
    if not os.path.exists(name):
        os.makedirs(name)


def execute(url):
    page_html = download_page(url)
    get_pic_list(page_html)


def main():
    create_dir('pic')
    queue = [i for i in range(1, 72)]   # 构造 url 链接 页码。
    threads = []
    while len(queue) > 0:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < 5 and len(queue) > 0:   # 最大线程数设置为 5
            cur_page = queue.pop(0)
            url = 'http://meizitu.com/a/more_{}.html'.format(cur_page)
            thread = threading.Thread(target=execute, args=(url,))
            thread.setDaemon(True)
            thread.start()
            print('{}正在下载{}页'.format(threading.current_thread().name, cur_page))
            threads.append(thread)


if __name__ == '__main__':
    main()


================================================
FILE: 爬虫集合/qiubai_crawer.py
================================================
import requests
from bs4 import BeautifulSoup


def download_page(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
    r = requests.get(url, headers=headers)
    return r.text


def get_content(html, page):
    output = """第{}页 作者：{} 性别：{} 年龄：{} 点赞：{} 评论：{}\n{}\n------------\n"""
    soup = BeautifulSoup(html, 'html.parser')
    con = soup.find(id='content-left')
    con_list = con.find_all('div', class_="article")
    for i in con_list:
        author = i.find('h2').string  # 获取作者名字
        content = i.find('div', class_='content').find('span').get_text()  # 获取内容
        stats = i.find('div', class_='stats')
        vote = stats.find('span', class_='stats-vote').find('i', class_='number').string
        comment = stats.find('span', class_='stats-comments').find('i', class_='number').string
        author_info = i.find('div', class_='articleGender')  # 获取作者 年龄，性别
        if author_info is not None:  # 非匿名用户
            class_list = author_info['class']
            if "womenIcon" in class_list:
                gender = '女'
            elif "manIcon" in class_list:
                gender = '男'
            else:
                gender = ''
            age = author_info.string   # 获取年龄
        else:  # 匿名用户
            gender = ''
            age = ''

        save_txt(output.format(page, author, gender, age, vote, comment, content))


def save_txt(*args):
    for i in args:
        with open('qiubai.txt', 'a', encoding='utf-8') as f:
            f.write(i)


def main():
    # 我们点击下面链接，在页面下方可以看到共有13页，可以构造如下 url，
    # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。
    for i in range(1, 14):
        url = 'https://qiushibaike.com/text/page/{}'.format(i)
        html = download_page(url)
        get_content(html, i)

if __name__ == '__main__':
    main()