Repository: injetlee/Python Branch: master Commit: 94faec41b8a7 Files: 26 Total size: 37.9 KB Directory structure: gitextract_gicf6sdo/ ├── .gitignore ├── CpuToInfluxdb.py ├── ModifyFilename.py ├── Python 黑魔法/ │ ├── Python 远程开机.py │ └── README.MD ├── README.md ├── biyingSpider.py ├── countFile.py ├── countPm.py ├── douban_book.py ├── douban_movie.py ├── excelToDatabase.py ├── image_recognition_zhihu.py ├── lagouSpider.py ├── login_zhihu.py ├── qiubai_crawer.py ├── readExcel.py ├── wechat/ │ ├── README.MD │ ├── connect.py │ ├── face_id.py │ ├── requirements.txt │ └── utils.py └── 爬虫集合/ ├── README.MD ├── lagou.py ├── meizitu.py └── qiubai_crawer.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ /no_use *.xlsx ================================================ FILE: CpuToInfluxdb.py ================================================ import psutil import os from influxdb import InfluxDBClient import time,math,random #获取当前运行的pid p1=psutil.Process(os.getpid()) from influxdb import InfluxDBClient import time,math,random while True: a = psutil.virtual_memory().percent #内存占用率 b = psutil.cpu_percent(interval=1.0) #cpu占用率 json_body = [ { "measurement": "cpu_load_short", "tags": { "host": "server01", "region": "us-west" }, #"time": "2009-11-10T23:00:00Z", "fields": { "cpu": b, "mem": a } } ] client = InfluxDBClient('localhost', 8086, 'root', 'root', 'xxyyxx') client.create_database('xxyyxx',if_not_exists=False) client.write_points(json_body) #result = client.query('select value from cpu_load_short;') #print("Result: {0}".format(result)) time.sleep(2) ================================================ FILE: ModifyFilename.py ================================================ import os dir = os.getcwd() subdir = os.listdir(dir) for i in subdir: path = os.path.join(dir, i) if os.path.isdir(path): end_dir = os.listdir(path) for i in range(len(end_dir)): newname = end_dir[i][0:50] os.rename(os.path.join(path, end_dir[ i]), os.path.join(path, newname)) ================================================ FILE: Python 黑魔法/Python 远程开机.py ================================================ def wake_up(request, mac='DC-4A-3E-78-3E-0A'): MAC = mac BROADCAST = "192.168.0.255" if len(MAC) != 17: raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'") mac_address = MAC.replace("-", '') data = ''.join(['FFFFFFFFFFFF', mac_address * 20]) # 构造原始数据格式 send_data = b'' # 把原始数据转换为16进制字节数组, for i in range(0, len(data), 2): send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))]) print(send_data) # 通过socket广播出去,为避免失败,间隔广播三次 try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) sock.sendto(send_data, (BROADCAST, 7)) time.sleep(1) sock.sendto(send_data, (BROADCAST, 7)) time.sleep(1) sock.sendto(send_data, (BROADCAST, 7)) return HttpResponse() print("Done") except Exception as e: return HttpResponse() print(e) ================================================ FILE: Python 黑魔法/README.MD ================================================ # 代码详细说明请看文章 [Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ) ================================================ FILE: README.md ================================================ # 欢迎关注我的微信公众号【智能制造社区】 ## 左手代码,右手制造,分享智能制造相关技术和业务,包括 Python, C#, 数据库,工业大数据、物联网技术及MES/ERP/SAP等系统。 ## 可以通过微信公众号加我好友 ![二维码](qrcode.jpg) # 内容列表 ## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) - ### Python 微信公众号开发—小白篇(一) - ### Python 公众号开发—颜值检测 ## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88) - ### Python 爬虫入门(一)——爬取糗事百科 - ### Python 爬虫入门(二)——爬取妹子图 - ### Python 爬虫——Python 岗位分析报告 - ### Python 爬虫利器——Selenium介绍 - ### Python 爬虫—— 抖音 App 视频抓包爬取 ## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95) - ### Python 远程关机 ## SQL 数据库 - [1 小时 SQL 极速入门(一)](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA) - [1 小时 SQL 极速入门(二)](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA) - [1 小时 SQL 极速入门(三)](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q) - [SQL 高级查询——(层次化查询,递归)](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q) - [GROUP BY高级查询,ROLLUP,CUBE,GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ) - [SQL 行转列,列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ) ## 其他 - 1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) - 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) - 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) - 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) - 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) - 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) - 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) - 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) - 9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) ================================================ FILE: biyingSpider.py ================================================ import requests import re import time local = time.strftime("%Y.%m.%d") url = 'http://cn.bing.com/' con = requests.get(url) content = con.text reg = r"(az/hprichbg/rb/.*?.jpg)" a = re.findall(reg, content, re.S)[0] print(a) picUrl = url + a read = requests.get(picUrl) f = open('%s.jpg' % local, 'wb') f.write(read.content) f.close() ================================================ FILE: countFile.py ================================================ import os result = [] def get_all(cwd): get_dir = os.listdir(cwd) #遍历当前目录,获取文件列表 for i in get_dir: sub_dir = os.path.join(cwd,i) # 把第一步获取的文件加入路径 if os.path.isdir(sub_dir): #如果当前仍然是文件夹,递归调用 get_all(sub_dir) else: ax = os.path.basename(sub_dir) #如果当前路径不是文件夹,则把文件名放入列表 result.append(ax) print(len(result)) #对列表计数 if __name__ == "__main__": cur_path = os.getcwd() #当前目录 get_all(cur_path) ================================================ FILE: countPm.py ================================================ # -*- coding:utf-8 -*- def count_pm(*args): alist = list([round(i*2-8,2) for i in args]) #计算三种颗粒浓度 result = [] for pm in alist: pm_abs = abs(pm) result.append(generate_iso_code(pm_abs)) print (result) return result def generate_iso_code(x): pm_value = [0.01,0.02,0.04,0.08,0.16,0.32,0.64,1.3,2.5,5,10,20,40,80] #颗粒浓度 iso = list(range(1,25)) #iso级别,共24级 for i in range(len(pm_value)): #for循环得到某个浓度范围的iso4006级别 if pm_value[i] < x <= pm_value[i+1]: iso_code = iso[i] break return iso_code if __name__ == '__main__': count_pm(7.95,5.85,3.98) count_pm(7.918,5.949,5.456) count_pm(6.916,3.956,3.956) ================================================ FILE: douban_book.py ================================================ from bs4 import BeautifulSoup import requests from openpyxl import Workbook excel_name = "书籍.xlsx" wb = Workbook() ws1 = wb.active ws1.title='书籍' def get_html(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} html = requests.get(url, headers=header).content return html def get_con(html): soup = BeautifulSoup(html,'html.parser') book_list = soup.find('div', attrs={'class': 'article'}) page = soup.find('div', attrs={'class': 'paginator'}) next_page = page.find('span', attrs={'class': 'next'}).find('a') name = [] for i in book_list.find_all('table'): book_name = i.find('div', attrs={'class': 'pl2'}) m = list(book_name.find('a').stripped_strings) if len(m)>1: x = m[0]+m[1] else: x = m[0] #print(x) name.append(x) if next_page: return name, next_page.get('href') else: return name, None def main(): url = 'https://book.douban.com/top250' name_list=[] while url: html = get_html(url) name, url = get_con(html) name_list = name_list + name for i in name_list: location = 'A%s'%(name_list.index(i)+1) print(i) print(location) ws1[location]=i wb.save(filename=excel_name) if __name__ == '__main__': main() ================================================ FILE: douban_movie.py ================================================ #!/usr/bin/env python # encoding=utf-8 import requests import re import codecs from bs4 import BeautifulSoup from openpyxl import Workbook wb = Workbook() dest_filename = '电影.xlsx' ws1 = wb.active ws1.title = "电影top250" DOWNLOAD_URL = 'http://movie.douban.com/top250/' def download_page(url): """获取url地址页面内容""" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' } data = requests.get(url, headers=headers).content return data def get_li(doc): soup = BeautifulSoup(doc, 'html.parser') ol = soup.find('ol', class_='grid_view') name = [] # 名字 star_con = [] # 评价人数 score = [] # 评分 info_list = [] # 短评 for i in ol.find_all('li'): detail = i.find('div', attrs={'class': 'hd'}) movie_name = detail.find( 'span', attrs={'class': 'title'}).get_text() # 电影名字 level_star = i.find( 'span', attrs={'class': 'rating_num'}).get_text() # 评分 star = i.find('div', attrs={'class': 'star'}) star_num = star.find(text=re.compile('评价')) # 评价 info = i.find('span', attrs={'class': 'inq'}) # 短评 if info: # 判断是否有短评 info_list.append(info.get_text()) else: info_list.append('无') score.append(level_star) name.append(movie_name) star_con.append(star_num) page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页 if page: return name, star_con, score, info_list, DOWNLOAD_URL + page['href'] return name, star_con, score, info_list, None def main(): url = DOWNLOAD_URL name = [] star_con = [] score = [] info = [] while url: doc = download_page(url) movie, star, level_num, info_list, url = get_li(doc) name = name + movie star_con = star_con + star score = score + level_num info = info + info_list for (i, m, o, p) in zip(name, star_con, score, info): col_A = 'A%s' % (name.index(i) + 1) col_B = 'B%s' % (name.index(i) + 1) col_C = 'C%s' % (name.index(i) + 1) col_D = 'D%s' % (name.index(i) + 1) ws1[col_A] = i ws1[col_B] = m ws1[col_C] = o ws1[col_D] = p wb.save(filename=dest_filename) if __name__ == '__main__': main() ================================================ FILE: excelToDatabase.py ================================================ from openpyxl import load_workbook import pymysql config = { 'host': '127.0.0.1', 'port':3306, 'user': 'root', 'password': 'root', 'charset': 'utf8mb4', #'cursorclass': pymysql.cursors.DictCursor } conn = pymysql.connect(**config) conn.autocommit(1) cursor = conn.cursor() name = 'lyexcel' cursor.execute('create database if not exists %s' %name) conn.select_db(name) table_name = 'info' cursor.execute('create table if not exists %s(id MEDIUMINT NOT NULL AUTO_INCREMENT,name varchar(30),tel varchar(30),primary key (id))'%table_name) wb2 = load_workbook('hpu.xlsx') ws=wb2.get_sheet_names() for row in wb2: print("1") for cell in row: value1=(cell[0].value,cell[4].value) cursor.execute('insert into info (name,tel) values(%s,%s)',value1) print("overing...") # for row in A: # print(row) #print (wb2.get_sheet_names()) ================================================ FILE: image_recognition_zhihu.py ================================================ # -*- coding:UTF-8 -*- import requests , time ,random import hmac ,json ,base64 from bs4 import BeautifulSoup from hashlib import sha1 import TencentYoutuyun from PIL import Image import uuid def recognition_captcha(data): ''' 识别验证码 ''' file_id = str(uuid.uuid1()) filename = 'captcha_'+ file_id +'.gif' filename_png = 'captcha_'+ file_id +'.png' if(data is None): return data = base64.b64decode(data.encode('utf-8')) with open( filename ,'wb') as fb: fb.write( data ) appid = 'appid' # 接入优图服务,注册账号获取 secret_id = 'secret_id' secret_key = 'secret_key' userid= 'userid' end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化 # 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式 im = Image.open( filename) im.save( filename_png ,"png") im.close() result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url return result def get_captcha(sessiona,headers): ''' 获取验证码 ''' need_cap = False while( need_cap is not True): try: sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 time.sleep( 0.5 + random.randint(1,9)/10 ) except Exception: continue try: resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put img_data = json.loads(resp3.text)["img_base64"] except Exception: return return img_data def create_point( point_data, confidence ): ''' 获得点阵 ''' # 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下 points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]} wi = 0 input_points = [] for word in ( point_data['items'][0]['words'] ): wi = wi+1 if( word['confidence'] < confidence ): try: input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5 except KeyError: continue if( len(input_points) > 2 or len(input_points) == 0 ): return [] # 7个字中只有2个倒置中文的成功率高 result = {} result['img_size']=[200,44] result['input_points']=input_points result = json.dumps(result) print(result) return result def bolting(k_low,k_hi,k3_confidence): ''' 筛选把握大的进行验证 ''' start = time.time() is_success = False while(is_success is not True): points_len = 1 angle = -20 img_ko = [] while(points_len != 21 or angle < k_low or angle > k_hi ): img_data = get_captcha(sessiona,headers) img_ko = recognition_captcha(img_data) ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理 # with open( "json.txt" ,'wb') as fb: # fb.write( img_ko_json ) try: points_len = len(img_ko['items'][0]['itemstring']) angle = img_ko['angle'] except Exception: points_len = 1 angle = -20 continue # print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示 # print('-'*50) input_text = create_point( img_ko ,k3_confidence ) if(type(input_text) == type([])): continue data = { "input_text":input_text } # 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟 time.sleep( 4 + random.randint(1,9)/10 ) try: resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers) except Exception: continue print("angle: "+ str(angle) ) print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心 print('-'*50) try: is_success = json.loads(resp5.text)["success"] except KeyError: continue end = time.time() return end-start if __name__ == "__main__": sessiona = requests.Session() headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} k3_confidence = 0.71 ''' # 可视化数据会被保存在云端供浏览 # https://plot.ly/~weldon2010/4 # 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快 ''' runtime_list_x = [] runtime_list_y = [] nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思 # 成功尝试100次,形成2维数据以热力图的方式展示 for y in nn : for x in nn : runtime_list_x.append( bolting(-3,3,k3_confidence) ) print( "y: " + str(runtime_list_y) ) print( "x: " + str(runtime_list_x) ) runtime_list_y.append(runtime_list_x.copy()) runtime_list_x = [] print ("-"*30) print( runtime_list_y ) print ("-"*30) # pip install plotly 数据可视化 import plotly import plotly.graph_objs as go plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册 trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ]) data=[trace] plotly.plotly.plot(data, filename='weldon-time2-heatmap') # 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速 # 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找 ### chcp 65001 (win下改变cmd字符集) ### python c:\python34\image_recognition_zhihu.py ================================================ FILE: lagouSpider.py ================================================ import requests from openpyxl import Workbook def get_json(url, page, lang_name): data = {'first': 'true', 'pn': page, 'kd': lang_name} json = requests.post(url, data).json() list_con = json['content']['positionResult']['result'] info_list = [] for i in list_con: info = [] info.append(i['companyShortName']) info.append(i['companyName']) info.append(i['salary']) info.append(i['city']) info.append(i['education']) info_list.append(info) return info_list def main(): lang_name = input('职位名:') page = 1 url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' info_result = [] while page < 31: info = get_json(url, page, lang_name) info_result = info_result + info page += 1 wb = Workbook() ws1 = wb.active ws1.title = lang_name for row in info_result: ws1.append(row) wb.save('职位信息.xlsx') if __name__ == '__main__': main() ================================================ FILE: login_zhihu.py ================================================ # -*- coding:UTF-8 -*- import requests , time import hmac ,json from bs4 import BeautifulSoup from hashlib import sha1 def get_captcha(data,need_cap): ''' 处理验证码 ''' if need_cap is False: return with open('captcha.gif','wb') as fb: fb.write(data) return input('captcha:') def get_signature(grantType,clientId,source,timestamp): ''' 处理签名 ''' hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1) hm.update(str.encode(grantType)) hm.update(str.encode(clientId)) hm.update(str.encode(source)) hm.update(str.encode(timestamp)) return str(hm.hexdigest()) def login(username,password,oncaptcha,sessiona,headers): ''' 处理登录 ''' resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 grantType = 'password' clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20' source ='com.zhihu.web' timestamp = str((time.time()*1000)).split('.')[0] # 签名只按这个时间戳变化 captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content data = { "client_id":clientId, "grant_type":grantType, "timestamp":timestamp, "source":source, "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名 "username":username, "password":password, "lang":"cn", "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码 "ref_source":"other_", "utm_source":"" } print("**2**: "+str(data)) print("-"*50) resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content print(BeautifulSoup(resp,'html.parser')) print("-"*50) return resp if __name__ == "__main__": sessiona = requests.Session() headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了 resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers) # 登录进去了,可以看私信了 print(BeautifulSoup(resp.content ,'html.parser')) ### chcp 65001 (win下改变cmd字符集) ### python c:\python34\login_zhihu.py ### 有非常无语的事情发生,还以为代码没生效 ================================================ FILE: qiubai_crawer.py ================================================ import requests from bs4 import BeautifulSoup def download_page(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} r = requests.get(url, headers=headers) return r.text def get_content(html, page): output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" soup = BeautifulSoup(html, 'html.parser') con = soup.find(id='content-left') con_list = con.find_all('div', class_="article") for i in con_list: author = i.find('h2').string # 获取作者名字 content = i.find('div', class_='content').find('span').get_text() # 获取内容 stats = i.find('div', class_='stats') vote = stats.find('span', class_='stats-vote').find('i', class_='number').string comment = stats.find('span', class_='stats-comments').find('i', class_='number').string author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 if author_info is not None: # 非匿名用户 class_list = author_info['class'] if "womenIcon" in class_list: gender = '女' elif "manIcon" in class_list: gender = '男' else: gender = '' age = author_info.string # 获取年龄 else: # 匿名用户 gender = '' age = '' save_txt(output.format(page, author, gender, age, vote, comment, content)) def save_txt(*args): for i in args: with open('qiubai.txt', 'a', encoding='utf-8') as f: f.write(i) def main(): # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 for i in range(1, 14): url = 'https://qiushibaike.com/text/page/{}'.format(i) html = download_page(url) get_content(html, i) if __name__ == '__main__': main() ================================================ FILE: readExcel.py ================================================ from openpyxl import Workbook from openpyxl.compat import range from openpyxl.cell import get_column_letter wb = Workbook() dest_filename = 'empty_book2.xlsx' ws1 = wb.active # 第一个表 ws1.title = "range names" # 第一个表命名 # 遍历第一个表的1到40行,赋值一个600内的随机数 for row in range(1, 40): ws1.append(range(60)) ws2 = wb.create_sheet(title="Pi") ws2['F5'] = 3.14 ws3 = wb.create_sheet(title="Data") for row in range(10, 20): for col in range(27, 54): _ = ws3.cell(column=col, row=row, value="%s" % get_column_letter(col)) wb.save(filename=dest_filename) ================================================ FILE: wechat/README.MD ================================================ # 详细使用请看文章 [Python微信公众号开发—小白篇(一)](https://mp.weixin.qq.com/s/iMPUC0yxI-zuf4AjtyAu6g) [Python公众号开发—颜值检测](https://mp.weixin.qq.com/s/I0DxhIHkeqhc2LeQ2ICHeA) ================================================ FILE: wechat/connect.py ================================================ # -*-coding:utf-8 -*- import falcon from falcon import uri from wechatpy.utils import check_signature from wechatpy.exceptions import InvalidSignatureException from wechatpy import parse_message from wechatpy.replies import TextReply, ImageReply from utils import img_download, img_upload from face_id import access_api class Connect(object): def on_get(self, req, resp): query_string = req.query_string query_list = query_string.split('&') b = {} for i in query_list: b[i.split('=')[0]] = i.split('=')[1] try: check_signature(token='lengxiao', signature=b['signature'], timestamp=b['timestamp'], nonce=b['nonce']) resp.body = (b['echostr']) except InvalidSignatureException: pass resp.status = falcon.HTTP_200 def on_post(self, req, resp): xml = req.stream.read() msg = parse_message(xml) if msg.type == 'text': reply = TextReply(content=msg.content, message=msg) xml = reply.render() resp.body = (xml) resp.status = falcon.HTTP_200 elif msg.type == 'image': name = img_download(msg.image, msg.source) print(name) r = access_api('images/' + name) if r == 'success': media_id = img_upload('image', 'faces/' + name) reply = ImageReply(media_id=media_id, message=msg) else: reply = TextReply(content='人脸检测失败,请上传1M以下人脸清晰的照片', message=msg) xml = reply.render() resp.body = (xml) resp.status = falcon.HTTP_200 app = falcon.API() connect = Connect() app.add_route('/connect', connect) ================================================ FILE: wechat/face_id.py ================================================ # -*-coding:utf-8 -*- import time import random import base64 import hashlib import requests from urllib.parse import urlencode import cv2 import numpy as np from PIL import Image, ImageDraw, ImageFont import os # 一.计算接口鉴权,构造请求参数 def random_str(): '''得到随机字符串nonce_str''' str = 'abcdefghijklmnopqrstuvwxyz' r = '' for i in range(15): index = random.randint(0,25) r += str[index] return r def image(name): with open(name, 'rb') as f: content = f.read() return base64.b64encode(content) def get_params(img): '''组织接口请求的参数形式,并且计算sign接口鉴权信息, 最终返回接口请求所需要的参数字典''' params = { 'app_id': '1106860829', 'time_stamp': str(int(time.time())), 'nonce_str': random_str(), 'image': img, 'mode': '0' } sort_dict = sorted(params.items(), key=lambda item: item[0], reverse=False) # 排序 sort_dict.append(('app_key', 'P8Gt8nxi6k8vLKbS')) # 添加app_key rawtext = urlencode(sort_dict).encode() # URL编码 sha = hashlib.md5() sha.update(rawtext) md5text = sha.hexdigest().upper() # 计算出sign,接口鉴权 params['sign'] = md5text # 添加到请求参数列表中 return params # 二.请求接口URL def access_api(img): frame = cv2.imread(img) nparry_encode = cv2.imencode('.jpg', frame)[1] data_encode = np.array(nparry_encode) img_encode = base64.b64encode(data_encode) # 图片转为base64编码格式 url = 'https://api.ai.qq.com/fcgi-bin/face/face_detectface' res = requests.post(url, get_params(img_encode)).json() # 请求URL,得到json信息 # 把信息显示到图片上 if res['ret'] == 0: # 0代表请求成功 pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # 把opencv格式转换为PIL格式,方便写汉字 draw = ImageDraw.Draw(pil_img) for obj in res['data']['face_list']: img_width = res['data']['image_width'] # 图像宽度 img_height = res['data']['image_height'] # 图像高度 # print(obj) x = obj['x'] # 人脸框左上角x坐标 y = obj['y'] # 人脸框左上角y坐标 w = obj['width'] # 人脸框宽度 h = obj['height'] # 人脸框高度 # 根据返回的值,自定义一下显示的文字内容 if obj['glass'] == 1: # 眼镜 glass = '有' else: glass = '无' if obj['gender'] >= 70: # 性别值从0-100表示从女性到男性 gender = '男' elif 50 <= obj['gender'] < 70: gender = "娘" elif obj['gender'] < 30: gender = '女' else: gender = '女汉子' if 90 < obj['expression'] <= 100: # 表情从0-100,表示笑的程度 expression = '一笑倾城' elif 80 < obj['expression'] <= 90: expression = '心花怒放' elif 70 < obj['expression'] <= 80: expression = '兴高采烈' elif 60 < obj['expression'] <= 70: expression = '眉开眼笑' elif 50 < obj['expression'] <= 60: expression = '喜上眉梢' elif 40 < obj['expression'] <= 50: expression = '喜气洋洋' elif 30 < obj['expression'] <= 40: expression = '笑逐颜开' elif 20 < obj['expression'] <= 30: expression = '似笑非笑' elif 10 < obj['expression'] <= 20: expression = '半嗔半喜' elif 0 <= obj['expression'] <= 10: expression = '黯然伤神' delt = h // 5 # 确定文字垂直距离 # 写入图片 if len(res['data']['face_list']) > 1: # 检测到多个人脸,就把信息写入人脸框内 font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8') # 提前把字体文件下载好 draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) elif img_width - x - w < 170: # 避免图片太窄,导致文字显示不完全 font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8') draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) else: font = ImageFont.truetype('yahei.ttf', 20, encoding='utf-8') draw.text((x + w + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) draw.text((x + w + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) draw.text((x + w + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) draw.text((x + w + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) draw.text((x + w + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) draw.rectangle((x, y, x + w, y + h), outline="#4CB050") # 画出人脸方框 cv2img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) # 把 pil 格式转换为 cv cv2.imwrite('faces/{}'.format(os.path.basename(img)), cv2img) # 保存图片到 face 文件夹下 return 'success' else: return 'fail' ================================================ FILE: wechat/requirements.txt ================================================ certifi==2018.4.16 chardet==3.0.4 falcon==1.4.1 idna==2.6 numpy==1.14.5 opencv-python==3.4.1.15 optionaldict==0.1.1 Pillow==5.1.0 pycrypto==2.6.1 python-dateutil==2.7.3 python-mimeparse==1.6.0 requests==2.18.4 six==1.11.0 urllib3==1.22 waitress==1.1.0 wechatpy==1.7.0 xmltodict==0.11.0 ================================================ FILE: wechat/utils.py ================================================ # -*-coding:utf-8 -*- import requests import json import threading import time import os token = '' app_id = '开发者ID(AppID)' secret = '开发者密码(AppSecret)' def img_download(url, name): r = requests.get(url) with open('images/{}-{}.jpg'.format(name, time.strftime("%Y_%m_%d%H_%M_%S", time.localtime())), 'wb') as fd: fd.write(r.content) if os.path.getsize(fd.name) >= 1048576: return 'large' # print('namename', os.path.basename(fd.name)) return os.path.basename(fd.name) def get_access_token(appid, secret): '''获取access_token,100分钟刷新一次''' url = 'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}'.format(appid, secret) r = requests.get(url) parse_json = json.loads(r.text) global token token = parse_json['access_token'] global timer timer = threading.Timer(6000, get_access_token) timer.start() def img_upload(mediaType, name): global token url = "https://api.weixin.qq.com/cgi-bin/media/upload?access_token=%s&type=%s" % (token, mediaType) files = {'media': open('{}'.format(name), 'rb')} r = requests.post(url, files=files) parse_json = json.loads(r.text) return parse_json['media_id'] get_access_token(app_id, secret) ================================================ FILE: 爬虫集合/README.MD ================================================ # 代码详细说明请看文章 [Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw) [Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew) [Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA) [Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw) [Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g) ================================================ FILE: 爬虫集合/lagou.py ================================================ import random import time import requests from openpyxl import Workbook import pymysql.cursors def get_conn(): '''建立数据库连接''' conn = pymysql.connect(host='localhost', user='root', password='root', db='python', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) return conn def insert(conn, info): '''数据写入数据库''' with conn.cursor() as cursor: sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)" cursor.execute(sql, info) conn.commit() def get_json(url, page, lang_name): '''返回当前页面的信息列表''' headers = { 'Host': 'www.lagou.com', 'Connection': 'keep-alive', 'Content-Length': '23', 'Origin': 'https://www.lagou.com', 'X-Anit-Forge-Code': '0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'X-Anit-Forge-Token': 'None', 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' } data = {'first': 'false', 'pn': page, 'kd': lang_name} json = requests.post(url, data, headers=headers).json() list_con = json['content']['positionResult']['result'] info_list = [] for i in list_con: info = [] info.append(i.get('companyShortName', '无')) info.append(i.get('companyFullName', '无')) info.append(i.get('industryField', '无')) info.append(i.get('companySize', '无')) info.append(i.get('salary', '无')) info.append(i.get('city', '无')) info.append(i.get('education', '无')) info_list.append(info) return info_list def main(): lang_name = 'python' wb = Workbook() # 打开 excel 工作簿 conn = get_conn() # 建立数据库连接 不存数据库 注释此行 for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市 page = 1 ws1 = wb.active ws1.title = lang_name url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i) while page < 31: # 每个城市30页信息 info = get_json(url, page, lang_name) page += 1 print(i, 'page', page) time.sleep(random.randint(10, 20)) for row in info: insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行 ws1.append(row) conn.close() # 关闭数据库连接,不存数据库 注释此行 wb.save('{}职位信息.xlsx'.format(lang_name)) if __name__ == '__main__': main() ================================================ FILE: 爬虫集合/meizitu.py ================================================ import requests import os import time import threading from bs4 import BeautifulSoup def download_page(url): ''' 用于下载页面 ''' headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} r = requests.get(url, headers=headers) r.encoding = 'gb2312' return r.text def get_pic_list(html): ''' 获取每个页面的套图列表,之后循环调用get_pic函数获取图片 ''' soup = BeautifulSoup(html, 'html.parser') pic_list = soup.find_all('li', class_='wp-item') for i in pic_list: a_tag = i.find('h3', class_='tit').find('a') link = a_tag.get('href') text = a_tag.get_text() get_pic(link, text) def get_pic(link, text): ''' 获取当前页面的图片,并保存 ''' html = download_page(link) # 下载界面 soup = BeautifulSoup(html, 'html.parser') pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} create_dir('pic/{}'.format(text)) for i in pic_list: pic_link = i.get('src') # 拿到图片的具体 url r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件 with open('pic/{}/{}'.format(text, pic_link.split('/')[-1]), 'wb') as f: f.write(r.content) time.sleep(1) # 休息一下,不要给网站太大压力,避免被封 def create_dir(name): if not os.path.exists(name): os.makedirs(name) def execute(url): page_html = download_page(url) get_pic_list(page_html) def main(): create_dir('pic') queue = [i for i in range(1, 72)] # 构造 url 链接 页码。 threads = [] while len(queue) > 0: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5 cur_page = queue.pop(0) url = 'http://meizitu.com/a/more_{}.html'.format(cur_page) thread = threading.Thread(target=execute, args=(url,)) thread.setDaemon(True) thread.start() print('{}正在下载{}页'.format(threading.current_thread().name, cur_page)) threads.append(thread) if __name__ == '__main__': main() ================================================ FILE: 爬虫集合/qiubai_crawer.py ================================================ import requests from bs4 import BeautifulSoup def download_page(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} r = requests.get(url, headers=headers) return r.text def get_content(html, page): output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" soup = BeautifulSoup(html, 'html.parser') con = soup.find(id='content-left') con_list = con.find_all('div', class_="article") for i in con_list: author = i.find('h2').string # 获取作者名字 content = i.find('div', class_='content').find('span').get_text() # 获取内容 stats = i.find('div', class_='stats') vote = stats.find('span', class_='stats-vote').find('i', class_='number').string comment = stats.find('span', class_='stats-comments').find('i', class_='number').string author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 if author_info is not None: # 非匿名用户 class_list = author_info['class'] if "womenIcon" in class_list: gender = '女' elif "manIcon" in class_list: gender = '男' else: gender = '' age = author_info.string # 获取年龄 else: # 匿名用户 gender = '' age = '' save_txt(output.format(page, author, gender, age, vote, comment, content)) def save_txt(*args): for i in args: with open('qiubai.txt', 'a', encoding='utf-8') as f: f.write(i) def main(): # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 for i in range(1, 14): url = 'https://qiushibaike.com/text/page/{}'.format(i) html = download_page(url) get_content(html, i) if __name__ == '__main__': main()