Repository: lixi5338619/magical_spider Branch: main Commit: 7a129f7207c3 Files: 29 Total size: 94.3 KB Directory structure: gitextract_qe1fffch/ ├── README.md ├── browserapi.py ├── config/ │ └── system_info.py ├── db.py ├── demo/ │ ├── runflow.py │ ├── 单任务GET-demo.py │ ├── 单任务POST-demo.py │ ├── 多任务demo.py │ ├── 抖音-步骤拆解demo.py │ └── 药监局.py ├── engine.py ├── middlerware.py ├── models.py ├── server.py ├── settings.py ├── static/ │ ├── css/ │ │ └── index.css │ └── docs/ │ ├── program.txt │ └── 部署.txt ├── templates/ │ └── index.html └── undetected_chromedriver/ ├── __init__.py ├── _compat.py ├── cdp.py ├── devtool.py ├── dprocess.py ├── options.py ├── patcher.py ├── reactor.py ├── v2.py └── webelement.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # magical_spider 神奇的蜘蛛🕷,一个几乎适用于所有web端站点的采集方案。 ### 诞生背景 纯属瞎扯:2022年全球变暖,各行业内卷严重,爬虫届更是入门抖音起步瑞数,为了减缓人才流失,推出magical_spider。 真实原因:一时兴起,吾辈当自强,重铸selenium荣光! 博客地址: [lxspider](http://www.lxspider.com) 爬虫逆向工具站:[lxtools](http://www.cnlans.com/lx/tools) ### 项目简介 - 非常规derver.pageSource。 - 通过Flask远程调用chromederver实现xmlHttpRequest。 - 通过sqlit记录任务状态。 - 通过undetected_selenium+stealth.min.js绕过一些校验。 - 目前适用于瑞数、加速乐等cookie加密,以及头条系的请求过程加密。 ### 项目声明 - 项目仅供学习参考。 - 如有风控校验需自行解决,滑块可参考middlerware.py。 - 方案适用于应急场景或数据量要求不高时,若时间充裕建议通过逆向处理。推荐阅读:[《爬虫逆向进阶实战》](https://github.com/lixi5338619/lxBook) ### 部署 [linux部署文档](static/docs/部署.txt) --- ## 使用说明 1、配置settings.py,启动 flask 服务 2、运行方法参考demo文件内容,主要借助runflow.py。 3、测试代码 GET请求 ```python from demo.runflow import magical_start,magical_request,magical_close project_name = 'cnipa' base_url = 'https://www.cnipa.gov.cn' session_id,process_url = magical_start(project_name,base_url) print(len(magical_request(session_id, process_url,'https://www.cnipa.gov.cn/col/col57/index.html'))) magical_close(session_id,process_url,project_name) ``` POST请求 ```python from demo.runflow import magical_start,magical_request,magical_close import json project_name = 'chinadrugtrials' base_url = 'http://www.chinadrugtrials.org.cn' session_id,process_url = magical_start(project_name,base_url) data = {"id": "","ckm_index": "","sort": "desc","sort2": "","rule": "CTR","secondLevel": "0","currentpage": "2","keywords": "","reg_no": "","indication": "","case_no": "","drugs_name": "","drugs_type": "","appliers": "","communities": "","researchers": "","agencies": "","state": ""} formdata = json.dumps(data) print(magical_request(session_id=session_id, process_url=process_url, request_url='http://www.chinadrugtrials.org.cn/clinicaltrials.searchlist.dhtml', request_type='post',formdata=formdata )) magical_close(session_id,process_url,project_name) ``` 4、index页可以查看和管理当前运行中的任务,也能查看系统内存和磁盘使用情况。 5、demo文件夹中有任务流程汇总runflow.py,以及抖音、药监局案例,单任务和多任务示例。 ![Alt](./static/image/index.png) ================================================ FILE: browserapi.py ================================================ # -*- coding: utf-8 -*- import undetected_chromedriver as webdriver from undetected_chromedriver.options import ChromeOptions from settings import * from selenium.webdriver import ActionChains from middlerware import Slide import time import platform class Browser(): """Browser Env : undetected_chromedriver + stealth.js headless_enable: 无头模式 images_enable: 图像开关 incognito_enable: 无痕模式 logging_enable: 开启日志 stealth_enable: stealth 伪装模式 proxy: 启用代理, 格式:http://127.0.0.1:8888 """ def __init__(self): options = ChromeOptions() options.add_argument("--lang=en-us") if headless_enable: options.add_argument("--headless") if plugin_enable: options.add_argument('--disable-images') options.add_argument('--disable-plugins') options.add_argument('disable-audio') options.add_argument('disable-translate') if proxy: options.add_argument('--proxy-server=' + proxy) if logging_enable: options.add_argument('log-level=3') if incognito_enable: options.add_argument("--incognito") if detach_enable: options.add_experimental_option("detach",True) if platform.system().lower()=='linux': options.add_argument("--headless") options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') self.browser = webdriver.Chrome(driver_executable_path=driverpath, options=options) if stealth_enable: self.stealth_enable() def stealth_enable(self): with open(stealth_path,'r',encoding='utf-8') as file: stealth_min_js = file.read() self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": stealth_min_js }) def start_request(self,url): self.browser.get(url) return self.browser def close(self): self.browser.close() self.browser.quit() class BrowserApi(): def __init__(self,browser): self.browser = browser def browser_ps(self,url): self.browser.get(url) return self.browser.page_source def browser_get(self, url): doc = self.browser.execute_script(''' function queryData(url) { var p = new Promise(function(resolve,reject) { var e={ "url":"%s", "method":"GET" }; var h = new XMLHttpRequest; h.open(e.method, e.url, true); h.setRequestHeader("salute-by","lx"); h.onreadystatechange =function() { if(h.readyState === 4 && h.status === 200) { resolve(h.responseText); } else {} }; h.send(null); }); return p; } var p1 = queryData('lx'); res = Promise.all([p1]).then(function(result){ return result }) return res ''' % (url)) return doc[0] def browser_post(self, url, formdata=""): doc = self.browser.execute_script(''' function queryData(url) { var p = new Promise(function(resolve,reject) { var e={"url":"%s", "method":"POST", "data" : '%s' }; var h = new XMLHttpRequest; h.open(e.method, e.url, true); h.setRequestHeader("accept","application/json, text/plain, */*"); h.setRequestHeader("content-type","application/json;charset=UTF-8"); h.setRequestHeader("salute-by","lx"); h.onreadystatechange =function() { if(h.readyState != 4) return; if(h.readyState === 4 && h.status ===200) { resolve(h.responseText); } else { } }; h.send(e.data); }); return p; } var p1 = queryData('lx'); res = Promise.all([p1]).then(function(result){ return result }) return res; ''' % (url, formdata)) return doc[0] def check_slide(self,bg_xpath,gap_xpath,slider_xpath,domain=None): """params: bg_xpath : 带缺口的背景图片的 xpath gap_xpath: 缺口滑块图片的 xpath slider_xpath: 待拖动滑块的 xpath domain: 图片doamin,非 http开头需补全链接 """ while 1: try: bg = self.browser.find_element_by_id(bg_xpath).get_attribute('src') gap = self.browser.find_element_by_xpath(gap_xpath).get_attribute('src') if not bg.startswith('http'):bg = domain+bg if not gap.startswith('http'):gap = domain+gap slide_app = Slide(gap=gap, bg=bg) distance = slide_app.discern() except: break try: slider = self.browser.find_element_by_xpath(slider_xpath) ActionChains(self.browser).click_and_hold(slider).perform() _tracks = slide_app.get_tracks(distance) new_1 = _tracks[-1] - (sum(_tracks) - distance) _tracks.pop() _tracks.append(new_1) for mouse_x in _tracks: ActionChains(self.browser).move_by_offset(mouse_x, 0).perform() ActionChains(self.browser).release().perform() time.sleep(1) except: break ================================================ FILE: config/system_info.py ================================================ # -*- coding: utf-8 -*- import psutil class SystemInfoUtil(object): @classmethod def get_format_byte(cls, value): """字节""" kb, b = divmod(value, 1024) mb, kb = divmod(kb, 1024) gb, mb = divmod(mb, 1024) if gb > 0: return f'{round(gb + mb * 0.001)}GB' elif mb > 0: return f'{round(mb + kb * 0.001)}MB' elif kb > 0: return f'{round(kb + b * 0.001)}KB' else: return f'{round(b)}B' @classmethod def get_virtual_memory(cls): """ 内存使用情况 total: 总内存 available: 可用内存 percent: 内存使用率 used: 已使用的内存 :return: """ virtual_memory = psutil.virtual_memory() return { 'total': virtual_memory.total, 'total_format': cls.get_format_byte(virtual_memory.total), 'available': virtual_memory.available, 'available_format': cls.get_format_byte(virtual_memory.available), 'percent': round(virtual_memory.percent), 'used': virtual_memory.used, 'used_format': cls.get_format_byte(virtual_memory.used), } @classmethod def get_disk_usage(cls): """磁盘使用情况""" disk_usage = psutil.disk_usage('/') return { 'total': disk_usage.total, 'total_format': cls.get_format_byte(disk_usage.total), 'used': disk_usage.used, 'used_format': cls.get_format_byte(disk_usage.used), 'free': disk_usage.free, 'free_format': cls.get_format_byte(disk_usage.free), 'percent': round(disk_usage.percent), } ================================================ FILE: db.py ================================================ import os.path import sqlite3 from models import * from settings import magicalpath def create_connection(): db = sqlite3.connect(magicalpath) return db def select_process(): db = create_connection() con = db.cursor() con.execute("select * from process") res = con.fetchall() con.close() db.close() return res def select_process_name(processName:str)->Process: db = create_connection() con = db.cursor() con.execute("select * from process where processName=?",(processName,)) res = con.fetchone() con.close() db.close() return res def select_process_id(processId:str)->Process: db = create_connection() con = db.cursor() con.execute("select * from process where processId=?",(processId,)) res = con.fetchone() con.close() db.close() return res def insert_process(process:Process)->Process: db = create_connection() cursor = db.cursor() try: cursor.execute("insert into process(processId, processName, processUrl, createTime) values ('%s','%s','%s','%s')" % (process.processId, process.processName, process.processUrl,datetime.datetime.now())) db.commit() cursor.close() db.close() return process except Exception as e: print(e) db.rollback() cursor.close() db.close() def delete_process(process_name): db = create_connection() cursor = db.cursor() try: cursor.execute("delete FROM process where processName ='%s'" % (process_name)) db.commit() cursor.close() db.close() except Exception as e: db.rollback() cursor.close() db.close() def delete_process_id(processId): db = create_connection() cursor = db.cursor() try: cursor.execute("delete FROM process where processId ='%s'" % (processId)) db.commit() cursor.close() db.close() except Exception as e: db.rollback() cursor.close() db.close() if __name__ == '__main__': if not os.path.exists(magicalpath): con = sqlite3.connect(magicalpath) cursor = con.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS `process`(`processId` VARCHAR(90),`processName` VARCHAR(90) UNIQUE,`processUrl` VARCHAR(256),`createTime` DATA,`baseUrl` VARCHAR(256));") con.commit() ================================================ FILE: demo/runflow.py ================================================ import requests sess = requests.session() host = 'http://127.0.0.1:5000' def magical_start(project_name,base_url = 'http://www.lxspider.com'): # 1、create browser and select session_id result = sess.post(f'{host}/create',data={'name':project_name,'url':base_url}).json() session_id,process_url = result['session_id'],result['process_url'] return session_id,process_url def magical_request(session_id,process_url,request_url,request_type='get',formdata=''): # 2、request browser_xhr data = {'session_id': session_id, 'process_url': process_url, 'request_url': request_url, 'request_type': request_type} if request_type.lower()=='post': data.update({'request_type':'post','formdata':formdata}) result = sess.post(f'{host}/xhr',data=data).json() return result['result'] def magical_close(session_id,process_url,process_name): # 4、close browser close_data = {'session_id':session_id,'process_url':process_url,'process_name':process_name} sess.post(f'{host}/close',data=close_data).json() ================================================ FILE: demo/单任务GET-demo.py ================================================ from demo.runflow import magical_start,magical_request,magical_close project_name = 'cnipa' base_url = 'https://www.cnipa.gov.cn' session_id,process_url = magical_start(project_name,base_url) for i in range(200): print(len(magical_request(session_id, process_url,'https://www.cnipa.gov.cn/col/col2486/index.html'))) magical_close(session_id,process_url,project_name) ================================================ FILE: demo/单任务POST-demo.py ================================================ from demo.runflow import magical_start,magical_request,magical_close import json # POST案例昨天忘记加了,感谢 [尘川] 的提醒 by:2022/08/10 project_name = 'chinadrugtrials' base_url = 'http://www.chinadrugtrials.org.cn' session_id,process_url = magical_start(project_name,base_url) data = {"id": "","ckm_index": "","sort": "desc","sort2": "","rule": "CTR","secondLevel": "0","currentpage": "2","keywords": "","reg_no": "","indication": "","case_no": "","drugs_name": "","drugs_type": "","appliers": "","communities": "","researchers": "","agencies": "","state": ""} formdata = json.dumps(data) for i in range(100): print(len(magical_request(session_id=session_id, process_url=process_url, request_url='http://www.chinadrugtrials.org.cn/clinicaltrials.searchlist.dhtml', request_type='post',formdata=formdata ))) magical_close(session_id,process_url,project_name) ================================================ FILE: demo/多任务demo.py ================================================ from demo.runflow import magical_start,magical_request,magical_close import time # 各任务间互不影响,可选择使用多线程或多进程,大家自由发挥 def r1(): project_name1 = '药监局新闻任务1' s1,p1 = magical_start(project_name1,'https://www.nmpa.gov.cn') request_list = [ 'https://www.nmpa.gov.cn/xxgk/ggtg/index.html', 'https://www.nmpa.gov.cn/xxgk/fgwj/index.html', 'https://www.nmpa.gov.cn/xxgk/fgwj/index.html' ] for request_url in request_list: print("r1:", len(magical_request(s1, p1, request_url))) time.sleep(5) magical_close(s1,p1,project_name1) def r2(): project_name2 = '药监局新闻任务2' s2,p2 = magical_start(project_name2,'https://www.nmpa.gov.cn') request_list = ['https://www.nmpa.gov.cn/zwgk/rshxx/index.html', 'https://www.nmpa.gov.cn/zwgk/xwfb/index.html', 'https://www.nmpa.gov.cn/zwgk/xwfb/index.html' ] for request_url in request_list: print("r2:", len(magical_request(s2, p2, request_url))) time.sleep(5) magical_close(s2,p2,project_name2) import threading thread1 = threading.Thread(target=r1) thread2 = threading.Thread(target=r2) thread1.start() thread2.start() ================================================ FILE: demo/抖音-步骤拆解demo.py ================================================ import requests # 步骤拆解,简化版查看 药监局.py project_name = '抖音任务2' # project_name不可重复,勿创建重复任务名 base_url = 'https://www.douyin.com' # 1、browser init and select browser_session result = requests.post('http://127.0.0.1:5000/create',data={'name':project_name,'url':base_url}).json() session_id = result['session_id'] process_url = result['process_url'] # 2、request browser_xhr # URL需要更换为你浏览器中的 request_list = [ 'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=%E9%9E%A0%E5%A9%A7%E7%A5%8E&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414', 'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=lx&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414', 'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=pythonlx&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414', 'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=lx666&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414', ] for request_url in request_list: data = {'session_id':session_id,'process_url':process_url, 'request_url':request_url,'request_type':'get'} result = requests.post('http://127.0.0.1:5000/xhr',data=data).json() print(len(result['result'])) # 3、close browser close_data = {'session_id':session_id,'process_url':process_url,'process_name':project_name} requests.post('http://127.0.0.1:5000/close',data=close_data).json() ================================================ FILE: demo/药监局.py ================================================ from demo.runflow import magical_start,magical_request,magical_close project_name = '药监局1' base_url = 'https://www.nmpa.gov.cn' request_list = [ 'https://www.nmpa.gov.cn/yaopin/ypjgdt/index.html', 'https://www.nmpa.gov.cn/yaopin/ypjgdt/20220705190551125.html' ] session_id,process_url = magical_start(project_name,base_url) for request_url in request_list: print(magical_request(session_id, process_url, request_url)) magical_close(session_id,process_url,project_name) ================================================ FILE: engine.py ================================================ # -*- coding: utf-8 -*- from browserapi import Browser,BrowserApi from db import * from models import Process from selenium import webdriver from selenium.webdriver.remote.webdriver import WebDriver def create_browser(url,name): bro = Browser() browser = bro.start_request(url) session_id = browser.session_id process_url = browser.command_executor._url insert_process(Process(session_id,name,process_url,url)) return browser def attachToSession(session_id,url): original_execute = WebDriver.execute def new_command_execute(self, command, params=None): if command == "newSession": return {'success': 0, 'value': None, 'sessionId': session_id} else: return original_execute(self, command, params) WebDriver.execute = new_command_execute driver = webdriver.Remote(command_executor=url, desired_capabilities={}) driver.session_id = session_id WebDriver.execute = original_execute return driver def carry_browser(session_id,process_url,request_url,request_type,formdata): try: browser = attachToSession(session_id,process_url) except: # 防止窗口崩溃 -> 增加的重建操作 print("防止窗口崩溃 -> 增加的重建操作") browser_info = select_process_id(session_id) base_url = browser_info[4] process_name = browser_info[1] delete_process(process_name) browser = create_browser(base_url,process_name) print("browser 重建成功") broapi = BrowserApi(browser) if request_type=='get': result = broapi.browser_get(request_url) else: result = broapi.browser_post(request_url,formdata) return result def close_browser(session_id,process_url,process_name): delete_process(process_name) browser = attachToSession(session_id,process_url) browser.close() browser.quit() def select_all_process(): return select_process() ================================================ FILE: middlerware.py ================================================ # -*- coding: utf-8 -*- import os,requests from urllib.parse import urlparse try: import cv2, numpy as np except: ... class Slide(object): def __init__(self, gap, bg, gap_size=None, bg_size=None, out=None): """ :param bg: 带缺口的图片链接或者url :param gap: 缺口图片链接或者url """ self.img_dir = os.path.join(os.getcwd(), 'img') if not os.path.exists(self.img_dir): os.makedirs(self.img_dir) bg_resize = bg_size if bg_size else (340, 212) gap_size = gap_size if gap_size else (68, 68) self.bg = self.check_is_img_path(bg, 'bg', resize=bg_resize) self.gap = self.check_is_img_path(gap, 'gap', resize=gap_size) self.out = out if out else os.path.join(self.img_dir, 'out.jpg') @staticmethod def check_is_img_path(img, img_type, resize): if img.startswith('http'): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;" "q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,ja;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": urlparse(img).hostname, "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/91.0.4472.164 Safari/537.36", } img_res = requests.get(img, headers=headers) if img_res.status_code == 200: img_path = f'./img/{img_type}.jpg' image = np.asarray(bytearray(img_res.content), dtype="uint8") image = cv2.imdecode(image, cv2.IMREAD_COLOR) if resize: image = cv2.resize(image, dsize=resize) cv2.imwrite(img_path, image) return img_path else: raise Exception(f"保存{img_type}图片失败") else: return img @staticmethod def clear_white(img): """清除图片的空白区域,这里主要清除滑块的空白""" img = cv2.imread(img) rows, cols, channel = img.shape min_x = 255 min_y = 255 max_x = 0 max_y = 0 for x in range(1, rows): for y in range(1, cols): t = set(img[x, y]) if len(t) >= 2: if x <= min_x: min_x = x elif x >= max_x: max_x = x if y <= min_y: min_y = y elif y >= max_y: max_y = y img1 = img[min_x:max_x, min_y: max_y] return img1 def template_match(self, tpl, target): th, tw = tpl.shape[:2] result = cv2.matchTemplate(target, tpl, cv2.TM_CCOEFF_NORMED) # 寻找矩阵(一维数组当作向量,用Mat定义) 中最小值和最大值的位置 min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result) tl = max_loc br = (tl[0] + tw, tl[1] + th) # 绘制矩形边框,将匹配区域标注出来 # target:目标图像 # tl:矩形定点 # br:矩形的宽高 # (0,0,255):矩形边框颜色 # 1:矩形边框大小 cv2.rectangle(target, tl, br, (0, 0, 255), 2) cv2.imwrite(self.out, target) return tl[0] @staticmethod def image_edge_detection(img): edges = cv2.Canny(img, 100, 200) return edges def discern(self): img1 = self.clear_white(self.gap) img1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY) slide = self.image_edge_detection(img1) back = cv2.imread(self.bg, 0) back = self.image_edge_detection(back) slide_pic = cv2.cvtColor(slide, cv2.COLOR_GRAY2RGB) back_pic = cv2.cvtColor(back, cv2.COLOR_GRAY2RGB) x = self.template_match(slide_pic, back_pic) # 输出横坐标, 即 滑块在图片上的位置 return x @staticmethod def get_tracks(distance, rate=0.6, t=0.2, v=0): """ 将distance分割成小段的距离 :param distance: 总距离 :param rate: 加速减速的临界比例 :param a1: 加速度 :param a2: 减速度 :param t: 单位时间 :param t: 初始速度 :return: 小段的距离集合 """ tracks = [] # 加速减速的临界值 mid = rate * distance # 当前位移 s = 0 # 循环 while s < distance: # 初始速度 v0 = v if s < mid: a = 20 else: a = -3 # 计算当前t时间段走的距离 s0 = v0 * t + 0.5 * a * t * t # 计算当前速度 v = v0 + a * t # 四舍五入距离,因为像素没有小数 tracks.append(round(s0)) # 计算当前距离 s += s0 return tracks ================================================ FILE: models.py ================================================ import datetime class Process: def __init__(self, processId, processName,processUrl,baseUrl,createTime = datetime.datetime.now()) -> None: super().__init__() self.processId = processId self.processName = processName self.processUrl = processUrl self.createTime = createTime self.baseUrl = baseUrl ================================================ FILE: server.py ================================================ # -*- coding: utf-8 -*- from datetime import timedelta from flask import Flask,session from flask import render_template,request,redirect,url_for,jsonify import os from engine import * from config.system_info import SystemInfoUtil from settings import host,port app = Flask(__name__) app.config['SECRET_KEY'] = os.urandom(24) app.config['SEND_FILE_MAX_AGE_DEFAULT'] = timedelta(days=7) @app.route('/') def index_info(): process = select_all_process() if not process:process=[["","没有在运行的任务","",""]] disk_usage = SystemInfoUtil.get_disk_usage() virtual_memory = SystemInfoUtil.get_virtual_memory() return render_template('index.html',process=process,disk_usage=disk_usage,virtual_memory=virtual_memory) @app.route('/create',methods=['POST']) def browser_start(): url = request.form.get("url") name = request.form.get("name") try: create_browser(url,name) session_id, process_name, process_url, datetime,base_url = select_process_name(name) result = {'session_id': session_id, 'process_name': process_name, 'process_url': process_url, 'datetime': datetime} return jsonify(result) except: return jsonify({"result":0,"detail":"驱动配置错误或任务名已存在"}) @app.route('/xhr',methods=['POST']) def browser_xhr(): session_id = request.form.get("session_id") process_url = request.form.get("process_url") request_url = request.form.get("request_url") request_type = request.form.get("request_type") formdata = request.form.get("formdata") result = carry_browser(session_id,process_url,request_url,request_type,formdata) return jsonify({"result":result}) @app.route('/close',methods=['POST']) def browser_close(): session_id = request.form.get("session_id") process_url = request.form.get("process_url") process_name = request.form.get("process_name") try: close_browser(session_id,process_url,process_name) return jsonify({"result":1}) except: return jsonify({"result":0,"detail":"驱动窗口已自动关闭"}) @app.route('/delete/',methods=['GET']) def delete_process_name(process_name): try: process = select_process_name(process_name) close_browser(session_id=process[0],process_url=process[2],process_name=process_name) except: delete_process(process_name) print("delete except: line 70") return redirect('/') if __name__ == '__main__': app.run(host=host,port=port,use_reloader=False,debug=True) ================================================ FILE: settings.py ================================================ # MagicalSpider Settings # 隐藏界面 headless_enable = True # 高匿模式、可能影响创建时间 stealth_enable = True # 代理设置 proxy = None # 无痕访问 incognito_enable = False # 分离模式 detach_enable = False plugin_enable = False logging_enable = False driverpath = './config/chromedriver.exe' magicalpath = './config/magical.db' stealth_path = './config/stealth.min.js' host = '0.0.0.0' port = 5000 # 让 Selenium 在 Linux 中以有头模式运行 # xvfb-run python3 test.py -s -screen 0 1920x1080x16 ================================================ FILE: static/css/index.css ================================================ body{ background: url(/static/image/bg.png); background-size: 100% 100%; background-repeat:no-repeat; } a{ text-decoration:none; } p{ font-size: 18px; color: white; } img{ width: 207.99px; height: 207.99px; } table{ border-collapse: collapse; margin-left: 6%; text-align: center; } table td, table th { border: 1px solid #cad9ea; color: #666; height: 30px; } table thead th { background-color: #CCE8EB; width: 260px; } table tr:nth-child(odd) { background: #fff; } table tr:nth-child(even) { background: #F5FAFA; } .lx{ display: inline-block; margin-left: 5%; } .bt{ font-weight:bold; color:#5b91d6; } .blog{ border:solid; border-color:#5784d0; width: 10%; margin-left: 65.6%; position: fixed; top: 0; } ================================================ FILE: static/docs/program.txt ================================================ magical_spider,一个几乎适用于所有web端站点的采集方案。 ## 项目简介 1、主要使用谷歌驱动,但非常规derver.page_source。 2、通过 flask 远程调用 chromederver 实现 xmlHttpRequest 传输数据。 3、通过sqlit 记录和管理任务状态。 4、通过undetected_selenium+stealth.min.js绕过一些校验。 5、测试通过瑞数、加速乐等cookie加密,以及头条系的请求过程加密。 6、支持 linux 部署,支持多任务。 ## 项目原理 打造一个近乎真实的浏览器环境,去完成网站内部环境的请求加载,直接返回响应内容供本地调用。 ## 项目声明 1、整合了一些其他开源项目,仅供学习参考。 2、适用于应急场景或小量任务,方便便捷,若时间充裕建议通过逆向处理。 3、如有风控校验需自行解决,滑块可参考middlerware.py。 ## 备注 1、index页可以查看和管理当前运行中的任务,也能查看系统内存和磁盘使用情况。 2、demo文件夹中有任务流程汇总runflow.py,以及抖音、药监局案例,单任务和多任务示例。 3、运行前配置服务信息和驱动路径,启动flask服务后再执行任务。 ================================================ FILE: static/docs/部署.txt ================================================ Linux部署 1.安装chrome (自行选择安装位置) yum install https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm 2.检查chrome的版本 google-chrome --version 3.安装对应版本的 chromedriver_linux64 比如我的chrome版本是104.0.5112.79 wget https://npm.taobao.org/mirrors/chromedriver/104.0.5112.79/chromedriver_linux64.zip 4.解压 unzip chromedriver_linux64 5.授权 chmod 777 chromedriver 6.修改项目代码settings.py中的chromedriver路径 7.安装python依赖后启动flask项目 - Python依赖 :flask、sqlite3、selenium、websockets、opencv-python、numpy - flask启动方式:python3 sever.py 8.开启服务器端口访问权限 9.运行项目测试 ================================================ FILE: templates/index.html ================================================ MagicalSpider





【MagicalSpider】 神奇的蜘蛛🕷,一个几乎适用于所有web端站点的采集方案。(比如瑞数、加速乐、头条系、五秒盾等)


诞生背景: 2022年全球变暖,各行业内卷严重,爬虫届更是入门抖音+瑞数,导致发生 [从入门到放弃] 。 所以吾辈当自强,重铸selenium荣光 (本段纯属瞎扯) 。


magical_spider:项目说明部署文档


运行中的任务:

{% for p in process %} {% if p[0] %} {% else %} {% endif %} {% endfor %}
任务名 任务ID 任务地址 创建时间 任务管理
{{ p[1] }} {{ p[0] }} {{ p[2] }} {{ p[3][:19] }}删除任务

公众号《Pythonlx》

内存使用情况

总内存: {{ virtual_memory.total_format }}

可用内存: {{ virtual_memory.available_format }}

已使用内存: {{ virtual_memory.used_format }}

内存使用率: {{ virtual_memory.percent }}%



磁盘使用情况

总内存: {{ disk_usage.total_format }}

可用内存: {{ disk_usage.free_format }}

已使用内存: {{ disk_usage.used_format }}

内存使用率: {{ disk_usage.percent }}%



{#
#} {#

微信赞助

#} {#

#} {# #} {#

#} {#
#}
================================================ FILE: undetected_chromedriver/__init__.py ================================================ #!/usr/bin/env python3 #from __future__ import annotations import subprocess """ 888 888 d8b 888 888 Y8P 888 888 .d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888 d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P" 888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888 Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888 "Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888 by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam) """ """ user_data_dir、language、webdriver、Webelement、no-first-run、window-size、log-level、start-maximized、no-sandbox """ __version__ = "3.1.5r4" import json import logging import os import re import shutil import sys import tempfile import time import inspect import threading import selenium.webdriver.chrome.service import selenium.webdriver.chrome.webdriver import selenium.webdriver.common.service import selenium.webdriver.remote.webdriver from .cdp import CDP from .options import ChromeOptions from .patcher import IS_POSIX from .patcher import Patcher from .reactor import Reactor from .dprocess import start_detached __all__ = ( "Chrome", "ChromeOptions", "Patcher", "Reactor", "CDP", "find_chrome_executable", ) logger = logging.getLogger("uc") logger.setLevel(logging.getLogger().getEffectiveLevel()) class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): """ Controls the ChromeDriver and allows you to drive the browser. The webdriver file will be downloaded by this module automatically, you do not need to specify this. however, you may if you wish. Attributes ---------- Methods ------- reconnect() this can be useful in case of heavy detection methods -stops the chromedriver service which runs in the background -starts the chromedriver service which runs in the background -recreate session start_session(capabilities=None, browser_profile=None) differentiates from the regular method in that it does not require a capabilities argument. The capabilities are automatically recreated from the options at creation time. -------------------------------------------------------------------------- NOTE: Chrome has everything included to work out of the box. it does not `need` customizations. any customizations MAY lead to trigger bot migitation systems. -------------------------------------------------------------------------- """ _instances = set() session_id = None debug = False def __init__( self, options=None, user_data_dir=None, driver_executable_path=None, browser_executable_path=None, port=0, enable_cdp_events=False, service_args=None, desired_capabilities=None, advanced_elements=False, service_log_path=None, keep_alive=True, log_level=0, headless=False, version_main=None, patcher_force_close=False, suppress_welcome=True, use_subprocess=False, debug=False, **kw ): """ Creates a new instance of the chrome driver. Starts the service and then creates new instance of chrome driver. Parameters ---------- options: ChromeOptions, optional, default: None - automatic useful defaults this takes an instance of ChromeOptions, mainly to customize browser behavior. anything other dan the default, for example extensions or startup options are not supported in case of failure, and can probably lowers your undetectability. user_data_dir: str , optional, default: None (creates temp profile) if user_data_dir is a path to a valid chrome profile directory, use it, and turn off automatic removal mechanism at exit. driver_executable_path: str, optional, default: None(=downloads and patches new binary) browser_executable_path: str, optional, default: None - use find_chrome_executable Path to the browser executable. If not specified, make sure the executable's folder is in $PATH port: int, optional, default: 0 port you would like the service to run, if left as 0, a free port will be found. enable_cdp_events: bool, default: False :: currently for chrome only this enables the handling of wire messages when enabled, you can subscribe to CDP events by using: driver.add_cdp_listener("Network.dataReceived", yourcallback) # yourcallback is an callable which accepts exactly 1 dict as parameter service_args: list of str, optional, default: None arguments to pass to the driver service desired_capabilities: dict, optional, default: None - auto from config Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref". advanced_elements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: advanced webelement repr )> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time. service_log_path: str, optional, default: None path to log information from the driver. keep_alive: bool, optional, default: True Whether to configure ChromeRemoteConnection to use HTTP keep-alive. log_level: int, optional, default: adapts to python global log level headless: bool, optional, default: False can also be specified in the options instance. Specify whether you want to use the browser in headless mode. warning: this lowers undetectability and not fully supported. version_main: int, optional, default: None (=auto) if you, for god knows whatever reason, use an older version of Chrome. You can specify it's full rounded version number here. Example: 87 for all versions of 87 patcher_force_close: bool, optional, default: False instructs the patcher to do whatever it can to access the chromedriver binary if the file is locked, it will force shutdown all instances. setting it is not recommended, unless you know the implications and think you might need it. suppress_welcome: bool, optional , default: True a "welcome" alert might show up on *nix-like systems asking whether you want to set chrome as your default browser, and if you want to send even more data to google. now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False. Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception. use_subprocess: bool, optional , default: False, False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after program exits or using .quit() unfortunately, there is always an edge case in which one would like to write an single script with the only contents being: --start script-- import undetected_chromedriver as uc d = uc.Chrome() d.get('https://somesite/') ---end script -- and will be greeted with an error, since the program exists before chrome has a change to launch. in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times. ! setting it to True comes with NO support when being detected. ! """ self.debug = debug patcher = Patcher( executable_path=driver_executable_path, force=patcher_force_close, version_main=version_main, ) patcher.auto() self.patcher = patcher if not options: options = ChromeOptions() try: if hasattr(options, "_session") and options._session is not None: # prevent reuse of options, # as it just appends arguments, not replace them # you'll get conflicts starting chrome raise RuntimeError("you cannot reuse the ChromeOptions object") except AttributeError: pass options._session = self debug_port = selenium.webdriver.common.service.utils.free_port() debug_host = "127.0.0.1" if not options.debugger_address: options.debugger_address = "%s:%d" % (debug_host, debug_port) if enable_cdp_events: options.set_capability( "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"} ) options.add_argument("--remote-debugging-host=%s" % debug_host) options.add_argument("--remote-debugging-port=%s" % debug_port) if user_data_dir: options.add_argument('--user-data-dir=%s' % user_data_dir) language, keep_user_data_dir = None, bool(user_data_dir) # see if a custom user profile is specified in options for arg in options.arguments: if "lang" in arg: m = re.search("(?:--)?lang(?:[ =])?(.*)", arg) try: language = m[1] except IndexError: logger.debug("will set the language to en-US,en;q=0.9") language = "en-US,en;q=0.9" if "user-data-dir" in arg: m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg) try: user_data_dir = m[1] logger.debug( "user-data-dir found in user argument %s => %s" % (arg, m[1]) ) keep_user_data_dir = True except IndexError: logger.debug( "no user data dir could be extracted from supplied argument %s " % arg ) if not user_data_dir: # backward compatiblity # check if an old uc.ChromeOptions is used, and extract the user data dir if hasattr(options, "user_data_dir") and getattr( options, "user_data_dir", None ): import warnings warnings.warn( "using ChromeOptions.user_data_dir might stop working in future versions." "use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder" ) options.add_argument("--user-data-dir=%s" % options.user_data_dir) keep_user_data_dir = True logger.debug( "user_data_dir property found in options object: %s" % user_data_dir ) else: user_data_dir = os.path.normpath(tempfile.mkdtemp()) keep_user_data_dir = False arg = "--user-data-dir=%s" % user_data_dir options.add_argument(arg) logger.debug( "created a temporary folder in which the user-data (profile) will be stored during this\n" "session, and added it to chrome startup arguments: %s" % arg ) if not language: try: import locale language = locale.getdefaultlocale()[0].replace("_", "-") except Exception: pass if not language: language = "en-US" options.add_argument("--lang=%s" % language) if not options.binary_location: options.binary_location = ( browser_executable_path or find_chrome_executable() ) self._delay = 3 self.user_data_dir = user_data_dir self.keep_user_data_dir = keep_user_data_dir if suppress_welcome: options.arguments.extend(["--no-default-browser-check", "--no-first-run"]) if headless or options.headless: options.headless = True options.add_argument("--window-size=1920,1080") options.add_argument("--start-maximized") options.add_argument("--no-sandbox") # fixes "could not connect to chrome" error when running # on linux using privileged user like root (which i don't recommend) options.add_argument( "--log-level=%d" % log_level or divmod(logging.getLogger().getEffectiveLevel(), 10)[0] ) if hasattr(options, 'handle_prefs'): options.handle_prefs(user_data_dir) # fix exit_type flag to prevent tab-restore nag try: with open( os.path.join(user_data_dir, "Default/Preferences"), encoding="latin1", mode="r+", ) as fs: config = json.load(fs) if config["profile"]["exit_type"] is not None: # fixing the restore-tabs-nag config["profile"]["exit_type"] = None fs.seek(0, 0) json.dump(config, fs) logger.debug("fixed exit_type flag") except Exception as e: logger.debug("did not find a bad exit_type flag ") self.options = options if not desired_capabilities: desired_capabilities = options.to_capabilities() if not use_subprocess: self.browser_pid = start_detached( options.binary_location, *options.arguments ) else: browser = subprocess.Popen( [options.binary_location, *options.arguments], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=IS_POSIX, ) self.browser_pid = browser.pid super(Chrome, self).__init__( executable_path=patcher.executable_path, port=port, options=options, service_args=service_args, desired_capabilities=desired_capabilities, service_log_path=service_log_path, keep_alive=keep_alive, ) self.reactor = None if enable_cdp_events: if logging.getLogger().getEffectiveLevel() == logging.DEBUG: logging.getLogger( "selenium.webdriver.remote.remote_connection" ).setLevel(20) reactor = Reactor(self) reactor.start() self.reactor = reactor if advanced_elements: from .webelement import WebElement self._web_element_cls = WebElement if options.headless: self._configure_headless() def __getattribute__(self, item): if not super().__getattribute__("debug"): return super().__getattribute__(item) else: import inspect original = super().__getattribute__(item) if inspect.ismethod(original) and not inspect.isclass(original): def newfunc(*args, **kwargs): logger.debug( "calling %s with args %s and kwargs %s\n" % (original.__qualname__, args, kwargs) ) return original(*args, **kwargs) return newfunc return original def _configure_headless(self): orig_get = self.get logger.info("setting properties for headless") def get_wrapped(*args, **kwargs): if self.execute_script("return navigator.webdriver"): logger.info("patch navigator.webdriver") self.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(window, 'navigator', { value: new Proxy(navigator, { has: (target, key) => (key === 'webdriver' ? false : key in target), get: (target, key) => key === 'webdriver' ? false : typeof target[key] === 'function' ? target[key].bind(target) : target[key] }) }); """ }, ) logger.info("patch user-agent string") self.execute_cdp_cmd( "Network.setUserAgentOverride", { "userAgent": self.execute_script( "return navigator.userAgent" ).replace("Headless", "") }, ) self.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 1 })""" }, ) return orig_get(*args, **kwargs) self.get = get_wrapped def __dir__(self): return object.__dir__(self) def _get_cdc_props(self): return self.execute_script( """ let objectToInspect = window, result = []; while(objectToInspect !== null) { result = result.concat(Object.getOwnPropertyNames(objectToInspect)); objectToInspect = Object.getPrototypeOf(objectToInspect); } return result.filter(i => i.match(/.+_.+_(Array|Promise|Symbol)/ig)) """ ) def _hook_remove_cdc_props(self): # 它可以让当前标签页打开的所有网页,在网页内容加载之前执行一段 JavaScript 代码 self.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ let objectToInspect = window, result = []; while(objectToInspect !== null) { result = result.concat(Object.getOwnPropertyNames(objectToInspect)); objectToInspect = Object.getPrototypeOf(objectToInspect); } result.forEach(p => p.match(/.+_.+_(Array|Promise|Symbol)/ig) &&delete window[p]&&console.log('removed',p)) """ }, ) def get(self, url): if self._get_cdc_props(): self._hook_remove_cdc_props() return super().get(url) def add_cdp_listener(self, event_name, callback): if ( self.reactor and self.reactor is not None and isinstance(self.reactor, Reactor) ): self.reactor.add_event_handler(event_name, callback) return self.reactor.handlers return False def clear_cdp_listeners(self): if self.reactor and isinstance(self.reactor, Reactor): self.reactor.handlers.clear() def tab_new(self, url: str): """ this opens a url in a new tab. apparently, that passes all tests directly! Parameters ---------- url Returns ------- """ if not hasattr(self, "cdp"): from .cdp import CDP cdp = CDP(self.options) cdp.tab_new(url) def reconnect(self, timeout=0.1): try: self.service.stop() except Exception as e: logger.debug(e) time.sleep(timeout) try: self.service.start() except Exception as e: logger.debug(e) try: self.start_session() except Exception as e: logger.debug(e) def start_session(self, capabilities=None, browser_profile=None): if not capabilities: capabilities = self.options.to_capabilities() super(selenium.webdriver.chrome.webdriver.WebDriver, self).start_session( capabilities, browser_profile ) # super(Chrome, self).start_session(capabilities, browser_profile) def quit(self): logger.debug("closing webdriver") if hasattr(self, "service") and getattr(self.service, "process", None): self.service.process.kill() try: if self.reactor and isinstance(self.reactor, Reactor): logger.debug("shutting down reactor") self.reactor.event.set() except Exception: # noqa pass try: logger.debug("killing browser") os.kill(self.browser_pid, 15) except TimeoutError as e: logger.debug(e, exc_info=True) except Exception: # noqa pass if ( hasattr(self, "keep_user_data_dir") and hasattr(self, "user_data_dir") and not self.keep_user_data_dir ): for _ in range(5): try: shutil.rmtree(self.user_data_dir, ignore_errors=False) except FileNotFoundError: pass except (RuntimeError, OSError, PermissionError) as e: logger.debug( "When removing the temp profile, a %s occured: %s\nretrying..." % (e.__class__.__name__, e) ) else: logger.debug("successfully removed %s" % self.user_data_dir) break time.sleep(0.1) # dereference patcher, so patcher can start cleaning up as well. # this must come last, otherwise it will throw 'in use' errors self.patcher = None def __del__(self): try: super().quit() # self.service.process.kill() except: # noqa pass self.quit() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.service.stop() time.sleep(self._delay) self.service.start() self.start_session() def __hash__(self): return hash(self.options.debugger_address) def find_chrome_executable(): """ Finds the chrome, chrome beta, chrome canary, chromium executable Returns ------- executable_path : str the full file path to found executable """ candidates = set() if IS_POSIX: for item in os.environ.get("PATH").split(os.pathsep): for subitem in ( "google-chrome", "chromium", "chromium-browser", "chrome", "google-chrome-stable", ): candidates.add(os.sep.join((item, subitem))) if "darwin" in sys.platform: candidates.update( [ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", "/Applications/Chromium.app/Contents/MacOS/Chromium", ] ) else: for item in map( os.environ.get, ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA") ): for subitem in ( "Google/Chrome/Application", "Google/Chrome Beta/Application", "Google/Chrome Canary/Application", ): candidates.add(os.sep.join((item, subitem, "chrome.exe"))) for candidate in candidates: if os.path.exists(candidate) and os.access(candidate, os.X_OK): return os.path.normpath(candidate) ================================================ FILE: undetected_chromedriver/_compat.py ================================================ #!/usr/bin/env python3 # this module is part of undetected_chromedriver """ 888 888 d8b 888 888 Y8P 888 888 .d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888 d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P" 888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888 Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888 "Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888 by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam) """ import io import logging import os import random import re import string import sys import zipfile from distutils.version import LooseVersion from urllib.request import urlopen, urlretrieve from selenium.webdriver import Chrome as _Chrome, ChromeOptions as _ChromeOptions TARGET_VERSION = 0 logger = logging.getLogger("uc") class Chrome: def __new__(cls, *args, emulate_touch=False, **kwargs): if not ChromeDriverManager.installed: ChromeDriverManager(*args, **kwargs).install() if not ChromeDriverManager.selenium_patched: ChromeDriverManager(*args, **kwargs).patch_selenium_webdriver() if not kwargs.get("executable_path"): kwargs["executable_path"] = "./{}".format( ChromeDriverManager(*args, **kwargs).executable_path ) if not kwargs.get("options"): kwargs["options"] = ChromeOptions() instance = object.__new__(_Chrome) instance.__init__(*args, **kwargs) instance._orig_get = instance.get def _get_wrapped(*args, **kwargs): if instance.execute_script("return navigator.webdriver"): instance.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(window, 'navigator', { value: new Proxy(navigator, { has: (target, key) => (key === 'webdriver' ? false : key in target), get: (target, key) => key === 'webdriver' ? undefined : typeof target[key] === 'function' ? target[key].bind(target) : target[key] }) }); """ }, ) return instance._orig_get(*args, **kwargs) instance.get = _get_wrapped instance.get = _get_wrapped instance.get = _get_wrapped original_user_agent_string = instance.execute_script( "return navigator.userAgent" ) instance.execute_cdp_cmd( "Network.setUserAgentOverride", { "userAgent": original_user_agent_string.replace("Headless", ""), }, ) if emulate_touch: instance.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 1 })""" }, ) logger.info(f"starting undetected_chromedriver2.Chrome({args}, {kwargs})") return instance class ChromeOptions: def __new__(cls, *args, **kwargs): if not ChromeDriverManager.installed: ChromeDriverManager(*args, **kwargs).install() if not ChromeDriverManager.selenium_patched: ChromeDriverManager(*args, **kwargs).patch_selenium_webdriver() instance = object.__new__(_ChromeOptions) instance.__init__() instance.add_argument("start-maximized") instance.add_experimental_option("excludeSwitches", ["enable-automation"]) instance.add_argument("--disable-blink-features=AutomationControlled") return instance class ChromeDriverManager(object): installed = False selenium_patched = False target_version = None DL_BASE = "https://chromedriver.storage.googleapis.com/" def __init__(self, executable_path=None, target_version=None, *args, **kwargs): _platform = sys.platform if TARGET_VERSION: # use global if set self.target_version = TARGET_VERSION if target_version: # use explicitly passed target self.target_version = target_version # user override if not self.target_version: # none of the above (default) and just get current version self.target_version = self.get_release_version_number().version[ 0 ] # only major version int self._base = base_ = "chromedriver{}" exe_name = self._base if _platform in ("win32",): exe_name = base_.format(".exe") if _platform in ("linux",): _platform += "64" exe_name = exe_name.format("") if _platform in ("darwin",): _platform = "mac64" exe_name = exe_name.format("") self.platform = _platform self.executable_path = executable_path or exe_name self._exe_name = exe_name def patch_selenium_webdriver(self_): """ Patches selenium package Chrome, ChromeOptions classes for current session :return: """ import selenium.webdriver.chrome.service import selenium.webdriver selenium.webdriver.Chrome = Chrome selenium.webdriver.ChromeOptions = ChromeOptions logger.info("Selenium patched. Safe to import Chrome / ChromeOptions") self_.__class__.selenium_patched = True def install(self, patch_selenium=True): """ Initialize the patch This will: download chromedriver if not present patch the downloaded chromedriver patch selenium package if is True (default) :param patch_selenium: patch selenium webdriver classes for Chrome and ChromeDriver (for current python session) :return: """ if not os.path.exists(self.executable_path): self.fetch_chromedriver() if not self.__class__.installed: if self.patch_binary(): self.__class__.installed = True if patch_selenium: self.patch_selenium_webdriver() def get_release_version_number(self): """ Gets the latest major version available, or the latest major version of self.target_version if set explicitly. :return: version string """ path = ( "LATEST_RELEASE" if not self.target_version else f"LATEST_RELEASE_{self.target_version}" ) return LooseVersion(urlopen(self.__class__.DL_BASE + path).read().decode()) def fetch_chromedriver(self): """ Downloads ChromeDriver from source and unpacks the executable :return: on success, name of the unpacked executable """ base_ = self._base zip_name = base_.format(".zip") ver = self.get_release_version_number().vstring if os.path.exists(self.executable_path): return self.executable_path urlretrieve( f"{self.__class__.DL_BASE}{ver}/{base_.format(f'_{self.platform}')}.zip", filename=zip_name, ) with zipfile.ZipFile(zip_name) as zf: zf.extract(self._exe_name) os.remove(zip_name) if sys.platform != "win32": os.chmod(self._exe_name, 0o755) return self._exe_name @staticmethod def random_cdc(): cdc = random.choices(string.ascii_lowercase, k=26) cdc[-6:-4] = map(str.upper, cdc[-6:-4]) cdc[2] = cdc[0] cdc[3] = "_" return "".join(cdc).encode() def patch_binary(self): """ Patches the ChromeDriver binary :return: False on failure, binary name on success """ linect = 0 replacement = self.random_cdc() with io.open(self.executable_path, "r+b") as fh: for line in iter(lambda: fh.readline(), b""): if b"cdc_" in line: fh.seek(-len(line), 1) newline = re.sub(b"cdc_.{22}", replacement, line) fh.write(newline) linect += 1 return linect def install(executable_path=None, target_version=None, *args, **kwargs): ChromeDriverManager(executable_path, target_version, *args, **kwargs).install() ================================================ FILE: undetected_chromedriver/cdp.py ================================================ #!/usr/bin/env python3 # this module is part of undetected_chromedriver import json import logging from collections.abc import Mapping, Sequence import requests import websockets log = logging.getLogger(__name__) class CDPObject(dict): def __init__(self, *a, **k): super().__init__(*a, **k) self.__dict__ = self for k in self.__dict__: if isinstance(self.__dict__[k], dict): self.__dict__[k] = CDPObject(self.__dict__[k]) elif isinstance(self.__dict__[k], list): for i in range(len(self.__dict__[k])): if isinstance(self.__dict__[k][i], dict): self.__dict__[k][i] = CDPObject(self) def __repr__(self): tpl = f"{self.__class__.__name__}(\n\t{{}}\n\t)" return tpl.format("\n ".join(f"{k} = {v}" for k, v in self.items())) class PageElement(CDPObject): pass class CDP: log = logging.getLogger("CDP") endpoints = CDPObject( { "json": "/json", "protocol": "/json/protocol", "list": "/json/list", "new": "/json/new?{url}", "activate": "/json/activate/{id}", "close": "/json/close/{id}", } ) def __init__(self, options: "ChromeOptions"): # noqa self.server_addr = "http://{0}:{1}".format(*options.debugger_address.split(":")) self._reqid = 0 self._session = requests.Session() self._last_resp = None self._last_json = None resp = self.get(self.endpoints.json) # noqa self.sessionId = resp[0]["id"] self.wsurl = resp[0]["webSocketDebuggerUrl"] def tab_activate(self, id=None): if not id: active_tab = self.tab_list()[0] id = active_tab.id # noqa self.wsurl = active_tab.webSocketDebuggerUrl # noqa return self.post(self.endpoints["activate"].format(id=id)) def tab_list(self): retval = self.get(self.endpoints["list"]) return [PageElement(o) for o in retval] def tab_new(self, url): return self.post(self.endpoints["new"].format(url=url)) def tab_close_last_opened(self): sessions = self.tab_list() opentabs = [s for s in sessions if s["type"] == "page"] return self.post(self.endpoints["close"].format(id=opentabs[-1]["id"])) async def send(self, method: str, params: dict): self._reqid += 1 async with websockets.connect(self.wsurl) as ws: await ws.send( json.dumps({"method": method, "params": params, "id": self._reqid}) ) self._last_resp = await ws.recv() self._last_json = json.loads(self._last_resp) self.log.info(self._last_json) def get(self, uri): resp = self._session.get(self.server_addr + uri) try: self._last_resp = resp self._last_json = resp.json() except Exception: return else: return self._last_json def post(self, uri, data: dict = None): if not data: data = {} resp = self._session.post(self.server_addr + uri, json=data) try: self._last_resp = resp self._last_json = resp.json() except Exception: return self._last_resp @property def last_json(self): return self._last_json ================================================ FILE: undetected_chromedriver/devtool.py ================================================ import asyncio import logging import time import traceback from collections.abc import Mapping from collections.abc import Sequence from typing import Any from typing import Awaitable from typing import Callable from typing import List from typing import Optional from contextlib import ExitStack import threading from functools import wraps, partial class Structure(dict): """ This is a dict-like object structure, which you should subclass Only properties defined in the class context are used on initialization. See example """ _store = {} def __init__(self, *a, **kw): """ Instantiate a new instance. :param a: :param kw: """ super().__init__() # auxiliar dict d = dict(*a, **kw) for k, v in d.items(): if isinstance(v, Mapping): self[k] = self.__class__(v) elif isinstance(v, Sequence) and not isinstance(v, (str, bytes)): self[k] = [self.__class__(i) for i in v] else: self[k] = v super().__setattr__("__dict__", self) def __getattr__(self, item): return getattr(super(), item) def __getitem__(self, item): return super().__getitem__(item) def __setattr__(self, key, value): self.__setitem__(key, value) def __setitem__(self, key, value): super().__setitem__(key, value) def update(self, *a, **kw): super().update(*a, **kw) def __eq__(self, other): return frozenset(other.items()) == frozenset(self.items()) def __hash__(self): return hash(frozenset(self.items())) @classmethod def __init_subclass__(cls, **kwargs): cls._store = {} def _normalize_strings(self): for k, v in self.copy().items(): if isinstance(v, (str)): self[k] = v.strip() def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] = None): def wrapper(func): @wraps(func) def wrapped(*args, **kwargs): def function_reached_timeout(): if on_timeout: on_timeout(func) else: raise TimeoutError("function call timed out") t = threading.Timer(interval=seconds, function=function_reached_timeout) t.start() try: return func(*args, **kwargs) except: t.cancel() raise finally: t.cancel() return wrapped return wrapper def test(): import sys, os sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) import undetected_chromedriver as uc import threading def collector( driver: uc.Chrome, stop_event: threading.Event, on_event_coro: Optional[Callable[[List[str]], Awaitable[Any]]] = None, listen_events: Sequence = ("browser", "network", "performance"), ): def threaded(driver, stop_event, on_event_coro): async def _ensure_service_started(): while ( getattr(driver, "service", False) and getattr(driver.service, "process", False) and driver.service.process.poll() ): print("waiting for driver service to come back on") await asyncio.sleep(0.05) # await asyncio.sleep(driver._delay or .25) async def get_log_lines(typ): await _ensure_service_started() return driver.get_log(typ) async def looper(): while not stop_event.is_set(): log_lines = [] try: for _ in listen_events: try: log_lines += await get_log_lines(_) except: if logging.getLogger().getEffectiveLevel() <= 10: traceback.print_exc() continue if log_lines and on_event_coro: await on_event_coro(log_lines) except Exception as e: if logging.getLogger().getEffectiveLevel() <= 10: traceback.print_exc() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(looper()) t = threading.Thread(target=threaded, args=(driver, stop_event, on_event_coro)) t.start() async def on_event(data): print("on_event") print("data:", data) def func_called(fn): def wrapped(*args, **kwargs): print( "func called! %s (args: %s, kwargs: %s)" % (fn.__name__, args, kwargs) ) while driver.service.process and driver.service.process.poll() is not None: time.sleep(0.1) res = fn(*args, **kwargs) print("func completed! (result: %s)" % res) return res return wrapped logging.basicConfig(level=10) options = uc.ChromeOptions() options.set_capability( "goog:loggingPrefs", {"performance": "ALL", "browser": "ALL", "network": "ALL"} ) driver = uc.Chrome(version_main=96, options=options) # driver.command_executor._request = timeout(seconds=1)(driver.command_executor._request) driver.command_executor._request = func_called(driver.command_executor._request) collector_stop = threading.Event() collector(driver, collector_stop, on_event) driver.get("https://nowsecure.nl") time.sleep(10) driver.quit() ================================================ FILE: undetected_chromedriver/dprocess.py ================================================ import multiprocessing import os import platform import sys from subprocess import PIPE from subprocess import Popen import atexit import traceback import logging import signal CREATE_NEW_PROCESS_GROUP = 0x00000200 DETACHED_PROCESS = 0x00000008 REGISTERED = [] def start_detached(executable, *args): """ Starts a fully independent subprocess (with no parent) :param executable: executable :param args: arguments to the executable, eg: ['--param1_key=param1_val', '-vvv' ...] :return: pid of the grandchild process 启动独立的子进程 """ # create pipe reader, writer = multiprocessing.Pipe(False) # do not keep reference multiprocessing.Process( target=_start_detached, args=(executable, *args), kwargs={"writer": writer}, daemon=True, ).start() # receive pid from pipe pid = reader.recv() REGISTERED.append(pid) # close pipes writer.close() reader.close() return pid def _start_detached(executable, *args, writer: multiprocessing.Pipe = None): # configure launch kwargs = {} if platform.system() == "Windows": kwargs.update(creationflags=DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP) elif sys.version_info < (3, 2): # assume posix kwargs.update(preexec_fn=os.setsid) else: # Python 3.2+ and Unix kwargs.update(start_new_session=True) # run p = Popen([executable, *args], stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs) # send pid to pipe writer.send(p.pid) sys.exit() def _cleanup(): for pid in REGISTERED: try: logging.getLogger(__name__).debug("cleaning up pid %d " % pid) os.kill(pid, signal.SIGTERM) except: # noqa pass atexit.register(_cleanup) ================================================ FILE: undetected_chromedriver/options.py ================================================ #!/usr/bin/env python3 # this module is part of undetected_chromedriver import json import os try: from selenium.webdriver.chromium.options import ChromiumOptions as _ChromiumOptions except: from selenium.webdriver.chrome.options import Options as _ChromiumOptions class ChromeOptions(_ChromiumOptions): _session = None _user_data_dir = None @property def user_data_dir(self): return self._user_data_dir @user_data_dir.setter def user_data_dir(self, path: str): """ Sets the browser profile folder to use, or creates a new profile at given . Parameters ---------- path: str the path to a chrome profile folder if it does not exist, a new profile will be created at given location 设置要使用的浏览器配置文件文件夹,或创建新的配置文件 """ apath = os.path.abspath(path) self._user_data_dir = os.path.normpath(apath) @staticmethod def _undot_key(key, value): """turn a (dotted key, value) into a proper nested dict""" if "." in key: key, rest = key.split(".", 1) value = ChromeOptions._undot_key(rest, value) return {key: value} def handle_prefs(self, user_data_dir): prefs = self.experimental_options.get("prefs") if prefs: user_data_dir = user_data_dir or self._user_data_dir default_path = os.path.join(user_data_dir, "Default") os.makedirs(default_path, exist_ok=True) # undot prefs dict keys undot_prefs = {} for key, value in prefs.items(): undot_prefs.update(self._undot_key(key, value)) prefs_file = os.path.join(default_path, "Preferences") if os.path.exists(prefs_file): with open(prefs_file, encoding="latin1", mode="r") as f: undot_prefs.update(json.load(f)) with open(prefs_file, encoding="latin1", mode="w") as f: json.dump(undot_prefs, f) # remove the experimental_options to avoid an error del self._experimental_options["prefs"] @classmethod def from_options(cls, options): o = cls() o.__dict__.update(options.__dict__) return o ================================================ FILE: undetected_chromedriver/patcher.py ================================================ #!/usr/bin/env python3 # this module is part of undetected_chromedriver import io import logging import os import random import re import string import sys import time import zipfile from distutils.version import LooseVersion from urllib.request import urlopen, urlretrieve import secrets logger = logging.getLogger(__name__) IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux")) class Patcher(object): """ 获取webdriver最新版本 """ url_repo = "https://chromedriver.storage.googleapis.com" zip_name = "chromedriver_%s.zip" exe_name = "chromedriver%s" # 判断当前系统 platform = sys.platform if platform.endswith("win32"): zip_name %= "win32" exe_name %= ".exe" if platform.endswith("linux"): zip_name %= "linux64" exe_name %= "" if platform.endswith("darwin"): zip_name %= "mac64" exe_name %= "" if platform.endswith("win32"): d = "~/appdata/roaming/undetected_chromedriver" elif platform.startswith("linux"): d = "~/.local/share/undetected_chromedriver" elif platform.endswith("darwin"): d = "~/Library/Application Support/undetected_chromedriver" else: d = "~/.undetected_chromedriver" data_path = os.path.abspath(os.path.expanduser(d)) def __init__(self, executable_path=None, force=False, version_main: int = 0): """ Args: executable_path: None = automatic a full file path to the chromedriver executable force: False terminate processes which are holding lock version_main: 0 = auto specify main chrome version (rounded, ex: 82) """ self.force = force self.executable_path = None prefix = secrets.token_hex(8) if not os.path.exists(self.data_path): os.makedirs(self.data_path, exist_ok=True) if not executable_path: self.executable_path = os.path.join( self.data_path, "_".join([prefix, self.exe_name]) ) if not IS_POSIX: if executable_path: if not executable_path[-4:] == ".exe": executable_path += ".exe" self.zip_path = os.path.join(self.data_path, prefix) if not executable_path: self.executable_path = os.path.abspath( os.path.join(".", self.executable_path) ) self._custom_exe_path = False if executable_path: self._custom_exe_path = True self.executable_path = executable_path self.version_main = version_main self.version_full = None def auto(self, executable_path=None, force=False, version_main=None): """""" if executable_path: self.executable_path = executable_path self._custom_exe_path = True if self._custom_exe_path: ispatched = self.is_binary_patched(self.executable_path) if not ispatched: return self.patch_exe() else: return if version_main: self.version_main = version_main if force is True: self.force = force try: os.unlink(self.executable_path) except PermissionError: if self.force: self.force_kill_instances(self.executable_path) return self.auto(force=not self.force) try: if self.is_binary_patched(): # assumes already running AND patched return True except PermissionError: pass # return False except FileNotFoundError: pass release = self.fetch_release_number() self.version_main = release.version[0] self.version_full = release self.unzip_package(self.fetch_package()) return self.patch() def patch(self): self.patch_exe() return self.is_binary_patched() def fetch_release_number(self): """ Gets the latest major version available, or the latest major version of self.target_version if set explicitly. :return: version string :rtype: LooseVersion 获取可用的最新版 """ path = "/latest_release" if self.version_main: path += f"_{self.version_main}" path = path.upper() logger.debug("getting release number from %s" % path) return LooseVersion(urlopen(self.url_repo + path).read().decode()) def parse_exe_version(self): with io.open(self.executable_path, "rb") as f: for line in iter(lambda: f.readline(), b""): match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line) if match: return LooseVersion(match[1].decode()) def fetch_package(self): """ Downloads ChromeDriver from source :return: path to downloaded file """ u = "%s/%s/%s" % (self.url_repo, self.version_full.vstring, self.zip_name) logger.debug("downloading from %s" % u) # return urlretrieve(u, filename=self.data_path)[0] return urlretrieve(u)[0] def unzip_package(self, fp): """ Does what it says :return: path to unpacked executable 解压缩可执行文件 """ logger.debug("unzipping %s" % fp) try: os.unlink(self.zip_path) except (FileNotFoundError, OSError): pass os.makedirs(self.zip_path, mode=0o755, exist_ok=True) with zipfile.ZipFile(fp, mode="r") as zf: zf.extract(self.exe_name, self.zip_path) os.rename(os.path.join(self.zip_path, self.exe_name), self.executable_path) os.remove(fp) os.rmdir(self.zip_path) os.chmod(self.executable_path, 0o755) return self.executable_path @staticmethod def force_kill_instances(exe_name): """ kills running instances. :param: executable name to kill, may be a path as well :return: True on success else False 通过进程号kill driver """ exe_name = os.path.basename(exe_name) if IS_POSIX: r = os.system("kill -f -9 $(pidof %s)" % exe_name) else: r = os.system("taskkill /f /im %s" % exe_name) return not r @staticmethod def gen_random_cdc(): cdc = random.choices(string.ascii_lowercase, k=26) cdc[-6:-4] = map(str.upper, cdc[-6:-4]) cdc[2] = cdc[0] cdc[3] = "_" return "".join(cdc).encode() def is_binary_patched(self, executable_path=None): """simple check if executable is patched. :return: False if not patched, else True 检查可执行文件补丁 """ executable_path = executable_path or self.executable_path with io.open(executable_path, "rb") as fh: for line in iter(lambda: fh.readline(), b""): if b"cdc_" in line: return False else: return True def patch_exe(self): """ Patches the ChromeDriver binary :return: False on failure, binary name on success """ logger.info("patching driver executable %s" % self.executable_path) linect = 0 replacement = self.gen_random_cdc() with io.open(self.executable_path, "r+b") as fh: for line in iter(lambda: fh.readline(), b""): if b"cdc_" in line: fh.seek(-len(line), 1) newline = re.sub(b"cdc_.{22}", replacement, line) fh.write(newline) linect += 1 return linect def __repr__(self): return "{0:s}({1:s})".format( self.__class__.__name__, self.executable_path, ) def __del__(self): if self._custom_exe_path: # if the driver binary is specified by user # we assume it is important enough to not delete it return else: timeout = 3 # stop trying after this many seconds t = time.monotonic() while True: now = time.monotonic() if now - t > timeout: # we don't want to wait until the end of time logger.debug( "could not unlink %s in time (%d seconds)" % (self.executable_path, timeout) ) break try: os.unlink(self.executable_path) logger.debug("successfully unlinked %s" % self.executable_path) break except (OSError, RuntimeError, PermissionError): time.sleep(0.1) continue except FileNotFoundError: break ================================================ FILE: undetected_chromedriver/reactor.py ================================================ #!/usr/bin/env python3 # this module is part of undetected_chromedriver import asyncio import json import logging import threading logger = logging.getLogger(__name__) class Reactor(threading.Thread): """ 异步事件处理 """ def __init__(self, driver: "Chrome"): super().__init__() self.driver = driver self.loop = asyncio.new_event_loop() self.lock = threading.Lock() self.event = threading.Event() self.daemon = True self.handlers = {} def add_event_handler(self, method_name, callback: callable): """ Parameters ---------- event_name: str example "Network.responseReceived" callback: callable callable which accepts 1 parameter: the message object dictionary Returns ------- """ with self.lock: self.handlers[method_name.lower()] = callback @property def running(self): return not self.event.is_set() def run(self): try: asyncio.set_event_loop(self.loop) self.loop.run_until_complete(self.listen()) except Exception as e: logger.warning("Reactor.run() => %s", e) async def _wait_service_started(self): while True: with self.lock: if ( getattr(self.driver, "service", None) and getattr(self.driver.service, "process", None) and self.driver.service.process.poll() ): await asyncio.sleep(self.driver._delay or 0.25) else: break async def listen(self): while self.running: await self._wait_service_started() await asyncio.sleep(1) try: with self.lock: log_entries = self.driver.get_log("performance") for entry in log_entries: try: obj_serialized: str = entry.get("message") obj = json.loads(obj_serialized) message = obj.get("message") method = message.get("method") if "*" in self.handlers: await self.loop.run_in_executor( None, self.handlers["*"], message ) elif method.lower() in self.handlers: await self.loop.run_in_executor( None, self.handlers[method.lower()], message ) # print(type(message), message) except Exception as e: raise e from None except Exception as e: if "invalid session id" in str(e): pass else: logging.debug("exception ignored :", e) ================================================ FILE: undetected_chromedriver/v2.py ================================================ # for backward compatibility import sys sys.modules[__name__] = sys.modules[__package__] ================================================ FILE: undetected_chromedriver/webelement.py ================================================ import selenium.webdriver.remote.webelement class WebElement(selenium.webdriver.remote.webelement.WebElement): """ Custom WebElement class which makes it easier to view elements when working in an interactive environment. standard webelement repr: using this WebElement class: )> selenium.webdriver.remote.webelement.WebElement 自定义的WebElement类,WebElement类可以代表任何 Web 对象,是selenium中所有元素的父类,也就是webelement对象拥有的方法,其它元素对象都会有。 如 div、a标签。 """ @property def attrs(self): """ attr:div:
""" if not hasattr(self, "_attrs"): self._attrs = self._parent.execute_script( """ var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items; """, self, ) return self._attrs def __repr__(self): strattrs = " ".join([f'{k}="{v}"' for k, v in self.attrs.items()]) if strattrs: strattrs = " " + strattrs return f"{self.__class__.__name__} <{self.tag_name}{strattrs}>"