Repository: lixi5338619/magical_spider
Branch: main
Commit: 7a129f7207c3
Files: 29
Total size: 94.3 KB
Directory structure:
gitextract_qe1fffch/
├── README.md
├── browserapi.py
├── config/
│ └── system_info.py
├── db.py
├── demo/
│ ├── runflow.py
│ ├── 单任务GET-demo.py
│ ├── 单任务POST-demo.py
│ ├── 多任务demo.py
│ ├── 抖音-步骤拆解demo.py
│ └── 药监局.py
├── engine.py
├── middlerware.py
├── models.py
├── server.py
├── settings.py
├── static/
│ ├── css/
│ │ └── index.css
│ └── docs/
│ ├── program.txt
│ └── 部署.txt
├── templates/
│ └── index.html
└── undetected_chromedriver/
├── __init__.py
├── _compat.py
├── cdp.py
├── devtool.py
├── dprocess.py
├── options.py
├── patcher.py
├── reactor.py
├── v2.py
└── webelement.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
# magical_spider
神奇的蜘蛛🕷,一个几乎适用于所有web端站点的采集方案。
### 诞生背景
纯属瞎扯:2022年全球变暖,各行业内卷严重,爬虫届更是入门抖音起步瑞数,为了减缓人才流失,推出magical_spider。
真实原因:一时兴起,吾辈当自强,重铸selenium荣光!
博客地址: [lxspider](http://www.lxspider.com) 爬虫逆向工具站:[lxtools](http://www.cnlans.com/lx/tools)
### 项目简介
- 非常规derver.pageSource。
- 通过Flask远程调用chromederver实现xmlHttpRequest。
- 通过sqlit记录任务状态。
- 通过undetected_selenium+stealth.min.js绕过一些校验。
- 目前适用于瑞数、加速乐等cookie加密,以及头条系的请求过程加密。
### 项目声明
- 项目仅供学习参考。
- 如有风控校验需自行解决,滑块可参考middlerware.py。
- 方案适用于应急场景或数据量要求不高时,若时间充裕建议通过逆向处理。推荐阅读:[《爬虫逆向进阶实战》](https://github.com/lixi5338619/lxBook)
### 部署
[linux部署文档](static/docs/部署.txt)
---
## 使用说明
1、配置settings.py,启动 flask 服务
2、运行方法参考demo文件内容,主要借助runflow.py。
3、测试代码
GET请求
```python
from demo.runflow import magical_start,magical_request,magical_close
project_name = 'cnipa'
base_url = 'https://www.cnipa.gov.cn'
session_id,process_url = magical_start(project_name,base_url)
print(len(magical_request(session_id, process_url,'https://www.cnipa.gov.cn/col/col57/index.html')))
magical_close(session_id,process_url,project_name)
```
POST请求
```python
from demo.runflow import magical_start,magical_request,magical_close
import json
project_name = 'chinadrugtrials'
base_url = 'http://www.chinadrugtrials.org.cn'
session_id,process_url = magical_start(project_name,base_url)
data = {"id": "","ckm_index": "","sort": "desc","sort2": "","rule": "CTR","secondLevel": "0","currentpage": "2","keywords": "","reg_no": "","indication": "","case_no": "","drugs_name": "","drugs_type": "","appliers": "","communities": "","researchers": "","agencies": "","state": ""}
formdata = json.dumps(data)
print(magical_request(session_id=session_id, process_url=process_url,
request_url='http://www.chinadrugtrials.org.cn/clinicaltrials.searchlist.dhtml',
request_type='post',formdata=formdata
))
magical_close(session_id,process_url,project_name)
```
4、index页可以查看和管理当前运行中的任务,也能查看系统内存和磁盘使用情况。
5、demo文件夹中有任务流程汇总runflow.py,以及抖音、药监局案例,单任务和多任务示例。

================================================
FILE: browserapi.py
================================================
# -*- coding: utf-8 -*-
import undetected_chromedriver as webdriver
from undetected_chromedriver.options import ChromeOptions
from settings import *
from selenium.webdriver import ActionChains
from middlerware import Slide
import time
import platform
class Browser():
"""Browser Env : undetected_chromedriver + stealth.js
headless_enable: 无头模式
images_enable: 图像开关
incognito_enable: 无痕模式
logging_enable: 开启日志
stealth_enable: stealth 伪装模式
proxy: 启用代理, 格式:http://127.0.0.1:8888
"""
def __init__(self):
options = ChromeOptions()
options.add_argument("--lang=en-us")
if headless_enable:
options.add_argument("--headless")
if plugin_enable:
options.add_argument('--disable-images')
options.add_argument('--disable-plugins')
options.add_argument('disable-audio')
options.add_argument('disable-translate')
if proxy:
options.add_argument('--proxy-server=' + proxy)
if logging_enable:
options.add_argument('log-level=3')
if incognito_enable:
options.add_argument("--incognito")
if detach_enable:
options.add_experimental_option("detach",True)
if platform.system().lower()=='linux':
options.add_argument("--headless")
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
self.browser = webdriver.Chrome(driver_executable_path=driverpath, options=options)
if stealth_enable:
self.stealth_enable()
def stealth_enable(self):
with open(stealth_path,'r',encoding='utf-8') as file:
stealth_min_js = file.read()
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": stealth_min_js
})
def start_request(self,url):
self.browser.get(url)
return self.browser
def close(self):
self.browser.close()
self.browser.quit()
class BrowserApi():
def __init__(self,browser):
self.browser = browser
def browser_ps(self,url):
self.browser.get(url)
return self.browser.page_source
def browser_get(self, url):
doc = self.browser.execute_script('''
function queryData(url) {
var p = new Promise(function(resolve,reject) {
var e={
"url":"%s",
"method":"GET"
};
var h = new XMLHttpRequest;
h.open(e.method, e.url, true);
h.setRequestHeader("salute-by","lx");
h.onreadystatechange =function() {
if(h.readyState === 4 && h.status === 200) {
resolve(h.responseText);
} else {}
};
h.send(null);
});
return p;
}
var p1 = queryData('lx');
res = Promise.all([p1]).then(function(result){
return result
})
return res
''' % (url))
return doc[0]
def browser_post(self, url, formdata=""):
doc = self.browser.execute_script('''
function queryData(url) {
var p = new Promise(function(resolve,reject) {
var e={"url":"%s",
"method":"POST",
"data" : '%s'
};
var h = new XMLHttpRequest;
h.open(e.method, e.url, true);
h.setRequestHeader("accept","application/json, text/plain, */*");
h.setRequestHeader("content-type","application/json;charset=UTF-8");
h.setRequestHeader("salute-by","lx");
h.onreadystatechange =function() {
if(h.readyState != 4) return;
if(h.readyState === 4 && h.status ===200) {
resolve(h.responseText);
} else {
}
};
h.send(e.data);
});
return p;
}
var p1 = queryData('lx');
res = Promise.all([p1]).then(function(result){
return result
})
return res;
''' % (url, formdata))
return doc[0]
def check_slide(self,bg_xpath,gap_xpath,slider_xpath,domain=None):
"""params:
bg_xpath : 带缺口的背景图片的 xpath
gap_xpath: 缺口滑块图片的 xpath
slider_xpath: 待拖动滑块的 xpath
domain: 图片doamin,非 http开头需补全链接
"""
while 1:
try:
bg = self.browser.find_element_by_id(bg_xpath).get_attribute('src')
gap = self.browser.find_element_by_xpath(gap_xpath).get_attribute('src')
if not bg.startswith('http'):bg = domain+bg
if not gap.startswith('http'):gap = domain+gap
slide_app = Slide(gap=gap, bg=bg)
distance = slide_app.discern()
except:
break
try:
slider = self.browser.find_element_by_xpath(slider_xpath)
ActionChains(self.browser).click_and_hold(slider).perform()
_tracks = slide_app.get_tracks(distance)
new_1 = _tracks[-1] - (sum(_tracks) - distance)
_tracks.pop()
_tracks.append(new_1)
for mouse_x in _tracks:
ActionChains(self.browser).move_by_offset(mouse_x, 0).perform()
ActionChains(self.browser).release().perform()
time.sleep(1)
except:
break
================================================
FILE: config/system_info.py
================================================
# -*- coding: utf-8 -*-
import psutil
class SystemInfoUtil(object):
@classmethod
def get_format_byte(cls, value):
"""字节"""
kb, b = divmod(value, 1024)
mb, kb = divmod(kb, 1024)
gb, mb = divmod(mb, 1024)
if gb > 0:
return f'{round(gb + mb * 0.001)}GB'
elif mb > 0:
return f'{round(mb + kb * 0.001)}MB'
elif kb > 0:
return f'{round(kb + b * 0.001)}KB'
else:
return f'{round(b)}B'
@classmethod
def get_virtual_memory(cls):
"""
内存使用情况
total: 总内存
available: 可用内存
percent: 内存使用率
used: 已使用的内存
:return:
"""
virtual_memory = psutil.virtual_memory()
return {
'total': virtual_memory.total,
'total_format': cls.get_format_byte(virtual_memory.total),
'available': virtual_memory.available,
'available_format': cls.get_format_byte(virtual_memory.available),
'percent': round(virtual_memory.percent),
'used': virtual_memory.used,
'used_format': cls.get_format_byte(virtual_memory.used),
}
@classmethod
def get_disk_usage(cls):
"""磁盘使用情况"""
disk_usage = psutil.disk_usage('/')
return {
'total': disk_usage.total,
'total_format': cls.get_format_byte(disk_usage.total),
'used': disk_usage.used,
'used_format': cls.get_format_byte(disk_usage.used),
'free': disk_usage.free,
'free_format': cls.get_format_byte(disk_usage.free),
'percent': round(disk_usage.percent),
}
================================================
FILE: db.py
================================================
import os.path
import sqlite3
from models import *
from settings import magicalpath
def create_connection():
db = sqlite3.connect(magicalpath)
return db
def select_process():
db = create_connection()
con = db.cursor()
con.execute("select * from process")
res = con.fetchall()
con.close()
db.close()
return res
def select_process_name(processName:str)->Process:
db = create_connection()
con = db.cursor()
con.execute("select * from process where processName=?",(processName,))
res = con.fetchone()
con.close()
db.close()
return res
def select_process_id(processId:str)->Process:
db = create_connection()
con = db.cursor()
con.execute("select * from process where processId=?",(processId,))
res = con.fetchone()
con.close()
db.close()
return res
def insert_process(process:Process)->Process:
db = create_connection()
cursor = db.cursor()
try:
cursor.execute("insert into process(processId, processName, processUrl, createTime) values ('%s','%s','%s','%s')" % (process.processId, process.processName, process.processUrl,datetime.datetime.now()))
db.commit()
cursor.close()
db.close()
return process
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
def delete_process(process_name):
db = create_connection()
cursor = db.cursor()
try:
cursor.execute("delete FROM process where processName ='%s'" % (process_name))
db.commit()
cursor.close()
db.close()
except Exception as e:
db.rollback()
cursor.close()
db.close()
def delete_process_id(processId):
db = create_connection()
cursor = db.cursor()
try:
cursor.execute("delete FROM process where processId ='%s'" % (processId))
db.commit()
cursor.close()
db.close()
except Exception as e:
db.rollback()
cursor.close()
db.close()
if __name__ == '__main__':
if not os.path.exists(magicalpath):
con = sqlite3.connect(magicalpath)
cursor = con.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS `process`(`processId` VARCHAR(90),`processName` VARCHAR(90) UNIQUE,`processUrl` VARCHAR(256),`createTime` DATA,`baseUrl` VARCHAR(256));")
con.commit()
================================================
FILE: demo/runflow.py
================================================
import requests
sess = requests.session()
host = 'http://127.0.0.1:5000'
def magical_start(project_name,base_url = 'http://www.lxspider.com'):
# 1、create browser and select session_id
result = sess.post(f'{host}/create',data={'name':project_name,'url':base_url}).json()
session_id,process_url = result['session_id'],result['process_url']
return session_id,process_url
def magical_request(session_id,process_url,request_url,request_type='get',formdata=''):
# 2、request browser_xhr
data = {'session_id': session_id, 'process_url': process_url,
'request_url': request_url, 'request_type': request_type}
if request_type.lower()=='post':
data.update({'request_type':'post','formdata':formdata})
result = sess.post(f'{host}/xhr',data=data).json()
return result['result']
def magical_close(session_id,process_url,process_name):
# 4、close browser
close_data = {'session_id':session_id,'process_url':process_url,'process_name':process_name}
sess.post(f'{host}/close',data=close_data).json()
================================================
FILE: demo/单任务GET-demo.py
================================================
from demo.runflow import magical_start,magical_request,magical_close
project_name = 'cnipa'
base_url = 'https://www.cnipa.gov.cn'
session_id,process_url = magical_start(project_name,base_url)
for i in range(200):
print(len(magical_request(session_id, process_url,'https://www.cnipa.gov.cn/col/col2486/index.html')))
magical_close(session_id,process_url,project_name)
================================================
FILE: demo/单任务POST-demo.py
================================================
from demo.runflow import magical_start,magical_request,magical_close
import json
# POST案例昨天忘记加了,感谢 [尘川] 的提醒 by:2022/08/10
project_name = 'chinadrugtrials'
base_url = 'http://www.chinadrugtrials.org.cn'
session_id,process_url = magical_start(project_name,base_url)
data = {"id": "","ckm_index": "","sort": "desc","sort2": "","rule": "CTR","secondLevel": "0","currentpage": "2","keywords": "","reg_no": "","indication": "","case_no": "","drugs_name": "","drugs_type": "","appliers": "","communities": "","researchers": "","agencies": "","state": ""}
formdata = json.dumps(data)
for i in range(100):
print(len(magical_request(session_id=session_id, process_url=process_url,
request_url='http://www.chinadrugtrials.org.cn/clinicaltrials.searchlist.dhtml',
request_type='post',formdata=formdata
)))
magical_close(session_id,process_url,project_name)
================================================
FILE: demo/多任务demo.py
================================================
from demo.runflow import magical_start,magical_request,magical_close
import time
# 各任务间互不影响,可选择使用多线程或多进程,大家自由发挥
def r1():
project_name1 = '药监局新闻任务1'
s1,p1 = magical_start(project_name1,'https://www.nmpa.gov.cn')
request_list = [
'https://www.nmpa.gov.cn/xxgk/ggtg/index.html',
'https://www.nmpa.gov.cn/xxgk/fgwj/index.html',
'https://www.nmpa.gov.cn/xxgk/fgwj/index.html'
]
for request_url in request_list:
print("r1:", len(magical_request(s1, p1, request_url)))
time.sleep(5)
magical_close(s1,p1,project_name1)
def r2():
project_name2 = '药监局新闻任务2'
s2,p2 = magical_start(project_name2,'https://www.nmpa.gov.cn')
request_list = ['https://www.nmpa.gov.cn/zwgk/rshxx/index.html',
'https://www.nmpa.gov.cn/zwgk/xwfb/index.html',
'https://www.nmpa.gov.cn/zwgk/xwfb/index.html'
]
for request_url in request_list:
print("r2:", len(magical_request(s2, p2, request_url)))
time.sleep(5)
magical_close(s2,p2,project_name2)
import threading
thread1 = threading.Thread(target=r1)
thread2 = threading.Thread(target=r2)
thread1.start()
thread2.start()
================================================
FILE: demo/抖音-步骤拆解demo.py
================================================
import requests
# 步骤拆解,简化版查看 药监局.py
project_name = '抖音任务2' # project_name不可重复,勿创建重复任务名
base_url = 'https://www.douyin.com'
# 1、browser init and select browser_session
result = requests.post('http://127.0.0.1:5000/create',data={'name':project_name,'url':base_url}).json()
session_id = result['session_id']
process_url = result['process_url']
# 2、request browser_xhr
# URL需要更换为你浏览器中的
request_list = [
'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=%E9%9E%A0%E5%A9%A7%E7%A5%8E&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414',
'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=lx&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414',
'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=pythonlx&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414',
'https://www.douyin.com/aweme/v1/web/search/item/?device_platform=webapp&aid=6383&channel=channel_pc_web&search_channel=aweme_video_web&sort_type=0&publish_time=0&keyword=lx666&search_source=normal_search&query_correct_type=1&is_filter_search=0&from_group_id=&offset=0&count=10&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=104.0.0.0&browser_online=true&engine_name=Blink&engine_version=104.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7122050458177701414',
]
for request_url in request_list:
data = {'session_id':session_id,'process_url':process_url,
'request_url':request_url,'request_type':'get'}
result = requests.post('http://127.0.0.1:5000/xhr',data=data).json()
print(len(result['result']))
# 3、close browser
close_data = {'session_id':session_id,'process_url':process_url,'process_name':project_name}
requests.post('http://127.0.0.1:5000/close',data=close_data).json()
================================================
FILE: demo/药监局.py
================================================
from demo.runflow import magical_start,magical_request,magical_close
project_name = '药监局1'
base_url = 'https://www.nmpa.gov.cn'
request_list = [
'https://www.nmpa.gov.cn/yaopin/ypjgdt/index.html',
'https://www.nmpa.gov.cn/yaopin/ypjgdt/20220705190551125.html'
]
session_id,process_url = magical_start(project_name,base_url)
for request_url in request_list:
print(magical_request(session_id, process_url, request_url))
magical_close(session_id,process_url,project_name)
================================================
FILE: engine.py
================================================
# -*- coding: utf-8 -*-
from browserapi import Browser,BrowserApi
from db import *
from models import Process
from selenium import webdriver
from selenium.webdriver.remote.webdriver import WebDriver
def create_browser(url,name):
bro = Browser()
browser = bro.start_request(url)
session_id = browser.session_id
process_url = browser.command_executor._url
insert_process(Process(session_id,name,process_url,url))
return browser
def attachToSession(session_id,url):
original_execute = WebDriver.execute
def new_command_execute(self, command, params=None):
if command == "newSession":
return {'success': 0, 'value': None, 'sessionId': session_id}
else:
return original_execute(self, command, params)
WebDriver.execute = new_command_execute
driver = webdriver.Remote(command_executor=url, desired_capabilities={})
driver.session_id = session_id
WebDriver.execute = original_execute
return driver
def carry_browser(session_id,process_url,request_url,request_type,formdata):
try:
browser = attachToSession(session_id,process_url)
except:
# 防止窗口崩溃 -> 增加的重建操作
print("防止窗口崩溃 -> 增加的重建操作")
browser_info = select_process_id(session_id)
base_url = browser_info[4]
process_name = browser_info[1]
delete_process(process_name)
browser = create_browser(base_url,process_name)
print("browser 重建成功")
broapi = BrowserApi(browser)
if request_type=='get':
result = broapi.browser_get(request_url)
else:
result = broapi.browser_post(request_url,formdata)
return result
def close_browser(session_id,process_url,process_name):
delete_process(process_name)
browser = attachToSession(session_id,process_url)
browser.close()
browser.quit()
def select_all_process():
return select_process()
================================================
FILE: middlerware.py
================================================
# -*- coding: utf-8 -*-
import os,requests
from urllib.parse import urlparse
try:
import cv2, numpy as np
except:
...
class Slide(object):
def __init__(self, gap, bg, gap_size=None, bg_size=None, out=None):
"""
:param bg: 带缺口的图片链接或者url
:param gap: 缺口图片链接或者url
"""
self.img_dir = os.path.join(os.getcwd(), 'img')
if not os.path.exists(self.img_dir):
os.makedirs(self.img_dir)
bg_resize = bg_size if bg_size else (340, 212)
gap_size = gap_size if gap_size else (68, 68)
self.bg = self.check_is_img_path(bg, 'bg', resize=bg_resize)
self.gap = self.check_is_img_path(gap, 'gap', resize=gap_size)
self.out = out if out else os.path.join(self.img_dir, 'out.jpg')
@staticmethod
def check_is_img_path(img, img_type, resize):
if img.startswith('http'):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;"
"q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,ja;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": urlparse(img).hostname,
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.164 Safari/537.36",
}
img_res = requests.get(img, headers=headers)
if img_res.status_code == 200:
img_path = f'./img/{img_type}.jpg'
image = np.asarray(bytearray(img_res.content), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
if resize:
image = cv2.resize(image, dsize=resize)
cv2.imwrite(img_path, image)
return img_path
else:
raise Exception(f"保存{img_type}图片失败")
else:
return img
@staticmethod
def clear_white(img):
"""清除图片的空白区域,这里主要清除滑块的空白"""
img = cv2.imread(img)
rows, cols, channel = img.shape
min_x = 255
min_y = 255
max_x = 0
max_y = 0
for x in range(1, rows):
for y in range(1, cols):
t = set(img[x, y])
if len(t) >= 2:
if x <= min_x:
min_x = x
elif x >= max_x:
max_x = x
if y <= min_y:
min_y = y
elif y >= max_y:
max_y = y
img1 = img[min_x:max_x, min_y: max_y]
return img1
def template_match(self, tpl, target):
th, tw = tpl.shape[:2]
result = cv2.matchTemplate(target, tpl, cv2.TM_CCOEFF_NORMED)
# 寻找矩阵(一维数组当作向量,用Mat定义) 中最小值和最大值的位置
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
tl = max_loc
br = (tl[0] + tw, tl[1] + th)
# 绘制矩形边框,将匹配区域标注出来
# target:目标图像
# tl:矩形定点
# br:矩形的宽高
# (0,0,255):矩形边框颜色
# 1:矩形边框大小
cv2.rectangle(target, tl, br, (0, 0, 255), 2)
cv2.imwrite(self.out, target)
return tl[0]
@staticmethod
def image_edge_detection(img):
edges = cv2.Canny(img, 100, 200)
return edges
def discern(self):
img1 = self.clear_white(self.gap)
img1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY)
slide = self.image_edge_detection(img1)
back = cv2.imread(self.bg, 0)
back = self.image_edge_detection(back)
slide_pic = cv2.cvtColor(slide, cv2.COLOR_GRAY2RGB)
back_pic = cv2.cvtColor(back, cv2.COLOR_GRAY2RGB)
x = self.template_match(slide_pic, back_pic)
# 输出横坐标, 即 滑块在图片上的位置
return x
@staticmethod
def get_tracks(distance, rate=0.6, t=0.2, v=0):
"""
将distance分割成小段的距离
:param distance: 总距离
:param rate: 加速减速的临界比例
:param a1: 加速度
:param a2: 减速度
:param t: 单位时间
:param t: 初始速度
:return: 小段的距离集合
"""
tracks = []
# 加速减速的临界值
mid = rate * distance
# 当前位移
s = 0
# 循环
while s < distance:
# 初始速度
v0 = v
if s < mid:
a = 20
else:
a = -3
# 计算当前t时间段走的距离
s0 = v0 * t + 0.5 * a * t * t
# 计算当前速度
v = v0 + a * t
# 四舍五入距离,因为像素没有小数
tracks.append(round(s0))
# 计算当前距离
s += s0
return tracks
================================================
FILE: models.py
================================================
import datetime
class Process:
def __init__(self, processId, processName,processUrl,baseUrl,createTime = datetime.datetime.now()) -> None:
super().__init__()
self.processId = processId
self.processName = processName
self.processUrl = processUrl
self.createTime = createTime
self.baseUrl = baseUrl
================================================
FILE: server.py
================================================
# -*- coding: utf-8 -*-
from datetime import timedelta
from flask import Flask,session
from flask import render_template,request,redirect,url_for,jsonify
import os
from engine import *
from config.system_info import SystemInfoUtil
from settings import host,port
app = Flask(__name__)
app.config['SECRET_KEY'] = os.urandom(24)
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = timedelta(days=7)
@app.route('/')
def index_info():
process = select_all_process()
if not process:process=[["","没有在运行的任务","",""]]
disk_usage = SystemInfoUtil.get_disk_usage()
virtual_memory = SystemInfoUtil.get_virtual_memory()
return render_template('index.html',process=process,disk_usage=disk_usage,virtual_memory=virtual_memory)
@app.route('/create',methods=['POST'])
def browser_start():
url = request.form.get("url")
name = request.form.get("name")
try:
create_browser(url,name)
session_id, process_name, process_url, datetime,base_url = select_process_name(name)
result = {'session_id': session_id, 'process_name': process_name,
'process_url': process_url, 'datetime': datetime}
return jsonify(result)
except:
return jsonify({"result":0,"detail":"驱动配置错误或任务名已存在"})
@app.route('/xhr',methods=['POST'])
def browser_xhr():
session_id = request.form.get("session_id")
process_url = request.form.get("process_url")
request_url = request.form.get("request_url")
request_type = request.form.get("request_type")
formdata = request.form.get("formdata")
result = carry_browser(session_id,process_url,request_url,request_type,formdata)
return jsonify({"result":result})
@app.route('/close',methods=['POST'])
def browser_close():
session_id = request.form.get("session_id")
process_url = request.form.get("process_url")
process_name = request.form.get("process_name")
try:
close_browser(session_id,process_url,process_name)
return jsonify({"result":1})
except:
return jsonify({"result":0,"detail":"驱动窗口已自动关闭"})
@app.route('/delete/<process_name>',methods=['GET'])
def delete_process_name(process_name):
try:
process = select_process_name(process_name)
close_browser(session_id=process[0],process_url=process[2],process_name=process_name)
except:
delete_process(process_name)
print("delete except: line 70")
return redirect('/')
if __name__ == '__main__':
app.run(host=host,port=port,use_reloader=False,debug=True)
================================================
FILE: settings.py
================================================
# MagicalSpider Settings
# 隐藏界面
headless_enable = True
# 高匿模式、可能影响创建时间
stealth_enable = True
# 代理设置
proxy = None
# 无痕访问
incognito_enable = False
# 分离模式
detach_enable = False
plugin_enable = False
logging_enable = False
driverpath = './config/chromedriver.exe'
magicalpath = './config/magical.db'
stealth_path = './config/stealth.min.js'
host = '0.0.0.0'
port = 5000
# 让 Selenium 在 Linux 中以有头模式运行
# xvfb-run python3 test.py -s -screen 0 1920x1080x16
================================================
FILE: static/css/index.css
================================================
body{
background: url(/static/image/bg.png);
background-size: 100% 100%;
background-repeat:no-repeat;
}
a{
text-decoration:none;
}
p{
font-size: 18px;
color: white;
}
img{
width: 207.99px;
height: 207.99px;
}
table{
border-collapse: collapse;
margin-left: 6%;
text-align: center;
}
table td, table th
{
border: 1px solid #cad9ea;
color: #666;
height: 30px;
}
table thead th
{
background-color: #CCE8EB;
width: 260px;
}
table tr:nth-child(odd)
{
background: #fff;
}
table tr:nth-child(even)
{
background: #F5FAFA;
}
.lx{
display: inline-block;
margin-left: 5%;
}
.bt{
font-weight:bold;
color:#5b91d6;
}
.blog{
border:solid;
border-color:#5784d0;
width: 10%;
margin-left: 65.6%;
position: fixed;
top: 0;
}
================================================
FILE: static/docs/program.txt
================================================
magical_spider,一个几乎适用于所有web端站点的采集方案。
## 项目简介
1、主要使用谷歌驱动,但非常规derver.page_source。
2、通过 flask 远程调用 chromederver 实现 xmlHttpRequest 传输数据。
3、通过sqlit 记录和管理任务状态。
4、通过undetected_selenium+stealth.min.js绕过一些校验。
5、测试通过瑞数、加速乐等cookie加密,以及头条系的请求过程加密。
6、支持 linux 部署,支持多任务。
## 项目原理
打造一个近乎真实的浏览器环境,去完成网站内部环境的请求加载,直接返回响应内容供本地调用。
## 项目声明
1、整合了一些其他开源项目,仅供学习参考。
2、适用于应急场景或小量任务,方便便捷,若时间充裕建议通过逆向处理。
3、如有风控校验需自行解决,滑块可参考middlerware.py。
## 备注
1、index页可以查看和管理当前运行中的任务,也能查看系统内存和磁盘使用情况。
2、demo文件夹中有任务流程汇总runflow.py,以及抖音、药监局案例,单任务和多任务示例。
3、运行前配置服务信息和驱动路径,启动flask服务后再执行任务。
================================================
FILE: static/docs/部署.txt
================================================
Linux部署
1.安装chrome (自行选择安装位置)
yum install https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm
2.检查chrome的版本
google-chrome --version
3.安装对应版本的 chromedriver_linux64
比如我的chrome版本是104.0.5112.79
wget https://npm.taobao.org/mirrors/chromedriver/104.0.5112.79/chromedriver_linux64.zip
4.解压
unzip chromedriver_linux64
5.授权
chmod 777 chromedriver
6.修改项目代码settings.py中的chromedriver路径
7.安装python依赖后启动flask项目
- Python依赖 :flask、sqlite3、selenium、websockets、opencv-python、numpy
- flask启动方式:python3 sever.py
8.开启服务器端口访问权限
9.运行项目测试
================================================
FILE: templates/index.html
================================================
<!DOCTYPE html>
<html lang="en" >
<head>
<meta charset="UTF-8">
<title>MagicalSpider</title>
</head>
<link rel="stylesheet" href="/static/css/index.css">
<body>
<div>
<br>
<br>
<br>
<br>
<br>
<p>
【MagicalSpider】 神奇的蜘蛛🕷,一个几乎适用于所有web端站点的采集方案。(比如瑞数、加速乐、头条系、五秒盾等)
</p>
<br>
<p>
诞生背景:
2022年全球变暖,各行业内卷严重,爬虫届更是入门抖音+瑞数,导致发生 [从入门到放弃] 。
所以吾辈当自强,重铸selenium荣光 (本段纯属瞎扯) 。
</p>
<br>
<p>
magical_spider:<a style="color: aqua" target="_blank" href="/static/docs/program.txt">项目说明</a>
、<a style="color: aqua" target="_blank" href="/static/docs/部署.txt">部署文档</a>
</p>
<br>
</div>
<p>运行中的任务:</p>
<div>
<table>
<thead>
<tr>
<th>任务名</th>
<th>任务ID</th>
<th>任务地址</th>
<th>创建时间</th>
<th>任务管理</th>
</tr>
{% for p in process %}
<tr>
<td>{{ p[1] }}</td>
<td>{{ p[0] }}</td>
<td>{{ p[2] }}</td>
<td>{{ p[3][:19] }}</td>
{% if p[0] %}
<td><a href="/delete/{{ p[1] }}">删除任务</a></td>
{% else %}
<td></td>
{% endif %}
</tr>
{% endfor %}
</thead>
</table>
</div>
<br>
<div style="text-align: center;margin-top: 10%">
<div class="blog">
<a href="http://www.lxspider.com" target="_blank">
<p style="color: aqua;">个人博客:lxspider</p>
</a>
</div>
<div class="lx" style="margin-left: 0%">
<p class="bt">公众号《Pythonlx》</p>
<p>
<img src="http://www.lxspider.com/wp-content/uploads/2022/07/qrcode_for_gh_b237fabfe467_258.jpg" alt="">
</p>
</div>
<div class="lx">
<p class="bt">内存使用情况</p>
<p>总内存: {{ virtual_memory.total_format }}</p>
<p>可用内存: {{ virtual_memory.available_format }}</p>
<p>已使用内存: {{ virtual_memory.used_format }}</p>
<p>内存使用率: {{ virtual_memory.percent }}%</p>
<br>
<br>
</div>
<div class="lx">
<p class="bt">磁盘使用情况</p>
<p>总内存: {{ disk_usage.total_format }}</p>
<p>可用内存: {{ disk_usage.free_format }}</p>
<p>已使用内存: {{ disk_usage.used_format }}</p>
<p>内存使用率: {{ disk_usage.percent }}%</p>
<br>
<br>
</div>
{# <div class="lx" style="margin-left: 5%">#}
{# <p class="bt">微信赞助</p>#}
{# <p>#}
{# <img src="http://www.lxspider.com/wp-content/uploads/2022/07/%E6%97%A0%E6%A0%87%E9%A2%98.png" alt="">#}
{# </p>#}
{# </div>#}
</div>
</body>
</html>
================================================
FILE: undetected_chromedriver/__init__.py
================================================
#!/usr/bin/env python3
#from __future__ import annotations
import subprocess
"""
888 888 d8b
888 888 Y8P
888 888
.d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888
d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P"
888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888
Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888
"Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888
by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam)
"""
"""
user_data_dir、language、webdriver、Webelement、no-first-run、window-size、log-level、start-maximized、no-sandbox
"""
__version__ = "3.1.5r4"
import json
import logging
import os
import re
import shutil
import sys
import tempfile
import time
import inspect
import threading
import selenium.webdriver.chrome.service
import selenium.webdriver.chrome.webdriver
import selenium.webdriver.common.service
import selenium.webdriver.remote.webdriver
from .cdp import CDP
from .options import ChromeOptions
from .patcher import IS_POSIX
from .patcher import Patcher
from .reactor import Reactor
from .dprocess import start_detached
__all__ = (
"Chrome",
"ChromeOptions",
"Patcher",
"Reactor",
"CDP",
"find_chrome_executable",
)
logger = logging.getLogger("uc")
logger.setLevel(logging.getLogger().getEffectiveLevel())
class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
"""
Controls the ChromeDriver and allows you to drive the browser.
The webdriver file will be downloaded by this module automatically,
you do not need to specify this. however, you may if you wish.
Attributes
----------
Methods
-------
reconnect()
this can be useful in case of heavy detection methods
-stops the chromedriver service which runs in the background
-starts the chromedriver service which runs in the background
-recreate session
start_session(capabilities=None, browser_profile=None)
differentiates from the regular method in that it does not
require a capabilities argument. The capabilities are automatically
recreated from the options at creation time.
--------------------------------------------------------------------------
NOTE:
Chrome has everything included to work out of the box.
it does not `need` customizations.
any customizations MAY lead to trigger bot migitation systems.
--------------------------------------------------------------------------
"""
_instances = set()
session_id = None
debug = False
def __init__(
self,
options=None,
user_data_dir=None,
driver_executable_path=None,
browser_executable_path=None,
port=0,
enable_cdp_events=False,
service_args=None,
desired_capabilities=None,
advanced_elements=False,
service_log_path=None,
keep_alive=True,
log_level=0,
headless=False,
version_main=None,
patcher_force_close=False,
suppress_welcome=True,
use_subprocess=False,
debug=False,
**kw
):
"""
Creates a new instance of the chrome driver.
Starts the service and then creates new instance of chrome driver.
Parameters
----------
options: ChromeOptions, optional, default: None - automatic useful defaults
this takes an instance of ChromeOptions, mainly to customize browser behavior.
anything other dan the default, for example extensions or startup options
are not supported in case of failure, and can probably lowers your undetectability.
user_data_dir: str , optional, default: None (creates temp profile)
if user_data_dir is a path to a valid chrome profile directory, use it,
and turn off automatic removal mechanism at exit.
driver_executable_path: str, optional, default: None(=downloads and patches new binary)
browser_executable_path: str, optional, default: None - use find_chrome_executable
Path to the browser executable.
If not specified, make sure the executable's folder is in $PATH
port: int, optional, default: 0
port you would like the service to run, if left as 0, a free port will be found.
enable_cdp_events: bool, default: False
:: currently for chrome only
this enables the handling of wire messages
when enabled, you can subscribe to CDP events by using:
driver.add_cdp_listener("Network.dataReceived", yourcallback)
# yourcallback is an callable which accepts exactly 1 dict as parameter
service_args: list of str, optional, default: None
arguments to pass to the driver service
desired_capabilities: dict, optional, default: None - auto from config
Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref".
advanced_elements: bool, optional, default: False
makes it easier to recognize elements like you know them from html/browser inspection, especially when working
in an interactive environment
default webelement repr:
<selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
advanced webelement repr
<WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time.
service_log_path: str, optional, default: None
path to log information from the driver.
keep_alive: bool, optional, default: True
Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
log_level: int, optional, default: adapts to python global log level
headless: bool, optional, default: False
can also be specified in the options instance.
Specify whether you want to use the browser in headless mode.
warning: this lowers undetectability and not fully supported.
version_main: int, optional, default: None (=auto)
if you, for god knows whatever reason, use
an older version of Chrome. You can specify it's full rounded version number
here. Example: 87 for all versions of 87
patcher_force_close: bool, optional, default: False
instructs the patcher to do whatever it can to access the chromedriver binary
if the file is locked, it will force shutdown all instances.
setting it is not recommended, unless you know the implications and think
you might need it.
suppress_welcome: bool, optional , default: True
a "welcome" alert might show up on *nix-like systems asking whether you want to set
chrome as your default browser, and if you want to send even more data to google.
now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False.
Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception.
use_subprocess: bool, optional , default: False,
False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python
This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after
program exits or using .quit()
unfortunately, there is always an edge case in which one would like to write an single script with the only contents being:
--start script--
import undetected_chromedriver as uc
d = uc.Chrome()
d.get('https://somesite/')
---end script --
and will be greeted with an error, since the program exists before chrome has a change to launch.
in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times.
! setting it to True comes with NO support when being detected. !
"""
self.debug = debug
patcher = Patcher(
executable_path=driver_executable_path,
force=patcher_force_close,
version_main=version_main,
)
patcher.auto()
self.patcher = patcher
if not options:
options = ChromeOptions()
try:
if hasattr(options, "_session") and options._session is not None:
# prevent reuse of options,
# as it just appends arguments, not replace them
# you'll get conflicts starting chrome
raise RuntimeError("you cannot reuse the ChromeOptions object")
except AttributeError:
pass
options._session = self
debug_port = selenium.webdriver.common.service.utils.free_port()
debug_host = "127.0.0.1"
if not options.debugger_address:
options.debugger_address = "%s:%d" % (debug_host, debug_port)
if enable_cdp_events:
options.set_capability(
"goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
)
options.add_argument("--remote-debugging-host=%s" % debug_host)
options.add_argument("--remote-debugging-port=%s" % debug_port)
if user_data_dir:
options.add_argument('--user-data-dir=%s' % user_data_dir)
language, keep_user_data_dir = None, bool(user_data_dir)
# see if a custom user profile is specified in options
for arg in options.arguments:
if "lang" in arg:
m = re.search("(?:--)?lang(?:[ =])?(.*)", arg)
try:
language = m[1]
except IndexError:
logger.debug("will set the language to en-US,en;q=0.9")
language = "en-US,en;q=0.9"
if "user-data-dir" in arg:
m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg)
try:
user_data_dir = m[1]
logger.debug(
"user-data-dir found in user argument %s => %s" % (arg, m[1])
)
keep_user_data_dir = True
except IndexError:
logger.debug(
"no user data dir could be extracted from supplied argument %s "
% arg
)
if not user_data_dir:
# backward compatiblity
# check if an old uc.ChromeOptions is used, and extract the user data dir
if hasattr(options, "user_data_dir") and getattr(
options, "user_data_dir", None
):
import warnings
warnings.warn(
"using ChromeOptions.user_data_dir might stop working in future versions."
"use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder"
)
options.add_argument("--user-data-dir=%s" % options.user_data_dir)
keep_user_data_dir = True
logger.debug(
"user_data_dir property found in options object: %s" % user_data_dir
)
else:
user_data_dir = os.path.normpath(tempfile.mkdtemp())
keep_user_data_dir = False
arg = "--user-data-dir=%s" % user_data_dir
options.add_argument(arg)
logger.debug(
"created a temporary folder in which the user-data (profile) will be stored during this\n"
"session, and added it to chrome startup arguments: %s" % arg
)
if not language:
try:
import locale
language = locale.getdefaultlocale()[0].replace("_", "-")
except Exception:
pass
if not language:
language = "en-US"
options.add_argument("--lang=%s" % language)
if not options.binary_location:
options.binary_location = (
browser_executable_path or find_chrome_executable()
)
self._delay = 3
self.user_data_dir = user_data_dir
self.keep_user_data_dir = keep_user_data_dir
if suppress_welcome:
options.arguments.extend(["--no-default-browser-check", "--no-first-run"])
if headless or options.headless:
options.headless = True
options.add_argument("--window-size=1920,1080")
options.add_argument("--start-maximized")
options.add_argument("--no-sandbox")
# fixes "could not connect to chrome" error when running
# on linux using privileged user like root (which i don't recommend)
options.add_argument(
"--log-level=%d" % log_level
or divmod(logging.getLogger().getEffectiveLevel(), 10)[0]
)
if hasattr(options, 'handle_prefs'):
options.handle_prefs(user_data_dir)
# fix exit_type flag to prevent tab-restore nag
try:
with open(
os.path.join(user_data_dir, "Default/Preferences"),
encoding="latin1",
mode="r+",
) as fs:
config = json.load(fs)
if config["profile"]["exit_type"] is not None:
# fixing the restore-tabs-nag
config["profile"]["exit_type"] = None
fs.seek(0, 0)
json.dump(config, fs)
logger.debug("fixed exit_type flag")
except Exception as e:
logger.debug("did not find a bad exit_type flag ")
self.options = options
if not desired_capabilities:
desired_capabilities = options.to_capabilities()
if not use_subprocess:
self.browser_pid = start_detached(
options.binary_location, *options.arguments
)
else:
browser = subprocess.Popen(
[options.binary_location, *options.arguments],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=IS_POSIX,
)
self.browser_pid = browser.pid
super(Chrome, self).__init__(
executable_path=patcher.executable_path,
port=port,
options=options,
service_args=service_args,
desired_capabilities=desired_capabilities,
service_log_path=service_log_path,
keep_alive=keep_alive,
)
self.reactor = None
if enable_cdp_events:
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
logging.getLogger(
"selenium.webdriver.remote.remote_connection"
).setLevel(20)
reactor = Reactor(self)
reactor.start()
self.reactor = reactor
if advanced_elements:
from .webelement import WebElement
self._web_element_cls = WebElement
if options.headless:
self._configure_headless()
def __getattribute__(self, item):
if not super().__getattribute__("debug"):
return super().__getattribute__(item)
else:
import inspect
original = super().__getattribute__(item)
if inspect.ismethod(original) and not inspect.isclass(original):
def newfunc(*args, **kwargs):
logger.debug(
"calling %s with args %s and kwargs %s\n"
% (original.__qualname__, args, kwargs)
)
return original(*args, **kwargs)
return newfunc
return original
def _configure_headless(self):
orig_get = self.get
logger.info("setting properties for headless")
def get_wrapped(*args, **kwargs):
if self.execute_script("return navigator.webdriver"):
logger.info("patch navigator.webdriver")
self.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(window, 'navigator', {
value: new Proxy(navigator, {
has: (target, key) => (key === 'webdriver' ? false : key in target),
get: (target, key) =>
key === 'webdriver' ?
false :
typeof target[key] === 'function' ?
target[key].bind(target) :
target[key]
})
});
"""
},
)
logger.info("patch user-agent string")
self.execute_cdp_cmd(
"Network.setUserAgentOverride",
{
"userAgent": self.execute_script(
"return navigator.userAgent"
).replace("Headless", "")
},
)
self.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(navigator, 'maxTouchPoints', {
get: () => 1
})"""
},
)
return orig_get(*args, **kwargs)
self.get = get_wrapped
def __dir__(self):
return object.__dir__(self)
def _get_cdc_props(self):
return self.execute_script(
"""
let objectToInspect = window,
result = [];
while(objectToInspect !== null)
{ result = result.concat(Object.getOwnPropertyNames(objectToInspect));
objectToInspect = Object.getPrototypeOf(objectToInspect); }
return result.filter(i => i.match(/.+_.+_(Array|Promise|Symbol)/ig))
"""
)
def _hook_remove_cdc_props(self):
# 它可以让当前标签页打开的所有网页,在网页内容加载之前执行一段 JavaScript 代码
self.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
let objectToInspect = window,
result = [];
while(objectToInspect !== null)
{ result = result.concat(Object.getOwnPropertyNames(objectToInspect));
objectToInspect = Object.getPrototypeOf(objectToInspect); }
result.forEach(p => p.match(/.+_.+_(Array|Promise|Symbol)/ig)
&&delete window[p]&&console.log('removed',p))
"""
},
)
def get(self, url):
if self._get_cdc_props():
self._hook_remove_cdc_props()
return super().get(url)
def add_cdp_listener(self, event_name, callback):
if (
self.reactor
and self.reactor is not None
and isinstance(self.reactor, Reactor)
):
self.reactor.add_event_handler(event_name, callback)
return self.reactor.handlers
return False
def clear_cdp_listeners(self):
if self.reactor and isinstance(self.reactor, Reactor):
self.reactor.handlers.clear()
def tab_new(self, url: str):
"""
this opens a url in a new tab.
apparently, that passes all tests directly!
Parameters
----------
url
Returns
-------
"""
if not hasattr(self, "cdp"):
from .cdp import CDP
cdp = CDP(self.options)
cdp.tab_new(url)
def reconnect(self, timeout=0.1):
try:
self.service.stop()
except Exception as e:
logger.debug(e)
time.sleep(timeout)
try:
self.service.start()
except Exception as e:
logger.debug(e)
try:
self.start_session()
except Exception as e:
logger.debug(e)
def start_session(self, capabilities=None, browser_profile=None):
if not capabilities:
capabilities = self.options.to_capabilities()
super(selenium.webdriver.chrome.webdriver.WebDriver, self).start_session(
capabilities, browser_profile
)
# super(Chrome, self).start_session(capabilities, browser_profile)
def quit(self):
logger.debug("closing webdriver")
if hasattr(self, "service") and getattr(self.service, "process", None):
self.service.process.kill()
try:
if self.reactor and isinstance(self.reactor, Reactor):
logger.debug("shutting down reactor")
self.reactor.event.set()
except Exception: # noqa
pass
try:
logger.debug("killing browser")
os.kill(self.browser_pid, 15)
except TimeoutError as e:
logger.debug(e, exc_info=True)
except Exception: # noqa
pass
if (
hasattr(self, "keep_user_data_dir")
and hasattr(self, "user_data_dir")
and not self.keep_user_data_dir
):
for _ in range(5):
try:
shutil.rmtree(self.user_data_dir, ignore_errors=False)
except FileNotFoundError:
pass
except (RuntimeError, OSError, PermissionError) as e:
logger.debug(
"When removing the temp profile, a %s occured: %s\nretrying..."
% (e.__class__.__name__, e)
)
else:
logger.debug("successfully removed %s" % self.user_data_dir)
break
time.sleep(0.1)
# dereference patcher, so patcher can start cleaning up as well.
# this must come last, otherwise it will throw 'in use' errors
self.patcher = None
def __del__(self):
try:
super().quit()
# self.service.process.kill()
except: # noqa
pass
self.quit()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.service.stop()
time.sleep(self._delay)
self.service.start()
self.start_session()
def __hash__(self):
return hash(self.options.debugger_address)
def find_chrome_executable():
"""
Finds the chrome, chrome beta, chrome canary, chromium executable
Returns
-------
executable_path : str
the full file path to found executable
"""
candidates = set()
if IS_POSIX:
for item in os.environ.get("PATH").split(os.pathsep):
for subitem in (
"google-chrome",
"chromium",
"chromium-browser",
"chrome",
"google-chrome-stable",
):
candidates.add(os.sep.join((item, subitem)))
if "darwin" in sys.platform:
candidates.update(
[
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Chromium.app/Contents/MacOS/Chromium",
]
)
else:
for item in map(
os.environ.get, ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA")
):
for subitem in (
"Google/Chrome/Application",
"Google/Chrome Beta/Application",
"Google/Chrome Canary/Application",
):
candidates.add(os.sep.join((item, subitem, "chrome.exe")))
for candidate in candidates:
if os.path.exists(candidate) and os.access(candidate, os.X_OK):
return os.path.normpath(candidate)
================================================
FILE: undetected_chromedriver/_compat.py
================================================
#!/usr/bin/env python3
# this module is part of undetected_chromedriver
"""
888 888 d8b
888 888 Y8P
888 888
.d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888
d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P"
888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888
Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888
"Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888
by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam)
"""
import io
import logging
import os
import random
import re
import string
import sys
import zipfile
from distutils.version import LooseVersion
from urllib.request import urlopen, urlretrieve
from selenium.webdriver import Chrome as _Chrome, ChromeOptions as _ChromeOptions
TARGET_VERSION = 0
logger = logging.getLogger("uc")
class Chrome:
def __new__(cls, *args, emulate_touch=False, **kwargs):
if not ChromeDriverManager.installed:
ChromeDriverManager(*args, **kwargs).install()
if not ChromeDriverManager.selenium_patched:
ChromeDriverManager(*args, **kwargs).patch_selenium_webdriver()
if not kwargs.get("executable_path"):
kwargs["executable_path"] = "./{}".format(
ChromeDriverManager(*args, **kwargs).executable_path
)
if not kwargs.get("options"):
kwargs["options"] = ChromeOptions()
instance = object.__new__(_Chrome)
instance.__init__(*args, **kwargs)
instance._orig_get = instance.get
def _get_wrapped(*args, **kwargs):
if instance.execute_script("return navigator.webdriver"):
instance.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(window, 'navigator', {
value: new Proxy(navigator, {
has: (target, key) => (key === 'webdriver' ? false : key in target),
get: (target, key) =>
key === 'webdriver'
? undefined
: typeof target[key] === 'function'
? target[key].bind(target)
: target[key]
})
});
"""
},
)
return instance._orig_get(*args, **kwargs)
instance.get = _get_wrapped
instance.get = _get_wrapped
instance.get = _get_wrapped
original_user_agent_string = instance.execute_script(
"return navigator.userAgent"
)
instance.execute_cdp_cmd(
"Network.setUserAgentOverride",
{
"userAgent": original_user_agent_string.replace("Headless", ""),
},
)
if emulate_touch:
instance.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(navigator, 'maxTouchPoints', {
get: () => 1
})"""
},
)
logger.info(f"starting undetected_chromedriver2.Chrome({args}, {kwargs})")
return instance
class ChromeOptions:
def __new__(cls, *args, **kwargs):
if not ChromeDriverManager.installed:
ChromeDriverManager(*args, **kwargs).install()
if not ChromeDriverManager.selenium_patched:
ChromeDriverManager(*args, **kwargs).patch_selenium_webdriver()
instance = object.__new__(_ChromeOptions)
instance.__init__()
instance.add_argument("start-maximized")
instance.add_experimental_option("excludeSwitches", ["enable-automation"])
instance.add_argument("--disable-blink-features=AutomationControlled")
return instance
class ChromeDriverManager(object):
installed = False
selenium_patched = False
target_version = None
DL_BASE = "https://chromedriver.storage.googleapis.com/"
def __init__(self, executable_path=None, target_version=None, *args, **kwargs):
_platform = sys.platform
if TARGET_VERSION:
# use global if set
self.target_version = TARGET_VERSION
if target_version:
# use explicitly passed target
self.target_version = target_version # user override
if not self.target_version:
# none of the above (default) and just get current version
self.target_version = self.get_release_version_number().version[
0
] # only major version int
self._base = base_ = "chromedriver{}"
exe_name = self._base
if _platform in ("win32",):
exe_name = base_.format(".exe")
if _platform in ("linux",):
_platform += "64"
exe_name = exe_name.format("")
if _platform in ("darwin",):
_platform = "mac64"
exe_name = exe_name.format("")
self.platform = _platform
self.executable_path = executable_path or exe_name
self._exe_name = exe_name
def patch_selenium_webdriver(self_):
"""
Patches selenium package Chrome, ChromeOptions classes for current session
:return:
"""
import selenium.webdriver.chrome.service
import selenium.webdriver
selenium.webdriver.Chrome = Chrome
selenium.webdriver.ChromeOptions = ChromeOptions
logger.info("Selenium patched. Safe to import Chrome / ChromeOptions")
self_.__class__.selenium_patched = True
def install(self, patch_selenium=True):
"""
Initialize the patch
This will:
download chromedriver if not present
patch the downloaded chromedriver
patch selenium package if <patch_selenium> is True (default)
:param patch_selenium: patch selenium webdriver classes for Chrome and ChromeDriver (for current python session)
:return:
"""
if not os.path.exists(self.executable_path):
self.fetch_chromedriver()
if not self.__class__.installed:
if self.patch_binary():
self.__class__.installed = True
if patch_selenium:
self.patch_selenium_webdriver()
def get_release_version_number(self):
"""
Gets the latest major version available, or the latest major version of self.target_version if set explicitly.
:return: version string
"""
path = (
"LATEST_RELEASE"
if not self.target_version
else f"LATEST_RELEASE_{self.target_version}"
)
return LooseVersion(urlopen(self.__class__.DL_BASE + path).read().decode())
def fetch_chromedriver(self):
"""
Downloads ChromeDriver from source and unpacks the executable
:return: on success, name of the unpacked executable
"""
base_ = self._base
zip_name = base_.format(".zip")
ver = self.get_release_version_number().vstring
if os.path.exists(self.executable_path):
return self.executable_path
urlretrieve(
f"{self.__class__.DL_BASE}{ver}/{base_.format(f'_{self.platform}')}.zip",
filename=zip_name,
)
with zipfile.ZipFile(zip_name) as zf:
zf.extract(self._exe_name)
os.remove(zip_name)
if sys.platform != "win32":
os.chmod(self._exe_name, 0o755)
return self._exe_name
@staticmethod
def random_cdc():
cdc = random.choices(string.ascii_lowercase, k=26)
cdc[-6:-4] = map(str.upper, cdc[-6:-4])
cdc[2] = cdc[0]
cdc[3] = "_"
return "".join(cdc).encode()
def patch_binary(self):
"""
Patches the ChromeDriver binary
:return: False on failure, binary name on success
"""
linect = 0
replacement = self.random_cdc()
with io.open(self.executable_path, "r+b") as fh:
for line in iter(lambda: fh.readline(), b""):
if b"cdc_" in line:
fh.seek(-len(line), 1)
newline = re.sub(b"cdc_.{22}", replacement, line)
fh.write(newline)
linect += 1
return linect
def install(executable_path=None, target_version=None, *args, **kwargs):
ChromeDriverManager(executable_path, target_version, *args, **kwargs).install()
================================================
FILE: undetected_chromedriver/cdp.py
================================================
#!/usr/bin/env python3
# this module is part of undetected_chromedriver
import json
import logging
from collections.abc import Mapping, Sequence
import requests
import websockets
log = logging.getLogger(__name__)
class CDPObject(dict):
def __init__(self, *a, **k):
super().__init__(*a, **k)
self.__dict__ = self
for k in self.__dict__:
if isinstance(self.__dict__[k], dict):
self.__dict__[k] = CDPObject(self.__dict__[k])
elif isinstance(self.__dict__[k], list):
for i in range(len(self.__dict__[k])):
if isinstance(self.__dict__[k][i], dict):
self.__dict__[k][i] = CDPObject(self)
def __repr__(self):
tpl = f"{self.__class__.__name__}(\n\t{{}}\n\t)"
return tpl.format("\n ".join(f"{k} = {v}" for k, v in self.items()))
class PageElement(CDPObject):
pass
class CDP:
log = logging.getLogger("CDP")
endpoints = CDPObject(
{
"json": "/json",
"protocol": "/json/protocol",
"list": "/json/list",
"new": "/json/new?{url}",
"activate": "/json/activate/{id}",
"close": "/json/close/{id}",
}
)
def __init__(self, options: "ChromeOptions"): # noqa
self.server_addr = "http://{0}:{1}".format(*options.debugger_address.split(":"))
self._reqid = 0
self._session = requests.Session()
self._last_resp = None
self._last_json = None
resp = self.get(self.endpoints.json) # noqa
self.sessionId = resp[0]["id"]
self.wsurl = resp[0]["webSocketDebuggerUrl"]
def tab_activate(self, id=None):
if not id:
active_tab = self.tab_list()[0]
id = active_tab.id # noqa
self.wsurl = active_tab.webSocketDebuggerUrl # noqa
return self.post(self.endpoints["activate"].format(id=id))
def tab_list(self):
retval = self.get(self.endpoints["list"])
return [PageElement(o) for o in retval]
def tab_new(self, url):
return self.post(self.endpoints["new"].format(url=url))
def tab_close_last_opened(self):
sessions = self.tab_list()
opentabs = [s for s in sessions if s["type"] == "page"]
return self.post(self.endpoints["close"].format(id=opentabs[-1]["id"]))
async def send(self, method: str, params: dict):
self._reqid += 1
async with websockets.connect(self.wsurl) as ws:
await ws.send(
json.dumps({"method": method, "params": params, "id": self._reqid})
)
self._last_resp = await ws.recv()
self._last_json = json.loads(self._last_resp)
self.log.info(self._last_json)
def get(self, uri):
resp = self._session.get(self.server_addr + uri)
try:
self._last_resp = resp
self._last_json = resp.json()
except Exception:
return
else:
return self._last_json
def post(self, uri, data: dict = None):
if not data:
data = {}
resp = self._session.post(self.server_addr + uri, json=data)
try:
self._last_resp = resp
self._last_json = resp.json()
except Exception:
return self._last_resp
@property
def last_json(self):
return self._last_json
================================================
FILE: undetected_chromedriver/devtool.py
================================================
import asyncio
import logging
import time
import traceback
from collections.abc import Mapping
from collections.abc import Sequence
from typing import Any
from typing import Awaitable
from typing import Callable
from typing import List
from typing import Optional
from contextlib import ExitStack
import threading
from functools import wraps, partial
class Structure(dict):
"""
This is a dict-like object structure, which you should subclass
Only properties defined in the class context are used on initialization.
See example
"""
_store = {}
def __init__(self, *a, **kw):
"""
Instantiate a new instance.
:param a:
:param kw:
"""
super().__init__()
# auxiliar dict
d = dict(*a, **kw)
for k, v in d.items():
if isinstance(v, Mapping):
self[k] = self.__class__(v)
elif isinstance(v, Sequence) and not isinstance(v, (str, bytes)):
self[k] = [self.__class__(i) for i in v]
else:
self[k] = v
super().__setattr__("__dict__", self)
def __getattr__(self, item):
return getattr(super(), item)
def __getitem__(self, item):
return super().__getitem__(item)
def __setattr__(self, key, value):
self.__setitem__(key, value)
def __setitem__(self, key, value):
super().__setitem__(key, value)
def update(self, *a, **kw):
super().update(*a, **kw)
def __eq__(self, other):
return frozenset(other.items()) == frozenset(self.items())
def __hash__(self):
return hash(frozenset(self.items()))
@classmethod
def __init_subclass__(cls, **kwargs):
cls._store = {}
def _normalize_strings(self):
for k, v in self.copy().items():
if isinstance(v, (str)):
self[k] = v.strip()
def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] = None):
def wrapper(func):
@wraps(func)
def wrapped(*args, **kwargs):
def function_reached_timeout():
if on_timeout:
on_timeout(func)
else:
raise TimeoutError("function call timed out")
t = threading.Timer(interval=seconds, function=function_reached_timeout)
t.start()
try:
return func(*args, **kwargs)
except:
t.cancel()
raise
finally:
t.cancel()
return wrapped
return wrapper
def test():
import sys, os
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
import undetected_chromedriver as uc
import threading
def collector(
driver: uc.Chrome,
stop_event: threading.Event,
on_event_coro: Optional[Callable[[List[str]], Awaitable[Any]]] = None,
listen_events: Sequence = ("browser", "network", "performance"),
):
def threaded(driver, stop_event, on_event_coro):
async def _ensure_service_started():
while (
getattr(driver, "service", False)
and getattr(driver.service, "process", False)
and driver.service.process.poll()
):
print("waiting for driver service to come back on")
await asyncio.sleep(0.05)
# await asyncio.sleep(driver._delay or .25)
async def get_log_lines(typ):
await _ensure_service_started()
return driver.get_log(typ)
async def looper():
while not stop_event.is_set():
log_lines = []
try:
for _ in listen_events:
try:
log_lines += await get_log_lines(_)
except:
if logging.getLogger().getEffectiveLevel() <= 10:
traceback.print_exc()
continue
if log_lines and on_event_coro:
await on_event_coro(log_lines)
except Exception as e:
if logging.getLogger().getEffectiveLevel() <= 10:
traceback.print_exc()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(looper())
t = threading.Thread(target=threaded, args=(driver, stop_event, on_event_coro))
t.start()
async def on_event(data):
print("on_event")
print("data:", data)
def func_called(fn):
def wrapped(*args, **kwargs):
print(
"func called! %s (args: %s, kwargs: %s)" % (fn.__name__, args, kwargs)
)
while driver.service.process and driver.service.process.poll() is not None:
time.sleep(0.1)
res = fn(*args, **kwargs)
print("func completed! (result: %s)" % res)
return res
return wrapped
logging.basicConfig(level=10)
options = uc.ChromeOptions()
options.set_capability(
"goog:loggingPrefs", {"performance": "ALL", "browser": "ALL", "network": "ALL"}
)
driver = uc.Chrome(version_main=96, options=options)
# driver.command_executor._request = timeout(seconds=1)(driver.command_executor._request)
driver.command_executor._request = func_called(driver.command_executor._request)
collector_stop = threading.Event()
collector(driver, collector_stop, on_event)
driver.get("https://nowsecure.nl")
time.sleep(10)
driver.quit()
================================================
FILE: undetected_chromedriver/dprocess.py
================================================
import multiprocessing
import os
import platform
import sys
from subprocess import PIPE
from subprocess import Popen
import atexit
import traceback
import logging
import signal
CREATE_NEW_PROCESS_GROUP = 0x00000200
DETACHED_PROCESS = 0x00000008
REGISTERED = []
def start_detached(executable, *args):
"""
Starts a fully independent subprocess (with no parent)
:param executable: executable
:param args: arguments to the executable, eg: ['--param1_key=param1_val', '-vvv' ...]
:return: pid of the grandchild process
启动独立的子进程
"""
# create pipe
reader, writer = multiprocessing.Pipe(False)
# do not keep reference
multiprocessing.Process(
target=_start_detached,
args=(executable, *args),
kwargs={"writer": writer},
daemon=True,
).start()
# receive pid from pipe
pid = reader.recv()
REGISTERED.append(pid)
# close pipes
writer.close()
reader.close()
return pid
def _start_detached(executable, *args, writer: multiprocessing.Pipe = None):
# configure launch
kwargs = {}
if platform.system() == "Windows":
kwargs.update(creationflags=DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP)
elif sys.version_info < (3, 2):
# assume posix
kwargs.update(preexec_fn=os.setsid)
else: # Python 3.2+ and Unix
kwargs.update(start_new_session=True)
# run
p = Popen([executable, *args], stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs)
# send pid to pipe
writer.send(p.pid)
sys.exit()
def _cleanup():
for pid in REGISTERED:
try:
logging.getLogger(__name__).debug("cleaning up pid %d " % pid)
os.kill(pid, signal.SIGTERM)
except: # noqa
pass
atexit.register(_cleanup)
================================================
FILE: undetected_chromedriver/options.py
================================================
#!/usr/bin/env python3
# this module is part of undetected_chromedriver
import json
import os
try:
from selenium.webdriver.chromium.options import ChromiumOptions as _ChromiumOptions
except:
from selenium.webdriver.chrome.options import Options as _ChromiumOptions
class ChromeOptions(_ChromiumOptions):
_session = None
_user_data_dir = None
@property
def user_data_dir(self):
return self._user_data_dir
@user_data_dir.setter
def user_data_dir(self, path: str):
"""
Sets the browser profile folder to use, or creates a new profile
at given <path>.
Parameters
----------
path: str
the path to a chrome profile folder
if it does not exist, a new profile will be created at given location
设置要使用的浏览器配置文件文件夹,或创建新的配置文件
"""
apath = os.path.abspath(path)
self._user_data_dir = os.path.normpath(apath)
@staticmethod
def _undot_key(key, value):
"""turn a (dotted key, value) into a proper nested dict"""
if "." in key:
key, rest = key.split(".", 1)
value = ChromeOptions._undot_key(rest, value)
return {key: value}
def handle_prefs(self, user_data_dir):
prefs = self.experimental_options.get("prefs")
if prefs:
user_data_dir = user_data_dir or self._user_data_dir
default_path = os.path.join(user_data_dir, "Default")
os.makedirs(default_path, exist_ok=True)
# undot prefs dict keys
undot_prefs = {}
for key, value in prefs.items():
undot_prefs.update(self._undot_key(key, value))
prefs_file = os.path.join(default_path, "Preferences")
if os.path.exists(prefs_file):
with open(prefs_file, encoding="latin1", mode="r") as f:
undot_prefs.update(json.load(f))
with open(prefs_file, encoding="latin1", mode="w") as f:
json.dump(undot_prefs, f)
# remove the experimental_options to avoid an error
del self._experimental_options["prefs"]
@classmethod
def from_options(cls, options):
o = cls()
o.__dict__.update(options.__dict__)
return o
================================================
FILE: undetected_chromedriver/patcher.py
================================================
#!/usr/bin/env python3
# this module is part of undetected_chromedriver
import io
import logging
import os
import random
import re
import string
import sys
import time
import zipfile
from distutils.version import LooseVersion
from urllib.request import urlopen, urlretrieve
import secrets
logger = logging.getLogger(__name__)
IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux"))
class Patcher(object):
"""
获取webdriver最新版本
"""
url_repo = "https://chromedriver.storage.googleapis.com"
zip_name = "chromedriver_%s.zip"
exe_name = "chromedriver%s"
# 判断当前系统
platform = sys.platform
if platform.endswith("win32"):
zip_name %= "win32"
exe_name %= ".exe"
if platform.endswith("linux"):
zip_name %= "linux64"
exe_name %= ""
if platform.endswith("darwin"):
zip_name %= "mac64"
exe_name %= ""
if platform.endswith("win32"):
d = "~/appdata/roaming/undetected_chromedriver"
elif platform.startswith("linux"):
d = "~/.local/share/undetected_chromedriver"
elif platform.endswith("darwin"):
d = "~/Library/Application Support/undetected_chromedriver"
else:
d = "~/.undetected_chromedriver"
data_path = os.path.abspath(os.path.expanduser(d))
def __init__(self, executable_path=None, force=False, version_main: int = 0):
"""
Args:
executable_path: None = automatic
a full file path to the chromedriver executable
force: False
terminate processes which are holding lock
version_main: 0 = auto
specify main chrome version (rounded, ex: 82)
"""
self.force = force
self.executable_path = None
prefix = secrets.token_hex(8)
if not os.path.exists(self.data_path):
os.makedirs(self.data_path, exist_ok=True)
if not executable_path:
self.executable_path = os.path.join(
self.data_path, "_".join([prefix, self.exe_name])
)
if not IS_POSIX:
if executable_path:
if not executable_path[-4:] == ".exe":
executable_path += ".exe"
self.zip_path = os.path.join(self.data_path, prefix)
if not executable_path:
self.executable_path = os.path.abspath(
os.path.join(".", self.executable_path)
)
self._custom_exe_path = False
if executable_path:
self._custom_exe_path = True
self.executable_path = executable_path
self.version_main = version_main
self.version_full = None
def auto(self, executable_path=None, force=False, version_main=None):
""""""
if executable_path:
self.executable_path = executable_path
self._custom_exe_path = True
if self._custom_exe_path:
ispatched = self.is_binary_patched(self.executable_path)
if not ispatched:
return self.patch_exe()
else:
return
if version_main:
self.version_main = version_main
if force is True:
self.force = force
try:
os.unlink(self.executable_path)
except PermissionError:
if self.force:
self.force_kill_instances(self.executable_path)
return self.auto(force=not self.force)
try:
if self.is_binary_patched():
# assumes already running AND patched
return True
except PermissionError:
pass
# return False
except FileNotFoundError:
pass
release = self.fetch_release_number()
self.version_main = release.version[0]
self.version_full = release
self.unzip_package(self.fetch_package())
return self.patch()
def patch(self):
self.patch_exe()
return self.is_binary_patched()
def fetch_release_number(self):
"""
Gets the latest major version available, or the latest major version of self.target_version if set explicitly.
:return: version string
:rtype: LooseVersion
获取可用的最新版
"""
path = "/latest_release"
if self.version_main:
path += f"_{self.version_main}"
path = path.upper()
logger.debug("getting release number from %s" % path)
return LooseVersion(urlopen(self.url_repo + path).read().decode())
def parse_exe_version(self):
with io.open(self.executable_path, "rb") as f:
for line in iter(lambda: f.readline(), b""):
match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line)
if match:
return LooseVersion(match[1].decode())
def fetch_package(self):
"""
Downloads ChromeDriver from source
:return: path to downloaded file
"""
u = "%s/%s/%s" % (self.url_repo, self.version_full.vstring, self.zip_name)
logger.debug("downloading from %s" % u)
# return urlretrieve(u, filename=self.data_path)[0]
return urlretrieve(u)[0]
def unzip_package(self, fp):
"""
Does what it says
:return: path to unpacked executable
解压缩可执行文件
"""
logger.debug("unzipping %s" % fp)
try:
os.unlink(self.zip_path)
except (FileNotFoundError, OSError):
pass
os.makedirs(self.zip_path, mode=0o755, exist_ok=True)
with zipfile.ZipFile(fp, mode="r") as zf:
zf.extract(self.exe_name, self.zip_path)
os.rename(os.path.join(self.zip_path, self.exe_name), self.executable_path)
os.remove(fp)
os.rmdir(self.zip_path)
os.chmod(self.executable_path, 0o755)
return self.executable_path
@staticmethod
def force_kill_instances(exe_name):
"""
kills running instances.
:param: executable name to kill, may be a path as well
:return: True on success else False
通过进程号kill driver
"""
exe_name = os.path.basename(exe_name)
if IS_POSIX:
r = os.system("kill -f -9 $(pidof %s)" % exe_name)
else:
r = os.system("taskkill /f /im %s" % exe_name)
return not r
@staticmethod
def gen_random_cdc():
cdc = random.choices(string.ascii_lowercase, k=26)
cdc[-6:-4] = map(str.upper, cdc[-6:-4])
cdc[2] = cdc[0]
cdc[3] = "_"
return "".join(cdc).encode()
def is_binary_patched(self, executable_path=None):
"""simple check if executable is patched.
:return: False if not patched, else True
检查可执行文件补丁
"""
executable_path = executable_path or self.executable_path
with io.open(executable_path, "rb") as fh:
for line in iter(lambda: fh.readline(), b""):
if b"cdc_" in line:
return False
else:
return True
def patch_exe(self):
"""
Patches the ChromeDriver binary
:return: False on failure, binary name on success
"""
logger.info("patching driver executable %s" % self.executable_path)
linect = 0
replacement = self.gen_random_cdc()
with io.open(self.executable_path, "r+b") as fh:
for line in iter(lambda: fh.readline(), b""):
if b"cdc_" in line:
fh.seek(-len(line), 1)
newline = re.sub(b"cdc_.{22}", replacement, line)
fh.write(newline)
linect += 1
return linect
def __repr__(self):
return "{0:s}({1:s})".format(
self.__class__.__name__,
self.executable_path,
)
def __del__(self):
if self._custom_exe_path:
# if the driver binary is specified by user
# we assume it is important enough to not delete it
return
else:
timeout = 3 # stop trying after this many seconds
t = time.monotonic()
while True:
now = time.monotonic()
if now - t > timeout:
# we don't want to wait until the end of time
logger.debug(
"could not unlink %s in time (%d seconds)"
% (self.executable_path, timeout)
)
break
try:
os.unlink(self.executable_path)
logger.debug("successfully unlinked %s" % self.executable_path)
break
except (OSError, RuntimeError, PermissionError):
time.sleep(0.1)
continue
except FileNotFoundError:
break
================================================
FILE: undetected_chromedriver/reactor.py
================================================
#!/usr/bin/env python3
# this module is part of undetected_chromedriver
import asyncio
import json
import logging
import threading
logger = logging.getLogger(__name__)
class Reactor(threading.Thread):
"""
异步事件处理
"""
def __init__(self, driver: "Chrome"):
super().__init__()
self.driver = driver
self.loop = asyncio.new_event_loop()
self.lock = threading.Lock()
self.event = threading.Event()
self.daemon = True
self.handlers = {}
def add_event_handler(self, method_name, callback: callable):
"""
Parameters
----------
event_name: str
example "Network.responseReceived"
callback: callable
callable which accepts 1 parameter: the message object dictionary
Returns
-------
"""
with self.lock:
self.handlers[method_name.lower()] = callback
@property
def running(self):
return not self.event.is_set()
def run(self):
try:
asyncio.set_event_loop(self.loop)
self.loop.run_until_complete(self.listen())
except Exception as e:
logger.warning("Reactor.run() => %s", e)
async def _wait_service_started(self):
while True:
with self.lock:
if (
getattr(self.driver, "service", None)
and getattr(self.driver.service, "process", None)
and self.driver.service.process.poll()
):
await asyncio.sleep(self.driver._delay or 0.25)
else:
break
async def listen(self):
while self.running:
await self._wait_service_started()
await asyncio.sleep(1)
try:
with self.lock:
log_entries = self.driver.get_log("performance")
for entry in log_entries:
try:
obj_serialized: str = entry.get("message")
obj = json.loads(obj_serialized)
message = obj.get("message")
method = message.get("method")
if "*" in self.handlers:
await self.loop.run_in_executor(
None, self.handlers["*"], message
)
elif method.lower() in self.handlers:
await self.loop.run_in_executor(
None, self.handlers[method.lower()], message
)
# print(type(message), message)
except Exception as e:
raise e from None
except Exception as e:
if "invalid session id" in str(e):
pass
else:
logging.debug("exception ignored :", e)
================================================
FILE: undetected_chromedriver/v2.py
================================================
# for backward compatibility
import sys
sys.modules[__name__] = sys.modules[__package__]
================================================
FILE: undetected_chromedriver/webelement.py
================================================
import selenium.webdriver.remote.webelement
class WebElement(selenium.webdriver.remote.webelement.WebElement):
"""
Custom WebElement class which makes it easier to view elements when
working in an interactive environment.
standard webelement repr:
<selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
using this WebElement class:
<WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
selenium.webdriver.remote.webelement.WebElement
自定义的WebElement类,WebElement类可以代表任何 Web 对象,是selenium中所有元素的父类,也就是webelement对象拥有的方法,其它元素对象都会有。
如 div、a标签。
"""
@property
def attrs(self):
"""
attr:div: <div id ="1"> </div>
"""
if not hasattr(self, "_attrs"):
self._attrs = self._parent.execute_script(
"""
var items = {};
for (index = 0; index < arguments[0].attributes.length; ++index)
{
items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value
};
return items;
""",
self,
)
return self._attrs
def __repr__(self):
strattrs = " ".join([f'{k}="{v}"' for k, v in self.attrs.items()])
if strattrs:
strattrs = " " + strattrs
return f"{self.__class__.__name__} <{self.tag_name}{strattrs}>"
gitextract_qe1fffch/
├── README.md
├── browserapi.py
├── config/
│ └── system_info.py
├── db.py
├── demo/
│ ├── runflow.py
│ ├── 单任务GET-demo.py
│ ├── 单任务POST-demo.py
│ ├── 多任务demo.py
│ ├── 抖音-步骤拆解demo.py
│ └── 药监局.py
├── engine.py
├── middlerware.py
├── models.py
├── server.py
├── settings.py
├── static/
│ ├── css/
│ │ └── index.css
│ └── docs/
│ ├── program.txt
│ └── 部署.txt
├── templates/
│ └── index.html
└── undetected_chromedriver/
├── __init__.py
├── _compat.py
├── cdp.py
├── devtool.py
├── dprocess.py
├── options.py
├── patcher.py
├── reactor.py
├── v2.py
└── webelement.py
SYMBOL INDEX (139 symbols across 18 files)
FILE: browserapi.py
class Browser (line 11) | class Browser():
method __init__ (line 20) | def __init__(self):
method stealth_enable (line 55) | def stealth_enable(self):
method start_request (line 63) | def start_request(self,url):
method close (line 68) | def close(self):
class BrowserApi (line 73) | class BrowserApi():
method __init__ (line 74) | def __init__(self,browser):
method browser_ps (line 77) | def browser_ps(self,url):
method browser_get (line 82) | def browser_get(self, url):
method browser_post (line 111) | def browser_post(self, url, formdata=""):
method check_slide (line 144) | def check_slide(self,bg_xpath,gap_xpath,slider_xpath,domain=None):
FILE: config/system_info.py
class SystemInfoUtil (line 3) | class SystemInfoUtil(object):
method get_format_byte (line 5) | def get_format_byte(cls, value):
method get_virtual_memory (line 21) | def get_virtual_memory(cls):
method get_disk_usage (line 43) | def get_disk_usage(cls):
FILE: db.py
function create_connection (line 7) | def create_connection():
function select_process (line 12) | def select_process():
function select_process_name (line 22) | def select_process_name(processName:str)->Process:
function select_process_id (line 31) | def select_process_id(processId:str)->Process:
function insert_process (line 40) | def insert_process(process:Process)->Process:
function delete_process (line 56) | def delete_process(process_name):
function delete_process_id (line 69) | def delete_process_id(processId):
FILE: demo/runflow.py
function magical_start (line 5) | def magical_start(project_name,base_url = 'http://www.lxspider.com'):
function magical_request (line 12) | def magical_request(session_id,process_url,request_url,request_type='get...
function magical_close (line 24) | def magical_close(session_id,process_url,process_name):
FILE: demo/多任务demo.py
function r1 (line 6) | def r1():
function r2 (line 19) | def r2():
FILE: engine.py
function create_browser (line 9) | def create_browser(url,name):
function attachToSession (line 17) | def attachToSession(session_id,url):
function carry_browser (line 31) | def carry_browser(session_id,process_url,request_url,request_type,formda...
function close_browser (line 52) | def close_browser(session_id,process_url,process_name):
function select_all_process (line 60) | def select_all_process():
FILE: middlerware.py
class Slide (line 9) | class Slide(object):
method __init__ (line 10) | def __init__(self, gap, bg, gap_size=None, bg_size=None, out=None):
method check_is_img_path (line 27) | def check_is_img_path(img, img_type, resize):
method clear_white (line 57) | def clear_white(img):
method template_match (line 82) | def template_match(self, tpl, target):
method image_edge_detection (line 101) | def image_edge_detection(img):
method discern (line 106) | def discern(self):
method get_tracks (line 122) | def get_tracks(distance, rate=0.6, t=0.2, v=0):
FILE: models.py
class Process (line 3) | class Process:
method __init__ (line 4) | def __init__(self, processId, processName,processUrl,baseUrl,createTim...
FILE: server.py
function index_info (line 17) | def index_info():
function browser_start (line 26) | def browser_start():
function browser_xhr (line 40) | def browser_xhr():
function browser_close (line 51) | def browser_close():
function delete_process_name (line 63) | def delete_process_name(process_name):
FILE: undetected_chromedriver/__init__.py
class Chrome (line 65) | class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
method __init__ (line 106) | def __init__(
method __getattribute__ (line 433) | def __getattribute__(self, item):
method _configure_headless (line 453) | def _configure_headless(self):
method __dir__ (line 503) | def __dir__(self):
method _get_cdc_props (line 506) | def _get_cdc_props(self):
method _hook_remove_cdc_props (line 518) | def _hook_remove_cdc_props(self):
method get (line 535) | def get(self, url):
method add_cdp_listener (line 540) | def add_cdp_listener(self, event_name, callback):
method clear_cdp_listeners (line 550) | def clear_cdp_listeners(self):
method tab_new (line 554) | def tab_new(self, url: str):
method reconnect (line 573) | def reconnect(self, timeout=0.1):
method start_session (line 589) | def start_session(self, capabilities=None, browser_profile=None):
method quit (line 597) | def quit(self):
method __del__ (line 641) | def __del__(self):
method __enter__ (line 649) | def __enter__(self):
method __exit__ (line 652) | def __exit__(self, exc_type, exc_val, exc_tb):
method __hash__ (line 658) | def __hash__(self):
function find_chrome_executable (line 662) | def find_chrome_executable():
FILE: undetected_chromedriver/_compat.py
class Chrome (line 37) | class Chrome:
method __new__ (line 38) | def __new__(cls, *args, emulate_touch=False, **kwargs):
class ChromeOptions (line 107) | class ChromeOptions:
method __new__ (line 108) | def __new__(cls, *args, **kwargs):
class ChromeDriverManager (line 122) | class ChromeDriverManager(object):
method __init__ (line 129) | def __init__(self, executable_path=None, target_version=None, *args, *...
method patch_selenium_webdriver (line 162) | def patch_selenium_webdriver(self_):
method install (line 176) | def install(self, patch_selenium=True):
method get_release_version_number (line 197) | def get_release_version_number(self):
method fetch_chromedriver (line 210) | def fetch_chromedriver(self):
method random_cdc (line 233) | def random_cdc():
method patch_binary (line 240) | def patch_binary(self):
function install (line 258) | def install(executable_path=None, target_version=None, *args, **kwargs):
FILE: undetected_chromedriver/cdp.py
class CDPObject (line 13) | class CDPObject(dict):
method __init__ (line 14) | def __init__(self, *a, **k):
method __repr__ (line 25) | def __repr__(self):
class PageElement (line 30) | class PageElement(CDPObject):
class CDP (line 34) | class CDP:
method __init__ (line 48) | def __init__(self, options: "ChromeOptions"): # noqa
method tab_activate (line 60) | def tab_activate(self, id=None):
method tab_list (line 67) | def tab_list(self):
method tab_new (line 71) | def tab_new(self, url):
method tab_close_last_opened (line 74) | def tab_close_last_opened(self):
method send (line 79) | async def send(self, method: str, params: dict):
method get (line 89) | def get(self, uri):
method post (line 99) | def post(self, uri, data: dict = None):
method last_json (line 110) | def last_json(self):
FILE: undetected_chromedriver/devtool.py
class Structure (line 17) | class Structure(dict):
method __init__ (line 27) | def __init__(self, *a, **kw):
method __getattr__ (line 48) | def __getattr__(self, item):
method __getitem__ (line 51) | def __getitem__(self, item):
method __setattr__ (line 54) | def __setattr__(self, key, value):
method __setitem__ (line 57) | def __setitem__(self, key, value):
method update (line 60) | def update(self, *a, **kw):
method __eq__ (line 63) | def __eq__(self, other):
method __hash__ (line 66) | def __hash__(self):
method __init_subclass__ (line 70) | def __init_subclass__(cls, **kwargs):
method _normalize_strings (line 73) | def _normalize_strings(self):
function timeout (line 79) | def timeout(seconds=3, on_timeout: Optional[Callable[[callable], Any]] =...
function test (line 104) | def test():
FILE: undetected_chromedriver/dprocess.py
function start_detached (line 18) | def start_detached(executable, *args):
function _start_detached (line 47) | def _start_detached(executable, *args, writer: multiprocessing.Pipe = No...
function _cleanup (line 67) | def _cleanup():
FILE: undetected_chromedriver/options.py
class ChromeOptions (line 12) | class ChromeOptions(_ChromiumOptions):
method user_data_dir (line 17) | def user_data_dir(self):
method user_data_dir (line 21) | def user_data_dir(self, path: str):
method _undot_key (line 37) | def _undot_key(key, value):
method handle_prefs (line 44) | def handle_prefs(self, user_data_dir):
method from_options (line 68) | def from_options(cls, options):
FILE: undetected_chromedriver/patcher.py
class Patcher (line 23) | class Patcher(object):
method __init__ (line 52) | def __init__(self, executable_path=None, force=False, version_main: in...
method auto (line 96) | def auto(self, executable_path=None, force=False, version_main=None):
method patch (line 136) | def patch(self):
method fetch_release_number (line 140) | def fetch_release_number(self):
method parse_exe_version (line 154) | def parse_exe_version(self):
method fetch_package (line 161) | def fetch_package(self):
method unzip_package (line 172) | def unzip_package(self, fp):
method force_kill_instances (line 195) | def force_kill_instances(exe_name):
method gen_random_cdc (line 211) | def gen_random_cdc():
method is_binary_patched (line 218) | def is_binary_patched(self, executable_path=None):
method patch_exe (line 232) | def patch_exe(self):
method __repr__ (line 251) | def __repr__(self):
method __del__ (line 257) | def __del__(self):
FILE: undetected_chromedriver/reactor.py
class Reactor (line 12) | class Reactor(threading.Thread):
method __init__ (line 16) | def __init__(self, driver: "Chrome"):
method add_event_handler (line 27) | def add_event_handler(self, method_name, callback: callable):
method running (line 46) | def running(self):
method run (line 49) | def run(self):
method _wait_service_started (line 56) | async def _wait_service_started(self):
method listen (line 68) | async def listen(self):
FILE: undetected_chromedriver/webelement.py
class WebElement (line 4) | class WebElement(selenium.webdriver.remote.webelement.WebElement):
method attrs (line 23) | def attrs(self):
method __repr__ (line 41) | def __repr__(self):
Condensed preview — 29 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (106K chars).
[
{
"path": "README.md",
"chars": 2069,
"preview": "# magical_spider\n神奇的蜘蛛🕷,一个几乎适用于所有web端站点的采集方案。\n\n\n### 诞生背景\n纯属瞎扯:2022年全球变暖,各行业内卷严重,爬虫届更是入门抖音起步瑞数,为了减缓人才流失,推出magical_spider。"
},
{
"path": "browserapi.py",
"chars": 6167,
"preview": "# -*- coding: utf-8 -*-\nimport undetected_chromedriver as webdriver\nfrom undetected_chromedriver.options import ChromeOp"
},
{
"path": "config/system_info.py",
"chars": 1687,
"preview": "# -*- coding: utf-8 -*-\nimport psutil\nclass SystemInfoUtil(object):\n @classmethod\n def get_format_byte(cls, value)"
},
{
"path": "db.py",
"chars": 2380,
"preview": "import os.path\nimport sqlite3\nfrom models import *\nfrom settings import magicalpath\n\n\ndef create_connection():\n db = "
},
{
"path": "demo/runflow.py",
"chars": 1057,
"preview": "import requests\nsess = requests.session()\nhost = 'http://127.0.0.1:5000'\n\ndef magical_start(project_name,base_url = 'htt"
},
{
"path": "demo/单任务GET-demo.py",
"chars": 376,
"preview": "from demo.runflow import magical_start,magical_request,magical_close\n\nproject_name = 'cnipa'\nbase_url = 'https://www.cni"
},
{
"path": "demo/单任务POST-demo.py",
"chars": 933,
"preview": "from demo.runflow import magical_start,magical_request,magical_close\nimport json\n\n# POST案例昨天忘记加了,感谢 [尘川] 的提醒 by:2022/08"
},
{
"path": "demo/多任务demo.py",
"chars": 1201,
"preview": "from demo.runflow import magical_start,magical_request,magical_close\nimport time\n\n# 各任务间互不影响,可选择使用多线程或多进程,大家自由发挥\n\ndef r1"
},
{
"path": "demo/抖音-步骤拆解demo.py",
"chars": 3614,
"preview": "import requests\n\n# 步骤拆解,简化版查看 药监局.py\n\nproject_name = '抖音任务2' # project_name不可重复,勿创建重复任务名\nbase_url = 'https://www.dou"
},
{
"path": "demo/药监局.py",
"chars": 484,
"preview": "from demo.runflow import magical_start,magical_request,magical_close\n\nproject_name = '药监局1'\nbase_url = 'https://www.nmpa"
},
{
"path": "engine.py",
"chars": 1897,
"preview": "# -*- coding: utf-8 -*-\nfrom browserapi import Browser,BrowserApi\nfrom db import *\nfrom models import Process\nfrom selen"
},
{
"path": "middlerware.py",
"chars": 4854,
"preview": "# -*- coding: utf-8 -*-\nimport os,requests\nfrom urllib.parse import urlparse\ntry:\n import cv2, numpy as np\nexcept:\n "
},
{
"path": "models.py",
"chars": 350,
"preview": "import datetime\n\nclass Process:\n def __init__(self, processId, processName,processUrl,baseUrl,createTime = datetime.d"
},
{
"path": "server.py",
"chars": 2503,
"preview": "# -*- coding: utf-8 -*-\nfrom datetime import timedelta\nfrom flask import Flask,session\nfrom flask import render_template"
},
{
"path": "settings.py",
"chars": 463,
"preview": "# MagicalSpider Settings\n\n# 隐藏界面\nheadless_enable = True\n\n# 高匿模式、可能影响创建时间\nstealth_enable = True\n\n# 代理设置\nproxy = None\n\n# 无"
},
{
"path": "static/css/index.css",
"chars": 820,
"preview": "body{\n background: url(/static/image/bg.png);\n background-size: 100% 100%;\n background-repeat:no-repeat;\n}\na{\n "
},
{
"path": "static/docs/program.txt",
"chars": 559,
"preview": "magical_spider,一个几乎适用于所有web端站点的采集方案。\n\n## 项目简介\n\n1、主要使用谷歌驱动,但非常规derver.page_source。\n\n2、通过 flask 远程调用 chromederver 实现 xmlHt"
},
{
"path": "static/docs/部署.txt",
"chars": 546,
"preview": "Linux部署\n\n1.安装chrome (自行选择安装位置)\nyum install https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm\n\n2"
},
{
"path": "templates/index.html",
"chars": 3019,
"preview": "<!DOCTYPE html>\n<html lang=\"en\" >\n<head>\n <meta charset=\"UTF-8\">\n <title>MagicalSpider</title>\n</head>\n<link rel=\""
},
{
"path": "undetected_chromedriver/__init__.py",
"chars": 25243,
"preview": "#!/usr/bin/env python3\n#from __future__ import annotations\n\nimport subprocess\n\n\"\"\"\n\n 888 "
},
{
"path": "undetected_chromedriver/_compat.py",
"chars": 9340,
"preview": "#!/usr/bin/env python3\n# this module is part of undetected_chromedriver\n\n\n\"\"\"\n\n 888 "
},
{
"path": "undetected_chromedriver/cdp.py",
"chars": 3430,
"preview": "#!/usr/bin/env python3\n# this module is part of undetected_chromedriver\n\nimport json\nimport logging\nfrom collections.abc"
},
{
"path": "undetected_chromedriver/devtool.py",
"chars": 5789,
"preview": "import asyncio\nimport logging\nimport time\nimport traceback\nfrom collections.abc import Mapping\nfrom collections.abc impo"
},
{
"path": "undetected_chromedriver/dprocess.py",
"chars": 1793,
"preview": "import multiprocessing\nimport os\nimport platform\nimport sys\nfrom subprocess import PIPE\nfrom subprocess import Popen\nimp"
},
{
"path": "undetected_chromedriver/options.py",
"chars": 2287,
"preview": "#!/usr/bin/env python3\n# this module is part of undetected_chromedriver\n\n\nimport json\nimport os\ntry:\n from selenium.w"
},
{
"path": "undetected_chromedriver/patcher.py",
"chars": 9017,
"preview": "#!/usr/bin/env python3\n# this module is part of undetected_chromedriver\n\nimport io\nimport logging\nimport os\nimport rando"
},
{
"path": "undetected_chromedriver/reactor.py",
"chars": 2995,
"preview": "#!/usr/bin/env python3\n# this module is part of undetected_chromedriver\n\nimport asyncio\nimport json\nimport logging\nimpor"
},
{
"path": "undetected_chromedriver/v2.py",
"chars": 90,
"preview": "# for backward compatibility\nimport sys\n\nsys.modules[__name__] = sys.modules[__package__]\n"
},
{
"path": "undetected_chromedriver/webelement.py",
"chars": 1559,
"preview": "import selenium.webdriver.remote.webelement\n\n\nclass WebElement(selenium.webdriver.remote.webelement.WebElement):\n \"\"\""
}
]
About this extraction
This page contains the full source code of the lixi5338619/magical_spider GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 29 files (94.3 KB), approximately 23.4k tokens, and a symbol index with 139 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.