Repository: NikolaiT/stealthy-scraping-tools Branch: main Commit: 8f155462a671 Files: 19 Total size: 46.3 KB Directory structure: gitextract_az1f2l_7/ ├── .gitignore ├── Dockerfile ├── README.md ├── behavior/ │ ├── behavior.py │ ├── human_replay.py │ └── sst_utils.py ├── cdp/ │ ├── coords.js │ ├── eval_js.js │ ├── goto.js │ └── page_source.js ├── crawl.py ├── ddc.py ├── example.py ├── immobilienscout24.py ├── local_forward_proxy_server/ │ └── proxy_server.js ├── lufthansa-de.py ├── requirements.txt ├── start.sh └── test.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ cdp/node_modules/* target.py local_forward_proxy_server/node_modules/ local_forward_proxy_server/node_modules/* trainline.py immo_env.env immo_env.py apartments.json deploy.sh exclude.txt immo_env.py *.pyc __pycache__/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: Dockerfile ================================================ FROM ubuntu:20.04 # Set correct timezone ENV TZ=Europe/Berlin RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN apt-get update && apt-get install python3 tesseract-ocr python3-pip curl unzip -yf # Install Chrome RUN apt-get update -y RUN apt-get install -y dbus-x11 RUN curl https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -o /chrome.deb RUN dpkg -i /chrome.deb || apt-get install -yf RUN rm /chrome.deb RUN apt-get install -y poppler-utils RUN apt-get clean RUN DEBIAN_FRONTEND=noninteractive apt install -y python3-xlib xvfb xserver-xephyr python3-tk python3-dev # https://github.com/puppeteer/puppeteer/issues/5429 RUN DEBIAN_FRONTEND=noninteractive apt-get -y install wget libcairo2-dev \ libjpeg-dev libpango1.0-dev libgif-dev build-essential g++ libgl1-mesa-dev libxi-dev \ libx11-dev pulseaudio udev RUN apt update && apt install -y postgresql-server-dev-12 RUN curl --silent --location https://deb.nodesource.com/setup_14.x | bash - &&\ apt-get -y -qq install nodejs # Move this into requirements.txt at some time RUN pip3 install pyautogui python-xlib PyVirtualDisplay RUN apt-get install -y fonts-roboto fonts-ubuntu ttf-bitstream-vera fonts-crosextra-caladea fonts-cantarell fonts-open-sans ttf-wqy-zenhei # install debs error if combine together RUN apt install -y --no-install-recommends --allow-unauthenticated x11vnc fluxbox xxd \ && apt autoclean -y \ && apt autoremove -y \ && rm -rf /var/lib/apt/lists/* RUN apt-get update -y && apt install -y iptables sudo COPY . . # https://dev.to/emmanuelnk/using-sudo-without-password-prompt-as-non-root-docker-user-52bg # Create new user `docker` and disable # password and gecos for later # --gecos explained well here: # https://askubuntu.com/a/1195288/635348 RUN adduser --force-badname --disabled-password --gecos '' browserUser # Add a user to run the browser as non-root RUN mkdir -p /home/browserUser/Downloads \ && chown -R browserUser:browserUser /home/browserUser RUN adduser browserUser sudo # Ensure sudo group users are not # asked for a password when using # sudo command by ammending sudoers file RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> \ /etc/sudoers RUN chmod 755 start.sh # Run everything after as non-privileged user. USER browserUser # Application specific environment variables # disp = Display(visible=True, size=(1920, 1080), backend="xvfb", use_xauth=True); disp.start() # set's DISPLAY=:1 ENV DISPLAY=:1 # By default, only screen 0 exists and has the dimensions 1280x1024x8 ENV XVFB_WHD=1920x1080x24 # x11vnc password ENV X11VNC_PASSWORD=test # This variable tells our source code that its invoked within a Docker container ENV DOCKER=1 ENTRYPOINT [ "./start.sh" ] ================================================ FILE: README.md ================================================ # Stealthy Scraping Tools Do not use puppeteer and playwright for scraping. Or any other browser automation framework for that matter. [Why?](https://incolumitas.com/2021/05/20/avoid-puppeteer-and-playwright-for-scraping/) We only use the [CDP](https://developer.chrome.com/docs/devtools/) to obtain the page source and to get the absolute coordinates for an arbitrary CSS selector. That's all what is needed for efficient scraping. 1. To obtain the page source of the browser's current page. Implemented in [page_source.js](https://github.com/NikolaiT/stealthy-scraping-tools/blob/main/page_source.js) 2. To get the absolute coordinates for an arbitrary CSS selector. Implemented in [coords.js](https://github.com/NikolaiT/stealthy-scraping-tools/blob/main/coords.js) Mouse movements and typing is handled by `pyautogui` or other means, but not with JavaScript or with the CDP! Reason: Browser based mouse and keyboard emulation is very easy detectable! ## Theory 1. Analyzing key strokes: [TypeNet: Deep Learning Keystroke Biometrics](https://arxiv.org/abs/2101.05570) 2. Research how to mimic human mouse movements: [BeCAPTCHA-Mouse: Synthetic Mouse Trajectories and Improved Bot Detection](https://arxiv.org/abs/2005.00890) ## Full Example The bot challenge that can be found here [bot.incolumitas.com/#botChallenge](https://bot.incolumitas.com/#botChallenge) will be solved in the following quick tutorial. The example code can be found in `example.py`. I am using an Ubuntu 18.04 system with `Python3` (with `pipenv`) and a recent `Node` version. The browser `google-chrome` must be installed. Clone the repo: ``` git clone https://github.com/NikolaiT/stealthy-scraping-tools cd stealthy-scraping-tools ``` Activate an environment with: ```bash pew new -p python3 sst pew workon sst ``` Then install `pyautogui`: ```bash pip install pyautogui ``` Install node modules: ``` npm install chrome-remote-interface ``` And then run the bot with: ```python python example.py ``` ## Docker The Dockerfile is based on `Ubuntu 20.04`. The Dockerfile uses `xvfb` from the python module `PyVirtualDisplay`. I use `pyautogui` for mouse and keyboard automation. I use `fluxbox` as a tiny window manager and `x11vnc` (Virtual Network Computing server program) as a means to inspect the docker image and see what is going on. Build the Dockerfile: ``` docker build -t sst:0.0.1 . ``` Hint: Avoid chrome in docker crashing: ``` 1. Option 1: Run chrome with --disable-dev-shm-usage 2. Option 2: Set /dev/shm size to a reasonable amount `docker run -it --shm-size=1g` replacing 1g with whatever amount you want. ``` The docker option `--shm-size=2g` is really important: ``` docker run --cap-add=NET_ADMIN --network="host" --shm-size=2g sst:0.0.1 ``` ## TODO + Look at Kernel/OS level mouse/keyboard control commands (Ditch `pyautogui`) + Use the math from [ghost-cursor](https://github.com/Xetera/ghost-cursor) + Create a set of typign recordings and use it to derive rules for bot writing ================================================ FILE: behavior/behavior.py ================================================ import random import time import os if os.getenv('DOCKER') == '1': from pyvirtualdisplay.display import Display import os import time disp = Display(visible=True, size=(1920, 1080), backend="xvfb", use_xauth=True) disp.start() print('Started display!') print('DISPLAY={}'.format(os.environ['DISPLAY'])) import Xlib.display import pyautogui pyautogui._pyautogui_x11._display = Xlib.display.Display(os.environ['DISPLAY']) pyautogui.FAILSAFE = True else: import pyautogui # When fail-safe mode is True, moving the mouse to the upper-left # will raise a pyautogui.FailSafeException that can abort your program: pyautogui.FAILSAFE = True def tinySleep(): time.sleep(random.uniform(0.075, 0.329)) def getDim(): # current screen resolution width and height return pyautogui.size() def someWhereRandomClose(x, y, max_dist=120): """ Find a random position close to (x, y) with maximal dist @max_dist """ shape = pyautogui.size() cnt = 0 while True: randX = random.randrange(1, max_dist) randY = random.randrange(1, max_dist) if random.random() > 0.5: randX *= -1 if random.random() > 0.5: randY *= -1 if x + randX in range(0, shape.width) and y + randY in range(0, shape.height): return (x + randX, y + randY) cnt += 1 if cnt > 15: return (x, y) def humanMove(x, y, clicks=1, steps=1): """ Moves like a human to the coordinate (x, y) and clicks on the coordinate. Randomizes move time and the move type. Visits one intermediate coordiante close to the target before fine correcting and clicking on the target coordinates. """ width, height = getDim() if steps > 1: # kek far_x, far_y = someWhereRandomClose(x, y, min(width, 600)) pyautogui.moveTo(far_x, far_y, random.uniform(0.35, .55), pyautogui.easeOutQuad) tinySleep() if steps > 0: closer_x, closer_y = someWhereRandomClose(x, y, min(width, 400)) pyautogui.moveTo(closer_x, closer_y, random.uniform(0.25, .40), pyautogui.easeOutQuad) # move to an intermediate target close to the destination # start fast, end slow close_x, close_y = someWhereRandomClose(x, y, 50) pyautogui.moveTo(close_x, close_y, random.uniform(.25, .45), pyautogui.easeOutQuad) # click on the main target pyautogui.moveTo(x, y, random.uniform(.22, .35)) tinySleep() pyautogui.click(clicks=clicks) def humanScroll(steps, clicks=(5, 20), direction=1): for i in range(steps): ran_click = random.uniform(*clicks) pyautogui.scroll(direction * ran_click) time.sleep(random.uniform(0.5, 1.329)) def tinySleep(): time.sleep(random.uniform(0.005, 0.009)) def doubleHit(key1, key2): """ Sometimes press two keys down at the same time and randomize the order of the corresponding key up events to resemble human typign closer. """ pyautogui.keyDown(key1) tinySleep() pyautogui.keyDown(key2) tinySleep() if random.random() > 0.5: pyautogui.keyUp(key1) tinySleep() pyautogui.keyUp(key2) else: pyautogui.keyUp(key2) tinySleep() pyautogui.keyUp(key1) def humanTyping(text, speed=(0.01, 0.025), double_hit=False): """ Mostly the keydown/keyup pairs are in order, but sometimes we want two keydown's at the same time. text: the text to be written in a human fashion. speed: the gap between key presses in seconds. Random number between (low, high) """ i = 0 while i <= len(text): if speed: time.sleep(random.uniform(*speed)) if double_hit is True and random.random() < .3 and i+1 < len(text): doubleHit(text[i], text[i+1]) i += 2 else: pyautogui.keyDown(text[i]) # tinySleep() pyautogui.keyUp(text[i]) i += 1 if i >= len(text): break def clickNormal(clicks=1): pyautogui.click(clicks=clicks, interval=0.25) def typeNormal(text): pyautogui.write(text, interval=random.uniform(0.15, 0.25)) def fastwrite(text): pyautogui.write(text, interval=random.uniform(0.045, 0.075)) def press(char): pyautogui.press('char', presses=1) def typeWrite(l): pyautogui.typewrite(l, interval=0.22) def press(key): pyautogui.press(key) ================================================ FILE: behavior/human_replay.py ================================================ """ Real human recorded behavior is replayed. Mouse movements exclusively. """ ================================================ FILE: behavior/sst_utils.py ================================================ import time import os import sys import random import math import json import subprocess import subprocess from pathlib import Path def goto(url): script_path = getScriptPath('goto.js') cmd = f"node {script_path} '{url}'" ps = subprocess.check_output(cmd, shell=True) return ps def getScriptPath(name): return os.path.join( Path(__file__).parent.parent, 'cdp/' + name ) def getPageSource(): cmd = 'node ' + getScriptPath('page_source.js') ps = subprocess.check_output(cmd, shell=True) return ps def evalJS(command): with open('/tmp/evalCommand.txt', 'w') as f: f.write(command) script_path = getScriptPath('eval_js.js') cmd = f"node {script_path}" ps = subprocess.check_output(cmd, shell=True) return ps def getCoords(selector, randomize_within_bcr=True, highlight_bb=True): """ - selector: The CSS selector to get the coords for - randomize_within_bcr: select a random coordinate within the bounding box hight - highlight_bb: visually highlight the bounding box for debugging purposes """ script_path = getScriptPath('coords.js') cmd = f"node {script_path} '{selector}'" coords = subprocess.check_output(cmd, shell=True) coords = coords.decode() x, y = 0, 0 try: parsed = json.loads(coords) x, y, width, height = parsed['x'], parsed['y'], parsed['width'], parsed['height'] if randomize_within_bcr: # print(x, y, parsed['width'], parsed['height']) x += random.randint(0, math.floor(parsed['width'] / 4)) y += random.randint(0, math.floor(parsed['height'] / 4)) if highlight_bb: # Just add a red thick border around the CSS selector cmd = """var el = document.querySelector('""" + selector + \ """'); if (el) { el.style.border = "2px solid #ff0000"; }""" evalJS(cmd) except Exception as e: print('getCoords() failed with Error: {}'.format(e)) return None return x, y def startBrowser(args=[], startInTempDir=False, chromeProfile='--profile-directory="Default"'): tempDirStr = '' if startInTempDir: tempDirStr = f'--user-data-dir=/tmp' arg_str = ' '.join(args) if sys.platform == 'darwin': chromePath = '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome' # On MacOS Monterey, we need to start Google Chrome # in fullscreen mode to get the correct coordinates. startCmd = f'{chromePath} --remote-debugging-port=9222 --start-maximized {tempDirStr} {chromeProfile} --disable-notifications --start-fullscreen {arg_str} 1>out.log 2>err.log &' else: startCmd = f'google-chrome --remote-debugging-port=9222 --start-maximized --disable-notifications {arg_str} 1>out.log 2>err.log &' if os.getenv('DOCKER') == '1': startCmd = 'google-chrome --remote-debugging-port=9222 --no-sandbox --disable-notifications --start-maximized --no-first-run --no-default-browser-check 1>out.log 2>err.log &' print(startCmd) subprocess.Popen([startCmd], shell=True) time.sleep(random.uniform(3, 4)) def closeBrowser(): print('closing browser') if sys.platform == 'darwin': os.system("killall -9 'Google Chrome'") else: os.system("killall -9 'google-chrome'") ================================================ FILE: cdp/coords.js ================================================ // coords.js // https://chromedevtools.github.io/devtools-protocol/ const CDP = require('chrome-remote-interface'); const random = (min, max) => Math.floor(Math.random() * (max - min)) + min; // given a selector or node_id returns x and y *relative* coordinates // coordinates are relative to the viewport // The x relative coordinate is the same as the absolute coordiante, as the browser is maximed // The y coordinate is less, because the browser has the address bar / header async function getCoordsAlt(css_selector) { let client; try { // connect to endpoint client = await CDP(); // extract domains const { Page, Runtime, DOM } = client; // enable events then start! await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); const {root: {nodeId: documentNodeId}} = await DOM.getDocument(); console.log(documentNodeId) const result = await DOM.querySelector({ selector: css_selector, nodeId: documentNodeId, }); console.log(result) const retval = await DOM.getBoxModel({"nodeId": result.nodeId}); var box_model = retval.model; console.log(box_model) content_w = Math.abs(box_model["content"][2] - box_model["content"][0]) center_x = box_model["content"][0] + random(content_w / 4.0, 3 * content_w / 4.0) content_h = Math.abs(box_model["content"][5] - box_model["content"][1]) center_y = box_model["content"][1] + random(content_h / 4.0, 3 * content_h / 4.0) // given a selector or node_id returns x and y *relative* coordinates // coordinates are relative to the viewport // The x relative coordinate is the same as the absolute coordiante, as the browser is maximed // The y coordinate is less, because the browser has the address bar / header const coords = {"x": center_x, "y": center_y, "node_id": result.nodeId, "root_node": documentNodeId}; console.log(JSON.stringify(coords)) return coords; } catch (err) { console.error(err); } finally { if (client) { await client.close(); } } } function getFrameExecId(frame) { var frameId = frameNameToFrameId[frame]; if (!frameId) throw Error(`Frame ${frame} is unknown`); var execId = frameIdToContextId[frameId]; if (!execId) throw Error(`Frame ${frame} (${frameId}) has no executionContextId`); return execId; } function expectLoadFrame(name, timeout) { return new Promise((resolve, reject) => { let tm = setTimeout( () => reject("timed out waiting for frame load"), timeout ); // we can only have one Page.frameNavigated() handler, so let our handler above resolve this promise frameWaitName = name; new Promise((fwpResolve, fwpReject) => { frameWaitPromiseResolve = fwpResolve }) .then(() => { // For the frame to be fully valid for queries, it also needs the corresponding // executionContextCreated() signal. This might happen before or after frameNavigated(), so wait in case // it happens afterwards. function pollExecId() { if (frameIdToContextId[frameNameToFrameId[name]]) { clearTimeout(tm); resolve(); } else { setTimeout(pollExecId, 100); } } pollExecId(); }); }); } async function getCoordsIframe(css_selector, iframe) { let client; try { // connect to endpoint client = await CDP(); // extract domains const { Page, Runtime, DOM } = client; // enable events then start! await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); var frameIdToContextId = {}; var frameNameToFrameId = {}; // set these to wait for a frame to be loaded var frameWaitName = null; var frameWaitPromiseResolve = null; // map frame names to frame IDs; root frame has no name, no need to track that await Page.frameNavigated(info => { if (info.frame.name) frameNameToFrameId[info.frame.name] = info.frame.id; // were we waiting for this frame to be loaded? if (frameWaitPromiseResolve && frameWaitName === info.frame.name) { frameWaitPromiseResolve(); frameWaitPromiseResolve = null; } }); // track execution contexts so that we can map between context and frame IDs await Runtime.executionContextCreated(info => { frameIdToContextId[info.context.auxData.frameId] = info.context.id; }); await Runtime.executionContextDestroyed(info => { for (let frameId in frameIdToContextId) { if (frameIdToContextId[frameId] == info.executionContextId) { delete frameIdToContextId[frameId]; break; } } }); let result = null; let clientRectCmd = `var targetCoordEl = document.querySelector('${css_selector}'); if (targetCoordEl) { JSON.stringify(targetCoordEl.getClientRects()); }`; await expectLoadFrame(iframe, 2000).then(async (res) => { let frameId = getFrameExecId(iframe); result = await Runtime.evaluate({ expression: clientRectCmd, contextId: frameId, }); console.log(result) }); // get offset screen positioning const screenPos = await Runtime.evaluate({ expression: "JSON.stringify({offsetY: window.screen.height - window.innerHeight, offsetX: window.screen.width - window.innerWidth})" }); let offset = JSON.parse(screenPos.result.value); let clientRect = null; try { clientRect = JSON.parse(result.result.value)["0"]; } catch(err) { return null; } let retVal = { x: offset.offsetX + clientRect.x, y: offset.offsetY + clientRect.y, width: clientRect.width, height: clientRect.height, }; console.log(JSON.stringify(retVal)); return retVal; } catch (err) { console.error(err); } finally { if (client) { await client.close(); } } } async function getCoords(css_selector) { let client; try { // connect to endpoint client = await CDP(); // extract domains const { Page, Runtime, DOM } = client; // enable events then start! await Promise.all([Runtime.enable()]); let result = null; let clientRectCmd = `var targetCoordEl = document.querySelector('${css_selector}'); if (targetCoordEl) { JSON.stringify(targetCoordEl.getClientRects()); }`; result = await Runtime.evaluate({ expression: clientRectCmd, }); // get offset screen positioning const screenPos = await Runtime.evaluate({ expression: "JSON.stringify({offsetY: window.screen.height - window.innerHeight, offsetX: window.screen.width - window.innerWidth})" }); let offset = JSON.parse(screenPos.result.value); let clientRect = null; try { clientRect = JSON.parse(result.result.value)["0"]; } catch(err) { return null; } let retVal = { x: offset.offsetX + clientRect.x, y: offset.offsetY + clientRect.y, width: clientRect.width, height: clientRect.height, }; console.log(JSON.stringify(retVal)); return retVal; } catch (err) { console.error(err); } finally { if (client) { await client.close(); } } } const argLength = process.argv.length; if (argLength === 3) { getCoords(process.argv[2]); } else if (argLength === 4) { getCoordsIframe(process.argv[2], process.argv[3]); } ================================================ FILE: cdp/eval_js.js ================================================ // eval_js.js // caller has to write command to /tmp/evalCommand.txt' const CDP = require('chrome-remote-interface'); const fs = require('fs'); async function evalCommand(command) { let client; try { // connect to endpoint client = await CDP(); // extract domains const { Page, Runtime, DOM } = client; // enable events then start! await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); const evalRes = await Runtime.evaluate({expression: command}); console.log(evalRes.result.value); } catch (err) { console.error(err); } finally { if (client) { await client.close(); } } } const argLength = process.argv.length; if (argLength === 2) { evalCommand(fs.readFileSync('/tmp/evalCommand.txt').toString()); } ================================================ FILE: cdp/goto.js ================================================ // page_source.js const CDP = require('chrome-remote-interface'); async function pageNav(url) { let client; try { // connect to endpoint client = await CDP(); // extract domains const { Page, Runtime, DOM } = client; // enable events then start! await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); // get the page source await Page.navigate({url: url}); return 'ok'; } catch (err) { console.error(err); } finally { if (client) { await client.close(); } } } const argLength = process.argv.length; if (argLength === 3) { pageNav(process.argv[2]); } ================================================ FILE: cdp/page_source.js ================================================ // page_source.js const CDP = require('chrome-remote-interface'); async function getPageSource() { let client; try { // connect to endpoint client = await CDP(); // extract domains const { Page, Runtime, DOM } = client; // enable events then start! await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); // get the page source const rootNode = await DOM.getDocument({ depth: -1 }); const pageSource = await DOM.getOuterHTML({ nodeId: rootNode.root.nodeId }); return pageSource.outerHTML; } catch (err) { console.error(err); } finally { if (client) { await client.close(); } } } getPageSource().then((pageSource) => { console.log(pageSource); }) ================================================ FILE: crawl.py ================================================ import time import random from behavior.behavior import humanMove from behavior.sst_utils import * """ Very simple HTML crawl of a website. """ def main(): print('Trying to start browser') startBrowser(['www.hetzner.com\n']) # do a bit of random moving around # to fool bot systems coords = getCoords('body') print('Clicking on coordinates ' + str(coords)) humanMove(*coords) time.sleep(random.uniform(0.5, 1.0)) # finally get the page source text = getPageSource() print('Got {} bytes of HTML data'.format(len(text))) # close the browser closeBrowser() if __name__ == '__main__': main() ================================================ FILE: ddc.py ================================================ import time import os import random import subprocess import json import re from behavior.behavior import humanMove, humanTyping from target import target import pyautogui """ Important: 1. Update the coordinates of the browser url address bar. Use the command `xdotool getmouselocation` to detect coordinates on your screen. 2. Do not change the zoom level for the page in the browser! This will mess with coordinates! Default level must be 100% zoom level. 3. I assume that the binary name of Google Chrome is `google-chrome`. Change the code if your binary name is different. 4. Make sure the browser window is started in your leftmost screen! I have a dual screen setup and sometimes I need to manually move my browser window to the correct screen ;) """ # collect keys keys = [] def getCoords(n): cmd = f'node coords.js "p:nth-of-type({n}) > a"' # print(cmd) coords = subprocess.check_output(cmd, shell=True).decode('utf8').strip() # print(coords) return json.loads(coords) def getKey(): cmd = f'/usr/bin/node page_source' ps = subprocess.check_output(cmd, shell=True).decode('utf8').strip() if 'Not found' in ps: return 'done' else: key = re.search(r'[0-9a-z]{32}', ps) return key.group(0) def visitPage(): # @UPDATE COORDINATES HERE humanMove(168, 79) # click on the address bar to enter URL pyautogui.typewrite(target) # the following is not necessary, because JavaScript cannot record # keydown/keyup events in the address bar # humanTyping(target, speed=None, doubleHit=False) time.sleep(random.uniform(1.95, 2.95)) def main(): """ Get pixel coords with: `xdotool getmouselocation` """ os.system('google-chrome --remote-debugging-port=9222 --start-maximized --disable-notifications &') time.sleep(4) try: while True: time.sleep(random.uniform(.95, 1.25)) visitPage() parsed = getCoords(random.randrange(1, 11)) keys.append(getKey()) for i in range(11): x = parsed['x'] + random.randrange(0, int(parsed['width'])) y = parsed['y'] + random.randrange(0, int(parsed['height'])) # print(f'x={x}, y={y}') humanMove(x, y) time.sleep(random.uniform(1.15, 1.74)) key = getKey() if key == 'done': break else: keys.append(key) parsed = getCoords(random.randrange(1, 11)) print(f'Got {len(set(keys))} unique keys') except (Exception, KeyboardInterrupt) as e: print(f'Error: {e}') print(keys) if __name__ == '__main__': main() ================================================ FILE: example.py ================================================ import time import random from behavior.behavior import humanMove, humanTyping from behavior.sst_utils import * """ You might have to adjust some coordinates. I used a dual screen setup and I started the browser on the left screen. You can obtain the coordinates of your current mouse pointer with the bash command on Linux `xdotool getmouselocation` """ def main(): print('Trying to start browser') startBrowser(['bot.incolumitas.com\n']) # click link to get to the challenge print('Trying to click challenge link') coords = getCoords('li:nth-of-type(3) a') print('Clicking on coordinates ' + str(coords)) humanMove(*coords) time.sleep(random.uniform(0.5, 1.0)) # enter username username = getCoords('input[name="userName"]') humanMove(*username, clicks=2) time.sleep(random.uniform(0.25, 1.25)) humanTyping('IamNotABotISwear\n', speed=(0.005, 0.008)) time.sleep(random.uniform(0.5, 1.0)) # enter email email = getCoords('input[name="eMail"]') humanMove(*email, clicks=3) time.sleep(random.uniform(0.25, 1.25)) humanTyping('bot@spambot.com\n', speed=(0.005, 0.008)) time.sleep(random.uniform(0.5, 1.0)) # agree to the terms terms = getCoords('input[name="terms"]') humanMove(*terms) # select cats cat = getCoords('#bigCat') humanMove(*cat) # submit submit = getCoords('#submit') humanMove(*submit) # press the final enter time.sleep(random.uniform(2.5, 3.4)) humanTyping('\n', speed=(0.005, 0.008)) # finally get the page source text = getPageSource() print('Got {} bytes of page soure'.format(len(text))) if __name__ == '__main__': main() ================================================ FILE: immobilienscout24.py ================================================ import time import random import json import sys import pprint from behavior.sst_utils import * from behavior.behavior import humanMove, humanScroll, typeNormal, getDim import immo_env """ this is an example how to scrape www.immobilienscout24.de with stealthy-scraping-tools www.immobilienscout24.de is protected by advanced bot protection: Imperva advanced? Let's see ;) """ if not os.path.exists('apartments.json'): with open('apartments.json', 'w') as f: json.dump(dict(), f) apartments = json.load(open('apartments.json', 'r')) SEARCH_URL = immo_env.SEARCH_URL def startFluxbox(): # start fluxbox os.system('fluxbox &') time.sleep(3) def startVNC(): # and a vnc server for debugging remotely vnc_cmd = 'x11vnc -display {}.0 -forever -passwd {} &'.format( os.environ['DISPLAY'], os.environ['X11VNC_PASSWORD'], ) print(vnc_cmd) os.system(vnc_cmd) def moveRandomly(steps=2): width, height = getDim() width = min(1920, width) # this is where the bot check is happening # move the mouse a bit for i in range(steps): humanMove(*(random.randrange(0, width-50), random.randrange(0, height-50)), clicks=0, steps=2) time.sleep(random.uniform(0.25, 1.0)) def contact(listing): """ contact the listing. This is where I get mostly blocked. """ goto('https://www.immobilienscout24.de' + listing.get('url')) moveRandomly(steps=4) already_contacted = getCoords('.is24-icon-heart-Favorite-glyph') is not None if already_contacted: print('Listing {} already contacted'.format(listing.get('url'))) return True # contact contact_button = getCoords("a span.palm-hide.email-button-desk-text.font-standard") humanMove(*contact_button, clicks=1) time.sleep(random.uniform(4, 5.5)) # check if message already entered already_entered = json.loads(evalJS('document.getElementById("contactForm-Message").value.includes("und Langfristiges")')) is True if not already_entered: evalJS('document.getElementById("contactForm-Message").value = `{}`'.format('')) # input message input_el = getCoords("#contactForm-Message") humanMove(*input_el, clicks=3) # typeNormal('Guten Tag, ') # time.sleep(random.uniform(0.5, 1.1)) evalJS('document.getElementById("contactForm-Message").value = `{}`'.format(immo_env.MESSAGE)) time.sleep(random.uniform(0.5, 1.1)) time.sleep(random.uniform(0.5, 1.1)) no_pets = getCoords('[for="contactForm-hasPets.no"]') if no_pets: humanScroll(4, (5, 20), -1) time.sleep(random.uniform(1.5, 1.5)) no_pets = getCoords('[for="contactForm-hasPets.no"]') humanMove(*no_pets, clicks=1) submit = getCoords('button.button-primary.padding-horizontal-m') humanMove(*submit, clicks=1) else: submit = getCoords('button[data-qa="sendButtonBasic"]') humanMove(*submit, clicks=1) time.sleep(random.uniform(3.9, 5.9)) return True def is_detected(): detected = json.loads(evalJS("JSON.stringify(document.body.textContent.includes('Warum haben wir deine Anfrage blockiert?'));")) == True other = json.loads(evalJS("JSON.stringify(document.body.textContent.includes('Sicherheitsabfrage'));")) == True if detected or other: print('Got detected as a bot. Aborting.') sys.exit(0) return True else: return False def main(): if os.getenv('DOCKER') == '1': startFluxbox() startVNC() # startBrowser(args=['--incognito']) startBrowser(args=[]) if os.getenv('DOCKER') == '1': # close the annoying chrome error message bar # it skews with coordinates # x:1903 y:114 screen:0 window:195035139 # x:1889 y:113 screen:0 window:195035139 humanMove(1893, 103) humanMove(1889, 103) time.sleep(random.uniform(2.5, 3.5)) try: goto('https://www.immobilienscout24.de') moveRandomly() # are there cookies to accept? # cookie consent is in an iframe with id '#gdpr-consent-notice' # coords = getCoords('button#save', '#gdpr-consent-notice') coords = 1099, 859 print(f'Accept Cookies by clicking at {coords}') humanMove(*coords) time.sleep(random.uniform(3.5, 4.5)) # login with username and password profile_button = getCoords('#link_loginAccountLink') humanMove(*profile_button, clicks=0) time.sleep(random.uniform(0.5, 2)) login_button = getCoords("#is24-dropdown > div.MyscoutDropdownV2_LoginContainer__3X0hy.topnavigation__sso-login__link-list--logged-out > a") # if login button not visible, we are logged in probably if login_button: humanMove(*login_button, clicks=1) time.sleep(random.uniform(2.5, 3)) user_input = getCoords('#username') if not user_input: raise Exception('Cannot find username input field by id #username') time.sleep(random.uniform(1.5, 2)) humanMove(*user_input, clicks=1) time.sleep(random.uniform(0.25, 1.25)) typeNormal(immo_env.EMAIL) time.sleep(random.uniform(0.25, 1.25)) humanMove(*getCoords('#submit'), clicks=1) time.sleep(random.uniform(2.25, 3.25)) humanMove(*getCoords('#password'), clicks=1) time.sleep(random.uniform(0.25, 1.25)) typeNormal(immo_env.PASSWORD) time.sleep(random.uniform(1.25, 2.25)) humanMove(*getCoords('#loginOrRegistration'), clicks=1) time.sleep(random.uniform(2.25, 3.55)) goto(SEARCH_URL) humanScroll(8, (5, 20), -1) # finally parse the listings parse_listings = """var res = []; document.querySelectorAll(".result-list__listing").forEach((el) => { let title = el.querySelector(".result-list-entry__brand-title"); let details = el.querySelector(".result-list-entry__criteria"); if (title) { let obj = { contacted: false, title: title.textContent, url: el.querySelector("a.result-list-entry__brand-title-container").getAttribute("href"), }; if (details) { obj.location = el.querySelector(".result-list-entry__map-link.link-text-secondary.font-normal.font-ellipsis").textContent; obj.price = details.querySelector("dl.grid-item:nth-child(1)").textContent; obj.area = details.querySelector("dl.grid-item:nth-child(2)").textContent; obj.rooms = details.querySelector("dl.grid-item:nth-child(3)").textContent; } res.push(obj); } }); JSON.stringify(res);""" output = evalJS(parse_listings) listings = json.loads(output) # pprint.pprint(listings) filtered_listings = {} for el in listings: if el.get('url'): key = el.get('url') location = el.get('location', '').lower().strip() if immo_env.FILTER_LISTINGS: for pref in immo_env.PREFERRED_LOCATIONS: if pref.lower().strip() in location: filtered_listings[key] = el else: filtered_listings[key] = el # remove listings we already contacted for key in apartments: if key in filtered_listings: if apartments[key].get('contacted', False): print('already contacted listing ' + key) del filtered_listings[key] pprint.pprint(filtered_listings) print('contacting {} listings'.format(len(filtered_listings))) for key in filtered_listings: try: contacted = contact(filtered_listings[key]) except Exception as e: print('Failed to contact {}. Blocked? Error: {}'.format(key, str(e))) is_detected() filtered_listings[key]['contacted'] = contacted time.sleep(random.uniform(0.5, 1.25)) # update? for k, v in filtered_listings.items(): apartments[k] = v with open('apartments.json', 'w') as f: json.dump(apartments, f) print('Updated database') closeBrowser() except Exception as e: print('Error: {}'.format(e)) is_detected() if __name__ == '__main__': main() ================================================ FILE: local_forward_proxy_server/proxy_server.js ================================================ const ProxyChain = require('proxy-chain'); async function startProxyServer(proxy) { return new Promise(function(resolve, reject) { const server = new ProxyChain.Server({ // Port where the server will listen. By default 8947. port: 8947, // Enables verbose logging verbose: false, prepareRequestFunction: function (params) { var {request, username, password, hostname, port, isHttp, connectionId} = params; console.log('isHttp: ' + isHttp); console.log('port: ' + port); console.log('hostname: ' + hostname); console.log('headers: ' + JSON.stringify(request.headers)); return { requestAuthentication: false, // http://username:password@proxy.example.com:3128 upstreamProxyUrl: proxy, }; }, }); // Emitted when HTTP connection is closed server.on('connectionClosed', (params) => { var {connectionId, stats} = params; console.log(`Connection ${connectionId} closed`); }); // Emitted when HTTP request fails server.on('requestFailed', (params) => { var {request, error} = params; console.error(`Request ${request.url} failed`); console.error(error); }); server.listen(() => { console.log(`ProxyServer listening on port ${server.port}`); resolve(server); }); }); } // Start local forwarding server with: node proxy_server.js http://username:password@proxy.example.com:3128 // Use the local forwarding proxy server with google-chrome: // google-chrome --proxy-server="localhost:8947" if (process.argv.length === 3) { (async () => { await startProxyServer(process.argv[2]); })(); } ================================================ FILE: lufthansa-de.py ================================================ import time import random from behavior.sst_utils import * from behavior.behavior import humanMove, humanScroll, press, typeNormal, clickNormal, typeWrite """ this is an example how to scrape www.lufthansa.de with stealthy-scraping-tools """ def startFluxbox(): # start fluxbox os.system('fluxbox &') time.sleep(3) def startVNC(): # and a vnc server for debugging remotely vnc_cmd = 'x11vnc -display {}.0 -forever -passwd {} &'.format( os.environ['DISPLAY'], os.environ['X11VNC_PASSWORD'], ) print(vnc_cmd) os.system(vnc_cmd) def main(): if os.getenv('DOCKER') == '1': startFluxbox() startVNC() startBrowser(args=[]) if os.getenv('DOCKER') == '1': # close the annoying chrome error message bar # it skews with coordinates # x:1903 y:114 screen:0 window:195035139 # x:1889 y:113 screen:0 window:195035139 humanMove(1893, 103) humanMove(1889, 103) time.sleep(random.uniform(2.5, 3.5)) for i in range(150): print(f'[{i}] Searching for flights...') time.sleep(random.uniform(0.5, 1.0)) goto('https://www.lufthansa.com/de/de/homepage') time.sleep(random.uniform(4, 6)) # accept cookies? if i == 0: try: cookie_accept = getCoords('#cm-acceptAll') if cookie_accept: humanMove(*cookie_accept, clicks=1) time.sleep(random.uniform(0.25, 1.25)) except Exception as e: print('No cookies to accept, #cm-acceptAll not found') # enter where to go try: input_loc = getCoords('input[placeholder="Nach"]') print('Enter Departure ' + str(input_loc)) humanMove(*input_loc, clicks=2) time.sleep(random.uniform(0.25, 1.25)) typeNormal(random.choice(['Berlin', 'Paris', 'Tel Aviv', 'Stockholm', 'Bogota', 'Bangkok', 'New York'])) time.sleep(random.uniform(1.5, 2.5)) press('down') time.sleep(random.uniform(0.5, 1.0)) press('enter') time.sleep(random.uniform(0.5, 1.0)) except Exception as e: print(f'[{i}] Could not enter flight destination. Blocked?') continue # input return date try: backdate = getCoords('input[placeholder="Rückflugdatum"]') print('backdate ' + str(backdate)) humanMove(*backdate, clicks=1) time.sleep(random.uniform(4.55, 5.55)) except Exception as e: print(f'[{i}] Could not click on return value. Leaving untouched.') # enter departure date try: datetile = getCoords(random.choice(['[aria-label^="Choose Samstag, 25 Dezember 2021"]', '[aria-label^="Choose Sonntag, 26 Dezember 2021"]'])) print('datetile ' + str(datetile)) humanMove(*datetile, clicks=1) time.sleep(random.uniform(2.25, 3.25)) except Exception as e: print(f'[{i}] Could not select return date. Keeping default value.') # submit try: submit = getCoords('[type="submit"]') print('Submit ' + str(submit)) humanMove(*submit) except Exception as e: print(f'[{i}] Could not submit search. Blocked?') continue # wait for quite some time time.sleep(random.uniform(10, 14)) humanScroll(2, (5, 20), -1) try: calendar = getCoords('#page .calendarTab') if calendar: print(f'[{i}] Flight Results loaded!') except Exception as e: print(f'[{i}] Could not find calendar for flights. Page load to slow?') if __name__ == '__main__': main() ================================================ FILE: requirements.txt ================================================ MouseInfo==0.1.3 Pillow==8.4.0 pkg_resources==0.0.0 PyAutoGUI==0.9.53 PyGetWindow==0.0.9 PyMsgBox==1.0.9 pyperclip==1.8.2 PyRect==0.1.4 PyScreeze==0.1.28 python3-xlib==0.15 pytweening==1.0.4 ================================================ FILE: start.sh ================================================ #!/bin/bash set -e # When docker restarts, this file is still there, # so we need to kill it just in case [ -f /tmp/.X99-lock ] && rm -f /tmp/.X99-lock _kill_procs() { kill -TERM $python kill -TERM $xvfb kill -TERM $chrome } # Relay quit commands to processes trap _kill_procs SIGTERM SIGINT # https://github.com/browserless/chrome/blob/307fa139b4c65f314a083891e1dbdb2dddeafcb7/start.sh # Alternatively: # xvfb-run -e /dev/stdout --server-num=99 --server-args="-ac -screen 0 $XVFB_WHD -nolisten tcp -nolisten unix" python3 -u immobilienscout24.py # echo "Starting X virtual framebuffer"; # python3 -u behavior/start_disp.py & # Xvfb $DISPLAY -ac -screen 0 $XVFB_WHD -nolisten tcp -nolisten unix & # xvfb=$! # sleep 3 # GENERATE .Xauthority file # xauth with complain unless ~/.Xauthority exists # touch $HOME/.Xauthority # # only this one key is needed for X11 over SSH # xauth generate $DISPLAY . trusted # # generate our own key, xauth requires 128 bit hex encoding # xauth add $DISPLAY . $(xxd -l 16 -p /dev/urandom) # # To view a listing of the .Xauthority file, enter the following # xauth list echo "Blocking all UDP traffic except DNS"; id # https://serverfault.com/questions/222606/how-can-i-reject-all-incoming-udp-packets-except-for-dns-lookups/716035 # how can I reject all traffic I didn't initiate with Linux netfilter? sudo iptables --version sudo iptables -A DOCKER-USER -m state --state ESTABLISHED,RELATED -j ACCEPT sudo iptables -A DOCKER-USER -p udp --dport 53 -j ACCEPT -m comment --comment "we serve DNS" sudo iptables -A DOCKER-USER -p tcp --dport 53 -j ACCEPT -m comment --comment "DNS uses TCP too sometimes" sudo iptables -A DOCKER-USER -j DROP echo "Starting browser"; # Avoid chrome in docker crashing: https://github.com/stephen-fox/chrome-docker/issues/8 # Option 1: Run chrome with --disable-dev-shm-usage # Option 2: Set /dev/shm size to a reasonable amount docker run -it --shm-size=1g replacing 1g with whatever amount you want. # google-chrome --remote-debugging-port=9222 --no-sandbox --disable-notifications --start-maximized --no-first-run --no-default-browser-check --incognito & # chrome=$! sleep 5 # https://abhishekvaid13.medium.com/pyautogui-headless-docker-mode-without-display-in-python-480480599fc4 echo "Running bot"; python3 -u immobilienscout24.py & python=$! # echo "Starting x11vnc"; # x11vnc -display $DISPLAY.0 -forever -passwd ${X11VNC_PASSWORD:-password} & # vnc_server=$! wait $python echo "bot terminated"; wait $xvfb wait $chrome wait $vnc_server ================================================ FILE: test.py ================================================ from sst_utils import * import pprint import json parse_listings = """var res = []; document.querySelectorAll(".result-list__listing").forEach((el) => { let title = el.querySelector(".result-list-entry__brand-title"); let details = el.querySelector(".result-list-entry__criteria"); if (title) { let obj = { title: title.textContent, url: el.querySelector("a.result-list-entry__brand-title-container").getAttribute("href"), }; if (details) { obj.price = details.querySelector("dl.grid-item:nth-child(1)").textContent; obj.area = details.querySelector("dl.grid-item:nth-child(2)").textContent; obj.rooms = details.querySelector("dl.grid-item:nth-child(3)").textContent; } res.push(obj); } }); JSON.stringify(res);""" listings = evalJS(parse_listings) pprint.pprint(json.loads(listings))