[
  {
    "path": ".gitignore",
    "content": "cdp/node_modules/*\ntarget.py\nlocal_forward_proxy_server/node_modules/\nlocal_forward_proxy_server/node_modules/*\n\ntrainline.py\nimmo_env.env\nimmo_env.py\napartments.json\n\ndeploy.sh\nexclude.txt\n\nimmo_env.py\n\n*.pyc\n__pycache__/*\n\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/"
  },
  {
    "path": "Dockerfile",
    "content": "FROM ubuntu:20.04\n\n# Set correct timezone\nENV TZ=Europe/Berlin\nRUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone\n\nRUN apt-get update && apt-get install python3 tesseract-ocr python3-pip curl unzip -yf\n\n# Install Chrome\nRUN apt-get update -y\nRUN apt-get install -y dbus-x11\nRUN curl https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -o /chrome.deb\nRUN dpkg -i /chrome.deb || apt-get install -yf\nRUN rm /chrome.deb\n\nRUN apt-get install -y poppler-utils\nRUN apt-get clean\nRUN DEBIAN_FRONTEND=noninteractive apt install -y python3-xlib xvfb xserver-xephyr python3-tk python3-dev\n\n# https://github.com/puppeteer/puppeteer/issues/5429\nRUN DEBIAN_FRONTEND=noninteractive apt-get -y install wget libcairo2-dev \\\n   libjpeg-dev libpango1.0-dev libgif-dev build-essential g++ libgl1-mesa-dev libxi-dev \\\n   libx11-dev pulseaudio udev\n\nRUN apt update && apt install -y postgresql-server-dev-12\n\nRUN curl --silent --location https://deb.nodesource.com/setup_14.x | bash - &&\\\n  apt-get -y -qq install nodejs\n\n# Move this into requirements.txt at some time\nRUN pip3 install pyautogui python-xlib PyVirtualDisplay\n\nRUN apt-get install -y fonts-roboto fonts-ubuntu ttf-bitstream-vera fonts-crosextra-caladea fonts-cantarell fonts-open-sans ttf-wqy-zenhei\n\n# install debs error if combine together\nRUN apt install -y --no-install-recommends --allow-unauthenticated x11vnc fluxbox xxd \\\n    && apt autoclean -y \\\n    && apt autoremove -y \\\n    && rm -rf /var/lib/apt/lists/*\n\n\nRUN apt-get update -y && apt install -y iptables sudo\n\nCOPY . .\n\n# https://dev.to/emmanuelnk/using-sudo-without-password-prompt-as-non-root-docker-user-52bg\n# Create new user `docker` and disable \n# password and gecos for later\n# --gecos explained well here:\n# https://askubuntu.com/a/1195288/635348\nRUN adduser --force-badname --disabled-password --gecos '' browserUser\n\n# Add a user to run the browser as non-root\nRUN mkdir -p /home/browserUser/Downloads \\\n  && chown -R browserUser:browserUser /home/browserUser\n\nRUN adduser browserUser sudo\n\n# Ensure sudo group users are not \n# asked for a password when using \n# sudo command by ammending sudoers file\nRUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> \\\n/etc/sudoers\n\nRUN chmod 755 start.sh\n\n# Run everything after as non-privileged user.\nUSER browserUser\n\n# Application specific environment variables\n# disp = Display(visible=True, size=(1920, 1080), backend=\"xvfb\", use_xauth=True); disp.start()\n# set's DISPLAY=:1\nENV DISPLAY=:1\n# By default, only screen 0 exists and has the dimensions 1280x1024x8\nENV XVFB_WHD=1920x1080x24\n# x11vnc password\nENV X11VNC_PASSWORD=test\n# This variable tells our source code that its invoked within a Docker container\nENV DOCKER=1\n\nENTRYPOINT [ \"./start.sh\" ]\n"
  },
  {
    "path": "README.md",
    "content": "# Stealthy Scraping Tools\n\nDo not use puppeteer and playwright for scraping. Or any other browser automation framework for that matter. [Why?](https://incolumitas.com/2021/05/20/avoid-puppeteer-and-playwright-for-scraping/)\n\nWe only use the [CDP](https://developer.chrome.com/docs/devtools/) to obtain the page source and to get the absolute coordinates for an arbitrary CSS selector. That's all what is needed for efficient scraping.\n\n1. To obtain the page source of the browser's current page. Implemented in [page_source.js](https://github.com/NikolaiT/stealthy-scraping-tools/blob/main/page_source.js)\n2. To get the absolute coordinates for an arbitrary CSS selector. Implemented in [coords.js](https://github.com/NikolaiT/stealthy-scraping-tools/blob/main/coords.js)\n\nMouse movements and typing is handled by `pyautogui` or other means, but not with JavaScript or with the CDP! Reason: Browser based mouse and keyboard emulation is very easy detectable!\n\n## Theory\n\n1. Analyzing key strokes: [TypeNet: Deep Learning Keystroke Biometrics](https://arxiv.org/abs/2101.05570)\n2. Research how to mimic human mouse movements: [BeCAPTCHA-Mouse: Synthetic Mouse Trajectories and Improved Bot Detection](https://arxiv.org/abs/2005.00890)\n\n## Full Example\n\nThe bot challenge that can be found here [bot.incolumitas.com/#botChallenge](https://bot.incolumitas.com/#botChallenge) will be solved in the following quick tutorial.\n\nThe example code can be found in `example.py`.\n\nI am using an Ubuntu 18.04 system with `Python3` (with `pipenv`) and a recent `Node` version.\n\nThe browser `google-chrome` must be installed.\n\nClone the repo:\n\n```\ngit clone https://github.com/NikolaiT/stealthy-scraping-tools\ncd stealthy-scraping-tools\n```\n\nActivate an environment with:\n\n```bash\npew new -p python3 sst\n\npew workon sst\n```\n\nThen install `pyautogui`:\n\n```bash\npip install pyautogui\n```\n\nInstall node modules:\n\n```\nnpm install chrome-remote-interface\n```\n\nAnd then run the bot with:\n\n```python\npython example.py\n```\n\n## Docker\n\nThe Dockerfile is based on `Ubuntu 20.04`.\n\nThe Dockerfile uses `xvfb` from the python module `PyVirtualDisplay`.\n\nI use `pyautogui` for mouse and keyboard automation.\n\nI use `fluxbox` as a tiny window manager and `x11vnc` (Virtual Network Computing server program) as a means to inspect the docker image and see what is going on.\n\nBuild the Dockerfile:\n\n```\ndocker build -t sst:0.0.1 .\n```\n\nHint: Avoid chrome in docker crashing: <https://github.com/stephen-fox/chrome-docker/issues/8>\n\n```\n1. Option 1: Run chrome with --disable-dev-shm-usage\n2. Option 2: Set /dev/shm size to a reasonable amount `docker run -it --shm-size=1g` replacing 1g with whatever amount you want.\n```\n\nThe docker option `--shm-size=2g` is really important:\n\n```\ndocker run --cap-add=NET_ADMIN --network=\"host\" --shm-size=2g sst:0.0.1\n```\n\n## TODO\n\n+ Look at Kernel/OS level mouse/keyboard control commands (Ditch `pyautogui`)\n+ Use the math from [ghost-cursor](https://github.com/Xetera/ghost-cursor)\n+ Create a set of typign recordings and use it to derive rules for bot writing\n"
  },
  {
    "path": "behavior/behavior.py",
    "content": "import random\nimport time\nimport os\n\nif os.getenv('DOCKER') == '1':\n  from pyvirtualdisplay.display import Display\n  import os\n  import time\n\n  disp = Display(visible=True, size=(1920, 1080), backend=\"xvfb\", use_xauth=True)\n  disp.start()\n\n  print('Started display!')\n  print('DISPLAY={}'.format(os.environ['DISPLAY']))\n\n  import Xlib.display\n  import pyautogui\n  pyautogui._pyautogui_x11._display = Xlib.display.Display(os.environ['DISPLAY'])\n  pyautogui.FAILSAFE = True\nelse:\n  import pyautogui\n  # When fail-safe mode is True, moving the mouse to the upper-left\n  # will raise a pyautogui.FailSafeException that can abort your program:\n  pyautogui.FAILSAFE = True\n\n\ndef tinySleep():\n  time.sleep(random.uniform(0.075, 0.329))\n\n\ndef getDim():\n  # current screen resolution width and height\n  return pyautogui.size()\n\n\ndef someWhereRandomClose(x, y, max_dist=120):\n  \"\"\"\n  Find a random position close to (x, y)\n  with maximal dist @max_dist\n  \"\"\"\n  shape = pyautogui.size()\n  cnt = 0\n\n  while True:\n    randX = random.randrange(1, max_dist)\n    randY = random.randrange(1, max_dist)\n\n    if random.random() > 0.5:\n      randX *= -1\n\n    if random.random() > 0.5:\n      randY *= -1\n\n    if x + randX in range(0, shape.width) and y + randY in range(0, shape.height):\n      return (x + randX, y + randY)\n\n    cnt += 1\n\n    if cnt > 15:\n      return (x, y)\n\n\ndef humanMove(x, y, clicks=1, steps=1):\n  \"\"\"\n  Moves like a human to the coordinate (x, y) and\n  clicks on the coordinate.\n\n  Randomizes move time and the move type.\n\n  Visits one intermediate coordiante close to the target before\n  fine correcting and clicking on the target coordinates.\n  \"\"\"\n  width, height = getDim()\n\n  if steps > 1: # kek\n    far_x, far_y = someWhereRandomClose(x, y, min(width, 600))\n    pyautogui.moveTo(far_x, far_y, random.uniform(0.35, .55), pyautogui.easeOutQuad)\n    tinySleep()\n\n  if steps > 0:\n    closer_x, closer_y = someWhereRandomClose(x, y, min(width, 400))\n    pyautogui.moveTo(closer_x, closer_y, random.uniform(0.25, .40), pyautogui.easeOutQuad)\n\n  # move to an intermediate target close to the destination\n  # start fast, end slow\n  close_x, close_y = someWhereRandomClose(x, y, 50)\n  pyautogui.moveTo(close_x, close_y, random.uniform(.25, .45), pyautogui.easeOutQuad)\n\n  # click on the main target\n  pyautogui.moveTo(x, y, random.uniform(.22, .35))\n  tinySleep()\n  pyautogui.click(clicks=clicks)\n\n\ndef humanScroll(steps, clicks=(5, 20), direction=1):\n  for i in range(steps):\n    ran_click = random.uniform(*clicks)\n    pyautogui.scroll(direction * ran_click)\n    time.sleep(random.uniform(0.5, 1.329))\n\n\ndef tinySleep():\n  time.sleep(random.uniform(0.005, 0.009))\n\n\ndef doubleHit(key1, key2):\n  \"\"\"\n  Sometimes press two keys down at the same time and randomize the\n  order of the corresponding key up events to resemble\n  human typign closer.\n  \"\"\"\n  pyautogui.keyDown(key1)\n  tinySleep()\n  pyautogui.keyDown(key2)\n  tinySleep()\n  if random.random() > 0.5:\n    pyautogui.keyUp(key1)\n    tinySleep()\n    pyautogui.keyUp(key2)\n  else:\n    pyautogui.keyUp(key2)\n    tinySleep()\n    pyautogui.keyUp(key1)\n\n\ndef humanTyping(text, speed=(0.01, 0.025), double_hit=False):\n  \"\"\"\n  Mostly the keydown/keyup pairs are in order, but\n  sometimes we want two keydown's at the same time.\n\n  text: the text to be written in a human fashion.\n\n  speed: the gap between key presses in seconds. Random number between\n    (low, high)\n  \"\"\"\n  i = 0\n  while i <= len(text):\n    if speed:\n      time.sleep(random.uniform(*speed))\n\n    if double_hit is True and random.random() < .3 and i+1 < len(text):\n      doubleHit(text[i], text[i+1])\n      i += 2\n    else:\n      pyautogui.keyDown(text[i])\n      # tinySleep()\n      pyautogui.keyUp(text[i])\n      i += 1\n\n    if i >= len(text):\n      break\n\n\ndef clickNormal(clicks=1):\n  pyautogui.click(clicks=clicks, interval=0.25)\n\n\ndef typeNormal(text):\n  pyautogui.write(text, interval=random.uniform(0.15, 0.25))\n\n\ndef fastwrite(text):\n  pyautogui.write(text, interval=random.uniform(0.045, 0.075))\n\n\ndef press(char):\n  pyautogui.press('char', presses=1)\n\n\ndef typeWrite(l):\n  pyautogui.typewrite(l, interval=0.22)\n\n\ndef press(key):\n  pyautogui.press(key)\n"
  },
  {
    "path": "behavior/human_replay.py",
    "content": "\"\"\"\nReal human recorded behavior is replayed.\n\nMouse movements exclusively.\n\"\"\"\n"
  },
  {
    "path": "behavior/sst_utils.py",
    "content": "import time\nimport os\nimport sys\nimport random\nimport math\nimport json\nimport subprocess\nimport subprocess\nfrom pathlib import Path\n\n\ndef goto(url):\n    script_path = getScriptPath('goto.js')\n    cmd = f\"node {script_path} '{url}'\"\n    ps = subprocess.check_output(cmd, shell=True)\n    return ps\n\n\ndef getScriptPath(name):\n    return os.path.join(\n        Path(__file__).parent.parent,\n        'cdp/' + name\n    )\n\n\ndef getPageSource():\n    cmd = 'node ' + getScriptPath('page_source.js')\n    ps = subprocess.check_output(cmd, shell=True)\n    return ps\n\n\ndef evalJS(command):\n    with open('/tmp/evalCommand.txt', 'w') as f:\n        f.write(command)\n\n    script_path = getScriptPath('eval_js.js')\n    cmd = f\"node {script_path}\"\n    ps = subprocess.check_output(cmd, shell=True)\n    return ps\n\n\ndef getCoords(selector, randomize_within_bcr=True, highlight_bb=True):\n    \"\"\"\n    - selector: The CSS selector to get the coords for\n    - randomize_within_bcr: select a random coordinate within the bounding box\n    hight\n    - highlight_bb: visually highlight the bounding box for debugging purposes\n    \"\"\"\n    script_path = getScriptPath('coords.js')\n    cmd = f\"node {script_path} '{selector}'\"\n    coords = subprocess.check_output(cmd, shell=True)\n    coords = coords.decode()\n\n    x, y = 0, 0\n\n    try:\n        parsed = json.loads(coords)\n        x, y, width, height = parsed['x'], parsed['y'], parsed['width'], parsed['height']\n\n        if randomize_within_bcr:\n            # print(x, y, parsed['width'], parsed['height'])\n            x += random.randint(0, math.floor(parsed['width'] / 4))\n            y += random.randint(0, math.floor(parsed['height'] / 4))\n\n        if highlight_bb:\n            # Just add a red thick border around the CSS selector\n            cmd = \"\"\"var el = document.querySelector('\"\"\" + selector + \\\n                \"\"\"'); if (el) { el.style.border = \"2px solid #ff0000\"; }\"\"\"\n            evalJS(cmd)\n\n    except Exception as e:\n        print('getCoords() failed with Error: {}'.format(e))\n        return None\n\n    return x, y\n\n\ndef startBrowser(args=[], startInTempDir=False, chromeProfile='--profile-directory=\"Default\"'):\n    tempDirStr = ''\n    if startInTempDir:\n        tempDirStr = f'--user-data-dir=/tmp'\n\n    arg_str = ' '.join(args)\n    if sys.platform == 'darwin':\n        chromePath = '/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome'\n        # On MacOS Monterey, we need to start Google Chrome\n        # in fullscreen mode to get the correct coordinates.\n        startCmd = f'{chromePath} --remote-debugging-port=9222 --start-maximized {tempDirStr} {chromeProfile} --disable-notifications --start-fullscreen {arg_str} 1>out.log 2>err.log &'\n    else:\n        startCmd = f'google-chrome --remote-debugging-port=9222 --start-maximized --disable-notifications {arg_str} 1>out.log 2>err.log &'\n\n    if os.getenv('DOCKER') == '1':\n        startCmd = 'google-chrome --remote-debugging-port=9222 --no-sandbox --disable-notifications --start-maximized --no-first-run --no-default-browser-check 1>out.log 2>err.log &'\n\n    print(startCmd)\n    subprocess.Popen([startCmd], shell=True)\n    time.sleep(random.uniform(3, 4))\n\n\ndef closeBrowser():\n    print('closing browser')\n    if sys.platform == 'darwin':\n        os.system(\"killall -9 'Google Chrome'\")\n    else:\n        os.system(\"killall -9 'google-chrome'\")\n"
  },
  {
    "path": "cdp/coords.js",
    "content": "// coords.js\n// https://chromedevtools.github.io/devtools-protocol/\n\nconst CDP = require('chrome-remote-interface');\n\nconst random = (min, max) => Math.floor(Math.random() * (max - min)) + min;\n\n// given a selector or node_id returns x and y *relative* coordinates\n// coordinates are relative to the viewport\n// The x relative coordinate is the same as the absolute coordiante, as the browser is maximed\n// The y coordinate is less, because the browser has the address bar / header\nasync function getCoordsAlt(css_selector) {\n  let client;\n  try {\n    // connect to endpoint\n    client = await CDP();\n    // extract domains\n    const { Page, Runtime, DOM } = client;\n    // enable events then start!\n    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);\n\n    const {root: {nodeId: documentNodeId}} = await DOM.getDocument();\n\n    console.log(documentNodeId)\n\n    const result = await DOM.querySelector({\n        selector: css_selector,\n        nodeId: documentNodeId,\n    });\n\n    console.log(result)\n\n    const retval = await DOM.getBoxModel({\"nodeId\": result.nodeId});\n\n    var box_model = retval.model;\n    console.log(box_model)\n\n    content_w = Math.abs(box_model[\"content\"][2] - box_model[\"content\"][0])\n    center_x = box_model[\"content\"][0] + random(content_w / 4.0, 3 * content_w / 4.0)\n\n    content_h = Math.abs(box_model[\"content\"][5] - box_model[\"content\"][1])\n    center_y = box_model[\"content\"][1] + random(content_h / 4.0, 3 * content_h / 4.0)\n\n    // given a selector or node_id returns x and y *relative* coordinates\n    // coordinates are relative to the viewport\n\n    // The x relative coordinate is the same as the absolute coordiante, as the browser is maximed\n    // The y coordinate is less, because the browser has the address bar / header\n\n    const coords = {\"x\": center_x, \"y\": center_y, \"node_id\": result.nodeId, \"root_node\": documentNodeId};\n    console.log(JSON.stringify(coords))\n    return coords;\n  } catch (err) {\n    console.error(err);\n  } finally {\n    if (client) {\n      await client.close();\n    }\n  }\n}\n\nfunction getFrameExecId(frame) {\n  var frameId = frameNameToFrameId[frame];\n  if (!frameId)\n      throw Error(`Frame ${frame} is unknown`);\n  var execId = frameIdToContextId[frameId];\n  if (!execId)\n      throw Error(`Frame ${frame} (${frameId}) has no executionContextId`);\n  return execId;\n}\n\nfunction expectLoadFrame(name, timeout) {\n  return new Promise((resolve, reject) => {\n      let tm = setTimeout( () => reject(\"timed out waiting for frame load\"), timeout );\n\n      // we can only have one Page.frameNavigated() handler, so let our handler above resolve this promise\n      frameWaitName = name;\n      new Promise((fwpResolve, fwpReject) => { frameWaitPromiseResolve = fwpResolve })\n          .then(() => {\n              // For the frame to be fully valid for queries, it also needs the corresponding\n              // executionContextCreated() signal. This might happen before or after frameNavigated(), so wait in case\n              // it happens afterwards.\n             function pollExecId() {\n                  if (frameIdToContextId[frameNameToFrameId[name]]) {\n                      clearTimeout(tm);\n                      resolve();\n                  } else {\n                      setTimeout(pollExecId, 100);\n                  }\n              }\n              pollExecId();\n          });\n  });\n}\n\n\nasync function getCoordsIframe(css_selector, iframe) {\n  let client;\n  try {\n    // connect to endpoint\n    client = await CDP();\n    // extract domains\n    const { Page, Runtime, DOM } = client;\n    // enable events then start!\n    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);\n\n    var frameIdToContextId = {};\n    var frameNameToFrameId = {};\n    // set these to wait for a frame to be loaded\n    var frameWaitName = null;\n    var frameWaitPromiseResolve = null;\n\n    // map frame names to frame IDs; root frame has no name, no need to track that\n    await Page.frameNavigated(info => {\n      if (info.frame.name)\n          frameNameToFrameId[info.frame.name] = info.frame.id;\n\n      // were we waiting for this frame to be loaded?\n      if (frameWaitPromiseResolve && frameWaitName === info.frame.name) {\n          frameWaitPromiseResolve();\n          frameWaitPromiseResolve = null;\n      }\n    });\n\n    // track execution contexts so that we can map between context and frame IDs\n    await Runtime.executionContextCreated(info => {\n      frameIdToContextId[info.context.auxData.frameId] = info.context.id;\n    });\n\n    await Runtime.executionContextDestroyed(info => {\n      for (let frameId in frameIdToContextId) {\n        if (frameIdToContextId[frameId] == info.executionContextId) {\n            delete frameIdToContextId[frameId];\n            break;\n        }\n      }\n    });\n\n    let result = null;\n    let clientRectCmd = `var targetCoordEl = document.querySelector('${css_selector}'); if (targetCoordEl) { JSON.stringify(targetCoordEl.getClientRects()); }`;\n\n    await expectLoadFrame(iframe, 2000).then(async (res) => {\n      let frameId = getFrameExecId(iframe);\n      result = await Runtime.evaluate({\n        expression: clientRectCmd,\n        contextId: frameId,\n      });\n      console.log(result)\n    });\n\n    // get offset screen positioning\n    const screenPos = await Runtime.evaluate({\n      expression: \"JSON.stringify({offsetY: window.screen.height - window.innerHeight, offsetX: window.screen.width - window.innerWidth})\"\n    });\n\n    let offset = JSON.parse(screenPos.result.value);\n    let clientRect = null;\n\n    try {\n      clientRect = JSON.parse(result.result.value)[\"0\"];\n    } catch(err) {\n      return null;\n    }\n\n    let retVal =  {\n      x: offset.offsetX + clientRect.x,\n      y: offset.offsetY + clientRect.y,\n      width: clientRect.width,\n      height: clientRect.height,\n    };\n    console.log(JSON.stringify(retVal));\n    return retVal;\n  } catch (err) {\n    console.error(err);\n  } finally {\n    if (client) {\n      await client.close();\n    }\n  }\n}\n\nasync function getCoords(css_selector) {\n  let client;\n  try {\n    // connect to endpoint\n    client = await CDP();\n    // extract domains\n    const { Page, Runtime, DOM } = client;\n    // enable events then start!\n    await Promise.all([Runtime.enable()]);\n\n    let result = null;\n    let clientRectCmd = `var targetCoordEl = document.querySelector('${css_selector}'); if (targetCoordEl) { JSON.stringify(targetCoordEl.getClientRects()); }`;\n\n    result = await Runtime.evaluate({\n      expression: clientRectCmd,\n    });\n\n    // get offset screen positioning\n    const screenPos = await Runtime.evaluate({\n      expression: \"JSON.stringify({offsetY: window.screen.height - window.innerHeight, offsetX: window.screen.width - window.innerWidth})\"\n    });\n\n    let offset = JSON.parse(screenPos.result.value);\n    let clientRect = null;\n\n    try {\n      clientRect = JSON.parse(result.result.value)[\"0\"];\n    } catch(err) {\n      return null;\n    }\n\n    let retVal =  {\n      x: offset.offsetX + clientRect.x,\n      y: offset.offsetY + clientRect.y,\n      width: clientRect.width,\n      height: clientRect.height,\n    };\n    console.log(JSON.stringify(retVal));\n    return retVal;\n  } catch (err) {\n    console.error(err);\n  } finally {\n    if (client) {\n      await client.close();\n    }\n  }\n}\n\nconst argLength = process.argv.length;\n\nif (argLength === 3) {\n  getCoords(process.argv[2]);\n} else if (argLength === 4) {\n  getCoordsIframe(process.argv[2], process.argv[3]);\n}\n"
  },
  {
    "path": "cdp/eval_js.js",
    "content": "// eval_js.js\n// caller has to write command to /tmp/evalCommand.txt'\nconst CDP = require('chrome-remote-interface');\nconst fs = require('fs');\n\nasync function evalCommand(command) {\n  let client;\n  try {\n    // connect to endpoint\n    client = await CDP();\n    // extract domains\n    const { Page, Runtime, DOM } = client;\n    // enable events then start!\n    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);\n\n    const evalRes = await Runtime.evaluate({expression: command});\n    console.log(evalRes.result.value);\n\n  } catch (err) {\n      console.error(err);\n  } finally {\n    if (client) {\n      await client.close();\n    }\n  }\n}\n\nconst argLength = process.argv.length;\n\nif (argLength === 2) {\n  evalCommand(fs.readFileSync('/tmp/evalCommand.txt').toString());\n}"
  },
  {
    "path": "cdp/goto.js",
    "content": "// page_source.js\nconst CDP = require('chrome-remote-interface');\n\nasync function pageNav(url) {\n  let client;\n  try {\n    // connect to endpoint\n    client = await CDP();\n    // extract domains\n    const { Page, Runtime, DOM } = client;\n    // enable events then start!\n    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);\n\n    // get the page source\n    await Page.navigate({url: url});\n    return 'ok';\n  } catch (err) {\n      console.error(err);\n  } finally {\n    if (client) {\n      await client.close();\n    }\n  }\n}\n\nconst argLength = process.argv.length;\n\nif (argLength === 3) {\n  pageNav(process.argv[2]);\n}"
  },
  {
    "path": "cdp/page_source.js",
    "content": "// page_source.js\nconst CDP = require('chrome-remote-interface');\n\nasync function getPageSource() {\n  let client;\n  try {\n    // connect to endpoint\n    client = await CDP();\n    // extract domains\n    const { Page, Runtime, DOM } = client;\n    // enable events then start!\n    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);\n\n    // get the page source\n    const rootNode = await DOM.getDocument({ depth: -1 });\n    const pageSource = await DOM.getOuterHTML({\n      nodeId: rootNode.root.nodeId\n    });\n    return pageSource.outerHTML;\n  } catch (err) {\n      console.error(err);\n  } finally {\n    if (client) {\n      await client.close();\n    }\n  }\n}\n\ngetPageSource().then((pageSource) => {\n  console.log(pageSource);\n})"
  },
  {
    "path": "crawl.py",
    "content": "import time\nimport random\nfrom behavior.behavior import humanMove\nfrom behavior.sst_utils import *\n\n\"\"\"\nVery simple HTML crawl of a website.\n\"\"\"\n\n\ndef main():\n    print('Trying to start browser')\n    startBrowser(['www.hetzner.com\\n'])\n\n    # do a bit of random moving around\n    # to fool bot systems\n    coords = getCoords('body')\n    print('Clicking on coordinates ' + str(coords))\n    humanMove(*coords)\n    time.sleep(random.uniform(0.5, 1.0))\n\n    # finally get the page source\n    text = getPageSource()\n    print('Got {} bytes of HTML data'.format(len(text)))\n\n    # close the browser\n    closeBrowser()\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "ddc.py",
    "content": "import time\nimport os\nimport random\nimport subprocess\nimport json\nimport re\nfrom behavior.behavior import humanMove, humanTyping\nfrom target import target\nimport pyautogui\n\n\"\"\"\nImportant: \n\n1. Update the coordinates of the browser url address bar. Use the command `xdotool getmouselocation` to detect coordinates on your screen. \n2. Do not change the zoom level for the page in the browser! This will mess with coordinates! Default level must be 100% zoom level.\n3. I assume that the binary name of Google Chrome is `google-chrome`. Change the code if your binary name is different.\n4. Make sure the browser window is started in your leftmost screen!\n   I have a dual screen setup and sometimes I need to manually move my browser window to the correct screen ;)\n\"\"\"\n\n# collect keys\nkeys = []\n\ndef getCoords(n):\n  cmd = f'node coords.js \"p:nth-of-type({n}) > a\"'\n  # print(cmd)\n  coords = subprocess.check_output(cmd, shell=True).decode('utf8').strip()\n  # print(coords)\n  return json.loads(coords)\n\n\ndef getKey():\n  cmd = f'/usr/bin/node page_source'\n  ps =  subprocess.check_output(cmd, shell=True).decode('utf8').strip()\n  if 'Not found' in ps:\n    return 'done'\n  else:\n    key = re.search(r'[0-9a-z]{32}', ps)\n    return key.group(0)\n\n\ndef visitPage():\n  # @UPDATE COORDINATES HERE\n  humanMove(168, 79) # click on the address bar to enter URL\n  pyautogui.typewrite(target)\n  # the following is not necessary, because JavaScript cannot record \n  # keydown/keyup events in the address bar\n  # humanTyping(target, speed=None, doubleHit=False)\n  time.sleep(random.uniform(1.95, 2.95))\n\n\ndef main():\n  \"\"\"\n  Get pixel coords with: `xdotool getmouselocation`\n  \"\"\"\n  os.system('google-chrome --remote-debugging-port=9222 --start-maximized --disable-notifications &')\n  time.sleep(4)\n\n  try:\n    while True:\n      time.sleep(random.uniform(.95, 1.25))\n      visitPage()\n      parsed = getCoords(random.randrange(1, 11))\n      keys.append(getKey())\n      \n      for i in range(11):\n        x = parsed['x'] + random.randrange(0, int(parsed['width']))\n        y = parsed['y'] + random.randrange(0, int(parsed['height']))\n        # print(f'x={x}, y={y}')\n        humanMove(x, y)\n        time.sleep(random.uniform(1.15, 1.74))\n        key = getKey()\n        if key == 'done':\n          break\n        else:\n          keys.append(key)\n        parsed = getCoords(random.randrange(1, 11))\n        print(f'Got {len(set(keys))} unique keys')\n  except (Exception, KeyboardInterrupt) as e:\n    print(f'Error: {e}')\n    print(keys)\n\n\nif __name__ == '__main__':\n  main()"
  },
  {
    "path": "example.py",
    "content": "import time\nimport random\nfrom behavior.behavior import humanMove, humanTyping\nfrom behavior.sst_utils import *\n\n\"\"\"\nYou might have to adjust some coordinates. \n\nI used a dual screen setup and I started the browser on the\nleft screen.\n\nYou can obtain the coordinates of your current mouse pointer with \nthe bash command on Linux `xdotool getmouselocation`\n\"\"\"\n\n\ndef main():\n    print('Trying to start browser')\n    startBrowser(['bot.incolumitas.com\\n'])\n\n    # click link to get to the challenge\n    print('Trying to click challenge link')\n    coords = getCoords('li:nth-of-type(3) a')\n    print('Clicking on coordinates ' + str(coords))\n    humanMove(*coords)\n    time.sleep(random.uniform(0.5, 1.0))\n\n    # enter username\n    username = getCoords('input[name=\"userName\"]')\n    humanMove(*username, clicks=2)\n    time.sleep(random.uniform(0.25, 1.25))\n    humanTyping('IamNotABotISwear\\n', speed=(0.005, 0.008))\n\n    time.sleep(random.uniform(0.5, 1.0))\n\n    # enter email\n    email = getCoords('input[name=\"eMail\"]')\n    humanMove(*email, clicks=3)\n    time.sleep(random.uniform(0.25, 1.25))\n    humanTyping('bot@spambot.com\\n', speed=(0.005, 0.008))\n\n    time.sleep(random.uniform(0.5, 1.0))\n\n    # agree to the terms\n    terms = getCoords('input[name=\"terms\"]')\n    humanMove(*terms)\n\n    # select cats\n    cat = getCoords('#bigCat')\n    humanMove(*cat)\n\n    # submit\n    submit = getCoords('#submit')\n    humanMove(*submit)\n\n    # press the final enter\n    time.sleep(random.uniform(2.5, 3.4))\n    humanTyping('\\n', speed=(0.005, 0.008))\n\n    # finally get the page source\n    text = getPageSource()\n    print('Got {} bytes of page soure'.format(len(text)))\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "immobilienscout24.py",
    "content": "import time\nimport random\nimport json\nimport sys\nimport pprint\nfrom behavior.sst_utils import *\nfrom behavior.behavior import humanMove, humanScroll, typeNormal, getDim\nimport immo_env\n\n\"\"\"\nthis is an example how to scrape www.immobilienscout24.de with stealthy-scraping-tools\n\nwww.immobilienscout24.de is protected by advanced bot protection: Imperva\n\nadvanced?\n\nLet's see ;)\n\"\"\"\n\nif not os.path.exists('apartments.json'):\n  with open('apartments.json', 'w') as f:\n    json.dump(dict(), f)\n\napartments = json.load(open('apartments.json', 'r'))\n\nSEARCH_URL = immo_env.SEARCH_URL\n\n\ndef startFluxbox():\n  # start fluxbox\n  os.system('fluxbox &')\n  time.sleep(3)\n\n\ndef startVNC():\n  # and a vnc server for debugging remotely\n  vnc_cmd = 'x11vnc -display {}.0 -forever -passwd {} &'.format(\n    os.environ['DISPLAY'],\n    os.environ['X11VNC_PASSWORD'],\n  )\n  print(vnc_cmd)\n  os.system(vnc_cmd)\n\n\ndef moveRandomly(steps=2):\n  width, height = getDim()\n  width = min(1920, width)\n  # this is where the bot check is happening\n  # move the mouse a bit\n  for i in range(steps):\n    humanMove(*(random.randrange(0, width-50), random.randrange(0, height-50)),\n     clicks=0, steps=2)\n    time.sleep(random.uniform(0.25, 1.0))\n\n\ndef contact(listing):\n  \"\"\"\n  contact the listing. This is where I get mostly blocked.\n  \"\"\"\n  goto('https://www.immobilienscout24.de' + listing.get('url'))\n  moveRandomly(steps=4)\n\n  already_contacted = getCoords('.is24-icon-heart-Favorite-glyph') is not None\n  if already_contacted:\n    print('Listing {} already contacted'.format(listing.get('url')))\n    return True\n\n  # contact\n  contact_button = getCoords(\"a span.palm-hide.email-button-desk-text.font-standard\")\n  humanMove(*contact_button, clicks=1)\n  time.sleep(random.uniform(4, 5.5))\n\n  # check if message already entered\n  already_entered = json.loads(evalJS('document.getElementById(\"contactForm-Message\").value.includes(\"und Langfristiges\")')) is True\n  if not already_entered:\n    evalJS('document.getElementById(\"contactForm-Message\").value = `{}`'.format(''))\n    # input message\n    input_el = getCoords(\"#contactForm-Message\")\n    humanMove(*input_el, clicks=3)\n    # typeNormal('Guten Tag, ')\n    # time.sleep(random.uniform(0.5, 1.1))\n    evalJS('document.getElementById(\"contactForm-Message\").value = `{}`'.format(immo_env.MESSAGE))\n    time.sleep(random.uniform(0.5, 1.1))\n\n  time.sleep(random.uniform(0.5, 1.1))\n\n  no_pets = getCoords('[for=\"contactForm-hasPets.no\"]')\n  if no_pets:\n    humanScroll(4, (5, 20), -1)\n    time.sleep(random.uniform(1.5, 1.5))\n    no_pets = getCoords('[for=\"contactForm-hasPets.no\"]')\n    humanMove(*no_pets, clicks=1)\n    submit = getCoords('button.button-primary.padding-horizontal-m')\n    humanMove(*submit, clicks=1)\n  else:\n    submit = getCoords('button[data-qa=\"sendButtonBasic\"]')\n    humanMove(*submit, clicks=1)\n\n  time.sleep(random.uniform(3.9, 5.9))\n  return True\n\n\ndef is_detected():\n  detected = json.loads(evalJS(\"JSON.stringify(document.body.textContent.includes('Warum haben wir deine Anfrage blockiert?'));\")) == True\n  other = json.loads(evalJS(\"JSON.stringify(document.body.textContent.includes('Sicherheitsabfrage'));\")) == True\n  if detected or other:\n    print('Got detected as a bot. Aborting.')\n    sys.exit(0)\n    return True\n  else:\n    return False\n\n\ndef main():\n  if os.getenv('DOCKER') == '1':\n    startFluxbox()\n    startVNC()\n\n  # startBrowser(args=['--incognito'])\n  startBrowser(args=[])\n\n  if os.getenv('DOCKER') == '1':\n    # close the annoying chrome error message bar\n    # it skews with coordinates\n    # x:1903 y:114 screen:0 window:195035139\n    # x:1889 y:113 screen:0 window:195035139\n    humanMove(1893, 103)\n    humanMove(1889, 103)\n    time.sleep(random.uniform(2.5, 3.5))\n\n  try:\n    goto('https://www.immobilienscout24.de')\n    moveRandomly()\n\n    # are there cookies to accept?\n    # cookie consent is in an iframe with id '#gdpr-consent-notice'\n    # coords = getCoords('button#save', '#gdpr-consent-notice')\n    coords = 1099, 859\n    print(f'Accept Cookies by clicking at {coords}')\n    humanMove(*coords)\n    time.sleep(random.uniform(3.5, 4.5))\n\n    # login with username and password\n    profile_button = getCoords('#link_loginAccountLink')\n    humanMove(*profile_button, clicks=0)\n    time.sleep(random.uniform(0.5, 2))\n\n    login_button = getCoords(\"#is24-dropdown > div.MyscoutDropdownV2_LoginContainer__3X0hy.topnavigation__sso-login__link-list--logged-out > a\")\n    # if login button not visible, we are logged in probably\n    if login_button:\n      humanMove(*login_button, clicks=1)\n\n      time.sleep(random.uniform(2.5, 3))\n\n      user_input = getCoords('#username')\n      if not user_input:\n        raise Exception('Cannot find username input field by id #username')\n\n      time.sleep(random.uniform(1.5, 2))\n\n      humanMove(*user_input, clicks=1)\n      time.sleep(random.uniform(0.25, 1.25))\n      typeNormal(immo_env.EMAIL)\n      time.sleep(random.uniform(0.25, 1.25))\n\n      humanMove(*getCoords('#submit'), clicks=1)\n      time.sleep(random.uniform(2.25, 3.25))\n\n      humanMove(*getCoords('#password'), clicks=1)\n      time.sleep(random.uniform(0.25, 1.25))\n      typeNormal(immo_env.PASSWORD)\n      time.sleep(random.uniform(1.25, 2.25))\n\n      humanMove(*getCoords('#loginOrRegistration'), clicks=1)\n      time.sleep(random.uniform(2.25, 3.55))\n\n    goto(SEARCH_URL)\n\n    humanScroll(8, (5, 20), -1)\n\n    # finally parse the listings\n    parse_listings = \"\"\"var res = [];\n  document.querySelectorAll(\".result-list__listing\").forEach((el) => {\n  let title = el.querySelector(\".result-list-entry__brand-title\");\n  let details = el.querySelector(\".result-list-entry__criteria\");\n\n  if (title) {\n    let obj = {\n      contacted: false,\n      title: title.textContent,\n      url: el.querySelector(\"a.result-list-entry__brand-title-container\").getAttribute(\"href\"),\n    };\n    if (details) {\n      obj.location = el.querySelector(\".result-list-entry__map-link.link-text-secondary.font-normal.font-ellipsis\").textContent;\n      obj.price = details.querySelector(\"dl.grid-item:nth-child(1)\").textContent;\n      obj.area = details.querySelector(\"dl.grid-item:nth-child(2)\").textContent;\n      obj.rooms = details.querySelector(\"dl.grid-item:nth-child(3)\").textContent;\n    }\n    res.push(obj);\n  }\n  });\n  JSON.stringify(res);\"\"\"\n\n    output = evalJS(parse_listings)\n    listings = json.loads(output)\n    # pprint.pprint(listings)\n    filtered_listings = {}\n\n    for el in listings:\n      if el.get('url'):\n        key = el.get('url')\n        location = el.get('location', '').lower().strip()\n        if immo_env.FILTER_LISTINGS:\n          for pref in immo_env.PREFERRED_LOCATIONS:\n            if pref.lower().strip() in location:\n              filtered_listings[key] = el\n        else:\n          filtered_listings[key] = el\n\n    # remove listings we already contacted\n    for key in apartments:\n      if key in filtered_listings:\n        if apartments[key].get('contacted', False):\n          print('already contacted listing ' + key)\n          del filtered_listings[key]\n\n    pprint.pprint(filtered_listings)\n\n    print('contacting {} listings'.format(len(filtered_listings)))\n    for key in filtered_listings:\n      try:\n        contacted = contact(filtered_listings[key])\n      except Exception as e:\n        print('Failed to contact {}. Blocked? Error: {}'.format(key, str(e)))\n        is_detected()\n\n      filtered_listings[key]['contacted'] = contacted\n      time.sleep(random.uniform(0.5, 1.25))\n\n    # update?\n    for k, v in filtered_listings.items():\n      apartments[k] = v\n\n    with open('apartments.json', 'w') as f:\n      json.dump(apartments, f)\n      print('Updated database')\n\n    closeBrowser()\n  except Exception as e:\n    print('Error: {}'.format(e))\n    is_detected()\n\n\nif __name__ == '__main__':\n  main()\n"
  },
  {
    "path": "local_forward_proxy_server/proxy_server.js",
    "content": "const ProxyChain = require('proxy-chain');\n\nasync function startProxyServer(proxy) {\n  return new Promise(function(resolve, reject) {\n    const server = new ProxyChain.Server({\n      // Port where the server will listen. By default 8947.\n      port: 8947,\n      // Enables verbose logging\n      verbose: false,\n      prepareRequestFunction: function (params) {\n        var {request, username, password, hostname, port, isHttp, connectionId} = params;\n        console.log('isHttp: ' + isHttp);\n        console.log('port: ' + port);\n        console.log('hostname: ' + hostname);\n        console.log('headers: ' + JSON.stringify(request.headers));\n        return {\n          requestAuthentication: false,\n          // http://username:password@proxy.example.com:3128\n          upstreamProxyUrl: proxy,\n        };\n      },\n    });\n\n    // Emitted when HTTP connection is closed\n    server.on('connectionClosed', (params) => {\n      var {connectionId, stats} = params;\n      console.log(`Connection ${connectionId} closed`);\n    });\n\n    // Emitted when HTTP request fails\n    server.on('requestFailed', (params) => {\n      var {request, error} = params;\n      console.error(`Request ${request.url} failed`);\n      console.error(error);\n    });\n\n    server.listen(() => {\n      console.log(`ProxyServer listening on port ${server.port}`);\n      resolve(server);\n    });\n  });\n}\n\n// Start local forwarding server with: node proxy_server.js http://username:password@proxy.example.com:3128\n// Use the local forwarding proxy server with google-chrome:\n// google-chrome --proxy-server=\"localhost:8947\"\n\nif (process.argv.length === 3) {\n  (async () => {\n    await startProxyServer(process.argv[2]);\n  })();\n}\n"
  },
  {
    "path": "lufthansa-de.py",
    "content": "import time\nimport random\nfrom behavior.sst_utils import *\nfrom behavior.behavior import humanMove, humanScroll, press, typeNormal, clickNormal, typeWrite\n\n\"\"\"\nthis is an example how to scrape www.lufthansa.de with stealthy-scraping-tools\n\"\"\"\n\ndef startFluxbox():\n  # start fluxbox\n  os.system('fluxbox &')\n  time.sleep(3)\n\n\ndef startVNC():\n  # and a vnc server for debugging remotely\n  vnc_cmd = 'x11vnc -display {}.0 -forever -passwd {} &'.format(\n    os.environ['DISPLAY'],\n    os.environ['X11VNC_PASSWORD'],\n  )\n  print(vnc_cmd)\n  os.system(vnc_cmd)\n\n\ndef main():\n  if os.getenv('DOCKER') == '1':\n    startFluxbox()\n    startVNC()\n\n  startBrowser(args=[])\n\n  if os.getenv('DOCKER') == '1':\n    # close the annoying chrome error message bar\n    # it skews with coordinates\n    # x:1903 y:114 screen:0 window:195035139\n    # x:1889 y:113 screen:0 window:195035139\n    humanMove(1893, 103)\n    humanMove(1889, 103)\n    time.sleep(random.uniform(2.5, 3.5))\n\n  for i in range(150):\n    print(f'[{i}] Searching for flights...')\n    time.sleep(random.uniform(0.5, 1.0))\n\n    goto('https://www.lufthansa.com/de/de/homepage')\n    time.sleep(random.uniform(4, 6))\n\n    # accept cookies?\n    if i == 0:\n      try:\n        cookie_accept = getCoords('#cm-acceptAll')\n        if cookie_accept:\n          humanMove(*cookie_accept, clicks=1)\n          time.sleep(random.uniform(0.25, 1.25))\n      except Exception as e:\n        print('No cookies to accept, #cm-acceptAll not found')\n\n    # enter where to go\n    try:\n      input_loc = getCoords('input[placeholder=\"Nach\"]')\n      print('Enter Departure ' + str(input_loc))\n      humanMove(*input_loc, clicks=2)\n      time.sleep(random.uniform(0.25, 1.25))\n      typeNormal(random.choice(['Berlin', 'Paris', 'Tel Aviv', 'Stockholm', 'Bogota', 'Bangkok', 'New York']))\n      time.sleep(random.uniform(1.5, 2.5))\n      press('down')\n      time.sleep(random.uniform(0.5, 1.0))\n      press('enter')\n      time.sleep(random.uniform(0.5, 1.0))\n    except Exception as e:\n      print(f'[{i}] Could not enter flight destination. Blocked?')\n      continue\n\n    # input return date\n    try:\n      backdate = getCoords('input[placeholder=\"Rückflugdatum\"]')\n      print('backdate ' + str(backdate))\n      humanMove(*backdate, clicks=1)\n      time.sleep(random.uniform(4.55, 5.55))\n    except Exception as e:\n      print(f'[{i}] Could not click on return value. Leaving untouched.')\n\n    # enter departure date\n    try:\n      datetile = getCoords(random.choice(['[aria-label^=\"Choose Samstag, 25 Dezember 2021\"]', '[aria-label^=\"Choose Sonntag, 26 Dezember 2021\"]']))\n      print('datetile ' + str(datetile))\n      humanMove(*datetile, clicks=1)\n      time.sleep(random.uniform(2.25, 3.25))\n    except Exception as e:\n      print(f'[{i}] Could not select return date. Keeping default value.')\n\n    # submit\n    try:\n      submit = getCoords('[type=\"submit\"]')\n      print('Submit ' + str(submit))\n      humanMove(*submit)\n    except Exception as e:\n      print(f'[{i}] Could not submit search. Blocked?')\n      continue\n\n    # wait for quite some time\n    time.sleep(random.uniform(10, 14))\n    humanScroll(2, (5, 20), -1)\n\n    try:\n      calendar = getCoords('#page .calendarTab')\n      if calendar:\n        print(f'[{i}] Flight Results loaded!')\n    except Exception as e:\n      print(f'[{i}] Could not find calendar for flights. Page load to slow?')\n\n\n\nif __name__ == '__main__':\n  main()"
  },
  {
    "path": "requirements.txt",
    "content": "MouseInfo==0.1.3\nPillow==8.4.0\npkg_resources==0.0.0\nPyAutoGUI==0.9.53\nPyGetWindow==0.0.9\nPyMsgBox==1.0.9\npyperclip==1.8.2\nPyRect==0.1.4\nPyScreeze==0.1.28\npython3-xlib==0.15\npytweening==1.0.4\n"
  },
  {
    "path": "start.sh",
    "content": "#!/bin/bash\nset -e\n\n# When docker restarts, this file is still there,\n# so we need to kill it just in case\n[ -f /tmp/.X99-lock ] && rm -f /tmp/.X99-lock\n\n_kill_procs() {\n  kill -TERM $python\n  kill -TERM $xvfb\n  kill -TERM $chrome\n}\n\n# Relay quit commands to processes\ntrap _kill_procs SIGTERM SIGINT\n\n\n# https://github.com/browserless/chrome/blob/307fa139b4c65f314a083891e1dbdb2dddeafcb7/start.sh\n# Alternatively:\n# xvfb-run -e /dev/stdout --server-num=99 --server-args=\"-ac -screen 0 $XVFB_WHD -nolisten tcp -nolisten unix\" python3 -u immobilienscout24.py\n# echo \"Starting X virtual framebuffer\";\n# python3 -u behavior/start_disp.py &\n# Xvfb $DISPLAY -ac -screen 0 $XVFB_WHD -nolisten tcp -nolisten unix &\n# xvfb=$!\n\n# sleep 3\n\n# GENERATE .Xauthority file\n# xauth with complain unless ~/.Xauthority exists\n# touch $HOME/.Xauthority\n# # only this one key is needed for X11 over SSH \n# xauth generate $DISPLAY . trusted\n# # generate our own key, xauth requires 128 bit hex encoding\n# xauth add $DISPLAY . $(xxd -l 16 -p /dev/urandom)\n# # To view a listing of the .Xauthority file, enter the following \n# xauth list\n\n\necho \"Blocking all UDP traffic except DNS\";\nid\n\n# https://serverfault.com/questions/222606/how-can-i-reject-all-incoming-udp-packets-except-for-dns-lookups/716035\n# how can I reject all traffic I didn't initiate with Linux netfilter?\nsudo iptables --version\nsudo iptables -A DOCKER-USER -m state --state ESTABLISHED,RELATED -j ACCEPT\n\nsudo iptables -A DOCKER-USER -p udp --dport 53 -j ACCEPT -m comment --comment \"we serve DNS\"\nsudo iptables -A DOCKER-USER -p tcp --dport 53 -j ACCEPT -m comment --comment \"DNS uses TCP too sometimes\"\n\nsudo iptables -A DOCKER-USER -j DROP\n\n\necho \"Starting browser\";\n# Avoid chrome in docker crashing: https://github.com/stephen-fox/chrome-docker/issues/8\n# Option 1: Run chrome with --disable-dev-shm-usage\n# Option 2: Set /dev/shm size to a reasonable amount docker run -it --shm-size=1g replacing 1g with whatever amount you want.\n# google-chrome --remote-debugging-port=9222 --no-sandbox --disable-notifications --start-maximized --no-first-run --no-default-browser-check --incognito &\n# chrome=$!\n\nsleep 5\n\n# https://abhishekvaid13.medium.com/pyautogui-headless-docker-mode-without-display-in-python-480480599fc4\necho \"Running bot\";\npython3 -u immobilienscout24.py &\npython=$!\n\n# echo \"Starting x11vnc\";\n# x11vnc -display $DISPLAY.0 -forever -passwd ${X11VNC_PASSWORD:-password} &\n# vnc_server=$!\n\nwait $python\necho \"bot terminated\";\nwait $xvfb\nwait $chrome\nwait $vnc_server\n"
  },
  {
    "path": "test.py",
    "content": "from sst_utils import *\nimport pprint \nimport json \n\nparse_listings = \"\"\"var res = [];\ndocument.querySelectorAll(\".result-list__listing\").forEach((el) => {\nlet title = el.querySelector(\".result-list-entry__brand-title\");\nlet details = el.querySelector(\".result-list-entry__criteria\");\n\nif (title) {\n  let obj = {\n    title: title.textContent,\n    url: el.querySelector(\"a.result-list-entry__brand-title-container\").getAttribute(\"href\"),\n  };\n  if (details) {\n    obj.price = details.querySelector(\"dl.grid-item:nth-child(1)\").textContent;\n    obj.area = details.querySelector(\"dl.grid-item:nth-child(2)\").textContent;\n    obj.rooms = details.querySelector(\"dl.grid-item:nth-child(3)\").textContent;\n  }\n  res.push(obj);\n}\n});\nJSON.stringify(res);\"\"\"\n\nlistings = evalJS(parse_listings)\npprint.pprint(json.loads(listings))"
  }
]