Repository: RuthGnz/SpyScrap Branch: master Commit: 02ddd87436df Files: 57 Total size: 150.7 KB Directory structure: gitextract_i6xpbykc/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ └── workflows/ │ └── codeql-analysis.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── src/ │ ├── data/ │ │ └── .gitignore │ ├── main.py │ ├── osint_sources/ │ │ ├── __init__.py │ │ ├── boe.py │ │ ├── facebook.py │ │ ├── google.py │ │ ├── instagram.py │ │ ├── model.py │ │ ├── recognition.py │ │ ├── scraper.py │ │ ├── tinder.py │ │ ├── twitter.py │ │ └── yandex.py │ └── requirements.txt └── web/ ├── Docker-compose.yaml ├── README.md ├── back/ │ ├── .gitignore │ ├── Dockerfile │ └── osint-back/ │ ├── api.py │ ├── back_model.py │ ├── back_requirements.txt │ ├── controller.py │ ├── server.py │ └── uploads/ │ └── .gitignore ├── data/ │ └── .gitignore ├── front/ │ ├── .gitignore │ ├── Dockerfile │ └── osint-front/ │ ├── .gitignore │ ├── README.md │ ├── babel.config.js │ ├── package.json │ ├── public/ │ │ └── index.html │ ├── src/ │ │ ├── App.vue │ │ ├── main.js │ │ ├── plugins/ │ │ │ └── vuetify.js │ │ ├── router.js │ │ └── views/ │ │ ├── About.vue │ │ ├── Boe.vue │ │ ├── Facebook.vue │ │ ├── Google.vue │ │ ├── Home.vue │ │ ├── Instagram.vue │ │ ├── Score.vue │ │ ├── Tinder.vue │ │ ├── Twitter.vue │ │ └── Yandex.vue │ └── vue.config.js └── reverse-proxy/ ├── Dockerfile └── nginx.conf ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. **Desktop (please complete the following information):** - OS: [e.g. iOS] - Browser [e.g. chrome, safari] - Version [e.g. 22] **Smartphone (please complete the following information):** - Device: [e.g. iPhone6] - OS: [e.g. iOS8.1] - Browser [e.g. stock browser, safari] - Version [e.g. 22] **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .github/workflows/codeql-analysis.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ master ] pull_request: # The branches below must be a subset of the branches above branches: [ master ] schedule: - cron: '16 22 * * 2' jobs: analyze: name: Analyze runs-on: ubuntu-latest strategy: fail-fast: false matrix: language: [ 'javascript', 'python' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] # Learn more: # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed steps: - name: Checkout repository uses: actions/checkout@v2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v1 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # queries: ./path/to/local/query, your-org/your-repo/queries@main # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v1 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # and modify them (or add more) to build your code if your project # uses a compiled language #- run: | # make bootstrap # make release - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v1 ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ rooted # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ *.db tinder.json database.db chromedriver_linux64/ /openface/ chromedriver /darta/ /boe/* /instagram/* /twitter/* /facebook/* /google/* /data/* .DS_Store ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). ## [1.1.0] - 2020-12-01 Fixed - Fix problems in scraping module ## [1.0.0] - 2019-09-14 Added - Adding web interface. - CLI ================================================ FILE: Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update \ && apt-get install -y python3-pip python3-dev \ && cd /usr/local/bin \ && ln -s /usr/bin/python3 python \ && pip3 install --upgrade pip RUN apt-get install -y --fix-missing \ build-essential \ cmake \ gfortran \ git \ wget \ curl \ pkg-config \ python3-dev \ python3-numpy \ software-properties-common \ zip \ unzip \ && apt-get clean && rm -rf /tmp/* /var/tmp/* RUN cd ~ && \ mkdir -p dlib && \ git clone -b 'v19.9' --single-branch https://github.com/davisking/dlib.git dlib/ && \ cd dlib/ && \ python3 setup.py install --yes USE_AVX_INSTRUCTIONS RUN mkdir spyscrap WORKDIR spyscrap # Install Google Chrome RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb RUN dpkg -i google-chrome-stable_current_amd64.deb; apt-get -fy install # Install Chrome WebDriver RUN CHROMEDRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` && \ mkdir -p /opt/chromedriver-$CHROMEDRIVER_VERSION && \ curl -sS -o /tmp/chromedriver_linux64.zip http://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip && \ unzip -qq /tmp/chromedriver_linux64.zip -d /opt/chromedriver-$CHROMEDRIVER_VERSION && \ rm /tmp/chromedriver_linux64.zip && \ chmod +x /opt/chromedriver-$CHROMEDRIVER_VERSION/chromedriver && \ ln -fs /opt/chromedriver-$CHROMEDRIVER_VERSION/chromedriver /usr/local/bin/chromedriver COPY ./src/requirements.txt . RUN pip3 install -r ./requirements.txt RUN python -m spacy download es_core_news_sm COPY ./src/ . RUN cp /usr/local/bin/chromedriver . ENV PYTHONIOENCODING=utf-8 ENTRYPOINT ["python3","main.py"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 Ruth Gonzalez - Miguel Hernández Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ![Last update](https://img.shields.io/badge/last%20update-11%20FEB%202021-green.svg?style=flat-square) ![OSINT](https://img.shields.io/badge/OSINT-brightgreen.svg?style=flat-square) ![SOCMINT](https://img.shields.io/badge/SOCMINT-brightgreen.svg?style=flat-square) ![TOOL](https://img.shields.io/badge/TOOL-brightgreen.svg?style=flat-square) ![CYBERSECURITY](https://img.shields.io/badge/CYBERSECURITY-brightgreen.svg?style=flat-square) ![SPYSCRAP](https://img.shields.io/badge/SPYSCRAP-brightgreen.svg?style=flat-square) # SpyScrap ![alt text](./SpyScrap.png) This is an [OSINT](https://en.wikipedia.org/wiki/Open-source_intelligence) tool. The main purpose is recolect information from different sources like Google, Tinder, Twitter and more. It combines facial recognition methods to filter the results and uses natural language processing for obtaining important entities from the website the user appears. The tool is able to calculate a final score which indicates the amount of public exposition an user has on the Internet. It has two different modules that can work indepently: CLI and Web Interface. Both modules are built using docker and are easy to deploy. If you like the tool, give us a star! :star: # CLI CLI Module for web scraping: * Tinder * Instagram * Yandex * Google * Facebook * BOE * Twitter ## Prerequisites Docker and docker-compose ### Installation ```bash docker build -t spyscrap . docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap [options] ``` You must put the image you want to be used for facial recognition under the shared volume in docker as in the next example: ``` docker run -ti -v /Users/ruthgnz/Documents/osint/SpyScrap/src/data:/spyscrap/data sp -t twitter -n "ruth gonzalez novillo" -i ./data/descarga.jpeg ``` ## Usage ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap [options] ``` Get Tinder users and store data in sqlite3 database. Tinder Token must be capturen when logging into Tinder App under Local Storage. ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t tinder -k TOKEN ``` Search in google. Add -i to download images and do facial recognition Add -p to only search in an specific site Ex: Linkedin ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag google -n "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag google -n "" -i docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag google -n "" -i -p "" ``` Search twitter profiles ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t twitter -n "" -s ``` Search facebook profiles Add -i to download images do facial recognition ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t facebook -n "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag facebook -n "" -i ``` Search instagram profiles Add -i to download instagram profile image and do facial recognition ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t instagram -n "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t instagram -n "" -i ``` Search DNI, Names and Surnames in BOE ```bash docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t boe -n "" -s docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t boe -n "" -s -e -d -f ``` OTHER EXAMPLES: ``` docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap [options] docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t tinder -k TOKEN docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag google -n "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag google -n "" -i docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag google -n "" -i -p "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t twitter -n "" -s docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t facebook -n "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap --tag facebook -n "" -i docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t instagram -n "" docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t instagram -n "" -i docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t boe -n "" -s docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap -t boe -n "" -s -e -d -f docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap main.py -t yandex -k -i docker run -ti -v /PATH/TO/SpyScrap/src/data:/spyscrap/data spyscrap main.py -t yandex -i ``` All the results are stored in the docker shared volume you must have configured on your localhost when running the container. The first part is the path for your local folder and you can change it. The second part must be the one in the example (/spyscrap/data) ```bash -v /PATH/TO/SpyScrap/src/data:/spyscrap/data ``` # Web Interface This is a wrapper for the CLI. ![alt text](./GUI.png) ## Prerequisites Docker and docker-compose ### Installation ```bash cd web docker-compose up ``` Once the images are built, open the browser: ``` http:\\localhost ``` For searching in Tinder you must put the database.db file created using the CLI in the volume inside the folder: ``` SpyScrap\web\data ``` You will also find in this folder the results of all your searches on the web interface. ## DISCLAIMER ⚠️ This tool is for educational purposes only. Please only use this tool on systems you have permission to access! Ethical use only. Any actions and or activities related to the tools we have created is solely your responsibility. The misuse of the tools we have created can result in criminal charges brought against the persons in question. We will not be held responsible in the event any criminal charges be brought against any individuals misusing the tools we have made to break the law. ## Authors ✒️ * **Ruth González** - [@RuthGnz](https://twitter.com/RuthGnz) * **Miguel Hernández** - [@MiguelHzBz](https://twitter.com/MiguelHzBz) ## Thanks 🎁 * BBVA Next Technologies SecLab Team
Feel free to collaborate!!
> NOTE: Facial recognition is slow. The tool doesn't implement threading, and depends on your computer power. Be patient when the dataset is huge and you are using images to filter the results, specially on the Tinder module. --- ⌨️ with ❤️ by [@RuthGnz](https://twitter.com/RuthGnz) & [@MiguelHzBz](https://twitter.com/MiguelHzBz) ================================================ FILE: src/data/.gitignore ================================================ # Ignore everything in this directory * # Except this file !.gitignore ================================================ FILE: src/main.py ================================================ from osint_sources.model import create_tables from osint_sources.scraper import * import sys, getopt import argparse def banner(): print(r""" ------------------------------------------ | SpyScrap | | | | Authors: Ruth Gonzalez (@RuthGnz) | | Miguel Hernandez (@MiguelHzBz) | | Version: 1.0 | | | | | ------------------------------------------ """) def getArguments(args): arguments={} parser = argparse.ArgumentParser(description='EI - This tool scrapping social media to get information from a target') parser.add_argument('-t','--tag',dest='tag', help='Insert the option to scrapper, options: tinder, twitter, google, facebook, instagram or all') parser.add_argument('-k','--token',dest='token', help='If you choose tinder/yandex option, provide a valid token') parser.add_argument('-n','--name',dest='name', help='Name of person you like to search.') parser.add_argument("-p",'--place',dest='place', help="Location you like to search") parser.add_argument("-i",'--image',dest='image', help="Image you like to search") parser.add_argument("-s",'--size',dest='size', help="Limit for searches") parser.add_argument("-e",'--explicit', dest='explicit', help="Default True. If true it search the exact text, if false it can search each word separately") parser.add_argument("-d",'--initdate',dest='initdate',help="Format is dd/mm/aaaa") parser.add_argument('-f','--finaldate',dest='finaldate', help="Format is dd/mm/aaaa") parser.add_argument("-v",'--verbose', action="store_true",help="Increase output verbosity") args = parser.parse_args() if not args.tag: print ("--------------") print ("Error in input arguments: ") print ("Need one tag of input, -t/--tag twitter/facebook/instagram/google/tinder/boe/yandex/all") print ("--------------") parser.print_help() sys.exit(-1) if args.tag.lower() == "tinder": if not args.token: print ("--------------") print ("Tinder token must be provided") print ("--------------") parser.print_help() sys.exit(-1) else: print ("Starting Tinder scrapper...") tinder(args.token) if args.tag.lower() == "google": if not args.name: print ("--------------") print ("Name option must be provided") print ("--------------") parser.print_help() sys.exit(-1) else: if not args.place: args.place='' print ("Starting Google scrapper...") google(args.name,args.place,args.image,args.size,args.verbose) if args.tag.lower() == "twitter": if not args.name: print ("--------------") print ("Name option must be provided") print ("--------------") parser.print_help() sys.exit(-1) if not args.size: print ("Size default: 2") args.size=2 print ("Starting Twitter scrapper...") twitter_scrapper(args.name,args.size,args.image,args.verbose) if args.tag.lower() == "facebook": if not args.name: print ("--------------") print ("Name must be provided") print ("--------------") parser.print_help() sys.exit(-1) if not args.size: print ("Size default: 2") args.size=2 print ("Starting Facebook scrapper...") facebook_scrapper(args.name,args.image,args.size,args.verbose) if args.tag.lower() == "instagram": if not args.name: print ("--------------") print ("Name option must be provided") print ("--------------") parser.print_help() sys.exit(-1) else: print ("Starting Instagram scrapper...") instagram_scrapper(args.name,args.image,args.verbose) if args.tag.lower()=="boe": if not args.name: print ("--------------") print ("Name option must be provided") print ("--------------") parser.print_help() sys.exit(-1) else: if not args.size: args.size=1 print ("Starting Boe scrapper...") boe_scrapper(args.name,args.initdate,args.finaldate,args.size,args.explicit,args.verbose) if args.tag.lower()=="yandex": if not args.image: print ("--------------") print ("image option must be provided") print ("--------------") parser.print_help() sys.exit(-1) if not re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+] |[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', args.image) and not args.token: print ("--------------") print ("if you upload your own photo, client-id imgur must be provided with token option " + args.image) print ("--------------") parser.print_help() sys.exit(-1) else: print ("Starting Yandex scrapper...") yandex_scrapper(args.image, args.token,args.verbose) if args.tag.lower() == "all": print ("--------------") print ("TBD") print ("--------------") sys.exit(-1) return args def main(argv): banner() if not os.path.isdir("data"): os.mkdir("data"); args = getArguments(argv) print("--------------------") print ("Thanks for use SypScrap tool") if __name__ == '__main__': create_tables() main(sys.argv) #sys.exit(-1) ================================================ FILE: src/osint_sources/__init__.py ================================================ ================================================ FILE: src/osint_sources/boe.py ================================================ #!/usr/bin/python # coding: utf-8 # encoding=utf8 import sys import datetime from selenium.webdriver.common.keys import Keys import time from selenium import webdriver import os from parsel import Selector import urllib.parse from selenium.common.exceptions import NoSuchElementException import json from selenium.webdriver.chrome.options import Options import shutil import requests from io import BytesIO import xml.etree.ElementTree as ET def boe (text_to_search,initDate,outDate,pages,exact,verbose): if exact: text_to_search='"'+text_to_search+'"' pages=int(pages) chrome_options = Options() jsonData=[] chrome_options.add_argument("--headless") chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') if initDate!=None and outDate!=None: url = 'https://www.boe.es/buscar/boe.php?campo%5B0%5D=ORI&dato%5B0%5D%5B1%5D=1&dato%5B0%5D%5B2%5D=2&dato%5B0%5D%5B3%5D=3&dato%5B0%5D%5B4%5D=4&dato%5B0%5D%5B5%5D=5&dato%5B0%5D%5BT%5D=T&operador%5B0%5D=and&campo%5B1%5D=TIT&dato%5B1%5D=&operador%5B1%5D=and&campo%5B2%5D=DEM&dato%5B2%5D=&operador%5B2%5D=and&campo%5B3%5D=DOC&dato%5B3%5D='+text_to_search+'&operador%5B3%5D=and&campo%5B4%5D=NBO&dato%5B4%5D=&operador%5B4%5D=and&campo%5B5%5D=NOF&dato%5B5%5D=&operador%5B5%5D=and&operador%5B6%5D=and&campo%5B6%5D=FPU&dato%5B6%5D%5B0%5D='+initDate+'&dato%5B6%5D%5B1%5D='+outDate+'&page_hits=50&sort_field%5B0%5D=fpu&sort_order%5B0%5D=desc&sort_field%5B1%5D=ori&sort_order%5B1%5D=asc&sort_field%5B2%5D=ref&sort_order%5B2%5D=asc&accion=Buscar' else: url ='https://www.boe.es/buscar/boe.php?campo%5B0%5D=ORI&dato%5B0%5D%5B1%5D=1&dato%5B0%5D%5B2%5D=2&dato%5B0%5D%5B3%5D=3&dato%5B0%5D%5B4%5D=4&dato%5B0%5D%5B5%5D=5&dato%5B0%5D%5BT%5D=T&operador%5B0%5D=and&campo%5B1%5D=TIT&dato%5B1%5D=&operador%5B1%5D=and&campo%5B2%5D=DEM&dato%5B2%5D=&operador%5B2%5D=and&campo%5B3%5D=DOC&dato%5B3%5D='+text_to_search+'&operador%5B3%5D=and&campo%5B4%5D=NBO&dato%5B4%5D=&operador%5B4%5D=and&campo%5B5%5D=NOF&dato%5B5%5D=&operador%5B5%5D=and&operador%5B6%5D=and&campo%5B6%5D=FPU&dato%5B6%5D%5B0%5D=&dato%5B6%5D%5B1%5D=&page_hits=50&sort_field%5B0%5D=fpu&sort_order%5B0%5D=desc&sort_field%5B1%5D=ori&sort_order%5B1%5D=asc&sort_field%5B2%5D=ref&sort_order%5B2%5D=asc&accion=Buscar' chrome_path = './chromedriver' driver = webdriver.Chrome(chrome_path,chrome_options=chrome_options) driver.get(url) driver.implicitly_wait(20) links=[] for page in range(pages): elements=driver.find_elements_by_tag_name('li') for link in elements: li=link.get_attribute('class') if li=='resultado-busqueda': date=link.find_elements_by_tag_name('h4')[0].get_attribute('innerHTML').split(' ')[3] a=link.find_elements_by_tag_name('a') for i in a: cl=i.get_attribute('class') if cl=='resultado-busqueda-link-defecto': href=i.get_attribute('href') href=href.split('=')[1] newUrl='https://www.boe.es/diario_boe/xml.php?id='+href links.append(newUrl) nextPage=driver.find_elements_by_class_name('pagSig') if len(nextPage) == 0: break else: nextPage=nextPage[0] nextPageLink= nextPage.find_element_by_xpath('..').get_attribute('href') driver.get(nextPageLink) driver.quit() boe_data=[] for url in links: boe={} boe['url']=url remoteFile = urllib.request.urlopen(url).read() memoryFile = BytesIO(remoteFile) tree = ET.parse(memoryFile) root = tree.getroot() text=root.find('texto') tables=text.findall('table') results=[] for table in tables: is_important=False headings=[] content=[] thead=table.find('thead') if thead!=None: tr=thead.find('tr') th=tr.findall('th') if len(th)>0: for t in th: if t!=None: data=t.text if data!=None: headings.append(data) if 'nombre' in data.lower() or 'apellido' in data.lower() or 'dni' in data.lower() or 'd.n.i' in data.lower() or 'nif' in data.lower(): is_important=True else: td=tr.findall('td') for t in td: p=t.find('p') if p!=None: data=p.text headings.append(data) if 'nombre' in data.lower() or 'apellido' in data.lower() or 'dni' in data.lower() or 'd.n.i' in data.lower() or 'nif' in data.lower(): is_important=True else: tr=table.findall('tr') for i,tri in enumerate(tr): td=tri.findall('td') for tdi in td: p=tdi.findall('p') heading="" for pi in p: if pi.get('class')!=None: if 'cabeza_tabla' in pi.get('class') : heading=heading+pi.text if 'nombre' in heading.lower() or 'apellido' in heading.lower() or 'dni' in heading.lower() or 'd.n.i' in heading.lower() or 'nif' in heading.lower(): is_important=True else: if pi.text != None: data=pi.text if 'nombre' in data.lower() or 'apellido' in data.lower() or 'dni' in data.lower() or 'd.n.i' in data.lower() or 'nif' in data.lower(): heading=heading+pi.text is_important=True if i==2: break if 'ANEXO' not in heading: headings.append(heading) if is_important: tbody=table.find('tbody') if tbody!=None: tr=tbody.findall('tr') else: tr=table.findall('tr') for t in tr: td=t.findall('td') dataTable={} for i,tdi in enumerate(td): info='' if tdi.text.strip()==None or tdi.text.strip()=="": p=tdi.find('p') if p!=None: info=p.text else: info=tdi.text if i>len(headings)-1: dataTable[i]=info else: dataTable[headings[i]]=info content.append(dataTable) results.append({'table':content,'headings':headings}) boe['datatables']=results texto=[] if len(results)==0: p=text.findall('p') is_important_line=len(p) for i,pi in enumerate(p): if pi.text != None: data=pi.text if 'nombre' in data.lower() or 'apellido' in data.lower() or 'dni' in data.lower() or 'd.n.i' in data.lower() or 'nif' in data.lower(): is_important_line=i if is_important_line<=i: texto.append(pi.text) #try to search the text boe['texto']=texto boe_data.append(boe) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not os.path.isdir("data/boe"): os.mkdir( "data/boe" ); path=os.path.join('data/boe',str(now) + '_boe_data.json') with open(path, 'w+') as outfile: json.dump(boe_data, outfile) if verbose: for i in boe_data: print(i["url"]) print("--------------------") if i["datatables"]: print(i["datatables"]) if i["texto"]: print(i["texto"]) print("Results BOE in: " + str(path)) response={'results':str(path)} return response ================================================ FILE: src/osint_sources/facebook.py ================================================ #!/usr/bin/python # coding: utf-8 # encoding=utf8 import sys import datetime from selenium.webdriver.common.keys import Keys import time from selenium import webdriver import os from parsel import Selector import urllib.parse from selenium.common.exceptions import NoSuchElementException import json from selenium.webdriver.chrome.options import Options import shutil import requests from osint_sources.recognition import * def facebook (name_to_search,knownimage,size,verbose): chrome_options = Options() jsonData=[] chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_path = './chromedriver' driver = webdriver.Chrome(chrome_path,chrome_options=chrome_options) driver.get("https://es-la.facebook.com/public/"+name_to_search) print("https://es-la.facebook.com/public/"+name_to_search) driver.implicitly_wait(20) isMoreButton=True for i in range(1,int(size)): isEnd=driver.find_elements_by_id('browse_end_of_results_footer') print(isEnd) if len(isEnd)>0: isMoreButton=False driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) links=[] results=driver.find_elements_by_id('BrowseResultsContainer') if len(results)>0: results=results[0] info=results.find_elements_by_tag_name('a') else: info=[] for user in info: user_class=user.get_attribute('class') if user_class=='_32mo': links.append(user.get_attribute('href')) user={'name':user.get_attribute('title'),'profile':user.get_attribute('href')} jsonData.append(user) isMoreButton=True i=0 id_value="fbBrowseScrollingPagerContainer" while isMoreButton: more=driver.find_elements_by_id(id_value+str(i)) i=i+1 if len(more)==0: isMoreButton=False else: div = more[0] info=div.find_elements_by_tag_name('a') for user in info: user_class=user.get_attribute('class') if user_class=='_32mo': links.append(user.get_attribute('href')) user={'name':user.get_attribute('title'),'profile':user.get_attribute('href')} jsonData.append(user) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not os.path.isdir("data/facebook"): os.mkdir( "data/facebook"); os.mkdir("data/facebook/"+str(now)+"_images") path= os.path.join('data/facebook',str(now)+'_facebook_data.json') with open(path, 'w+') as outfile: json.dump(jsonData, outfile) if verbose: print(jsonData) print("Results Facebook in: " + str(path)) j=0 response={'results':str(path)} if knownimage: for ind,l in enumerate(links): user=jsonData[ind] driver.get(l) try: div=driver.find_elements_by_class_name('profilePicThumb')[0] img=div.find_elements_by_tag_name('img')[0] url=img.get_attribute('src') name=os.path.join('data/facebook/'+str(now)+'_images',str(j)+"-"+name_to_search+".jpg") j=j+1 urllib.request.urlretrieve(url, name) user['image']=name except: pass with open(path, 'w+') as outfile: json.dump(jsonData, outfile) driver.quit() print("Start compare images") face_identification(knownimage,'./data/facebook/'+str(now)+'_images/') response['images']='./data/facebook/'+str(now)+'_images/' response['recognized']='./data/facebook/'+str(now)+'_images/recognized/' return response ================================================ FILE: src/osint_sources/google.py ================================================ from selenium.webdriver.common.keys import Keys import time import urllib.request from urllib.parse import unquote from selenium import webdriver import os import json import datetime import face_recognition from os import listdir,remove from os.path import isfile, join from selenium.webdriver.chrome.options import Options from osint_sources.recognition import * import spacy import requests as req import re def containsAny(str, set): """ Check whether sequence str contains ANY of the items in set. """ return 1 in [c in str for c in set] def google(toSearch,placeToSearch,knownImage,number,verbose): chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_path = './chromedriver' driver = webdriver.Chrome(chrome_path,chrome_options=chrome_options) if placeToSearch != None and len(placeToSearch)>0: driver.get("https://www.google.com/search?q=site:"+placeToSearch+"+AND+%22"+toSearch+"%22&source=lnms&tbm=isch&sa=X&ved=0ahUKEwiz2eSN_9vgAhUJoRQKHU8YCuwQ_AUIDigB&biw=1181&bih=902") else: driver.get("https://www.google.com/search?q="+toSearch+"&source=lnms&tbm=isch&sa=X&ved=0ahUKEwiz2eSN_9vgAhUJoRQKHU8YCuwQ_AUIDigB&biw=1181&bih=902") driver.implicitly_wait(50) if number == None: number=len(search) isMoreButton=True while isMoreButton: last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight-1000);") # Wait to load the page. driver.implicitly_wait(30) # seconds new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height # sleep for 30s driver.implicitly_wait(30) # seconds inputs=driver.find_elements_by_class_name('mye4qd') input_elem=None for inp in inputs: more=inp.get_attribute("type") if more=='button': input_elem=inp break if input_elem==None: isMoreButton=False else: print('More Elements') try: print('Click') input_elem.click() driver.implicitly_wait(30) except Exception as e: print('break',e) driver.implicitly_wait(30) break out = [] jsonfile={} t ="" my_set = {"{", "}", "&", "#", "_", "=",":","(",")", "+","."} nlp = spacy.load("es_core_news_sm") search=driver.find_elements_by_xpath("//img[contains(@class,'Q4LuWd')]") j=1 notRepeatPhotos = [] notRepeatFromUrl = [] number = int(number) totalPages = number*20 now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not os.path.isdir("data/google"): os.mkdir( "data/google"); os.mkdir("data/google/"+str(now)+"_images") for i in range(0,len(search)): img=search[i] try: img.click() time.sleep(2) actual_images = driver.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute('src') and 'https' in actual_image.get_attribute('src'): url_image = actual_image.get_attribute('src') text_image = actual_image.get_attribute('alt') n = actual_image.find_element_by_xpath('..') url_content = n.get_attribute("href") jsonfile["photos"]=url_image jsonfile["from_url"]=url_content jsonfile["info"] = text_image if not url_content.endswith(".pdf"): try: resp = req.get(url_content) content = resp.text stripped = re.sub('<[^<]+?>', '', content) stripped_filter = re.sub('\n', '', stripped) stripped_filter2 = re.sub('\t', '', stripped_filter) doc = nlp(stripped_filter2) locs = [] for e in doc.ents: if e.label_ == "LOC": if not containsAny(e.text,my_set) and not e.text in locs: locs.append(e.text) jsonfile["LOC_LIST"] = locs except: pass else: jsonfile["LOC_LIST"] = [] if placeToSearch != None: name=os.path.join('data/google/'+str(now)+'_images',str(i)+"-"+placeToSearch+"-"+toSearch+".jpg") else: name=os.path.join('data/google/'+str(now)+'_images',str(i)+"-"+toSearch+".jpg") try: urllib.request.urlretrieve(url_image, name) jsonfile['storedImage']=name except: src=actual_image.get_attribute('src') if src != None: urllib.request.urlretrieve(src, name) jsonfile['storedImage']=name out.append(jsonfile) jsonfile={} if len(out)>= int(totalPages): break except Exception as er: print(er) if len(out)>= int(totalPages): break path= os.path.join('data/google',str(now)+'_google_data.json') response={'results':str(path)} with open(path, 'w+') as outfile: json.dump(out, outfile) print("Results Google in: " + str(path)) if verbose: print(out) if knownImage: face_identification(knownImage,'data/google/'+str(now)+'_images/') response['images']='./data/google/'+str(now)+'_images/' response['recognized']='./data/google/'+str(now)+'_images/recognized/' return response ================================================ FILE: src/osint_sources/instagram.py ================================================ #!/usr/bin/python # coding: utf-8 # encoding=utf8 import sys import datetime import time import os import requests import json import urllib.request from osint_sources.recognition import * def instagram (name_to_search,knownimage,verbose): resp = requests.get(url='https://www.instagram.com/web/search/topsearch/?context=blended&query='+name_to_search) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not os.path.isdir("data/instagram"): os.mkdir("data/instagram"); path=os.path.join('data/instagram',str(now)+'_instagram_data.json') users=resp.json()['users'] jsonData=[] j=0 for u in users: if verbose: print('Username: '+u['user']['username']) print('Full Name: '+u['user']['full_name']) print('Profile: https://www.instagram.com/'+u['user']['username']) print('Is Private: '+str(u['user']['is_private'])) print('Is Verified: '+str(u['user']['is_verified'])) print() if knownimage: if not os.path.isdir("data/instagram/"+str(now)+"_images"): os.mkdir("data/instagram/"+str(now)+"_images"); image_name=os.path.join('data/instagram/'+str(now)+'_images',str(j)+"-"+'instagram.jpg') try: urllib.request.urlretrieve(u['user']['profile_pic_url'], image_name) except Exception as e: print(e) user={'username':u['user']['username'],'full_name':u['user']['full_name'],'profile':'https://www.instagram.com/'+u['user']['username'],'is_private':u['user']['is_private'],'is_verified':u['user']['is_verified'],'image':image_name} j=j+1 else: user={'username':u['user']['username'],'full_name':u['user']['full_name'],'profile':'https://www.instagram.com/'+u['user']['username'],'is_private':u['user']['is_private'],'is_verified':u['user']['is_verified']} jsonData.append(user) with open(path, 'w+') as outfile: json.dump(jsonData, outfile) print("Results Instagram in: " + str(path)) response={'results':str(path)} if len(users)>0: if knownimage: print("Start compare images.") face_identification(knownimage,"data/instagram/"+str(now)+"_images/") response['images']='./data/instagram/'+str(now)+'_images/' response['recognized']='./data/instagram/'+str(now)+'_images/recognized/' if verbose: for r, d, f in os.walk('./data/instagram/'+str(now)+'_images/recognized/'): for file in f: index_json = int(file.replace("-instagram.jpg","")) print("-----------") print(" MATCH") print("-----------") print('Username: '+jsonData[index_json]['username']) print('Full Name: '+jsonData[index_json]['full_name']) print('Profile: https://www.instagram.com/'+jsonData[index_json]['username']) print('Is Private: '+str(jsonData[index_json]['is_private'])) print('Is Verified: '+str(jsonData[index_json]['is_verified'])) print() return response ================================================ FILE: src/osint_sources/model.py ================================================ from peewee import * import datetime DATABASE = './data/database.db' database = SqliteDatabase(DATABASE) # model definitions -- the standard "pattern" is to define a base model class # that specifies which database to use. then, any subclasses will automatically # use the correct storage. for more information, see: # http://charlesleifer.com/docs/peewee/peewee/models.html#model-api-smells-like-django class BaseModel(Model): class Meta: database = database class User(BaseModel): name = CharField() uid = CharField(unique=True) bio = CharField() birth = DateTimeField() gender=IntegerField() s_number=IntegerField() location = CharField() def getUsers(): select=User.select() return (select) def insertUser(user_elem): try: result = (User .create(name=user_elem['name'],uid=user_elem['uid'], bio=user_elem['bio'], birth=user_elem['birth'],gender=user_elem['gender'],s_number=user_elem['s_number'],location=user_elem['location'])) Photos.insertPhotos(result,user_elem['photos']) InstagramPhotos.insertPhotosIg(result,user_elem['instagram']) Schools.insertSchools(result,user_elem['schools']) Jobs.insertJobs(result,user_elem['jobs']) except IntegrityError: print('Couldnt insert user it might be duplicated') def getIds(): result=User.select() response=[] for u in result.iterator(): response.append(u.uid) return(response) class Photos(BaseModel): user=ForeignKeyField(User, backref='photos') photo=CharField() def insertPhotos(user,photos_elems): for p in photos_elems: res=Photos.insert(user=user, photo=p).execute() class InstagramPhotos(BaseModel): user=ForeignKeyField(User, backref='inst_photos') photo=CharField() def insertPhotosIg(user,photos_elems): for p in photos_elems: InstagramPhotos.insert(user=user, photo=p).execute() class Schools(BaseModel): user=ForeignKeyField(User, backref='schools') school=CharField() def insertSchools(user,schools): for s in schools: res=Schools.insert(user=user, school=s).execute() class Jobs(BaseModel): user=ForeignKeyField(User, backref='jobs') job=CharField() def insertJobs(user,jobs): for j in jobs: res=Jobs.insert(user=user, job=j).execute() class GoogleUrls(): folder = CharField() photo = CharField() url = CharField() def create_tables(): with database: database.create_tables([User,Photos,InstagramPhotos,Schools,Jobs]) ================================================ FILE: src/osint_sources/recognition.py ================================================ import face_recognition from os import listdir,remove from os.path import isfile, join import os import cv2 import numpy as np def face_identification(known_image,folder): #print('****** Image recognition *******') onlyfiles = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f))] try: known_image_recon = face_recognition.load_image_file(known_image) known_encoding = face_recognition.face_encodings(known_image_recon)[0] except: print('Not valid known image') return os.mkdir( folder+'recognized/'); for image in onlyfiles: if 'data.json' not in image: unknown_image = face_recognition.load_image_file(image) if len(face_recognition.face_encodings(unknown_image))>0: unknown_encoding = face_recognition.face_encodings(unknown_image)[0] results = face_recognition.compare_faces([known_encoding], unknown_encoding) if results[0]==True: img_name=image.split('/') img_name=img_name[len(img_name)-1] new_image=folder+'/recognized/'+img_name os.rename(image,new_image) else: pass else: pass #print('Not face found') else: pass #print() ================================================ FILE: src/osint_sources/scraper.py ================================================ import csv from osint_sources.tinder import * from osint_sources.model import * from osint_sources.google import * from osint_sources.twitter import * from osint_sources.facebook import * from osint_sources.instagram import * from osint_sources.boe import * from osint_sources.yandex import * def tinder(token): #start_tinder_scrap scan=Tinder() authtk=scan.get_auth_token(token) print(authtk) unique_list_ids=[] #load existent ids from database unique_list_ids=User.getIds() print(len(unique_list_ids)) while True: response=scan.getUserInfo() if 'error' in response: if response['msg']=='limit rate': break elif response['msg']=='no data': print('no data') break else: if response == "Error": break ids=response['ids'] data=response['data'] differents= list(set(ids) - set(unique_list_ids)) unique_list_ids.extend(differents) for d in differents: userInfo = [usr['user_info'] for usr in data if usr['id']==d] for user in userInfo: User.insertUser(user) scan.diskike_users(userInfo) print(len(unique_list_ids)) def yandex_scrapper(img,token,verbose): yandex(img,token,verbose) def linkedin(): pass def google_scrapper(toSearch,place,knownImage,number,verbose): google(toSearch,place,knownImage,number,verbose) def twitter_scrapper(name,size,knownimage,verbose): twitter(name,size,knownimage,verbose) def facebook_scrapper(name,knownImage,size,verbose): facebook(name,knownImage,size,verbose) def instagram_scrapper(name,knownImage,verbose): instagram(name,knownImage,verbose) def boe_scrapper(toSearch,initDate,finalDate,size,explicit,verbose): if explicit==None: explicit=True boe(toSearch,initDate,finalDate,size,explicit,verbose) ================================================ FILE: src/osint_sources/tinder.py ================================================ # coding=utf-8 import json import config import requests import numpy as np headers = { 'app_version': '6.9.4', 'platform': 'ios', "content-type": "application/json", "User-agent": "Tinder/7.5.3 (iPhone; iOS 10.3.2; Scale/2.00)", "Accept": "application/json" } config.host="https://api.gotinder.com" class Tinder: def __init__(self): self.gender = "" self.location = "location" def get_auth_token(self,tinderToken): try: headers.update({"X-Auth-Token": tinderToken}) #print(tinder_auth_token) response = self.get_self() print("You have been successfully authorized!") profile = self.get_self() self.gender = profile['gender'] self.location = profile['pos_info']['city']['name'] return tinderToken except Exception as e: print(e) return {"error": "Something went wrong. Sorry, but we could not authorize you."} def authverif(self): res = self.get_auth_token(config.fb_access_token, config.fb_user_id) if "error" in res: return False return True def get_recommendations(self): ''' Returns a list of users that you can swipe on ''' try: r = requests.get('https://api.gotinder.com/user/recs', headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong with getting recomendations:", e) def get_updates(self,last_activity_date=""): ''' Returns all updates since the given activity date. The last activity date is defaulted at the beginning of time. Format for last_activity_date: "2017-07-09T10:28:13.392Z" ''' try: url = config.host + '/updates' r = requests.post(url, headers=headers, data=json.dumps({"last_activity_date": last_activity_date})) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong with getting updates:", e) def get_self(self): ''' Returns your own profile data ''' try: url = config.host + '/profile' r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your data:", e) def change_preferences(self,**kwargs): ''' ex: change_preferences(age_filter_min=30, gender=0) kwargs: a dictionary - whose keys become separate keyword arguments and the values become values of these arguments age_filter_min: 18..46 age_filter_max: 22..55 age_filter_min <= age_filter_max - 4 gender: 0 == seeking males, 1 == seeking females distance_filter: 1..100 discoverable: true | false {"photo_optimizer_enabled":false} ''' try: url = config.host + '/profile' r = requests.post(url, headers=headers, data=json.dumps(kwargs)) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not change your preferences:", e) def get_meta(self): ''' Returns meta data on yourself. Including the following keys: ['globals', 'client_resources', 'versions', 'purchases', 'status', 'groups', 'products', 'rating', 'tutorials', 'travel', 'notifications', 'user'] ''' try: url = config.host + '/meta' r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your metadata:", e) def get_meta_v2(self): ''' Returns meta data on yourself from V2 API. Including the following keys: ['account', 'client_resources', 'plus_screen', 'boost', 'fast_match', 'top_picks', 'paywall', 'merchandising', 'places', 'typing_indicator', 'profile', 'recs'] ''' try: url = config.host + '/v2/meta' r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your metadata:", e) def update_location(self,lat, lon): ''' Updates your location to the given float inputs Note: Requires a passport / Tinder Plus ''' try: url = config.host + '/passport/user/travel' r = requests.post(url, headers=headers, data=json.dumps({"lat": lat, "lon": lon})) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not update your location:", e) def reset_real_location(self): try: url = config.host + '/passport/user/reset' r = requests.post(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not update your location:", e) def get_recs_v2(self): ''' This works more consistently then the normal get_recommendations becuase it seeems to check new location ''' try: url = config.host + '/v2/recs/core?locale=en-US' r = requests.get(url, headers=headers) return r.json() except Exception as e: print('excepted') def set_webprofileusername(self,username): ''' Sets the username for the webprofile: https://www.gotinder.com/@YOURUSERNAME ''' try: url = config.host + '/profile/username' r = requests.put(url, headers=headers, data=json.dumps({"username": username})) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not set webprofile username:", e) def reset_webprofileusername(self,username): ''' Resets the username for the webprofile ''' try: url = config.host + '/profile/username' r = requests.delete(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not delete webprofile username:", e) def get_person(self,id): ''' Gets a user's profile via their id ''' try: url = config.host + '/user/%s' % id r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get that person:", e) def send_msg(self,match_id, msg): try: url = config.host + '/user/matches/%s' % match_id r = requests.post(url, headers=headers, data=json.dumps({"message": msg})) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not send your message:", e) def unmatch(self,match_id): try: url = config.host + '/user/matches/%s' % match_id r = requests.delete(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not unmatch person:", e) def superlike(self,person_id): try: url = config.host + '/like/%s/super' % person_id r = requests.post(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not superlike:", e) def like(self,person_id): try: url = config.host + '/like/%s' % person_id r = requests.get(url, headers=headers) print(r) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not like:", e) def dislike(self,person_id,s_number): try: url = config.host + '/pass/%s' % person_id+'?s_number='+str(s_number) headers2=headers headers2.pop('content-type', None) r = requests.get(url, headers=headers2, timeout=0.7).json() return r except requests.exceptions.RequestException as e: print("Something went wrong. Could not dislike:", e) def report(self,person_id, cause, explanation=''): ''' There are three options for cause: 0 : Other and requires an explanation 1 : Feels like spam and no explanation 4 : Inappropriate Photos and no explanation ''' try: url = config.host + '/report/%s' % person_id r = requests.post(url, headers=headers, data={ "cause": cause, "text": explanation}) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not report:", e) def match_info(self,match_id): try: url = config.host + '/matches/%s' % match_id r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your match info:", e) def all_matches(self): try: url = config.host + '/v2/matches' r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your match info:", e) def fast_match_info(self): try: url = config.host + '/v2/fast-match/preview' r = requests.get(url, headers=headers) count = r.headers['fast-match-count'] # image is in the response but its in hex.. return count except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your fast-match count:", e) def trending_gifs(self,limit=3): try: url = config.host + '/giphy/trending?limit=%s' % limit r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get the trending gifs:", e) def gif_query(self,query, limit=3): try: url = config.host + '/giphy/search?limit=%s&query=%s' % (limit, query) r = requests.get(url, headers=headers) return r.json() except requests.exceptions.RequestException as e: print("Something went wrong. Could not get your gifs:", e) # def see_friends(self): # try: # url = config.host + '/group/friends' # r = requests.get(url, headers=headers) # return r.json()['results'] # except requests.exceptions.RequestException as e: # print("Something went wrong. Could not get your Facebook friends:", e) def getUserInfo(self): recs=self.get_recs_v2() if recs == None: return "Error" if 'data' in recs: data=[] ids=[] try: results=recs['data']['results'] except: return {'error':True,'msg':'no data'} for r in results: user_info={} user_info['name']=r['user']['name'] user_info['uid']=r['user']['_id'] ids.append(r['user']['_id']) try: user_info['bio']=r['user']['bio'] except: user_info['bio']='' user_info['birth']=r['user']['birth_date'] photos=[] for p in r['user']['photos']: photos.append(p['url']) user_info['photos']=photos try: user_info['jobs']=r['user']['jobs'] except: user_info['jobs']='' try: user_info['schools']=r['user']['schools'] except: user_info['schools']='' user_info['gender']=r['user']['gender'] user_info['s_number']=r['s_number'] try: inst=r['instagram'] photos=[] for p in inst['photos']: photos.append(p['image']) user_info['instagram']=photos except: user_info['instagram']=[] user_info['location']=self.location data.append({'user_info':user_info,'id':r['user']['_id']}) return {'data':data,'ids':ids} elif 'error' in recs: if recs['error']['code']==42901: print('Limit Rate') return {'error':True,'msg':'limit rate'} def diskike_users(self,user_info): for uinf in user_info: resp=self.dislike(uinf['uid'],uinf['s_number']) ================================================ FILE: src/osint_sources/twitter.py ================================================ #!/usr/bin/python # coding=utf-8 # encoding=utf8 import sys import datetime from selenium.webdriver.common.keys import Keys import time from selenium import webdriver import os from parsel import Selector import urllib.parse from selenium.common.exceptions import NoSuchElementException import json from selenium.webdriver.chrome.options import Options from difflib import SequenceMatcher from osint_sources.recognition import * def twitter (name_to_search,page_number,knownimage,verbose): placeToSearch='twitter.com' chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_path = './chromedriver' driver = webdriver.Chrome(chrome_path,chrome_options=chrome_options) people_list=[] for i in range(int(page_number)): driver.get("https://www.google.com/search?q=site:"+placeToSearch+"+AND+"+name_to_search + "&start=" + str(10 * i)) search=driver.find_elements_by_tag_name('a') time.sleep(10) for s in search: href=s.get_attribute('href') if href != None: if "https://twitter.com/" in href: if "/status/" not in href and "/media" not in href and "/hashtag/" not in href and "webcache.googleusercontent.com" not in href and "google.com" not in href: people_list.append(href) elif "/hashtag/" not in href and "webcache.googleusercontent.com" not in href and "google.com" not in href: if "/status/" in href: people_list.append(href.split("/status/")[0]) elif "/media" not in s.text: people_list.append(href.split("/media")[0]) people_list=set(people_list) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not os.path.isdir("data/twitter"): os.mkdir("data/twitter"); path=os.path.join('data/twitter',str(now)+'_twitter_data.json') jsonData=[] userLink = set() for p in people_list: if verbose: print("*******************************************************************************************************") print(p) driver.get(p) driver.implicitly_wait(50) time.sleep(2) sel = Selector(text=driver.page_source) name = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[2]/div/div/div[1]/div/span[1]/span/text()').extract_first() link = p description = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div/div/span[1]/text()').extract_first() location = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[1]/span/span/text()').extract_first() member_since = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[2]/svg/text()').extract_first() born=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[2]/svg/text()').extract_first() webpage=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/a/text()').extract_first() image_url=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[1]/a/div[1]/div[2]/div/img/@src').extract_first() if name==None: name="" if str(link) not in userLink: userLink.add(link) nameParts = name_to_search.split(' ') isMatcher = False for n in nameParts: if SequenceMatcher(None,n, name).ratio()>0.4 or SequenceMatcher(None,n,str(link)).ratio()>0.4 or n in str(description).lower(): isMatcher=True if SequenceMatcher(None,name_to_search, name).ratio()>0.4 or SequenceMatcher(None,name_to_search,str(link)).ratio()>0.4 or name_to_search in str(description).lower(): isMatcher=True if isMatcher: userData = {} if verbose: print("Name: "+str(name)) print("Link: "+str(link)) print("Description: "+str(description)) print("Location: "+ str(location)) print("Member since: "+str(member_since)) print("Born: "+str(born)) print("Web: "+str(webpage)) print ("Profile image url: "+str(image_url)) print('\n') print('\n') if knownimage: if not os.path.isdir("data/twitter/"+str(now)+"_images"): os.mkdir("data/twitter/"+str(now)+"_images"); image=os.path.join("data/twitter/"+str(now)+"_images/"+str(link.split('.com/')[1])+".jpg") try: urllib.request.urlretrieve(image_url, image) userData={'storedImage':image,'name':str(name),'link':str(link),'description':str(description),'location':str(location),'member_since':str(member_since),'born':str(born),'web':str(webpage),'image':str(image_url)} jsonData.append(userData) except: pass else: userData={'name':str(name),'link':str(link),'description':str(description),'location':str(location),'member_since':str(member_since),'born':str(born),'web':str(webpage),'image':str(image_url)} jsonData.append(userData) with open(path, 'w+') as outfile: json.dump(jsonData, outfile) print("Results Twitter in: " + str(path)) response={'results':str(path)} if len(people_list)>0: if knownimage: print("Compare similarity images.") face_identification(knownimage,'./data/twitter/'+str(now)+'_images/') response['images']='./data/twitter/'+str(now)+'_images/' response['recognized']='./data/twitter/'+str(now)+'_images/recognized/' driver.quit() return response ================================================ FILE: src/osint_sources/yandex.py ================================================ #!/usr/bin/python # coding: utf-8 # encoding=utf8 from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options from selenium import webdriver from urllib.parse import unquote from os.path import isfile, join from os import listdir,remove import time import json import os import requests import re import datetime import base64 import urllib.parse import urllib.request from urllib.request import Request, urlopen import random from bs4 import BeautifulSoup from fake_useragent import UserAgent from osint_sources.recognition import * import sys def isCaptcha(driver): headers=driver.find_elements_by_tag_name('h1') for h in headers: if h.get_attribute('innerHTML')=="oops…": print('captcha detected') return(True) return(False) def checkProxy(proxy): proxies = { "http": proxy, "https": proxy, } try: resp= requests.get("https://google.com", proxies=proxies, timeout=20) print("Google responses "+str(resp.status_code)+" in "+str(resp.elapsed.total_seconds())+ " seconds.") if((resp.status_code!=200) or resp.elapsed.total_seconds()>10): return 0 else: return 1 except: print("Error while checking the proxy: "+ str(proxy)) return 0 def crawlProxy(): # Retrieve latest proxies ua = UserAgent() proxies = [] proxies_req = Request('https://www.sslproxies.org/') proxies_req.add_header('User-Agent', ua.random) proxies_doc = urlopen(proxies_req).read().decode('utf8') soup = BeautifulSoup(proxies_doc, 'html.parser') proxies_table = soup.find(id='proxylisttable') # Save proxies in the array for row in proxies_table.tbody.find_all('tr'): proxies.append({ 'ip': row.find_all('td')[0].string, 'port': row.find_all('td')[1].string}) invalidProxy=True proxy = None while invalidProxy: random_index = random.randint(0, len(proxies) - 1) random_proxy = proxies[random_index] proxy = str(random_proxy['ip'])+":"+str(random_proxy['port']) chk=checkProxy(proxy) if chk: invalidProxy=False return proxy def searchImages(driver,now,verbose): os.mkdir("data/yandex/"+str(now)+"_images") search=driver.find_elements_by_class_name('other-sites__item') j=0 print('Retrieving images') out = [] for s in search: try: a = s.find_elements_by_tag_name('a') for i,al in enumerate(a): aclass = al.get_attribute('class') if aclass == 'other-sites__preview-link': link=al.get_attribute('href') if link != None and link != "": name=os.path.join('data/yandex/'+str(now)+'_images',str(j)+"-yandex.jpg") j=j+1 title = s.find_elements_by_class_name('other-sites__snippet-title')[0] atittle= title.find_elements_by_tag_name('a')[0] title = atittle.get_attribute('innerText') url = atittle.get_attribute('href') domain =s.find_elements_by_class_name('other-sites__snippet-site')[0] domain= domain.find_elements_by_tag_name('a')[0].get_attribute('innerText') info ={} info["originUrl"] = link info["title"] = title info["url"] = url info["domain"] = domain if verbose: print("-----------------") print(info) out.append(info) try: urllib.request.urlretrieve(link, name) except: print("Failed when downloading photo " + str(j)) except Exception as e: print(e) return out def deletedImage(hashimage, token): headers = {'Authorization': 'Client-ID ' + token} req = requests.delete(url= "https://api.imgur.com/3/image/" + hashimage, headers= headers) if req.status_code == requests.codes.ok: return True else: return False def yandex(image,token,verbose): if not os.path.isdir("data/yandex"): os.mkdir("data/yandex"); image_url = image image_delete = "" results={} url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+] |[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', image) if not url: try: f = open(image, "rb") except FileNotFoundError as e: print ("Image not found: " + image) return [] #sys.exit(-1) image_data = f.read() b64_image = base64.standard_b64encode(image_data) client_id = token headers = {'Authorization': 'Client-ID ' + token} data = {'image': b64_image, 'title': 'test'} try: request = requests.post(url="https://api.imgur.com/3/upload.json", data=data,headers=headers) if request.status_code == requests.codes.ok: image_url = request.json()['data']['link'] image_delete = request.json()['data']['deletehash'] print ("Image upload to imgur: " + image_url) except Exception as e: print(e) sys.exit(-1) proxy=crawlProxy() if proxy is not None: print (proxy) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument('--proxy-server=%s' % proxy) chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_path = './chromedriver' driver = webdriver.Chrome(chrome_path,options=chrome_options) url_final = "https://yandex.ru/images/search?rpt=imageview&url="+image_url+"&rpt=imagelike" driver.get(url_final) driver.implicitly_wait(50) time.sleep(3) captcha=isCaptcha(driver) if captcha == True: driver.close() else: now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") images=searchImages(driver,now,verbose) if not images: print('No images.') driver.close() path=os.path.join('data/yandex',str(now)+'_yandex_data.json') with open(path, 'w+') as outfile: json.dump(images, outfile) if not token == None: if deletedImage(image_delete,token): print ("Image deleted") else: print("Problem when deleted image from imgur") print("Results Yandex in: " + str(path)) results['results']=str(path) else: print('Yandex is blocked') return results ================================================ FILE: src/requirements.txt ================================================ selenium == 3.141.0 bs4 == 0.0.1 requests == 2.25.0 peewee == 3.8.0 robobrowser == 0.5.3 config == 0.4.0 face-recognition==1.2.3 opencv-python==4.4.0.46 spacy==2.1.6 urllib3==1.26.2 fake-useragent==0.1.11 parsel==1.6.0 ================================================ FILE: web/Docker-compose.yaml ================================================ version: "3.8" services: front: build: ./front networks: - osint back: build: "./back" volumes: - ./data:/SpyScrap/src/data networks: - osint reverse-proxy: container_name: reverse-proxy build: reverse-proxy ports: - "80:80" networks: - osint depends_on: - back - front networks: osint: ================================================ FILE: web/README.md ================================================ # WEB Wraper For SpyScrap Requirements docker-compose & docker ### Installation docker-compose up ================================================ FILE: web/back/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ tinder.json database.db chromedriver_linux64/ /openface/ /openface /openface/* chromedriver /osint_sources/* /osint_sources main.py SpyScrap.png setup.sh /back/README.md /build /build/* ================================================ FILE: web/back/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update \ && apt-get install -y python3-pip python3-dev \ && cd /usr/local/bin \ && ln -s /usr/bin/python3 python \ && pip3 install --upgrade pip RUN apt-get install -y --fix-missing \ build-essential \ cmake \ gfortran \ git \ wget \ curl \ pkg-config \ python3-dev \ python3-numpy \ software-properties-common \ zip \ unzip \ && apt-get clean && rm -rf /tmp/* /var/tmp/* RUN cd ~ && \ mkdir -p dlib && \ git clone -b 'v19.9' --single-branch https://github.com/davisking/dlib.git dlib/ && \ cd dlib/ && \ python3 setup.py install --yes USE_AVX_INSTRUCTIONS # Install Google Chrome RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb RUN dpkg -i google-chrome-stable_current_amd64.deb; apt-get -fy install # Install Chrome WebDriver RUN CHROMEDRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` && \ mkdir -p /opt/chromedriver-$CHROMEDRIVER_VERSION && \ curl -sS -o /tmp/chromedriver_linux64.zip http://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip && \ unzip -qq /tmp/chromedriver_linux64.zip -d /opt/chromedriver-$CHROMEDRIVER_VERSION && \ rm /tmp/chromedriver_linux64.zip && \ chmod +x /opt/chromedriver-$CHROMEDRIVER_VERSION/chromedriver && \ ln -fs /opt/chromedriver-$CHROMEDRIVER_VERSION/chromedriver /usr/local/bin/chromedriver RUN git clone https://github.com/RuthGnz/SpyScrap.git WORKDIR SpyScrap/src RUN pip3 install -r ./requirements.txt RUN rm requirements.txt RUN python -m spacy download es_core_news_sm RUN cp /usr/local/bin/chromedriver . COPY ./osint-back/ . RUN pip3 install -r ./back_requirements.txt ENV PYTHONIOENCODING=utf-8 ENTRYPOINT ["python3","server.py"] ================================================ FILE: web/back/osint-back/api.py ================================================ from flask import Flask, jsonify, request,send_from_directory import logging from controller import * from flask_cors import CORS app = Flask(__name__,static_url_path = "/data", static_folder = "./data") app.config['UPLOAD_FOLDER'] = './uploads' app.config['ALLOWED_EXTENSIONS'] = ['png','jpeg','jpg'] CORS(app) URL_BASE = '/osint/api/v1' @app.route("/osint/api/v1") def ping(): return "It works OSINT!" @app.route(URL_BASE+"/tinder",methods=['POST']) def tinder(): name = request.form.get('name') company = request.form.get('company') files = request.files users = [] if len(files)>0 and not name and not company: users=compareImages(files,None,app) if company and not name and len(files)==0: users = getUsersByCompany(company) elif name and not company and len(files)==0: users = getUsersByName(name) elif company and name and len(files)==0: users = getUsersByCompanyAndName(company,name) elif company and name and len(files)>0: users=getUsersNameCompanyPhoto(company,name,files,app) elif company and not name and len(files)>0: users=getUsersByPhotoAndCompany(company,files,app) elif not company and name and len(files)>0: users=getUsersByNameAndPhoto(name,files,app) return jsonify({'msg':users}) @app.route(URL_BASE+"/google",methods=['POST']) def google(): data=[] name = request.form.get('name') download = request.form.get('download') number = request.form.get('number') if name == None: return jsonify("Name must me provided") place = request.form.get('place') files = request.files files = request.files if len(files)==0 and download=='true': files=None data=google_controller(name,place,number,files,app) return jsonify({'msg':data['data']}) @app.route(URL_BASE+"/instagram",methods=['POST']) def instagram(): data=[] name = request.form.get('name') download = request.form.get('download') if name == None: return jsonify("Name must me provided") files = request.files if len(files)==0 and download=='true': files=None data=instagram_controller(name,files,app) return jsonify({'msg':data['data']}) ##todo @app.route(URL_BASE+"/twitter",methods=['POST']) def twitter(): data=[] name = request.form.get('name') number = request.form.get('number') download = request.form.get('download') if name == None: return jsonify("Name must me provided") if number==None: number=1 files = request.files if len(files)==0 and download=='true': files=None data=twitter_controller(name,files,number,app) return jsonify({'msg':data['data']}) @app.route(URL_BASE+"/facebook",methods=['POST']) def facebook(): data=[] name = request.form.get('name') number = request.form.get('number') download = request.form.get('download') if name == None: return jsonify("Name must me provided") if number==None: number=1 files = request.files if len(files)==0 and download=='true': files=None data=facebook_controller(name,files,number,app) return jsonify({'msg':data['data']}) @app.route(URL_BASE+"/boe",methods=['POST']) def boe(): data=[] text = request.form.get('text') is_explicit = request.form.get('explicit') pages = request.form.get('pages') if text == None: return jsonify("Text must me provided") initDate=None outDate=None data=boe_controller(text,is_explicit,initDate,outDate,pages) return jsonify({'msg':data['data']}) @app.route(URL_BASE+"/yandex",methods=['POST']) def yandex(): data=[] url = request.form.get('url') token = request.form.get('token') files = request.files data=yandex_controller(url,files,token,app) return jsonify({'msg':data['data']}) @app.route(URL_BASE+"/data///") def download_file(folder,dateFolder,image): return send_from_directory('./data/'+folder+'/'+dateFolder,image) @app.route(URL_BASE+"/scoring",methods=['POST']) def scoring(): #TODO name = request.form.get('name') imgurl = request.form.get('imgurl') number = request.form.get('number') gnumber = request.form.get('gnumber') files = request.files if len(files)==0: return jsonify({'msg':'Image must be sent'}),400 data=scoring_controller(name,imgurl,number,gnumber,files,app) return jsonify({'msg':data}) if __name__ == '__main__': app.run(debug=True) ================================================ FILE: web/back/osint-back/back_model.py ================================================ from peewee import * import datetime from playhouse.shortcuts import model_to_dict, dict_to_model import json import sys from flask import jsonify sys.path.insert(1, './CLI/') from osint_sources.scraper import * scriptDirectory = os.path.dirname(os.path.realpath(__file__)) print(scriptDirectory) class User_Back(User): def getIds(): result=User.select() response=[] for u in result.iterator(): response.append(u.uid) return(response) def getById(arrayIds): userIds=User.select().where(User.id << arrayIds) result = [] for u in userIds.iterator(): try: job=Jobs.get(Jobs.user_id==u.id).job json_acceptable_string = job.replace("'", "\"") job = json.loads(json_acceptable_string) user= {'name':u.name,'location':u.location,'birth':u.birth,'job':job} except: print("Unexpected error:", sys.exc_info()[0]) user= {'name':u.name,'location':u.location,'birth':u.birth,'job':''} photos = [] ph = Photos.select().where(Photos.user == u) for i in ph.iterator(): photos.append(i.photo) ig = InstagramPhotos.select().where(InstagramPhotos.user == u) for i in ig.iterator(): photos.append(i.photo) userData={'user':user,'photos':photos} result.append(userData) return result def getByName(name): userIds=User.select().where(User.name.contains(name)) result = [] for u in userIds.iterator(): try: job=Jobs.get(Jobs.user_id==u.id).job json_acceptable_string = job.replace("'", "\"") job = json.loads(json_acceptable_string) user= {'name':u.name,'location':u.location,'birth':u.birth,'job':job} except: #print("Unexpected error:", sys.exc_info()[0]) user= {'name':u.name,'location':u.location,'birth':u.birth,'job':''} photos = [] ph = Photos.select().where(Photos.user == u) for i in ph.iterator(): photos.append(i.photo) ig = InstagramPhotos.select().where(InstagramPhotos.user == u) for i in ig.iterator(): photos.append(i.photo) userData={'user':user,'photos':photos} result.append(userData) return result def getUserIdsByName(name): userIds=User.select().where(User.name.contains(name)) result=[] for u in userIds: result.append(u) return result def getByJob(company): userIds=Jobs.select().where(Jobs.job.contains(company)) result = [] for u in userIds.iterator(): user_obj = User.get(User.id==u.user) json_acceptable_string = u.job.replace("'", "\"") job = json.loads(json_acceptable_string) user= {'name':user_obj.name,'location':user_obj.location,'birth':user_obj.birth,'job':job} photos = [] ph = Photos.select().where(Photos.user == u.user) for i in ph.iterator(): photos.append(i.photo) ig = InstagramPhotos.select().where(InstagramPhotos.user == u) for i in ig.iterator(): photos.append(i.photo) userData={'user':user,'photos':photos} result.append(userData) return result def getUserIdsByCompany(company): userIds=Jobs.select().where(Jobs.job.contains(company)) result=[] for u in userIds: result.append(u.user) return result def getByCompanyAndName(company,name): userIds=User.select().join(Jobs, on=(Jobs.user==User.id)).where(Jobs.job.contains(company) & User.name.contains(name)) result = [] for u in userIds.iterator(): job_obj = Jobs.get(Jobs.user==u) json_acceptable_string = job_obj.job.replace("'", "\"") try: job = json.loads(json_acceptable_string) except: job='' user= {'name':u.name,'location':u.location,'birth':u.birth,'job':job} photos = [] ph = Photos.select().where(Photos.user == u) for i in ph.iterator(): photos.append(i.photo) ig = InstagramPhotos.select().where(InstagramPhotos.user == u) for i in ig.iterator(): photos.append(i.photo) userData={'user':user,'photos':photos} result.append(userData) return result def getUsersIdsByCompanyAndName(company,name): userIds=User.select().join(Jobs, on=(Jobs.user==User.id)).where(Jobs.job.contains(company) & User.name.contains(name)) result = [] for u in userIds: result.append(u) return result class Photos_back(Photos): def getPhotos(): result = [] photos=Photos.select().join(User,on=(User.id==Photos.user)) for p in photos: user= {'id':p.user.id,'name':p.user.name,'location':p.user.location,'birth':p.user.birth} userData={'user':user,'photo':p.photo} result.append(userData) return result def getPhotosByUsers(users): result=[] photos=Photos.select().join(User,on=(User.id==Photos.user)).where(User.id.in_(users)) for p in photos: user= {'id':p.user.id,'name':p.user.name,'location':p.user.location,'birth':p.user.birth} userData={'user':user,'photo':p.photo} result.append(userData) return result ================================================ FILE: web/back/osint-back/back_requirements.txt ================================================ Flask==1.0.2 requests==2.25.0 gunicorn==19.8.1 pyjwt==1.7.1 Flask-Cors==3.0.8 peewee==3.8.0 gevent==1.4.0 urllib3==1.24.3 numpy==1.15.0 ================================================ FILE: web/back/osint-back/controller.py ================================================ from back_model import * from werkzeug.utils import secure_filename import os from os import listdir,remove from PIL import Image import requests from io import BytesIO from selenium.webdriver.common.keys import Keys import time from selenium import webdriver import os import json #from api import app import sys from osint_sources.scraper import * from osint_sources.recognition import * import datetime import urllib.request import shutil import re import logging def allowed_file(filename,app): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS'] def getValidImagePath(knownFiles,app): location0=None for f in knownFiles: file = knownFiles[f] if file and allowed_file(file.filename,app): filename = secure_filename(file.filename) location0 = os.path.join(app.config['UPLOAD_FOLDER'], filename) file.save(location0) known_image = face_recognition.load_image_file(location0) face_locations= face_recognition.face_locations(known_image) if len(face_locations)>0: break return location0 def compareImages(knownFiles,users,app): now = datetime.datetime.now() if users==None: photos=Photos_back.getPhotos() else: photos=Photos_back.getPhotosByUsers(users) for f in knownFiles: file = knownFiles[f] if file and allowed_file(file.filename,app): filename = secure_filename(file.filename) location0 = os.path.join(app.config['UPLOAD_FOLDER'], filename) file.save(location0) known_image = face_recognition.load_image_file(location0) face_locations= face_recognition.face_locations(known_image) if len(face_locations)>0: break folder=app.config['UPLOAD_FOLDER']+'/'+str(now)+'/' #save fotos in folder os.mkdir( folder); for i,p in enumerate(photos): try: location = os.path.join(folder, str(p['user']['id'])+'-'+str(i)+'.jpg') urllib.request.urlretrieve(p['photo'], location) except: pass face_identification(location0,folder) ##get userIds from folder ##remove photos userIds=[] # r=root, d=directories, f = files for r, d, f in os.walk(folder+'recognized'): for file in f: if '.jpg' in file: file=file.split('-')[0] userIds.append(int(file)) users = User_Back.getById(userIds) remove(location0) shutil.rmtree(folder) return users def getUsersByPhotoAndCompany(company,knownFiles,app): users = User_Back.getUserIdsByCompany(company) result=compareImages(knownFiles,users,app) return result def getUsersByNameAndPhoto(name,knownFiles,app): result = [] users = User_Back.getUserIdsByName(name) result = compareImages(knownFiles,users,app) return result def getUsersNameCompanyPhoto(company,name,knownFiles,app): users = User_Back.getUsersIdsByCompanyAndName(company,name) result = compareImages(knownFiles,users,app) return result def getUsersByCompany(company): ids = User_Back.getByJob(company) return ids def getUsersByCompanyAndName(company,name): users = User_Back.getByCompanyAndName(company,name) return users def getUsersByName(name): users = User_Back.getByName(name) return users def getUsersByCompanyAndNameAndImage(company,name,knownFiles): pass def google_controller(toSearch,placeToSearch,number,knownFiles,app): if number =="" or number=="undefined": number=None if placeToSearch == "" or placeToSearch=="undefined": placeToSearch=None if type(knownFiles)== str: knownImage=knownFiles else: if knownFiles == None: knownImage='file' elif len(knownFiles)>0: knownImage=getValidImagePath(knownFiles,app) else: knownImage=None paths=google(toSearch,placeToSearch,knownImage,number,False) jsonPath=paths['results'] response={} with open(jsonPath) as json_file: data = json.load(json_file) if "recognized" in paths and knownImage!='file': files=[] for r, d, f in os.walk(paths['recognized']): for file in f: files.append(file) newData = [] for a in data: try: img = a['storedImage'].split('/') img= img[len(img)-1] if img in files: path=a['storedImage'].split(img)[0] path=path+"recognized/"+img a['storedImage']=path newData.append(a) except: pass data=newData response['data']=data return response def instagram_controller(toSearch,knownFiles,app): if type(knownFiles)== str: knownImage=knownFiles else: if knownFiles == None: knownImage='file' elif len(knownFiles)>0: knownImage=getValidImagePath(knownFiles,app) else: knownImage=None paths = instagram(toSearch,knownImage,False) jsonPath=paths['results'] response={} with open(jsonPath) as json_file: data = json.load(json_file) if "recognized" in paths and knownImage!='file': files=[] for r, d, f in os.walk(paths['recognized']): for file in f: files.append(file) newData = [] for a in data: try: img = a['image'].split('/') img= img[len(img)-1] if img in files: path=a['image'].split(img)[0] path=path+"recognized/"+img a['image']=path newData.append(a) except: pass data=newData response['data']=data return response def twitter_controller(name_to_search,knownFiles,page_number,app): if type(knownFiles)== str: knownImage=knownFiles else: if knownFiles == None: knownImage='file' elif len(knownFiles)>0: knownImage=getValidImagePath(knownFiles,app) else: knownImage=None paths=twitter(name_to_search,page_number,knownImage,False) jsonPath=paths['results'] response={} with open(jsonPath) as json_file: data = json.load(json_file) if "recognized" in paths and knownImage!='file': files=[] for r, d, f in os.walk(paths['recognized']): print(f) for file in f: files.append(file) newData = [] for a in data: try: img = a['storedImage'].split('/') img= img[len(img)-1] if img in files: path=a['storedImage'].split(img)[0] path=path+"recognized/"+img a['storedImage']=path newData.append(a) except: pass data=newData response['data']=data return response def facebook_controller(name_to_search,knownFiles,page_number,app): if type(knownFiles)== str: knownImage=knownFiles else: if knownFiles == None: knownImage='file' elif len(knownFiles)>0: knownImage=getValidImagePath(knownFiles,app) else: knownImage=None paths=facebook(name_to_search,knownImage,page_number,False) jsonPath=paths['results'] response={} with open(jsonPath) as json_file: data = json.load(json_file) if "recognized" in paths and knownImage!='file': files=[] for r, d, f in os.walk(paths['recognized']): for file in f: files.append(file) newData = [] for a in data: try: img = a['image'].split('/') img= img[len(img)-1] if img in files: path=a['image'].split(img)[0] path=path+"recognized/"+img a['image']=path newData.append(a) except: pass data=newData response['data']=data return response def yandex_controller(url,knownFiles,token,app): logging.warning('URL '+url) if url != None: image = url elif type(knownFiles)== str: knownImage=knownFiles else: if knownFiles == None: knownImage='file' elif len(knownFiles)>0: knownImage=getValidImagePath(knownFiles,app) else: knownImage=None image="" if token== None and knownImage==None: return "error" else: image=knownImage print(image) paths=yandex(image,token,False) response={} jsonPath=paths['results'] with open(jsonPath) as json_file: data = json.load(json_file) response['data']=data return response def boe_controller(text,is_explicit,initDate,outDate,pages): paths=boe (text,initDate,outDate,pages,is_explicit,False) response={} jsonPath=paths['results'] with open(jsonPath) as json_file: data = json.load(json_file) response['data']=data return response def scoring_controller(name,url,number,gnumber,files,app): imagePath=getValidImagePath(files,app) response = {} try: ins = instagram_controller(name,imagePath,app) response['instagram']=ins['data'] except: print('instagram error') response['instagram']=[] try: fb = facebook_controller(name,imagePath,number,app) response['facebook']=fb['data'] except: print('facebook error') response['facebook']=[] try: gl = google_controller(name,"undefined",gnumber,imagePath,app) response['google']=gl['data'] except: print('google error') response['google']=[] try: tw = twitter_controller(name,imagePath,number,app) response['twitter']=tw['data'] except: print('twitter error') response['twitter']=[] try: yn = yandex_controller(url,None,None,app) response['yandex']=yn['data'] except: print('yandex error') response['yandex']=[] score=compute_score(response) response['score']=score return response def compute_score(data): #TODO #TOTAL 100 #Twitter 10 #Instagram 10 #Facebook 10 #Yandex 10 -> #Google 60 score=0 if len(data['twitter'])>0: score=score+10 if len(data['facebook'])>0: score=score+10 if len(data['instagram'])>0: score=score+10 if data['yandex'] != False: if len(data['yandex'])>0: yn=data['yandex'] score=score+1 bonus=0 if len(yn)>10: bonus=bonus+1 if len(yn)>20: bonus=bonus+2 if len(yn)>30: bonus=bonus+3; if len(yn)>40: bonus=bonus+3; score=score+bonus if len(data['google'])>0: gn=data['google'] score=score+10 bonus=0 if len(gn)>2: bonus=bonus+5 if len(gn)>5: bonus=bonus+5 if len(gn)>10: bonus=bonus+5; if len(gn)>15: bonus=bonus+5; if len(gn)>25: bonus=bonus+5; if len(gn)>35: bonus=bonus+5; if len(gn)>45: bonus=bonus+5; if len(gn)>55: bonus=bonus+5; if len(gn)>65: bonus=bonus+5; if len(gn)>75: bonus=bonus+5; score=score+bonus return score ================================================ FILE: web/back/osint-back/server.py ================================================ from gevent.pywsgi import WSGIServer from api import app # if yourapplication imports from views, # the sort would happen once; here. app.debug = True http_server = WSGIServer(('', 5000), app) http_server.serve_forever() ================================================ FILE: web/back/osint-back/uploads/.gitignore ================================================ # Ignore everything in this directory * # Except this file !.gitignore ================================================ FILE: web/data/.gitignore ================================================ # Ignore everything in this directory * # Except this file !.gitignore ================================================ FILE: web/front/.gitignore ================================================ .DS_Store Thumbs.db db.json *.log node_modules/ .deploy*/ src/_drafts package-lock.json requirements.txt ================================================ FILE: web/front/Dockerfile ================================================ FROM node:15.2.1-alpine3.10 # instalar un simple servidor http para servir nuestro contenido estático RUN yarn global add http-server # hacer la carpeta 'app' el directorio de trabajo actual WORKDIR /app # copiar 'package.json' y 'package-lock.json' (si están disponibles) COPY ./osint-front/package*.json ./ # instalar dependencias del proyecto RUN yarn install # copiar los archivos y carpetas del proyecto al directorio de trabajo actual (es decir, la carpeta 'app') COPY ./osint-front/ . # construir aplicación para producción minificada RUN yarn run build EXPOSE 8080 CMD [ "http-server", "dist" ] ================================================ FILE: web/front/osint-front/.gitignore ================================================ .DS_Store node_modules /dist # local env files .env.local .env.*.local # Log files npm-debug.log* yarn-debug.log* yarn-error.log* # Editor directories and files .idea .vscode *.suo *.ntvs* *.njsproj *.sln *.sw? ================================================ FILE: web/front/osint-front/README.md ================================================ # front ## Project setup ``` yarn install ``` ### Compiles and hot-reloads for development ``` yarn serve ``` ### Compiles and minifies for production ``` yarn build ``` ### Lints and fixes files ``` yarn lint ``` ### Customize configuration See [Configuration Reference](https://cli.vuejs.org/config/). ================================================ FILE: web/front/osint-front/babel.config.js ================================================ module.exports = { presets: [ '@vue/cli-plugin-babel/preset' ] } ================================================ FILE: web/front/osint-front/package.json ================================================ { "name": "front", "version": "0.1.0", "private": true, "scripts": { "serve": "vue-cli-service serve", "build": "vue-cli-service build", "lint": "vue-cli-service lint" }, "dependencies": { "core-js": "^3.6.5", "vue": "^2.6.11", "vue-resource": "^1.5.1", "vue-router": "^3.4.9", "vue-toast-notification": "^0.6.0", "vuetify": "^2.2.11", "vuetify-numeric": "^0.1.7" }, "devDependencies": { "@vue/cli-plugin-babel": "~4.5.0", "@vue/cli-plugin-eslint": "~4.5.0", "@vue/cli-service": "~4.5.0", "babel-eslint": "^10.1.0", "eslint": "^6.7.2", "eslint-plugin-vue": "^6.2.2", "sass": "^1.19.0", "sass-loader": "^8.0.0", "vue-cli-plugin-vuetify": "~2.0.7", "vue-template-compiler": "^2.6.11", "vuetify-loader": "^1.3.0" }, "eslintConfig": { "root": true, "env": { "node": true }, "extends": [ "plugin:vue/essential", "eslint:recommended" ], "parserOptions": { "parser": "babel-eslint" }, "rules": {} }, "browserslist": [ "> 1%", "last 2 versions", "not dead" ] } ================================================ FILE: web/front/osint-front/public/index.html ================================================ SpyScrap
================================================ FILE: web/front/osint-front/src/App.vue ================================================ ================================================ FILE: web/front/osint-front/src/main.js ================================================ import Vue from 'vue' import App from './App.vue' import router from "./router"; import vuetify from './plugins/vuetify'; import VueResource from "vue-resource"; Vue.config.productionTip = false new Vue({ router, vuetify, render: h => h(App) }).$mount('#app') Vue.use(VueResource); ================================================ FILE: web/front/osint-front/src/plugins/vuetify.js ================================================ import Vue from 'vue'; import Vuetify from 'vuetify/lib'; Vue.use(Vuetify); export default new Vuetify({ }); ================================================ FILE: web/front/osint-front/src/router.js ================================================ import Vue from "vue"; import Router from "vue-router"; import Home from "./views/Home.vue"; Vue.use(Router) export default new Router({ mode: "hash", base: process.env.BASE_URL, routes: [ { path: "/", name: "home", component: Home }, { path: "/tinder", name: "tinder", component: () => import(/* webpackChunkName: "about" */ "./views/Tinder.vue") }, { path: "/google", name: "google", component: () => import(/* webpackChunkName: "about" */ "./views/Google.vue") }, { path: "/yandex", name: "yandex", component: () => import(/* webpackChunkName: "about" */ "./views/Yandex.vue") }, { path: "/boe", name: "boe", component: () => import(/* webpackChunkName: "about" */ "./views/Boe.vue") }, { path: "/instagram", name: "instagram", component: () => import(/* webpackChunkName: "about" */ "./views/Instagram.vue") }, { path: "/facebook", name: "facebook", component: () => import(/* webpackChunkName: "about" */ "./views/Facebook.vue") }, { path: "/twitter", name: "twitter", component: () => import(/* webpackChunkName: "about" */ "./views/Twitter.vue") }, { path: "/score", name: "score", component: () => import(/* webpackChunkName: "about" */ "./views/Score.vue") }, { path: "/about", name: "about", // route level code-splitting // this generates a separate chunk (about.[hash].js) for this route // which is lazy-loaded when the route is visited. component: () => import(/* webpackChunkName: "about" */ "./views/About.vue") } ] }); ================================================ FILE: web/front/osint-front/src/views/About.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Boe.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Facebook.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Google.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Home.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Instagram.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Score.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Tinder.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Twitter.vue ================================================ ================================================ FILE: web/front/osint-front/src/views/Yandex.vue ================================================ ================================================ FILE: web/front/osint-front/vue.config.js ================================================ module.exports = { "transpileDependencies": [ "vuetify" ] } ================================================ FILE: web/reverse-proxy/Dockerfile ================================================ FROM nginx:1.19-alpine COPY nginx.conf /etc/nginx/nginx.conf EXPOSE 80 ================================================ FILE: web/reverse-proxy/nginx.conf ================================================ worker_processes 4; events { worker_connections 1024; } http { # Basic Settings sendfile on; tcp_nopush on; tcp_nodelay on; keepalive_timeout 65; types_hash_max_size 2048; proxy_connect_timeout 300000; proxy_send_timeout 300000; proxy_read_timeout 300000; send_timeout 300000; upstream backend { server back:5000; } upstream frontend { server front:8080; } server { listen 80; server_name localhost; if ($http_x_forwarded_proto = 'http') { return 301 https://$server_name$request_uri; } access_log /var/log/nginx/client.access.log; error_log /var/log/nginx/client.error.log; gzip on; gzip_http_version 1.1; gzip_min_length 1100; gzip_vary on; gzip_proxied expired no-cache no-store private auth; gzip_types text/plain text/css application/json application/x-javascript text/xml application/xml application/xml+rss text/javascript application/javascript text/x-js; gzip_comp_level 9; location / { proxy_pass http://frontend; proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504; proxy_redirect off; proxy_buffering off; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; } location /osint { proxy_pass http://backend; proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504; proxy_redirect off; proxy_buffering off; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; add_header 'Access-Control-Allow-Credentials' 'true'; add_header 'Access-Control-Allow-Origin' 'api'; } location /data { proxy_pass http://backend; proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504; proxy_redirect off; proxy_buffering off; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; add_header 'Access-Control-Allow-Credentials' 'true'; add_header 'Access-Control-Allow-Origin' 'api'; } } }