.
================================================
FILE: README.md
================================================
MetaFinder
Search for documents in a domain through Search Engines. The objective is to extract metadata.
## Installation:
```
> pip3 install metafinder
```
Upgrades are also available using:
```
> pip3 install metafinder --upgrade
```
## Usage
MetaFinder can be used in 2 ways:
### CLI
```
metafinder -d domain.com -l 20 -o folder [-t 10] -go -bi -ba
```
Parameters:
* d: Specifies the target domain.
* l: Specify the maximum number of results to be searched in the searchs engines.
* o: Specify the path to save the report.
* t: Optional. Used to configure the threads (4 by default).
* v: Show Metafinder version.
* Search Engines to select (Google by default):
* go: Optional. Search in Google.
* bi: Optional. Search in Bing.
* ba: Optional. Search in Baidu. (Experimental)
### In Code
```
import metafinder.extractor as metadata_extractor
documents_limit = 5
domain = "target_domain"
result = metadata_extractor.extract_metadata_from_google_search(domain, documents_limit)
# result = metadata_extractor.extract_metadata_from_bing_search(domain, documents_limit)
# result = metadata_extractor.extract_metadata_from_baidu_search(domain, documents_limit)
authors = result.get_authors()
software = result.get_software()
for k,v in result.get_metadata().items():
print(f"{k}:")
print(f"|_ URL: {v['url']}")
for metadata,value in v['metadata'].items():
print(f"|__ {metadata}: {value}")
document_name = "test.pdf"
try:
metadata_file = metadata_extractor.extract_metadata_from_document(document_name)
for k,v in metadata_file.items():
print(f"{k}: {v}")
except FileNotFoundError:
print("File not found")
```
## Example

# Author
This project has been developed by:
* **Josué Encinar García** -- [@JosueEncinar](https://twitter.com/JosueEncinar)
# Contributors
* **Félix Brezo Fernández** -- [@febrezo](https://twitter.com/febrezo)
# Disclaimer!
The software is designed to leave no trace in the documents we upload to a domain. The author is not responsible for any illegitimate use.
================================================
FILE: metafinder/__init__.py
================================================
__version__ = "1.2"
================================================
FILE: metafinder/cli.py
================================================
import argparse
from os import sep, listdir, remove
import os.path
import sys
from pathlib import Path
from metafinder.utils.banner import show_banner
from metafinder.core import processing
from metafinder import __version__
def main(argv=None):
"""The entry point for the script
Args:
argv (list): The list of parameters passed.
"""
parser = argparse.ArgumentParser()
parser.add_argument('-d','--domain', help="Domain to search",required=True)
parser.add_argument('-o','--output', help="Folder where the results will be stored",required=True, default="results")
parser.add_argument('-l','--limit', help="Limit of documents to search in the searchs engines (max 250)", type=int, required=True)
parser.add_argument('-t','--threads', help="Number of threads for downloading documents", type=int, default=4)
parser.add_argument('-go','--google', help="Search in Google", action='store_true', default=False)
parser.add_argument('-bi','--bing', help="Search in Bing", action='store_true', default=False)
parser.add_argument('-ba','--baidu', help="Search in Baidu", action='store_true', default=False)
parser.add_argument('-v','--version', help="Show Metafinder version", action='version', version=__version__)
args = parser.parse_args()
show_banner()
directory = Path(args.output) / args.domain
directory.mkdir(parents=True, exist_ok=True)
search_engines = {
"google": args.google,
"bing": args.bing,
"baidu": args.baidu
}
some_election = False
for k,v in search_engines.items():
if v:
some_election = True
break
if not some_election:
search_engines["google"] = True
limit = 250 if args.limit > 250 else args.limit # max 250
try:
processing(args.domain, limit, str(directory), args.threads, search_engines)
except KeyboardInterrupt:
print("[-] MetaFinder has been interrupted. Deleting files.")
try:
for f in listdir(directory):
remove(os.path.join(directory, f))
except:
pass
try:
for f in listdir("."):
if f.endswith(".tmp"):
remove(os.path.join(directory, f))
except:
pass
print("Bye.")
if __name__ == '__main__':
main(sys.argv[1:])
================================================
FILE: metafinder/core.py
================================================
from metafinder.utils.finder import google
from metafinder.utils.finder import bing
from metafinder.utils.finder import baidu
from metafinder.utils.file.download import download_file
from metafinder.utils.file.parser import file_parser_list, file_parser
from metafinder.utils.color_print import print_error, print_ok
from metafinder.utils.result import Result
from metafinder.utils.var_data import *
def _get_links(target, limit, directory, threads, search_engines):
links = []
search_engines_methods = {
"google": google.search,
"bing": bing.search,
"baidu": baidu.search
}
for engine in search_engines_methods.keys():
aux_links = []
if search_engines.get(engine, False):
print(f"Searching in {engine}")
try:
aux_links = search_engines_methods[engine](target, limit)
for link_to_check in aux_links:
exist = False
i = 0
for exist_link in links:
if link_to_check == exist_link[CONST_URL]:
exist = True
links[i][CONST_SEARCH_ENGINES].append(engine)
break
i += 1
if not exist:
links.append({
CONST_SEARCH_ENGINES: [engine],
CONST_URL: link_to_check})
print_ok("Done", end="\n")
except KeyboardInterrupt:
print(f"{engine} interrupted\n")
except Exception as ex:
print_error(f"{engine} error {ex}", end="\n")
return links
def processing(target, limit, directory, threads, search_engines):
links = _get_links(target, limit, directory, threads, search_engines)
total_links = len(links)
links_msg = f"Total files to be analyzed: {total_links}"
print(links_msg)
print("-" * len(links_msg))
if total_links > 0:
try:
metadata_result = Result(download_file(links, directory, threads))
authors = metadata_result.get_authors()
software = metadata_result.get_software()
metadata_files = metadata_result.get_metadata()
print("\nAnalyzing metadata...")
if metadata_files:
software_msg = f"Software data found: {len(software)}"
authors_msg = f"Authors found: {len(authors)}"
print(f"\n{authors_msg}")
print("-" * len(authors_msg))
for a in authors:
print(a)
print(f"\n{software_msg}")
print("-" * len(software_msg))
for s in software:
print(s)
metadata_filename = "metadata_result.txt"
authors_filename = "authors.txt"
software_filename = "software.txt"
file_parser_list(directory, authors_filename, authors)
file_parser_list(directory, software_filename, software)
print("")
print_ok(f"Authors data have been saved in file {directory}/{authors_filename}")
print_ok(f"Software data have been saved in file {directory}/{software_filename}")
file_parser(directory, metadata_filename, metadata_files)
print_ok(f"All metadata results have been saved in file {directory}/{metadata_filename}")
else:
print_error("No metadata found...")
except KeyboardInterrupt:
print("CTRL^C")
except:
pass
else:
print("There is nothing to analyze. Closing...")
================================================
FILE: metafinder/extractor.py
================================================
from metafinder.utils.library import extract_metadata_from_google_search, extract_metadata_from_bing_search, extract_metadata_from_baidu_search
from metafinder.utils.library import extract_metadata_from_document
from metafinder.utils.exception import GoogleCaptcha, BaiduDetection, GoogleCookiePolicies
================================================
FILE: metafinder/utils/__init__.py
================================================
================================================
FILE: metafinder/utils/agent.py
================================================
user_agent = {
0: {'User-agent': 'Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.86 Mobile Safari/537.36'},
1: {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
2: {'User-agent': 'Opera/9.80 (Linux armv7l) Presto/2.12.407 Version/12.51 , D50u-D1-UHD/V1.5.16-UHD (Vizio, D50u-D1, Wireless)'},
3: {'User-agent': 'BrightSign/7.1.95 (XT1143) Mozilla/5.0 (Unknown; Linux arm) AppleWebKit/537.36 (KHTML, like Gecko) QtWebEngine/5.6.0 Chrome/45.0.2454.101 Safari/537.36'},
4: {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.24 Safari/537.36'},
5: {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'},
6: {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
}
================================================
FILE: metafinder/utils/banner.py
================================================
from metafinder import __version__
banner = """
_____ __ ___________ .__ .___
/ \ ____ _/ |_ _____ \_ _____/ |__| ____ __| _/ ____ _______
/ \ / \ _/ __ \ \ __\ \__ \ | __) | | / \ / __ | _/ __ \ \_ __ \
/ Y \ \ ___/ | | / __ \_ | \ | | | | \ / /_/ | \ ___/ | | \/
\____|__ / \___ > |__| (____ / \___ / |__| |___| / \____ | \___ > |__|
\/ \/ \/ \/ \/ \/ \/
"""
author = "@JosueEncinar"
description = "Search for documents in a domain through Search Engines. The objective is to extract metadata"
usage_example = "metafinder -d domain.com -l 50 -o /tmp -go -bi"
def show_banner():
print(banner)
print(f"|_ Author: {author}")
print(f"|_ Description: {description}")
print(f"|_ Version: {__version__}")
print(f"|_ Usage: {usage_example}")
print("")
================================================
FILE: metafinder/utils/color_print.py
================================================
from prompt_toolkit import print_formatted_text, HTML
def print_error(msg, start="", end=""):
print_formatted_text(HTML(f"{start}[!] {msg}{end}"))
def print_ok(msg, start="", end=""):
print_formatted_text(HTML(f"{start}[+] {msg}{end}"))
================================================
FILE: metafinder/utils/exception.py
================================================
class GoogleCaptcha(Exception):
def __init__(self, *args):
self.data = "Google Captcha detected"
if args:
self.data = args[0]
def __str__(self):
return "GoogleCaptcha, {0}".format(self.data)
class GoogleCookiePolicies(Exception):
def __init__(self, *args):
self.data = "Google Cookie Policy detected"
if args:
self.data = args[0]
def __str__(self):
return "GoogleCookiePolicies, {0}".format(self.data)
class BaiduDetection(Exception):
def __init__(self, *args):
self.data = "Baidu Robot detected"
if args:
self.data = args[0]
def __str__(self):
return "BaiduDetection, {0}".format(self.data)
================================================
FILE: metafinder/utils/file/__init__.py
================================================
================================================
FILE: metafinder/utils/file/download.py
================================================
from os import sep, listdir, remove
import os.path
from concurrent.futures import ThreadPoolExecutor, as_completed
from metafinder.utils.file.metadata import extract_metadata
import time
from os import sep
import requests
from random import randint
from metafinder.utils.agent import user_agent
from metafinder.utils.var_data import *
# Disable warning by SSL certificate
import urllib3
urllib3.disable_warnings()
from metafinder.utils.color_print import print_error, print_ok
def download_document(element, directory, display):
metadata = {}
url = element[CONST_URL]
search_engines = element[CONST_SEARCH_ENGINES]
try:
response = requests.get(url, headers=user_agent.get(randint(0, len(user_agent)-1)), timeout=10, verify=False)
url = response.url
name = url.split(sep)[-1].split('?')[0]
file_name = directory + sep + name
s_code = response.status_code
data = {}
if s_code == 200:
with open(file_name, "wb") as f:
f.write(response.content)
data = extract_metadata(file_name)
if display:
print_ok(f"Downloaded file {url}")
elif display:
print_error(f"(Status code: {s_code}) File {url}")
metadata = {
CONST_NAME: name,
CONST_URL: url,
CONST_METADATA: data,
CONST_STATUS_CODE: s_code,
CONST_SEARCH_ENGINES: search_engines}
except Exception as ex:
if display:
print_error(f"Error donwloading {url} >> {ex}")
return metadata
def download_file(urls_metadata, directory, threads, display=True):
metadata_files = {}
with ThreadPoolExecutor(max_workers=threads) as executor:
future_download = {executor.submit(download_document, url, directory, display): url for url in urls_metadata}
for future in as_completed(future_download):
try:
data = future.result()
if data:
name = data[CONST_NAME]
del data[CONST_NAME]
metadata_files[name] = data
except Exception as ex:
if display:
print_error(f"Error: {ex}")
try:
for f in listdir(directory):
remove(os.path.join(directory, f))
except:
pass
return metadata_files
================================================
FILE: metafinder/utils/file/metadata.py
================================================
from docx import Document
import pikepdf
from openpyxl import load_workbook
from pptx import Presentation
from datetime import datetime
def _get_properties(prop):
metadata = {}
metadata["Author"] = prop.author
metadata["Comments"] = prop.comments
metadata["Created"] = str(prop.created)
metadata["Identifier"] = prop.identifier
metadata["Keywords"] = prop.keywords
metadata["Modified"] = str(prop.modified)
metadata["Subject"] = prop.subject
metadata["Title"] = prop.title
def extract_doc(document):
doc = Document(document)
prop = doc.core_properties
return _get_properties(prop)
def extract_pdf(document):
metadata = {}
with pikepdf.open(document) as pdf:
info = pdf.docinfo
for meta in info:
data = str(info.get(meta, ""))
if data and data.startswith("D:"):
try:
d = data.split("D:")[1].split("+")[0]
data = str(datetime.strptime(d,"%Y%m%d%H%M%S"))
except:
pass
metadata[meta[1:]] = data # [1:] avoid first element '/'
return metadata
def extract_xls(document):
wb = load_workbook(document)
prop = wb.properties
return _get_properties(prop)
def extract_ppt(document):
pptx_presentation = Presentation(document)
prop = pptx_presentation.core_properties
return _get_properties(prop)
def remove_indirect_object(metadata):
new_metadata = {}
for m in metadata:
value = metadata[m]
if "IndirectObject(" not in str(value) and str(value) != "":
new_metadata[m] = value
return new_metadata
def extract_metadata(document):
try:
if document.endswith("pdf"):
metadata = extract_pdf(document)
elif document.endswith("doc") or document.endswith("docx"):
metadata = extract_doc(document)
elif document.endswith("xls") or document.endswith("xlsx"):
metadata = extract_xls(document)
elif document.endswith("ppt") or document.endswith("pptx"):
metadata = extract_ppt(document)
else:
metadata = {}
except:
metadata = {}
return remove_indirect_object(metadata)
================================================
FILE: metafinder/utils/file/parser.py
================================================
from os import sep
from metafinder.utils.var_data import *
def file_parser_list(directory, name, data):
file_name = directory + sep + name
with open(file_name, "w") as output:
for d in data:
output.write(f'{d}\n')
def file_parser(directory, name, metadata):
file_name = directory + sep + name
with open(file_name, "w") as output:
for key, value in metadata.items():
output.write("\n")
output.write(key + "\n")
output.write("-"* len(key) + "\n")
output.write(f"URL: {value[CONST_URL]}\n")
output.write(f"Status code: {value[CONST_STATUS_CODE]}\n")
engines = ""
for se in value[CONST_SEARCH_ENGINES]:
engines = se + ", "
output.write(f"Search engines: {engines[0:-2]}\n")
if value[CONST_METADATA]:
for v in value[CONST_METADATA]:
if value[CONST_METADATA][v] is not None:
output.write(f'|_ {v}: {value[CONST_METADATA][v]}\n')
else:
output.write(f'|_ No metadata found\n')
================================================
FILE: metafinder/utils/finder/__init__.py
================================================
================================================
FILE: metafinder/utils/finder/baidu.py
================================================
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
from metafinder.utils.exception import BaiduDetection
from metafinder.utils.agent import user_agent
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib3
urllib3.disable_warnings()
def _get_link(baidu_link):
location = None
try:
resp = requests.get(baidu_link, timeout=5, allow_redirects=False)
location = resp.headers.get("Location", None) # Get redirection (Real link)
except:
pass
return location
def search(target, total):
num_results = 50 if total >= 50 else total
documents = []
base_url = "https://www.baidu.com/s?ie=utf-8"
total_loop = int(total/num_results)
if (total%num_results) != 0:
total_loop += 1
count = 1
old_useragent = -1
total_timeout = 0
while (count <= total_loop) and (len(documents) < total):
while True:
next_useragent = randint(0, len(user_agent)-1)
if next_useragent != old_useragent:
break
old_useragent = next_useragent
new_url = base_url + f"&pn={count*num_results}&wd=(site:{target}+|+site:*.{target})+filetype:pdf&rn={num_results}"
try:
new_agent = user_agent.get(count, next_useragent)
response = requests.get(new_url, headers=new_agent, timeout=5, verify=False)
text = response.text
if "timeout-button" in text:
total_timeout += 1
if total_timeout == 5:
raise BaiduDetection
sleep(2)
continue
soup = BeautifulSoup(text, "html.parser")
all_h3 = soup.findAll("h3", {"class": "t"})
for h3 in all_h3:
href = h3.a.get("href", None)
if href and href not in documents:
documents.append(href)
if len(documents) >= total:
break
except Exception as ex:
raise ex #It's left over... but it stays there
count += 1
return_documents = []
with ThreadPoolExecutor(max_workers=6) as executor:
future_download = {executor.submit(_get_link, url): url for url in documents}
for future in as_completed(future_download):
try:
data = future.result()
if data and data not in return_documents:
return_documents.append(data)
except:
pass
return return_documents
================================================
FILE: metafinder/utils/finder/bing.py
================================================
from urllib.parse import urlencode, urlunparse
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from random import randint
from metafinder.utils.agent import user_agent
def search(target, total):
bing_count = 25
documents = []
url = f"https://www.bing.com/search?q=site:{target}+(filetype:pdf+OR+filetype:doc+OR%20filetype:docx+OR+filetype:xls+OR+filetype:xlsx+OR+filetype:ppt+OR+filetype:pptx)&count={bing_count}"
try:
count = 0
iter_count = int(total/bing_count)
if (total%bing_count) != 0:
iter_count +=1
while (count < iter_count) and (len(documents) < total):
this_count = count*bing_count + 1
new_url = url + f"&first={this_count}&FORM=PERE"
req = Request(new_url, headers={"User-Agent": user_agent.get(randint(0, len(user_agent)-1))["User-agent"]})
page = urlopen(req)
soup = BeautifulSoup(page.read(), "html.parser")
all_links = soup.find_all("a")
for link in all_links:
href = link.get("href", None)
if href and target in href and \
(href.endswith("pdf") or href.endswith("doc") or href.endswith("docx") or href.endswith("ppt") or href.endswith("pptx") or href.endswith("xls") or href.endswith("xlsx")) \
and href not in documents:
documents.append(href)
if len(documents) >= total:
break
count += 1
except Exception as e:
pass
return documents
================================================
FILE: metafinder/utils/finder/google.py
================================================
import requests
from bs4 import BeautifulSoup
from random import randint
from metafinder.utils.exception import GoogleCaptcha, GoogleCookiePolicies
from metafinder.utils.agent import user_agent
import urllib3
urllib3.disable_warnings()
def search(target, total):
documents = []
start = 0
num = 50 if total > 50 else total
iterations = int(total/50)
if (total%50) != 0:
iterations += 1
## Check https://github.com/n4xh4ck5/RastLeak - thanks Nacho
url_base = f"https://www.google.com/search?q=(ext:pdf OR ext:doc OR ext:docx OR ext:xls OR ext:xlsx OR ext:ppt OR ext:pptx)+(site:*.{target} OR site:{target})&num={num}"
cookies = {"CONSENT": "YES+srp.gws"}
while (start < iterations) and (len(documents) < total):
try:
url = url_base + f"&start={start}"
response = requests.get(url,
headers={'User-agent': 'APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)'},
timeout=5,
verify=False,
allow_redirects=False,
cookies=cookies)
text = response.text
if response.status_code == 302 and ("htps://www.google.com/webhp" in text or "https://consent.google.com" in text):
raise GoogleCookiePolicies()
if "detected unusual traffic" in text:
raise GoogleCaptcha()
soup = BeautifulSoup(text, "html.parser")
all_links = soup.find_all("a")
follow = False
for link in all_links:
href = link.get("href", None)
if href and target in href and "google" not in href:
try:
href = "http" + href.split("=http")[1]
href = href.split("&sa=U&")[0]
follow = True
if href not in documents:
documents.append(href)
if len(documents) >= total:
break
except:
continue
if not follow:
break
except Exception as ex:
raise ex #It's left over... but it stays there
start += 1
return documents
================================================
FILE: metafinder/utils/library.py
================================================
from pathlib import Path
import tempfile
from os.path import isfile
from metafinder.utils.finder import google
from metafinder.utils.finder import bing
from metafinder.utils.finder import baidu
from metafinder.utils.file.download import download_file
from metafinder.utils.file.metadata import extract_metadata
from metafinder.utils.result import Result
from metafinder.utils.var_data import *
def _generate_list(links, search_engine):
result = []
for link in links:
result.append({
CONST_URL: link,
CONST_SEARCH_ENGINES: [search_engine]
})
return result
def extract_metadata_from_google_search(domain, limit=50, threads=4):
"""Search metadata in files through Google
Args:
domain: Target domain.
limit: Maximum number of files to search.
threads: Threads for downloading documents.
Raises:
GoogleCaptcha: If Google displays the captcha.
Exception: If there is an error
Returns:
dict: Result object
"""
links = google.search(domain, limit)
directory = tempfile.TemporaryDirectory()
metadata_files = None
if links and len(links) > 0:
metadata_files = download_file(_generate_list(links, "Google"), directory.name, threads, False)
directory.cleanup()
return Result(metadata_files) if metadata_files else None
def extract_metadata_from_bing_search(domain, limit=50, threads=4):
"""Search metadata in files through Bing
Args:
domain: Target domain.
limit: Maximum number of files to search.
threads: Threads for downloading documents.
Raises:
Exception: If there is an error
Returns:
dict: Result object
"""
links = bing.search(domain, limit)
directory = tempfile.TemporaryDirectory()
metadata_files = None
if len(links) > 0:
metadata_files = download_file(_generate_list(links, "Bing"), directory.name, threads, False)
directory.cleanup()
return Result(metadata_files) if metadata_files else None
def extract_metadata_from_baidu_search(domain, limit=50, threads=2):
"""Search metadata in PDF files through Baidu (slow method)
Args:
domain: Target domain.
limit: Maximum number of files to search.
threads: Threads for downloading documents.
Raises:
BaiduDetection: If Google displays the captcha.
Exception: If there is an error
Returns:
dict: Result object
"""
links = baidu.search(domain, limit)
directory = tempfile.TemporaryDirectory()
metadata_files = None
if len(links) > 0:
metadata_files = download_file(_generate_list(links, "Baidu"), directory.name, threads, False)
directory.cleanup()
return Result(metadata_files) if metadata_files else None
def extract_metadata_from_document(document):
"""Search metadata in a document
Args:
document: Document to be analyze.
Raises:
FileNotFoundError: If file doesn't exist.
Returns:
dict: A json with metadata
"""
if isfile(document):
return extract_metadata(document)
raise FileNotFoundError()
================================================
FILE: metafinder/utils/result.py
================================================
class Result:
"""Result class
Functions:
get_metadata: Return all metadata collected.
get_authors: Returns all authors found in a list.
get_software: Returns all software found in a list.
"""
def __init__(self, metadata):
self._metadata = metadata
self._authors, self._software = self._configure_data(metadata)
def get_metadata(self):
return self._metadata
def get_authors(self):
return self._authors
def get_software(self):
return self._software
def _configure_data(self, metadata):
authors = []
software = []
for key, value in metadata.items():
for v in value["metadata"]:
if v and v == "Author" and value["metadata"][v] not in authors:
authors.append(value["metadata"][v])
elif v and (v == "Producer" or v == "Creator") and value["metadata"][v] not in software:
software.append(value["metadata"][v])
return authors, software
================================================
FILE: metafinder/utils/var_data.py
================================================
# Define constants
CONST_SEARCH_ENGINES = "search_engines"
CONST_URL = "url"
CONST_NAME = "name"
CONST_METADATA = "metadata"
CONST_STATUS_CODE = "status_code"
================================================
FILE: requirements.txt
================================================
requests
pikepdf
beautifulsoup4
openpyxl
python-docx
python-pptx
prompt-toolkit
urllib3
================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages
from os import path
import metafinder
long_description = """
**MetaFinder - Metadata search through Search Engines**
=======================================================
Installation:
-------------
> pip3 install metafinder
Upgrades are also available using:
> pip3 install metafinder --upgrade
Usage
-----
MetaFinder can be used in 2 ways:
CLI
---
metafinder -d domain.com -l 20 -o folder [-t 10] -go -bi -ba
Parameters:
- d: Specifies the target domain.
- l: Specify the maximum number of results to be searched.
- o: Specify the path to save the report.
- t: Optional. Used to configure the threads (4 by default).
- v: Show Metafinder version.
- go: Optional. Search in Google. (Default)
- bi: Optional. Search in Bing.
- ba: Optional. Search in Baidu. (Experimental)
In Code
-------
import metafinder.extractor as metadata_extractor
documents_limit = 5
domain = "target_domain"
result = metadata_extractor.extract_metadata_from_google_search(domain, documents_limit)
# result = metadata_extractor.extract_metadata_from_bing_search(domain, documents_limit)
# result = metadata_extractor.extract_metadata_from_baidu_search(domain, documents_limit)
authors = result.get_authors()
software = result.get_software()
for k,v in result.get_metadata().items():
print(f"{k}:")
print(f"|_ URL: {v['url']}")
for metadata,value in v['metadata'].items():
print(f"|__ {metadata}: {value}")
document_name = "test.pdf"
try:
metadata_file = metadata_extractor.extract_metadata_from_document(document_name)
for k,v in metadata_file.items():
print(f"{k}: {v}")
except FileNotFoundError:
print("File not found")
Author
======
This project has been developed by:
- **Josué Encinar García** -- https://twitter.com/JosueEncinar
Contributors
============
- **Félix Brezo Fernández** -- https://twitter.com/febrezo
Disclaimer!
===========
The software is designed to leave no trace in the documents we upload to a domain. The author is not responsible for any
illegitimate use.
"""
setup(
name='metafinder',
version=metafinder.__version__,
author='Josue Encinar (@JosueEncinar)',
description='MetaFinder - Metadata search through Search Engines',
include_package_data=True,
license='GNU GPLv3+',
packages=find_packages(),
long_description=long_description,
long_description_content_type='text/markdown',
url="https://github.com/Josue87/MetaFinder",
entry_points={
'console_scripts': [
'metafinder=metafinder.cli:main',
],
},
install_requires=[
"requests>=2.25.1",
"pikepdf>=2.5.2",
"beautifulsoup4>=4.9.3",
"openpyxl>=3.0.5",
"python-docx>=0.8.6",
"python-pptx>=0.6.18",
"prompt-toolkit>=3.0.5",
"urllib3>=1.26.4"
]
)