Repository: DistriNet/tranco-list Branch: master Commit: 27e2502acb27 Files: 10 Total size: 36.5 KB Directory structure: gitextract_1v7f8h45/ ├── README.md ├── combined_lists.py ├── generate_daily_list.py ├── generate_domain_parts.py ├── global_config.py ├── job_handler.py ├── job_server.py ├── notify_email.py ├── requirements.txt └── shared.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation *By Victor Le Pochat, Tom Van Goethem, Samaneh Tajalizadehkhoob, Maciej Korczyński and Wouter Joosen* This repository contains the source code driving the generation of the Tranco ranking provided at [https://tranco-list.eu/](https://tranco-list.eu/). This new top websites ranking was proposed in our paper [Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation](https://tranco-list.eu/assets/tranco-ndss19.pdf). * `combined_lists.py` contains the core code for generating new lists based on a configuration passed to `combined_lists.generate_combined_list`. * `shared.py` and `global_config.py` contain several configuration variables; `shared.DEFAULT_TRANCO_CONFIG` gives the configuration of the default (daily updated) Tranco list. * `generate_daily_list.py` runs daily to generate the default Tranco list. * `job_handler.py` contains either the code for submitting jobs to an `rq` queue for processing, or code to relay requests for list generation to a remote host. * `job_server.py` accepts request for list generation on a remote host. * `notify_email.py` contains code to notify users when their list has been generated. * `generate_domain_parts.py` preprocesses rankings to extract the different components of domains. ================================================ FILE: combined_lists.py ================================================ # Imports import csv import datetime import glob import shutil import time import traceback import zipfile from itertools import islice import os import tempfile # Imports of configuration variables from global_config import * # Constants GLOBAL_MAX_RANK = 1000000 LIST_FILENAME_FORMAT = "{}.csv" from shared import ZIP_FILENAME_FORMAT # When using AWS services, set up retrieval and storage of lists for S3 if USE_S3: import boto3 s3_resource = boto3.resource('s3', region_name="us-east-1") toplists_archive_bucket = s3_resource.Bucket(name=TOPLISTS_ARCHIVE_S3_BUCKET) from smart_open import smart_open # List ID generation from hashids import Hashids hsh = Hashids(salt="tsr", min_length=4, alphabet="BCDFGHJKLMNPQRSTVWXYZ23456789") # Mongo connection for storing configuration of generated lists from pymongo import MongoClient client = MongoClient(MONGO_URL) db = client["tranco"] def count_dict(dct, entry, value=1): """ Helper function for updating dictionaries """ if not entry in dct: dct[entry] = 0 dct[entry] += value def date_list(start_date, end_date): """ Generate list of dates between start and end date """ start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d") end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") return [(start_date_dt + datetime.timedelta(days=x)) for x in range((end_date_dt - start_date_dt).days + 1)] def _db_id_to_list_id(db_id): """ List number to hash """ if db_id: return hsh.encode(db_id) else: return None def _list_id_to_db_id(list_id): """ Hash to list number """ try: return hsh.decode(list_id)[0] except: return None def config_to_list_id(config, insert=True, skip_failed=False): """ List configuration to list hash (either insert new configuration into database, or retrieve ID for existing list with that configuration) :param config: list configuration :param insert: whether to create a new list ID if the given configuration does not exist yet :param skip_failed: skip failed lists :return: """ if skip_failed: query = {**config, "failed": {"$ne": True}} else: query = config out = db["lists"].find_one(query) if out: db_id = int(out["_id"]) else: if insert: db_id = get_next_db_key() insert_config_in_db(config, db_id) else: return None return _db_id_to_list_id(db_id) def list_id_to_config(list_id): """ Retrieve configuration of existing list based on hash """ db_id = _list_id_to_db_id(list_id) if db_id: return {**db["lists"].find_one({"_id": int(db_id)}), "list_id": list_id} def list_available(list_id): """ Check if list is available for download """ db_id = _list_id_to_db_id(list_id) if not db_id: return False doc = db["lists"].find_one({"_id": int(db_id)}) return doc is not None and doc.get("finished", False) and not doc.get("failed", True) def get_next_db_key(): """ Get next key from list configuration database (for a new list) """ counter_increase = db["counter"].find_one_and_update({"_id": "lists"}, {'$inc': {'count': 1}}) return int(counter_increase["count"]) def insert_config_in_db(config, db_id): """ Insert a new configuration into the database, with the given key """ db["lists"].insert_one({**config, "_id": db_id, "finished": False, "creationDate": datetime.datetime.now().strftime("%Y-%m-%d"), "creationTime": datetime.datetime.now().isoformat()}) def get_generated_list_fp(list_id): """ Get file location of existing list (file-based archive) """ return os.path.join(NETAPP_STORAGE_PATH, "generated_lists/{}".format(LIST_FILENAME_FORMAT.format(list_id))) def get_generated_zip_fp(list_id): """ Get file location of existing zip (file-based archive) """ return os.path.join(NETAPP_STORAGE_PATH, "generated_lists_zip/{}".format(ZIP_FILENAME_FORMAT.format(list_id))) def get_generated_list_s3(list_id): """ Get file location of existing list (AWS S3) """ return "s3://{}/{}".format(TOPLISTS_GENERATED_LIST_S3_BUCKET, LIST_FILENAME_FORMAT.format(list_id)) def get_generated_zip_s3(list_id): """ Get file location of existing zip (AWS S3) """ return "s3://{}/{}".format(TOPLISTS_DAILY_LIST_S3_BUCKET, ZIP_FILENAME_FORMAT.format(list_id)) def get_list_fp_for_day(provider, date, parts=False): """ Get file location for source list (of one of the providers) """ date = date.strftime("%Y%m%d") if parts: fp = next(glob.iglob(os.path.join(NETAPP_STORAGE_PATH, "archive/{}/parts/{}_{}_parts.csv".format(provider, provider, date)))) else: fp = next(glob.iglob(os.path.join(NETAPP_STORAGE_PATH, "archive/{}/{}_{}.csv".format(provider, provider, date)))) return fp def get_s3_key_for_day(provider, date, parts=False): """ Get S3 key for source list (of one of the providers) """ date = date.strftime("%Y%m%d") if parts: fp = "{}/parts/{}_{}_parts.csv".format(provider, provider, date) else: fp = "{}/{}_{}.csv".format(provider, provider, date) return fp def get_s3_url_for_day(provider, date, parts=False): """ Get S3 url for source list (of one of the providers) """ key = get_s3_key_for_day(provider, date, parts) return "s3://{}/{}".format(TOPLISTS_ARCHIVE_S3_BUCKET, key) def get_s3_url_for_fp(fp): """ Get S3 url for source list (of one of the providers) """ return "s3://{}/{}".format(TOPLISTS_ARCHIVE_S3_BUCKET, fp) def generate_prefix_items_file(fp, list_prefix): """ Create list of source list items (up to requested list length) """ with open(fp, encoding='utf8') as f: if list_prefix: return [r.split(",") for r in islice(f.read().splitlines(), list_prefix)] else: return [r.split(",") for r in f.read().splitlines()] def generate_prefix_items_s3(fp, list_prefix): """ Create list of source list items (up to requested list length) """ with smart_open(get_s3_url_for_fp(fp)) as f: if list_prefix: result = [r.decode("utf-8").split(",") for r in islice(f.read().splitlines(), list_prefix)] else: result = [r.decode("utf-8").split(",") for r in f.read().splitlines()] return result def rescale_rank(rank, max_rank_of_input, min_rank_of_output, max_rank_of_output): """ Rescale a given rank to the min/max range provided This makes sure that shorter lists are not given a higher importance. """ return min_rank_of_output + (rank - 1)*((max_rank_of_output-min_rank_of_output)/(max_rank_of_input - 1)) def borda_count_fp(fps, list_prefix): """ Generate aggregate scores for domains based on Borda count """ borda_scores = {} for fp in fps: if USE_S3: items = generate_prefix_items_s3(fp, list_prefix) else: items = generate_prefix_items_file(fp, list_prefix) max_rank_of_input = len(items) max_rank_of_output = min(GLOBAL_MAX_RANK, list_prefix if list_prefix else GLOBAL_MAX_RANK) for rank, elem in items: count_dict(borda_scores, elem, max_rank_of_output + 1 - rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists (i.e. Quantcast) return borda_scores def dowdall_count_fp(fps, list_prefix): """ Generate aggregate scores for domains based on Dowdall count """ dowdall_scores = {} for fp in fps: if USE_S3: items = generate_prefix_items_s3(fp, list_prefix) else: items = generate_prefix_items_file(fp, list_prefix) max_rank_of_input = len(items) max_rank_of_output = min(GLOBAL_MAX_RANK, list_prefix if list_prefix else GLOBAL_MAX_RANK) for rank, elem in items: count_dict(dowdall_scores, elem, 1 / rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists (i.e. Quantcast) return dowdall_scores def filtered_parts_list_file(fp, list_prefix, f_pld=None, f_tlds=None, f_organization=None, f_subdomains=None, maintain_rank=True): """ Get list of domains that conform to the set filters """ with open(fp) as f: if list_prefix: parts_input = islice(f, list_prefix) else: parts_input = f output = [] organizations_seen = set() new_rank = 1 max_rank = 0 for line in parts_input: max_rank += 1 rank, fqdn, pld, sld, subd, ps, tld, is_pld = line.rstrip().split(",") if f_tlds and (tld not in f_tlds): continue if f_subdomains and (subd not in f_subdomains): continue if f_organization: if sld in organizations_seen: continue else: organizations_seen.add(sld) if f_pld: if is_pld != "True": continue if maintain_rank: output.append((rank, fqdn)) else: output.append((new_rank, fqdn)) new_rank += 1 return (output, max_rank) def filtered_parts_list_s3(fp, list_prefix, f_pld=None, f_tlds=None, f_organization=None, f_subdomains=None, maintain_rank=True): """ Get list of domains that conform to the set filters """ with smart_open(get_s3_url_for_fp(fp)) as f: if list_prefix: parts_input = islice(f, list_prefix) else: parts_input = f output = [] organizations_seen = set() new_rank = 1 max_rank = 0 for line in parts_input: max_rank += 1 rank, fqdn, pld, sld, subd, ps, tld, is_pld = line.decode("utf-8").rstrip().split(",") if f_tlds and (tld not in f_tlds): continue if f_subdomains and (subd not in f_subdomains): continue if f_organization: if sld in organizations_seen: continue else: organizations_seen.add(sld) if f_pld: if is_pld != "True": continue if maintain_rank: output.append((rank, fqdn)) else: output.append((new_rank, fqdn)) new_rank += 1 return (output, max_rank) def get_filtered_parts_lists(fps, input_prefix, config, maintain_rank=True): """ Get domains in given source lists that conform to the filters in the configuration """ for fp in fps: if USE_S3: yield filtered_parts_list_s3(fp, input_prefix, config.get("filterPLD", None) == "on", config.get('filterTLDValue').split(",") if config.get("filterTLDValue", None) else None, config.get("filterOrganization", None) == "on", config.get('filterSubdomainValue').split(",") if config.get( "filterSubdomainValue", None) else None, maintain_rank=maintain_rank ) else: yield filtered_parts_list_file(fp, input_prefix, config.get("filterPLD", None) == "on", config.get('filterTLDValue').split(",") if config.get("filterTLDValue", None) else None, config.get("filterOrganization", None) == "on", config.get('filterSubdomainValue').split(",") if config.get( "filterSubdomainValue", None) else None, maintain_rank=maintain_rank ) def borda_count_list(fps, input_prefix, config, maintain_rank=True): """ Generate aggregate scores for list of filtered domains based on Borda count """ borda_scores = {} for (filtered_lst, max_rank) in get_filtered_parts_lists(fps, input_prefix, config): if maintain_rank: max_rank_of_input = max_rank else: max_rank_of_input = len(filtered_lst) max_rank_of_output = min(GLOBAL_MAX_RANK, input_prefix if input_prefix else GLOBAL_MAX_RANK) for rank, elem in filtered_lst: count_dict(borda_scores, elem, max_rank_of_output + 1 - rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists return borda_scores def dowdall_count_list(fps, input_prefix, config, maintain_rank=True): """ Generate aggregate scores for list of filtered domains based on Dowdall count """ dowdall_scores = {} for (filtered_lst, max_rank) in get_filtered_parts_lists(fps, input_prefix, config): if maintain_rank: max_rank_of_input = max_rank else: max_rank_of_input = len(filtered_lst) max_rank_of_output = min(GLOBAL_MAX_RANK, input_prefix if input_prefix else GLOBAL_MAX_RANK) for rank, elem in filtered_lst: count_dict(dowdall_scores, elem, 1 / rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists return dowdall_scores def sort_counts(scores): """ Sort domains based on aggregate scores """ return sorted(scores.keys(), key=lambda elem: (-scores[elem], elem)) def filter_list_1(lst, filter_set, list_size=None): """ Filter list of domains on given set of domains """ if list_size: result = [] for e in lst: if e in filter_set: result.append(e) if len(result) >= list_size: break return result else: return [e for e in lst if e in filter_set] def filter_list_multiple(lst, filter_sets): """ Filter list of domains on given sets of domains """ return [e for e in lst if all(e in filter_set for filter_set in filter_sets)] def count_presence_in_fps(fps, prefix): """ Counts of occurrences in given files with domains """ presence = {} for fp in fps: lst = generate_prefix_items_s3(fp, prefix) for i in lst: count_dict(presence, i, 1) def count_presence_in_sets(sets,): """ Counts of occurrences in given sets """ presence = {} for st in sets: for i in st: count_dict(presence, i, 1) return presence def items_in_any_list(fps, prefix): """ Find domains that appear in any of the given lists """ return set.union(*map(set, [[i[1] for i in generate_prefix_items_s3(fp, prefix)] for fp in fps])) def generate_filter_minimum_presence(fps, prefix, minimum): """ An item should appear on all the lists """ presence = count_presence_in_fps(fps, prefix) return {k for k, v in presence.items() if v >= minimum} def generate_filter_minimum_presence_any(groups_of_fps, prefix, minimum): """ An item should appear in `minimum` groups, where an item may appear in any list in that group """ items_per_group = [items_in_any_list(group, prefix) for group in groups_of_fps] presence = count_presence_in_sets(items_per_group,) return {k for k, v in presence.items() if v >= minimum} def truncate_list(lst, list_size=None): """ Return only prefix of given list """ return lst[:list_size] if list_size else lst def write_sorted_counts(sorted_items, scores, fp): """ Write domains and aggregate scores to file """ with open(fp, 'w', encoding='utf8') as f: csvw = csv.writer(f) for idx, entry in enumerate(sorted_items): csvw.writerow([idx + 1, entry, scores[entry]]) def write_list_to_file(lst, list_id): """ Write ranks and domains to file """ with open(get_generated_list_fp(list_id), 'w', encoding='utf8') as f: csvw = csv.writer(f) for idx, entry in enumerate(lst): csvw.writerow([idx + 1, entry]) def write_zip_to_file(lst, list_id): """ Write list of (top 1M) domains to zip file """ with tempfile.SpooledTemporaryFile(mode='w+b') as z: with tempfile.NamedTemporaryFile(mode='w+') as t: csvw = csv.writer(t) for idx, entry in enumerate(lst): csvw.writerow([idx + 1, entry]) t.seek(0) with zipfile.ZipFile(z, 'w') as a: a.write(t.name, arcname="top-1m.csv") z.seek(0) with open(get_generated_zip_fp(list_id), 'wb') as f: f.write(z.read()) def write_list_to_s3(lst, list_id): """ Write ranks and domains to file """ with smart_open(get_generated_list_s3(list_id), 'w', encoding='utf8') as f: csvw = csv.writer(f) for idx, entry in enumerate(lst): csvw.writerow([idx + 1, entry]) def write_zip_to_s3(lst, list_id): """ Write list of (top 1M) domains to zip file """ with tempfile.SpooledTemporaryFile(mode='w+b') as z: with tempfile.NamedTemporaryFile(mode='w+') as t: csvw = csv.writer(t) for idx, entry in enumerate(lst): csvw.writerow([idx + 1, entry]) t.seek(0) with zipfile.ZipFile(z, 'w') as a: a.write(t.name, arcname="top-1m.csv") z.seek(0) with smart_open(get_generated_zip_s3(list_id), 'wb') as f: f.write(z.read()) def copy_daily_list_s3(list_id): """ Copy the daily list on S3 to the fixed URL """ zip_key = ZIP_FILENAME_FORMAT.format(list_id) source = {'Bucket': TOPLISTS_DAILY_LIST_S3_BUCKET, 'Key': zip_key} target_bucket = s3_resource.Bucket(TOPLISTS_DAILY_LIST_S3_BUCKET) target_bucket.copy(source, 'top-1m.csv.zip') def copy_daily_list_file(list_id): """ Copy the daily list on file-based archive to the fixed URL """ zip_file = get_generated_zip_fp(list_id) target_file = os.path.join(NETAPP_STORAGE_PATH, "generated_lists_zip/{}".format("top-1m.csv.zip")) shutil.copy2(zip_file, target_file) def generate_combined_list(config, list_id, test=False): """ Generate combined list by calculating aggregate scores on (potentially filtered) source lists of ranked domains """ db_id = _list_id_to_db_id(list_id) try: ### INPUT ### # If a filter on parts is selected, the preprocessed parts files should be used. parts_filter = config.get("filterPLD", False) or (config.get("filterTLD", "false") != "false") or config.get("filterOrganization", False) or config.get('filterSubdomain', False) dates = date_list(config.get("startDate"), config.get("endDate")) # Get source files to process fps = [] fps_on_date = {date: [] for date in dates} fps_on_provider = {provider: [] for provider in config['providers']} for provider in config['providers']: for date in dates: if USE_S3: list_fp = get_s3_key_for_day(provider, date, parts_filter) else: list_fp = get_list_fp_for_day(provider, date, parts_filter) fps.append(list_fp) fps_on_date[date].append(list_fp) fps_on_provider[provider].append(list_fp) # Get requested list prefix if "listPrefix" in config and config['listPrefix']: if config['listPrefix'] == "full": input_prefix = None elif config['listPrefix'] == "custom": input_prefix = int(config['listPrefixCustomValue']) else: input_prefix = int(config['listPrefix']) else: input_prefix = None # Generate (sorted) aggregate counts (on parts files if necessary) if parts_filter: if config['combinationMethod'] == 'borda': scores = borda_count_list(fps, input_prefix, config) elif config['combinationMethod'] == 'dowdall': scores = dowdall_count_list(fps, input_prefix, config) else: raise Exception("Unknown combination method") else: if config['combinationMethod'] == 'borda': scores = borda_count_fp(fps, input_prefix) elif config['combinationMethod'] == 'dowdall': scores = dowdall_count_fp(fps, input_prefix) else: raise Exception("Unknown combination method") sorted_domains = sort_counts(scores) domains = sorted_domains ### FILTERS ### filters_to_apply = [] if "inclusionDays" in config and config["inclusionDays"]: presence_filter = generate_filter_minimum_presence_any([fps_on_date[date] for date in dates], input_prefix, int(config["inclusionDaysValue"])) filters_to_apply.append(presence_filter) if "inclusionLists" in config and config["inclusionLists"]: presence_filter = generate_filter_minimum_presence_any([fps_on_provider[provider] for provider in config['providers']], input_prefix, int(config["inclusionListsValue"])) filters_to_apply.append(presence_filter) domains = filter_list_multiple(domains, filters_to_apply) ### OUTPUT ### if test: return domains else: # Write list to file if USE_S3: write_list_to_s3(domains, list_id) else: write_list_to_file(domains, list_id) # If the list is the daily default list, also generate a zip of the top 1M and copy to permanent URL try: if "isDailyList" in config and config["isDailyList"] is True: if USE_S3: write_zip_to_s3(domains[:1000000], list_id) copy_daily_list_s3(list_id) else: write_zip_to_file(domains[:1000000], list_id) copy_daily_list_file(list_id) except: print("Zip creation failed") traceback.print_exc() # Update generation success in database db["lists"].update_one({"_id": db_id}, {"$set": {"finished": True, "failed": False, "list_id": list_id}}) time.sleep(1) # Report success return True except: traceback.print_exc() # Update generation failure in database db["lists"].update_one({"_id": db_id}, {"$set": {"finished": True, "failed": True}}) # Report failure return False ================================================ FILE: generate_daily_list.py ================================================ import datetime import sys from redis import Redis from rq import Queue import combined_lists from shared import DATE_FORMAT_WITH_HYPHEN, DEFAULT_TRANCO_CONFIG def get_date_interval_bounds(start_date, end_date, nb_days, nb_days_from): if start_date: start_date_dt = datetime.datetime.strptime(start_date, DATE_FORMAT_WITH_HYPHEN) return ( start_date, (start_date_dt + datetime.timedelta(days=int(nb_days) - 1)).strftime(DATE_FORMAT_WITH_HYPHEN)) elif end_date: end_date_dt = datetime.datetime.strptime(end_date, DATE_FORMAT_WITH_HYPHEN) return ((end_date_dt - datetime.timedelta(days=int(nb_days) - 1)).strftime(DATE_FORMAT_WITH_HYPHEN), end_date) def generate_todays_lists(day): print("Generating lists for {}...".format(day)) config = DEFAULT_TRANCO_CONFIG.copy() if day == "yesterday": date = (datetime.datetime.utcnow() - datetime.timedelta(days=1)).strftime(DATE_FORMAT_WITH_HYPHEN) elif day == "today": date = datetime.datetime.utcnow().strftime(DATE_FORMAT_WITH_HYPHEN) else: raise ValueError config["startDate"], config["endDate"] = get_date_interval_bounds(None, date, 30, "end") config["isDailyList"] = True print("Generating list...") list_id = combined_lists.config_to_list_id(config) print("Generating list ID {}...".format(list_id)) if not combined_lists.list_available(list_id): conn = Redis('localhost', 6379) generate_queue = Queue('generate', connection=conn, default_timeout="1h") if list_id not in generate_queue.job_ids: generate_queue.enqueue(combined_lists.generate_combined_list, args=(config, list_id), job_id=str(list_id), timeout="1h") print("Submitted job for list ID {}".format(list_id)) if __name__ == '__main__': day = "yesterday" if len(sys.argv) > 1: day = sys.argv[1] generate_todays_lists(day) ================================================ FILE: generate_domain_parts.py ================================================ import csv import sys import tldextract def generate_parts_list(input_fp, output_fp): print(input_fp) print(output_fp) with open(output_fp, 'w', encoding='UTF-8') as output_file: output = csv.writer(output_file) with open(input_fp, encoding='UTF-8') as input_file: for l in input_file: rank, fqdn = l.rstrip('\n').split(",") ext = tldextract.extract(fqdn) pld = ext.registered_domain is_pld = pld == fqdn ps = ext.suffix tld = fqdn[fqdn.rfind(".") + 1:] sld = ext.domain subd = ext.subdomain output.writerow([rank, fqdn, pld, sld, subd, ps, tld, is_pld]) if __name__ == '__main__': input_fp = sys.argv[1] output_fp = "/".join(input_fp.split("/")[:-1]) + "/parts/" + input_fp.split("/")[-1][:-4] + "_parts.csv" generate_parts_list(input_fp, output_fp) ================================================ FILE: global_config.py ================================================ NETAPP_STORAGE_PATH = None # File-based archive MAILGUN_API_KEY = None # API key for sending email notifications TOPLISTS_ARCHIVE_S3_BUCKET = None # S3 bucket with archived rankings TOPLISTS_GENERATED_LIST_S3_BUCKET = None # S3 bucket with generated lists TOPLISTS_DAILY_LIST_S3_BUCKET = None # S3 bucket with daily default lists MONGO_URL = None # Mongo instance for storing configurations of lists USE_S3 = None # Boolean indicating whether to use AWS services GENERATION_REMOTE = None # Boolean indicating whether list generation is handled remotely GENERATION_REMOTE_ENDPOINT = None # Endpoint accepting list generation jobs JOB_SERVER_PORT = None # Port of server accepting list generation jobs ================================================ FILE: job_handler.py ================================================ import functools from redis import Redis from rq import Queue from rq.registry import StartedJobRegistry import combined_lists import notify_email class JobHandler: """ Manage list generation run on this machine. """ def __init__(self, asyncio_loop): self.loop = asyncio_loop self.setup_job_queues() def setup_job_queues(self): """ Setup rq queues for submitting list generation and email notification jobs. """ self.conn = Redis('localhost', 6379) self.generate_queue = Queue('generate', connection=self.conn, default_timeout="1h") self.email_queue = Queue('notify_email', connection=self.conn) async def submit_generate_job(self, config, list_id): """ Submit a new job for generating a list (with the given config) """ if list_id not in await self.loop.run_in_executor(None, self.current_jobs): await self.loop.run_in_executor(None, functools.partial(self.generate_queue.enqueue, combined_lists.generate_combined_list, args=(config, list_id), job_id=str(list_id), timeout="1h")) return True else: return False async def submit_email_job(self, email_address, list_id, list_size): """ Submit a new job for sending an email once a list has been generated """ generate_job = await self.loop.run_in_executor(None, self.generate_queue.fetch_job, list_id) await self.loop.run_in_executor(None, functools.partial(self.email_queue.enqueue, notify_email.send_notification_mailgun_api, email_address, list_id, list_size, depends_on=generate_job)) return True def current_jobs(self): """ Track currently active and queued jobs """ registry = StartedJobRegistry(queue=self.generate_queue) jobs = registry.get_job_ids() + self.current_jobs() return jobs def jobs_ahead_of_job(self, list_id): """ Count number of jobs ahead of current job """ jobs = self.current_jobs() if list_id in jobs: return jobs.index(list_id) else: return 0 async def get_job_status(self, list_id): """ Get current status of a job """ job_success = await self.loop.run_in_executor(None, self.get_job_success, list_id) jobs_ahead = await self.loop.run_in_executor(None, self.jobs_ahead_of_job, list_id) return {"completed": job_success is not None, "jobs_ahead": jobs_ahead, "success": job_success} def get_job_success(self, list_id): """ Get current rq status of a job """ return self.generate_queue.fetch_job(list_id).result class JobHandlerRemote: """ Manage relaying jobs to a remote machine that generates lists. """ def __init__(self, asyncio_loop, endpoint=None, session=None): """ :param asyncio_loop: :param endpoint: remote location that generates lists :param session: client session for aiohttp """ if not endpoint or not session: raise ValueError self.endpoint = endpoint self.session = session async def submit_generate_job(self, config, list_id): """ Submit a new job for generating a list (with the given config) """ async with self.session.post("{}/submit_generate".format(self.endpoint), json={"config": config, "list_id": list_id}) as response: jsn = await response.json() return jsn["success"] async def submit_email_job(self, email_address, list_id, list_size): """ Submit a new job for sending an email once a list has been generated """ async with self.session.post("{}/submit_email".format(self.endpoint), json={"email_address": email_address, "list_id": list_id, "list_size": list_size}) as response: jsn = await response.json() return jsn["success"] async def get_job_status(self, list_id): """ Get current status of a job """ async with self.session.get("{}/job_status".format(self.endpoint), params={"list_id": list_id}) as response: jsn = await response.json() return jsn async def retrieve_list(self, list_id, slice_size): """ Retrieve the contents of a remotely generated list """ async with self.session.get("{}/retrieve_list".format(self.endpoint), json={"list_id": list_id, "slice_size": slice_size}) as response: while True: chunk = await response.content.read(1024) if not chunk: break yield chunk ================================================ FILE: job_server.py ================================================ import asyncio import aitertools from aiohttp import web import combined_lists import job_handler from global_config import JOB_SERVER_PORT class JobServer: """ Job server for accepting requests for generating a custom Tranco list (hosted on remote machine) """ def __init__(self, loop): self.web_app = None self.server = None self.runner = None self.routes = web.RouteTableDef() self.loop = loop self.job_handler: job_handler.JobHandler = None async def submit_generate_job(self, request): """ Submit a new job for generating a list (with the given config) """ post_data = await request.json() print("Generating ", post_data) result = await self.job_handler.submit_generate_job(post_data["config"], post_data["list_id"]) return web.json_response({"success": result}) async def submit_email_job(self, request): """ Submit a new job for sending an email once a list has been generated """ post_data = await request.json() result = await self.job_handler.submit_email_job(post_data["email_address"], post_data["list_id"], post_data["list_size"]) return web.json_response({"success": result}) async def get_job_status(self, request): """ Get current status of a job """ list_id = request.query['list_id'] print("Getting status for ", list_id) return web.json_response(await self.job_handler.get_job_status(list_id)) async def retrieve_list(self, request): """ Retrieve the contents of a remotely generated list """ post_data = await request.json() list_id = post_data["list_id"] slice_size = post_data["slice_size"] file_path = await self.loop.run_in_executor(None, combined_lists.get_generated_list_fp, list_id) async def generator(): with open(file_path) as csvf: async for line in aitertools.islice(csvf, slice_size): yield line.encode("utf-8") return web.Response(body=generator(), content_type="text/csv", charset="utf-8", ) async def initialize_routes(self): self.web_app.add_routes([ web.post('/submit_generate', self.submit_generate_job), web.post('/submit_email', self.submit_email_job), web.get('/job_status', self.get_job_status), web.get('/retrieve_list', self.retrieve_list) ]) async def run(self): self.job_handler = job_handler.JobHandler(self.loop) self.web_app = web.Application() await self.initialize_routes() self.runner = web.AppRunner(self.web_app) await self.runner.setup() self.server = web.TCPSite(self.runner, '0.0.0.0', JOB_SERVER_PORT) await self.server.start() if __name__ == '__main__': loop = asyncio.get_event_loop() server = JobServer(loop) loop.run_until_complete(server.run()) loop.run_forever() ================================================ FILE: notify_email.py ================================================ import smtplib from email.message import EmailMessage import email.utils import requests from rq import Queue, Connection, get_current_connection from global_config import MAILGUN_API_KEY def send_notification_mailgun_api(email_address, list_id, list_size): with Connection(get_current_connection()): q = Queue('generate') job = q.fetch_job(list_id) success = job.result if success: subject = 'The Tranco list: generation succeeded' body = "Hello,\n\nWe have successfully generated your requested Tranco list with ID {}. You may retrieve it at https://tranco-list.eu/list/{}/{}\n\nTranco\nhttps://tranco-list.eu/".format(list_id, list_id, list_size) else: subject = 'The Tranco list: generation failed' body = "Hello,\n\nUnfortunately, we were currently unable to generate your requested Tranco list with ID {}. Please try again later.\n\nTranco\nhttps://tranco-list.eu/".format(list_id) r = requests.post( "https://api.eu.mailgun.net/v3/mg.tranco-list.eu/messages", auth=("api", MAILGUN_API_KEY), data={"from": "Tranco ", "to": [email_address], "subject": subject, "text": body}) return int(r.status_code) == 200 ================================================ FILE: requirements.txt ================================================ boto3 smart_open hashids pymongo redis rq aiohttp aitertools ================================================ FILE: shared.py ================================================ DATE_FORMAT_WITH_HYPHEN = "%Y-%m-%d" DEFAULT_TRANCO_CONFIG = {"nbDays": "30", "nbDaysFrom": "end", "combinationMethod": "dowdall", # TODO make choice based on assessment on stability etc. "listPrefix": 'full', "includeDomains": 'all', # TODO make choice "filterPLD": "on", "providers": ["alexa", "umbrella", "majestic", "quantcast"] } ZIP_FILENAME_FORMAT = "tranco_{}-1m.csv.zip"