[
  {
    "path": ".gdbinit",
    "content": "\nset history save on\n\n"
  },
  {
    "path": ".gitignore",
    "content": "\ndeps/*\ntmp/*\nsrc/*.o\ntests/*.o\nbuild/*\ndocumentation/*.aux\ndocumentation/*.log\ndocumentation/statues_swe.pdf\n.DS_Store\nconfig/config.h\nresponse.txt\ncc_parser.zip\ncc_parser\ncc_indexer.zip\ncc_indexer\ncc_api.zip\ncc_api\ncc_full_text.zip\ncc_full_text\nrun_tests\nCMakeCache.txt\nCMakeFiles\nCMakeScripts\nMakefile\ncmake_install.cmake\nwarc.paths\n.vscode\n.gdb_history\n*~\n*.swp\n*.swo\n\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "\nset(CMAKE_BUILD_TYPE Release)\n#set(CMAKE_BUILD_TYPE Debug)\n\ncmake_minimum_required(VERSION 3.5)\nset(CMAKE_C_COMPILER /usr/bin/gcc-10)\nset(CMAKE_CXX_COMPILER /usr/bin/g++-10)\nset(CMAKE_CXX_STANDARD 20)\nset(CMAKE_CXX_FLAGS_RELEASE \"-O3\")\nset(CMAKE_CXX_FLAGS_DEBUG \"-g\")\nset(THREADS_PREFER_PTHREAD_FLAG ON)\nproject(alexandria LANGUAGES CXX)\n\nadd_definitions(-Wfatal-errors)\n\nlist(APPEND CMAKE_MODULE_PATH \"${CMAKE_CURRENT_LIST_DIR}/cmake\")\n\nadd_subdirectory(\"deps/abseil-cpp\")\n\nfind_package(roaring REQUIRED)\nfind_package(Threads REQUIRED)\nFIND_PACKAGE(CURL REQUIRED)\nfind_package(Boost REQUIRED COMPONENTS system iostreams filesystem unit_test_framework)\nfind_package(ZLIB)\nfind_package(fcgi)\n\ninclude_directories(src/)\ninclude_directories(deps/)\ninclude_directories(tests/)\n\nset(SRC_CLASSES\n\n\t\"src/url_link/link.cpp\"\n\t\"src/api/result_with_snippet.cpp\"\n\t\"src/api/api_response.cpp\"\n\t\n\t\"src/file/file.cpp\"\n\t\"src/file/archive.cpp\"\n\t\"src/file/tsv_file.cpp\"\n\t\"src/file/gz_tsv_file.cpp\"\n\t\"src/file/tsv_file_remote.cpp\"\n\t\"src/file/tsv_row.cpp\"\n\n\t\"src/transfer/transfer.cpp\"\n\n\t\"src/hash_table2/hash_table.cpp\"\n\t\"src/hash_table2/hash_table_shard.cpp\"\n\t\"src/hash_table2/hash_table_shard_builder.cpp\"\n\t\"src/hash_table2/builder.cpp\"\n\n\t\"src/hash_table_helper/hash_table_helper.cpp\"\n\n\t\"src/parser/parser.cpp\"\n\t\"src/parser/entities.cpp\"\n\t\"src/parser/html_link.cpp\"\n\t\"src/parser/html_parser.cpp\"\n\t\"src/parser/unicode.cpp\"\n\t\"src/parser/cc_parser.cpp\"\n\n\t\"src/downloader/warc_downloader.cpp\"\n\t\"src/downloader/merge_downloader.cpp\"\n\n\t\"src/URL.cpp\"\n\n\t\"src/warc/warc.cpp\"\n\n\t\"src/profiler/profiler.cpp\"\n\n\t\"src/logger/logger.cpp\"\n\n\t\"src/utils/thread_pool.cpp\"\n\n\t\"src/memory/memory.cpp\"\n\t\"src/memory/debugger.cpp\"\n\n\t\"src/config.cpp\"\n\n\t\"src/algorithm/algorithm.cpp\"\n\t\"src/algorithm/intersection.cpp\"\n\t\"src/algorithm/sort.cpp\"\n\t\"src/algorithm/hash.cpp\"\n\t\"src/algorithm/hyper_log_log.cpp\"\n\t\"src/algorithm/bloom_filter.cpp\"\n\n\t\"src/tools/splitter.cpp\"\n\t\"src/tools/find_links.cpp\"\n\t\"src/tools/counter.cpp\"\n\t\"src/tools/calculate_harmonic.cpp\"\n\t\"src/tools/generate_url_lists.cpp\"\n\n\t\"src/cluster/document.cpp\"\n\t\"src/scraper/scraper.cpp\"\n\t\"src/scraper/scraper_store.cpp\"\n\n\t\"src/indexer/index_manager.cpp\"\n\t\"src/indexer/console.cpp\"\n\t\"src/indexer/merger.cpp\"\n\t\"src/indexer/score_builder.cpp\"\n\t\"src/indexer/index_reader.cpp\"\n\t\"src/indexer/index_utils.cpp\"\n\n\t\"src/server/search_server.cpp\"\n\t\"src/server/url_server.cpp\"\n\n\t\"src/http/server.cpp\"\n\t\"src/http/request.cpp\"\n\n\t\"src/domain_stats/domain_stats.cpp\"\n\t\"src/debug.cpp\"\n\n\t\"deps/robots.cc\"\n)\n\nset(SRC_COMMON\n\t\"src/common/dictionary.cpp\"\n\t\"src/common/system.cpp\"\n\t\"src/common/datetime.cpp\"\n\t\"src/common/dictionary_row.cpp\"\n\t\"src/text/stopwords.cpp\"\n\t\"src/text/text.cpp\"\n)\n\nset(SRC_TESTS\n\t\"tests/test_hyper_log_log.cpp\"\n\t\"tests/test_memory.cpp\"\n\t\"tests/test_algorithm.cpp\"\n\t\"tests/test_bloom_filter.cpp\"\n\t\"tests/test_cc_parser.cpp\"\n\t\"tests/test_configuration.cpp\"\n\t\"tests/test_counted_index_builder.cpp\"\n\t\"tests/test_datetime.h\"\n\t\"tests/test_file.cpp\"\n\t\"tests/test_hash.cpp\"\n\t\"tests/test_hash_table.cpp\"\n\t\"tests/test_html_parser.cpp\"\n\t\"tests/test_hyper_ball.cpp\"\n\t\"tests/test_index_builder.cpp\"\n\t\"tests/test_index_iteration.cpp\"\n\t\"tests/test_index_reader.cpp\"\n\t\"tests/test_logger.cpp\"\n\t\"tests/test_n_gram.cpp\"\n\t\"tests/test_robot_parser.cpp\"\n\t\"tests/test_scraper.cpp\"\n\t\"tests/test_sharded_index_builder.cpp\"\n\t\"tests/test_sort.cpp\"\n\t\"tests/test_sum_sorted.cpp\"\n\t\"tests/test_text.cpp\"\n\t\"tests/test_thread_pool.cpp\"\n\t\"tests/test_top_k.cpp\"\n\t\"tests/test_unicode.cpp\"\n\t\"tests/test_url.cpp\"\n\t\"tests/test_url_record.cpp\"\n\n\t# This overloads the new/delete operators to keep track of memory, slows things down a lot.\n\t\"src/memory/overload.cpp\"\n)\n\nadd_executable(run_tests\n\t\"tests/main.cpp\"\n\t${SRC_CLASSES}\n\t${SRC_COMMON}\n\t${SRC_TESTS}\n)\nadd_executable(server\n\t\"src/server.cpp\"\n\t${SRC_CLASSES}\n\t${SRC_COMMON}\n)\nadd_executable(scraper\n\t\"src/scraper.cpp\"\n\t${SRC_CLASSES}\n\t${SRC_COMMON}\n)\nadd_executable(indexer\n\t\"src/indexer.cpp\"\n\t${SRC_CLASSES}\n\t${SRC_COMMON}\n)\nadd_executable(alexandria\n\t\"src/alexandria.cpp\"\n\t${SRC_CLASSES}\n\t${SRC_COMMON}\n)\n\ntarget_compile_definitions(run_tests PUBLIC IS_TEST)\ntarget_compile_definitions(run_tests PUBLIC FT_NUM_SHARDS=16)\ntarget_compile_definitions(run_tests PUBLIC HT_NUM_SHARDS=16)\ntarget_compile_definitions(run_tests PUBLIC FILE_SERVER=\"http://127.0.0.1\")\ntarget_compile_definitions(run_tests PUBLIC COMPILE_WITH_LINK_INDEX)\n\ntarget_compile_options(run_tests PUBLIC -Wall -Werror)\ntarget_compile_options(server PUBLIC -Wall -Werror)\ntarget_compile_options(scraper PUBLIC -Wall -Werror)\ntarget_compile_options(indexer PUBLIC -Wall -Werror)\ntarget_compile_options(alexandria PUBLIC -Wall -Werror)\n\ntarget_link_libraries(run_tests PUBLIC\n\t${FCGI_LIBRARY}\n\t${FCGI_LIBRARYCPP}\n\t${CURL_LIBRARIES}\n\t${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)\ntarget_link_libraries(server PUBLIC\n\t${FCGI_LIBRARY}\n\t${FCGI_LIBRARYCPP}\n\t${CURL_LIBRARIES}\n\t${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)\ntarget_link_libraries(scraper PUBLIC\n\t${FCGI_LIBRARY}\n\t${FCGI_LIBRARYCPP}\n\t${CURL_LIBRARIES}\n\t${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)\ntarget_link_libraries(indexer PUBLIC\n\t${FCGI_LIBRARY}\n\t${FCGI_LIBRARYCPP}\n\t${CURL_LIBRARIES}\n\t${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)\ntarget_link_libraries(alexandria PUBLIC\n\t${FCGI_LIBRARY}\n\t${FCGI_LIBRARYCPP}\n\t${CURL_LIBRARIES}\n\t${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)\n"
  },
  {
    "path": "Dockerfile",
    "content": "# syntax=docker/dockerfile:1\nFROM ubuntu:latest\nARG DEBIAN_FRONTEND=noninteractive\nRUN apt-get update && apt-get install -y zip make cmake gcc gcc-10 g++ g++-10 libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx vim wget git curl\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nAlexandria.org\n\nCopyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Alexandria.org\n\n1. [Coding Rules](/documentation/coding_rules.md)\n2. [Full text indexes](/documentation/full_text_indexes.md)\n3. [Hash table](/documentation/hash_table.md)\n\n## Build instructions with docker\n1. Checkout repo\nWINDOWS USERS: You need to run 'git config --global core.autocrlf false' before checking out the repository\n```\ngit clone git@github.com:alexandria-org/alexandria.git\n```\n2. Build docker image\n```\ndocker build . -t alexandria\n```\n3. Run container\n```\ndocker container run --name alexandria -v ${PWD}:/alexandria -it -d alexandria\n```\n4. Attach to container.\n```\ndocker exec -it alexandria /bin/bash\n```\n5. Navigate to directory\n```\ncd /alexandria\n```\n6. Initialize docker\n```\nscripts/init-docker.sh\n```\n7. Configure with cmake\n```\nmkdir build; cd build; cmake ..\n```\n8. Build all\n```\nmake -j4\n```\n9. Run test suite\n```\n./run_tests\n```\n\n## How to build manually (not recommended)\n1. Configure the system (Tested on Ubuntu 20.04)\n```\n# Will alter your system and install dependencies with apt.\n./scripts/install-deps.sh\n\n# Will download and build zlib, aws-lambda-cpp and aws-sdk-cpp will only alter the local directory.\n./scripts/build-deps.sh\n```\n\n2. Build with cmake\n```\nmkdir build\ncd build\n\ncmake .. -DCMAKE_BUILD_TYPE=Debug\nor\ncmake .. -DCMAKE_BUILD_TYPE=Release\n\nmake -j24\n```\n\n3. Download test data to local server.\nTo run the test suite you need to install nginx and pre-download all the data: [Configure local nginx test data server](/documentation/configure_local_nginx.md)\n\n4. Create output directories. Note, this will create a bunch of directories in the /mnt so make sure you don't have anything there.\n```\n./scripts/prepare-output-dirs.sh\n```\n\n5. Run the test suite\n```\ncd build\nmake run_tests -j24\n./run_tests\n```\n\n## Notes\nOn nodes with spinning disks we should turn off energy saving:\n```\nhdparm -B 255 /dev/sda\n```\n\n## Debugging notes\n### Debugging scraper with gdb:\nBy default, gdb captures SIGPIPE of a process and pauses it. However, some program ignores SIGPIPE. So, the default behavour of gdb is not desired when debugging those program. To avoid gdb stopping in SIGPIPE, use the folloing command in gdb:\n```handle SIGPIPE nostop noprint pass```\n"
  },
  {
    "path": "cmake/Findfcgi.cmake",
    "content": "# CMake module to search for FastCGI headers\n#\n# If it's found it sets FCGI_FOUND to TRUE\n# and following variables are set:\n#    FCGI_INCLUDE_DIR\n#    FCGI_LIBRARY\nFIND_PATH(FCGI_INCLUDE_DIR\n  fcgio.h\n  PATHS\n  /usr/include\n  /usr/local/include\n  /usr/include/fastcgi\n  \"$ENV{LIB_DIR}/include\"\n  $ENV{INCLUDE}\n  )\n\nFIND_LIBRARY(FCGI_LIBRARY NAMES fcgi libfcgi PATHS \n  /usr/local/lib \n  /usr/lib \n  \"$ENV{LIB_DIR}/lib\"\n  \"$ENV{LIB}\"\n  )\nFIND_LIBRARY(FCGI_LIBRARYCPP NAMES libfcgi++.so PATHS \n  /usr/local/lib \n  /usr/lib \n  \"$ENV{LIB_DIR}/lib\"\n  \"$ENV{LIB}\"\n  )\n\nIF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY)\n   SET(FCGI_FOUND TRUE)\nENDIF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY)\n\nIF (FCGI_FOUND)\n   IF (NOT FCGI_FIND_QUIETLY)\n      MESSAGE(STATUS \"Found FCGI: ${FCGI_LIBRARY}\")\n      MESSAGE(STATUS \"Found FCGI: ${FCGI_LIBRARYCPP}\")\n   ENDIF (NOT FCGI_FIND_QUIETLY)\nELSE (FCGI_FOUND)\n   IF (FCGI_FIND_REQUIRED)\n      MESSAGE(FATAL_ERROR \"Could not find FCGI\")\n   ENDIF (FCGI_FIND_REQUIRED)\nENDIF (FCGI_FOUND)\n"
  },
  {
    "path": "config.conf",
    "content": "\n# Cluster config\nnodes_in_cluster = 3\nnode_id = 0\n\n# Indexer config\nbatches[] = ALEXANDRIA-MANUAL-01\nbatches[] = CC-MAIN-2021-25\nbatches[] = CC-MAIN-2021-31\n\nlink_batches[] = CC-MAIN-2021-31\nlink_batches[] = CC-MAIN-2021-25\nlink_batches[] = CC-MAIN-2021-21\nlink_batches[] = CC-MAIN-2021-17\nlink_batches[] = CC-MAIN-2021-10\nlink_batches[] = CC-MAIN-2021-04\nlink_batches[] = CC-MAIN-2020-50\nlink_batches[] = CC-MAIN-2020-45\n\n# Server config\nworker_count = 8\nquery_max_words = 10 # Maximum number of words used in query.\nquery_max_len = 200\ndeduplicate_domain_count = 5\npre_result_limit = 200000\nresult_limit = 1000\n\n# Full text config\nft_max_sections = 4\nft_max_results_per_section = 2000000\n\n\n"
  },
  {
    "path": "documentation/alexandria.md",
    "content": "Usage: ./alexandria [OPTIONS]...\n\n## Options\n\n**--downloader [commoncrawl-batch] [limit] [offset]**\n\nDownloads files from the given commoncrawl batch. Limit and offset arguments are used for downloading a subset of the files. Example\n```\n./alexandria --downloader CC-MAIN-2022-27 2500 0\n```\nWill download the first 2500 files from CC-MAIN-2022-27 and upload them to the 'upload' host. See config documentation.\n\n**--downloader-merge**\n\nMerges downloaded files. This should run on the upload host to merge the different downloaded batches into our hash table.\n\n**--hash-table-url [URL]**\n\nSearches the local hash table called 'all_urls' for the given URL.\n\n**--hash-table-url-hash [URL-hash]**\n\nSearches the local hash table called 'all_urls' for the given URL-hash.\n\n**--hash-table-count**\n\nCounts all items in local hash table called 'all_urls'.\n\n**--hash-table-find-all [HOST]**\n\nSearches the local hash table called 'all_urls' for urls from specified host. This takes several days for large hash table.\n\n**--hash-table-count [HOST]**\n\nEstimated count of host from hash table by only counting one shard and multiply by number of shards.\n\n**--hash-table-optimize-shard [SHARD]**\n\nOptimizes shard for local hash table called 'all_urls'.\n\n**--internal-harmonic**\n\nRun the whole internal links harmonic calculator. Should run on 'upload' host.\n"
  },
  {
    "path": "documentation/api_response_format.md",
    "content": "# Api Response Format\n\nThis is a description of the endpoints available on a node.\n\n### Perform search\n```\ncurl http://node0002.alexandria.org/?q=the%20beatles\n{\n  \"status\":\t\"success\",\n  \"time_ms\":\t35.876,\n  \"total_found\":\t245436,\n  \"total_url_links_found\":\t4092,\n  \"total_domain_links_found\":\t4092,\n  \"links_handled\":\t674,\n  \"link_domain_matches\":\t18059,\n  \"link_url_matches\":\t589,\n  \"results\":\t[{\n    \"url\":\t\"https://www.example.com/\",\n    \"title\":\t\"Example dot com\",\n    \"snippet\":\t\"Lorem ipsum dolor esit\",\n    \"score\":\t182.51408386230469,\n    \"domain_hash\":\t\"2892282071861106665\",\n    \"url_hash\":\t\"2892281418178079567\"\n  }]\n}\n\nThe url flag d can be used to control deduplication:\ncurl http://node0002.alexandria.org/?q=the%20beatles&d=a\ncurl http://node0002.alexandria.org/?q=the%20beatles&d=d\n\nd=a // No deduplication, show all results\nd=d // Deduplication\nDefault value is d=d\n```\n\n### Perform url lookup\n```\ncurl http://node0002.alexandria.org/?u=https://www.example.org/\n{\n  \"status\":\t\"success\",\n  \"time_ms\":\t35.876,\n  \"response\":\t\"[DATA]\"\n}\n```\n\n### Fetch information about search result\n```\ncurl http://node0002.alexandria.org/?s=example%20query\n{\n  \"status\":\t\"success\",\n  \"time_ms\":\t13.984,\n  \"index\":\t{\n    \"total\":\t980770801,\n    \"words\":\t{\n      \"example\":\t0.0080152416772448342,\n      \"query\":\t0.0017581304401006531\n    }\n  },\n  \"link_index\":\t{\n    \"total\":\t472012858,\n    \"words\":\t{\n      \"example\":\t0.000581251114985516,\n      \"query\":\t6.3595725182554242e-05\n    }\n  }\n}\n```\n\n### Fetch status of the node.\n```\ncurl http://node0002.alexandria.org/status\n{\n  \"status\":\t\"success\",\n  \"time_ms\":\t13.984,\n  \"total_disk_space\": 89374934876,\n  \"avail_disk_space\": 83975235,\n  \"avail_disk_percent\": 0.0832,\n  \"index\":\t{\n    \"items\":\t980770801,\n    \"full_text_disk_used\": 973295875,\n    \"full_text_disk_percent\": 0.5423,\n    \"hash_table_disk_used\": 839265,\n    \"hash_table_disk_percent\": 0.05423\n  },\n  \"link_index\":\t{\n    \"items\":\t980770801,\n    \"full_text_disk_used\": 973295875,\n    \"full_text_disk_percent\": 0.2423,\n    \"hash_table_disk_used\": 839265,\n    \"hash_table_disk_percent\": 0.0423\n  }\n}\n```\n\n### Combined api response (api.alexandria.org)\n```\ncurl https://api.alexandria.org/?q=the%20beatles&p=1\n{\n  \"status\":\t\"success\",\n  \"time_ms\":\t35.876,\n  \"total_found\":\t245436,\n  \"total_url_links_found\":\t4092,\n  \"total_domain_links_found\":\t4092,\n  \"links_handled\":\t674,\n  \"link_domain_matches\":\t18059,\n  \"link_url_matches\":\t589,\n  \"page_max\": 10,\n  \"results\":\t[{\n    \"url\":\t\"https://www.example.com/\",\n    \"display_url\": \"https://www.example.com/\",\n    \"title\":\t\"Example dot com\",\n    \"snippet\":\t\"Lorem ipsum dolor esit\",\n    \"score\":\t182.51408386230469,\n    \"domain_hash\":\t\"2892282071861106665\",\n    \"url_hash\":\t\"2892281418178079567\",\n    \"exact_match\": 1,\n    \"phrase_match\": 1,\n    \"year\": 3300,\n    \"is_old\": 0,\n    \"is_subdomain\": 0,\n    \"domain\": \"www.example.com\"\n  },\n  ...\n  ]\n}\n```\n"
  },
  {
    "path": "documentation/caching.md",
    "content": "## Caching\n\nOur nodes should try to use as much RAM as possible to store index data for common tokens in RAM. I think the best way would be to hold a list of the most commonly queried tokens.\n\nWe can use /proc/meminfo to retrieve information about available memory on the server.\n"
  },
  {
    "path": "documentation/coding_rules.md",
    "content": "\n## Coding rules\n1. Indent with tabs.\n2. Use auto for variable declarations when possible.\n3. Never put \"using namespace std\" in any file.\n4. Prefix class member variables with m_, this way you know you are using a member or local variable.\n5. All namespaces, classes, functions and variables should be lower_case.\n6. All files within a sub-directory must declare everything within a namespace with the same name as the directory. For example src/file/tsv_file.h must declare everything within the namespace file::\n7. Prefer smart pointers over regular pointers.\n8. Prefer if statements over switch statements.\n\n## Indentation examples\n\nIndent with tabs!\n\n### pointers\n```c++\n// * and & are glued to the variable\nint *ptr = new int[100];\nint *ptr2 = &addr;\n```\n\n### operators\n```c++\n// Spaces between binary operators\nint a = 1 + 2;\nint b = multiple * (add1 + add2);\na += b;\n\n// Unary operators are glued to variable\nint a = 1;\na++;\nint b = -a;\n```\n\n### functions\n```c++\n// Spaces after comma\nint add(int a, int b) {\n    return a + b;\n}\n\n// Spaces after comma here too\nadd(123, 333);\n```\n\n### classes\n```c++\ntemplate<typename data_record>\nclass index_builder {\n    public:\n        index_builder(const std::string &db_name, size_t id);\n        int public_func();\n\n    private:\n        int m_member;\n        int m_counter;\n\n        int private_func();\n};\n```\n\n\n### if\n```c++\n// Space between \"if\" and \"(\"\n// Space between \")\" and \"{\"\nif (something) {\n    do_something();\n} else if (something_else) {\n    do_something_else();\n} else {\n    do_else();\n}\n```\n\n### loops\n```c++\n// Prefer range based loops.\nfor (const auto &iter : m_map) {\n\n}\n// But if you need a standard loop indent it like this.\nfor (int i = 0; i < 100; i++) {\n\n} \n```\n\n### memory allocation\n```c++\n// Avoid new/delete, use smart pointers everywhere.\n// If you just need a regular pointer to memory do this:\nstd::unique_ptr<char[]> allocator;\ntry {\n    allocator = std::make_unique<char[]>(1000);\n} catch (std::bad_alloc &error) {\n    // Handle allocation error.\n}\n\nchar *ptr = allocator.get();\n\n// Use ptr as regular pointer to 1000 chars.\n// ptr will be deleted automatically when allocator goes out of scope.\n```\n\n\n\n"
  },
  {
    "path": "documentation/configure_local_nginx.md",
    "content": "# COnfigure local nginx server.\n\n1. Install nginx\n```\napt-get install nginx\n```\n\n2. Add configuration to /etc/nginx/sites-available/default (If you are running other sites locally you should probably do something else here)\n```\nserver {\n\tlisten 80 default_server;\n\tlisten [::]:80 default_server;\n\n\troot /var/www/html/node0003.alexandria.org;\n\n\tindex index.html index.htm index.nginx-debian.html;\n\n\tserver_name _;\n\n\tlocation / {\n\t\ttry_files $uri $uri/ =404;\n\t\tautoindex on;\n\t}\n}\n```\n\n3. Download test data to /var/www/html\n```\n./scripts/download-test-data.sh /var/www/html\n```\n\n"
  },
  {
    "path": "documentation/full_text_indexes.md",
    "content": "# The alexandria full text index\n\nA full text index in its simplest form is a hash map from an integer word id ```key``` to a list of documents.\n\nThere are two kinds of data structures called ```index``` and ```counted_index```. Both data structures acts on a given template type\n```data_record```.\nThe two data structures shares the same data layout except for the last part where ```index``` stores roaring bitmaps while `counted_index` store the records.\n\n## Data layout\n\nThe index starts with a hash table. The hash table stores the position for the page containing `key` at index `key % hash_table_size`.\n\n```\nhash table        : uint64_t[hash_table_size] (8 x hash_table_size bytes)\nnum_records       : uint64_t (8 bytes)\nlist of records   : data_record[num_records] (sizeof(data_record) * num_records bytes)\nconsecutive pages : page[varying] (undetermined size)\n```\n\nA single page consists of a list of keys. Each key then has a corresponding position among the bitmaps and a length of the bitmap. The bitmaps (of varying length) are then stored consecutively.\n```\nnum_keys             : uint64_t (8 bytes)\nlist of keys         : uint64_t[num_keys] (8 x num_keys bytes)\nlist of positions    : uint64_t[num_keys] (8 x num_keys bytes)\nlist of lengths      : uint64_t[num_keys] (8 x num_keys bytes)\nconsecutive bitmaps  : bitmap[num_keys] (undetermined size)\n```\n\n\n\n"
  },
  {
    "path": "documentation/ideas.md",
    "content": "# Similar words\nTo handle similar words (saluhall, saluhallen) we should create a hashtable with similar words and as an additional index create \"saluhall+\" by combining our existing indexes of saluhall, saluhallen, saluhallarna etc. into one additional index.\n\n# Autocomplete\nWe should base our autocomplete on the most common words in titles of documents before and after each word. For example \"Uppsala\" could suggest \"Uppsala kommun\", \"Uppsala universitet\" and \"Destination Uppsala\" based on the search results.\n"
  },
  {
    "path": "documentation/index_file_format.md",
    "content": "# Index file format\n\n```8 bytes number of keys (n)\n8 * n bytes keys\n8 * n bytes positions\n8 * n bytes lengths (len(k) number of records for key k)\n8 * n bytes total found results\n[Data Records]\n```\n\n```\nData records are structured like this:\nlen(k) * (8 bytes unsigned long URL id, 4 bytes single precision float score)\n"
  },
  {
    "path": "documentation/indexer.md",
    "content": "### NAME\n\nindexer - manually index data or analyze things\n\n### SYNOPSIS\n\nindexer [OPTION]\n\n### DESCRIPTION\n```\n\t--split source_batch target_prefix\n\t\tsplits the urls in the local source batch and outputs them into {target_prefix}-[0-23]/files.\n\t\tfor example --split CC-MAIN-2021-04 /mnt/crawl-data/NODE\n\t--split-count\n\t--split-count-domains\n\t--split-count-links\n\t--split-make-scraper-urls\n\n\t--tools-download-batch\n\t--tools-upload-urls-with-links\n\t--tools-find-links\n\n\t--calculate-harmonic-hosts\n\t--calculate-harmonic-links\n\t--calculate-harmonic\n\n\t--host-hash\n\t--host-hash-mod\n\n\t--console\n\t\trun the interactive console for making debug searches.\n\n\t--index-domans BATCH LIMIT OFFSET\n\t\trun the indexer for our domain index adding the urls+data from BATCH\n\t--index-links BATCH LIMIT OFFSET\n\t\trun the link indexer adding url_ and domain_ links from BATCH\n\t--index-words BATCH LIMIT OFFSET\n\t\trun the word indexer adding word data from BATCH\n\t--index-urls BATCH LIMIT OFFSET\n\t\trun the url indexer on batch generating one index per domain\n\t--index-snippets BATCH LIMIT OFFSET\n\t\trun the snippet indexer\n\n\t--truncate-domains\n\t--truncate-links\n\t--truncate-words\n\t--truncate-urls\n\t--truncate-snippets\n\n\t--info\n\t\tprint info about indexes\n```\n"
  },
  {
    "path": "documentation/installing_nodes.md",
    "content": "If problem with raid information on drive unmount all partitions and do this:\n```\nwipefs -a /dev/nvme1n1\n```\nthen reset and install node again.\n\nTo setup node with two drives run:\n```\nsource <(curl -s https://raw.githubusercontent.com/alexandria-org/alexandria/main/scripts/bootstrap_node_2drives.sh)\n```\n"
  },
  {
    "path": "documentation/performance_journal.md",
    "content": "## Performance journal\n\n### File system testing\nExt2 (noatime,nodiratime,barrier=0)\n```\n$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync\n0+1 records in\n0+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 4.76649 s, 451 MB/s\n\n$ echo 3 > /proc/sys/vm/drop_caches\n\n$ time dd if=/tmp/test1.img of=/dev/null bs=8k\n262143+1 records in\n262143+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.43043 s, 1.5 GB/s\n\nreal\t0m1.435s\nuser\t0m0.013s\nsys\t0m0.763s\n```\nExt2 (relatime)\n```\n$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync\n0+1 records in\n0+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 5.02563 s, 427 MB/s\n\n$ echo 3 > /proc/sys/vm/drop_caches\n\n$ time dd if=/tmp/test1.img of=/dev/null bs=8k\n262143+1 records in\n262143+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.48533 s, 1.4 GB/s\n\nreal\t0m1.490s\nuser\t0m0.046s\nsys\t0m0.604s\n```\n\nExt4 (noatime,nodiratime,barrier=0):\n```\n$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync\n0+1 records in\n0+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.26469 s, 948 MB/s\n\n$ echo 3 > /proc/sys/vm/drop_caches\n\n$ time dd if=/tmp/test1.img of=/dev/null bs=8k\n262143+1 records in\n262143+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.821499 s, 2.6 GB/s\n\nreal\t0m0.824s\nuser\t0m0.004s\nsys\t0m0.648s\n```\n\nExt4 (relatime):\n```\n$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync\n0+1 records in\n0+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.15461 s, 997 MB/s\n\n$ echo 3 > /proc/sys/vm/drop_caches\n\n$ time dd if=/tmp/test1.img of=/dev/null bs=8k\n262143+1 records in\n262143+1 records out\n2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.822013 s, 2.6 GB/s\n\nreal\t0m0.825s\nuser\t0m0.029s\nsys\t0m0.568s\n```\n\nConclusion. Run ext4\n\n### Software load testing\n2021-10-06, AX61-NVME with two discs\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        node0002.alexandria.org\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   294.451 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      294262066 bytes\nHTML transferred:       293986342 bytes\nRequests per second:    6.79 [#/sec] (mean)\nTime per request:       736.127 [ms] (mean)\nTime per request:       147.225 [ms] (mean, across all concurrent requests)\nTransfer rate:          975.94 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       12   19  10.1     16     152\nProcessing:    16  717 461.5    652    2896\nWaiting:        0  662 431.7    587    2770\nTotal:         31  736 460.4    671    2911\n\nPercentage of the requests served within a certain time (ms)\n  50%    671\n  66%    879\n  75%   1009\n  80%   1108\n  90%   1344\n  95%   1595\n  98%   1864\n  99%   2062\n 100%   2911 (longest request)\n```\n\n2021-10-10, AX61-NVME with two discs\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        node0002.alexandria.org\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   328.051 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      255881934 bytes\nHTML transferred:       255605934 bytes\nRequests per second:    6.10 [#/sec] (mean)\nTime per request:       820.128 [ms] (mean)\nTime per request:       164.026 [ms] (mean, across all concurrent requests)\nTransfer rate:          761.73 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       12   52  95.6     25    1560\nProcessing:    16  767 558.9    689    3961\nWaiting:       15  638 427.9    594    2631\nTotal:         32  819 558.5    742    4113\n\nPercentage of the requests served within a certain time (ms)\n  50%    742\n  66%    982\n  75%   1159\n  80%   1260\n  90%   1560\n  95%   1831\n  98%   2186\n  99%   2470\n 100%   4113 (longest request)\n```\n\n2021-10-10, AX41-NVMe with four discs\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        65.21.238.146\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   278.694 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      232745432 bytes\nHTML transferred:       232469432 bytes\nRequests per second:    7.18 [#/sec] (mean)\nTime per request:       696.735 [ms] (mean)\nTime per request:       139.347 [ms] (mean, across all concurrent requests)\nTransfer rate:          815.56 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       12   69  98.4     35    1107\nProcessing:    14  627 698.4    454    9790\nWaiting:       14  435 346.5    368    4045\nTotal:         29  696 719.1    522   10159\n\nPercentage of the requests served within a certain time (ms)\n  50%    522\n  66%    755\n  75%    927\n  80%   1050\n  90%   1382\n  95%   1781\n  98%   2415\n  99%   3439\n 100%  10159 (longest request)\n```\n\n2021-10-10, AX41-NVMe with four discs\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        65.21.238.146\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   252.503 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      230349918 bytes\nHTML transferred:       230073780 bytes\nRequests per second:    7.92 [#/sec] (mean)\nTime per request:       631.258 [ms] (mean)\nTime per request:       126.252 [ms] (mean, across all concurrent requests)\nTransfer rate:          890.88 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       12   54  78.2     27    1068\nProcessing:    15  576 519.3    436    3659\nWaiting:       15  421 325.7    354    2421\nTotal:         30  631 527.6    491    3728\n\nPercentage of the requests served within a certain time (ms)\n  50%    491\n  66%    707\n  75%    861\n  80%    988\n  90%   1355\n  95%   1736\n  98%   2100\n  99%   2419\n 100%   3728 (longest request)\n```\n\n2021-10-10, AX61-NVME with two discs, 4 partitions\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        65.21.125.158\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   263.283 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      282821583 bytes\nHTML transferred:       282545445 bytes\nRequests per second:    7.60 [#/sec] (mean)\nTime per request:       658.209 [ms] (mean)\nTime per request:       131.642 [ms] (mean, across all concurrent requests)\nTransfer rate:          1049.03 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       13   28  32.9     26     630\nProcessing:    17  629 434.1    563    3051\nWaiting:       15  587 412.8    517    2949\nTotal:         36  657 435.8    593    3090\n\nPercentage of the requests served within a certain time (ms)\n  50%    593\n  66%    774\n  75%    914\n  80%   1003\n  90%   1260\n  95%   1480\n  98%   1708\n  99%   1959\n 100%   3090 (longest request)\n```\n\n2021-10-10, AX61-NVME with two discs, 4 partitions\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        65.21.125.158\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   249.241 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      267058842 bytes\nHTML transferred:       266782842 bytes\nRequests per second:    8.02 [#/sec] (mean)\nTime per request:       623.101 [ms] (mean)\nTime per request:       124.620 [ms] (mean, across all concurrent requests)\nTransfer rate:          1046.38 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       13   27  19.3     25     734\nProcessing:    15  596 469.4    506    3785\nWaiting:        0  554 449.3    467    3660\nTotal:         32  622 470.7    531    3805\n\nPercentage of the requests served within a certain time (ms)\n  50%    531\n  66%    735\n  75%    878\n  80%    974\n  90%   1234\n  95%   1495\n  98%   1809\n  99%   2104\n 100%   3805 (longest request)\n```\n\n2021-10-12, AX61-NVME with four discs and 8 partitions\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        135.181.182.4\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   264.412 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      274309399 bytes\nHTML transferred:       274033261 bytes\nRequests per second:    7.56 [#/sec] (mean)\nTime per request:       661.029 [ms] (mean)\nTime per request:       132.206 [ms] (mean, across all concurrent requests)\nTransfer rate:          1013.12 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       13   27  16.1     25     348\nProcessing:    14  633 449.6    565    2996\nWaiting:        0  590 425.7    520    2545\nTotal:         34  661 450.3    594    3014\n\nPercentage of the requests served within a certain time (ms)\n  50%    594\n  66%    772\n  75%    905\n  80%   1000\n  90%   1271\n  95%   1510\n  98%   1834\n  99%   1997\n 100%   3014 (longest request)\n```\n\n2021-10-12, AX61-NVME with four discs and 8 partitions\n```\nServer Software:        nginx/1.18.0\nServer Hostname:        135.181.182.4\nServer Port:            80\n\nConcurrency Level:      5\nTime taken for tests:   233.408 seconds\nComplete requests:      2000\nFailed requests:        0\nWrite errors:           0\nTotal transferred:      272488725 bytes\nHTML transferred:       272213277 bytes\nRequests per second:    8.57 [#/sec] (mean)\nTime per request:       583.519 [ms] (mean)\nTime per request:       116.704 [ms] (mean, across all concurrent requests)\nTransfer rate:          1140.07 [Kbytes/sec] received\n\nConnection Times (ms)\n              min  mean[+/-sd] median   max\nConnect:       12   25  10.1     24     187\nProcessing:    15  558 402.0    487    2727\nWaiting:        0  512 377.0    440    2051\nTotal:         33  583 402.8    512    2757\n\nPercentage of the requests served within a certain time (ms)\n  50%    512\n  66%    695\n  75%    806\n  80%    882\n  90%   1114\n  95%   1373\n  98%   1621\n  99%   1779\n 100%   2757 (longest request)\n```\n"
  },
  {
    "path": "documentation/search_result_ranking.md",
    "content": "\n# Search Result Ranking\n\nThis document describes how search results are indexed and ranked.\n\n## Input\nInput to our indexer is a sequence of deduplicated urls with the following data.\n```\n{\n    url: \"https://www.example.com/\",\n    title: \"Example Page\",\n    meta_description: \"\",\n    h1: \"Example Domain\",\n    text: \"This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...\"\n}\n```\n\n## 1. Domain level\nEach url is added with the url hash as key. The tokens are not deduplicated throughout the domain.\n\n```\ndomain_score:\nidf * sum(tf_ + )\n```\n\n```\ndomain_score = expm1(5 * link.m_score) + 0.1;\nurl_score = expm1(10 * link.m_score) + 0.1;\n```\n"
  },
  {
    "path": "documentation/statues_swe.tex",
    "content": "\n\\documentclass[12pt, a4paper]{article}\n\\usepackage[T1]{fontenc}\n\\usepackage[utf8]{inputenc}\n\\usepackage[swedish]{babel}\n\n\\title{Stadgar för Föreningen Alexandria.org}\n\\date{Januari 2022}\n\n\\begin{document}\n\n\\maketitle\n\n\\paragraph{§ 1 Föreningens firma}\n\\paragraph{}\nFöreningens firma är Föreningen Alexandria.org och föreningens firmatecknare är ordförande eller annan person utsedd till firmatecknare av styrelsen.\n\n\\paragraph{§ 2 Föreningens ändamål}\n\\paragraph{}\nFöreningen har som ändamål att göra kunskap mer tillgängligt. Föreningen ska uppfylla sitt ändamål genom att utveckla och tillhandahålla en sökmotor som är gratis och utan annonser. Källkoden till sökmotorn ska publiceras som öppen källkod.\n\n\\paragraph{§ 3 Föreningens säte}\n\\paragraph{}\nFöreningen har sitt säte i Uppsala.\n\n\\paragraph{§ 4 Medlemsskap}\n\\paragraph{}\nFöreningens medlemmar är aktiva i föreningens verksamhet. Nya medlemmar måste godkännas av styrelsen.\n\n\\paragraph{§ 5 Medlemsavgifter}\n\\paragraph{}\nMedlem ska betala den medlemsavgift som årligen fastställs av årsmötet.\n\n\\pagebreak\n\n\n\\paragraph{§ 6 Styrelsen}\n\\paragraph{}\nStyrelsen består av en ordförande, en kassör, en suppleant och eventuellt ytterligare ledarmöter enligt årsmötets beslut.\n\n\\paragraph{§ 7 Styrelsens uppdrag}\n\\paragraph{}\nStyrelsen företräder föreningen, bevakar dess intressen och handhar dess angelägenheter. Styrelsen beslutar å föreningens vägnar såvida inte annat\nföreskrivs i dessa stadgar. Styrelsen ska verkställa av årsmötet fattade beslut, handha föreningens ekonomiska angelägenheter och föra räkenskaper,\nsamt avge årsredovisning till årsstämman för det senaste räkenskapsåret. Styrelsen sammanträder när ordföranden finner det erforderligt eller om\nminst två styrelseledamöter begär detta.\n\n\\paragraph{}\nStyrelsen är beslutsför då minst hälften av ledmöterna, avrundat uppåt är närvarande. Styrelsebeslut fattas med enkel majoritet. Vid lika röstetal gäller den mening\nordföranden biträder.\n\n\\paragraph{§ 8 Räkenskaper}\n\\paragraph{}\nRäkenskapsår ska vara kalenderår.\n\n\\paragraph{§ 9 Revisor}\n\\paragraph{}\nStyrelsens förvaltning ska årligen granskas av en på årsmötet utsedd revisor. Revisorn ska senast den 1 mars avge sin revisionsberättelse. Revisorn får ej vara medlem i styrelsen.\n\n\\paragraph{§ 10 Årsmöte}\n\\paragraph{}\nOrdinarie årsmöte, vilket är föreningens högsta beslutande organ, hålls årligen före den 30 juni på tid och plats som styrelsen bestämmer. Kallelse sker via epost minst 2 veckor före utsatt möte. Motioner som har inkommit senast 7 dagar före årsmötet ska anses ha kommit i tid. Motioner skickas via epost.\n\n\\paragraph{}\nVid ordinarie årsmöte ska följande ärenden behandlas:\n\\begin{enumerate}\n\\item Val av ordförande och sekreterare för mötet.\n\\item Fastställande av röstlängd för mötet.\n\\item Fastställande av dagordning.\n\\item Styrelsens verksamhetsberättelse för det senaste verksamhetsåret.\n\\item Styrelsens förvaltningsberättelse (balans- och resultaträkning) för det senaste verksamhets-/räkenskapsåret.\n\\item Revisionsberättelsen för verksamhets-/räkenskapsåret.\n\\item Fråga om ansvarsfrihet för styrelsen för den tid revisionen avser.\n\\item Fastställande av medlemsavgifter.\n\\item Fastställande av ev. verksamhetsplan och behandling av budget för det kommande verksamhets-/räkenskapsåret.\n\\item Val av ordförande i föreningen för en tid av 1 år.\n\\item Val av kassör, övriga styrelseledamöter samt suppleanter för en tid av 1 år\n\\item Val av revisorer.\n\\item Behandling av styrelsens förslag och i rätt tid inkomna motioner.\n\\item Övriga frågor. \n\\end{enumerate}\n\n\\paragraph{§ 11 Extra årsmöte}\n\\paragraph{}\nExtra årsmöte hålls när styrelsen eller revisorerna finner att det är nödvändigt. Kallelse sker via epost minst 2 veckor före utsatt möte.\n\n\\paragraph{§ 12 Rösträtt}\n\\paragraph{}\nVid årsmöte har varje medlem en röst. Rösträtten är personlig och kan inte utövas genom ombud.\n\n\\paragraph{§ 13 Beslut, omröstning och beslutsmässighet}\n\\paragraph{}\nBeslut fattas med bifallsrop (acklamation) eller om så begärs, efter omröstning (votering).\n\n\\paragraph{}\nOmröstning sker öppet, utom vid val där sluten omröstning ska äga rum om någon begär detta. Beslut fattas, såvida dessa stadgar ej föreskriver\nannat, med enkel majoritet. Vid lika röstetal skall den mening som ordförande biträder vinna bifall.\n\n\\paragraph{}\nMötet är beslutsmässigt med det antal röstberättigade medlemmar som är närvarande på mötet.\n\n\\paragraph{§ 14 Regler för ändring av stadgarna}\n\\paragraph{}\nFör ändring av dessa stadgar krävs beslut av två på varandra följande ordinarie årsmöten. Förslag till ändring av stadgarna får ges såväl av medlem som styrelsen.\n\n\\paragraph{§ 15 Utträde}\n\\paragraph{}\nMedlem som önskar utträda ur föreningen ska skriftligen anmäla detta till styrelsen och anses därmed omedelbart ha lämnat föreningen.\n\n\\paragraph{§ 16 Uteslutning}\n\\paragraph{}\nMedlem får uteslutas från föreningen om den har försummat att betala beslutade avgifter, motarbetat föreningens\nverksamhet eller ändamål, eller skadat föreningens intressen. Beslut om uteslutning fattas av styrelsen.\n\n\\end{document}\n"
  },
  {
    "path": "scripts/bootstrap_node_2drives.sh",
    "content": "#!/bin/bash\n\napt-get update\napt-get -y install vim parted zip unzip nginx\n\n_mkpart() { \n\tdisc=$1\n\tmountpoint1=$2\n\tmountpoint2=$3\n\tmountpoint3=$4\n\tmountpoint4=$5\n\tparted -s $disc mklabel gpt\n\tparted -s -a optimal $disc mkpart primary ext4 0% 25%\n\tparted -s -a optimal $disc mkpart primary ext4 25% 50%\n\tparted -s -a optimal $disc mkpart primary ext4 50% 75%\n\tparted -s -a optimal $disc mkpart primary ext4 75% 100%\n\n\tsleep 1\n\n\tmkfs.ext4 -F ${disc}p1\n\tmkfs.ext4 -F ${disc}p2\n\tmkfs.ext4 -F ${disc}p3\n\tmkfs.ext4 -F ${disc}p4\n\n\tmkdir $mountpoint1\n\tmkdir $mountpoint2\n\tmkdir $mountpoint3\n\tmkdir $mountpoint4\n\n\tmount ${disc}p1 $mountpoint1\n\tmount ${disc}p2 $mountpoint2\n\tmount ${disc}p3 $mountpoint3\n\tmount ${disc}p4 $mountpoint4\n\n\techo \"\" >> /etc/fstab\n\techo \"${disc}p1 $mountpoint1 ext4 noatime,nodiratime,barrier=0 0 0\" >> /etc/fstab\n\techo \"${disc}p2 $mountpoint2 ext4 noatime,nodiratime,barrier=0 0 0\" >> /etc/fstab\n\techo \"${disc}p3 $mountpoint3 ext4 noatime,nodiratime,barrier=0 0 0\" >> /etc/fstab\n\techo \"${disc}p4 $mountpoint4 ext4 noatime,nodiratime,barrier=0 0 0\" >> /etc/fstab\n}\n\nmkdir /mnt/0\nmkdir /mnt/1\nmkdir /mnt/2\nmkdir /mnt/3\n\n_mkpart /dev/nvme1n1 /mnt/4 /mnt/5 /mnt/6 /mnt/7\n\nfor shard in $(seq 0 7); do\n\tmkdir \"/mnt/$shard/input\";\n\tmkdir \"/mnt/$shard/output\";\n\tmkdir \"/mnt/$shard/upload\";\n\tmkdir \"/mnt/$shard/hash_table\";\n\tmkdir \"/mnt/$shard/full_text\";\n\tmkdir \"/mnt/$shard/tmp\";\ndone\n\necho \"server {\n    listen 80;\n    server_name localhost;\n\n    location / {\n        fastcgi_pass   127.0.0.1:8000;\n        fastcgi_param  GATEWAY_INTERFACE  CGI/1.1;\n        fastcgi_param  SERVER_SOFTWARE    nginx;\n        fastcgi_param  QUERY_STRING       \\$query_string;\n        fastcgi_param  REQUEST_METHOD     \\$request_method;\n        fastcgi_param  CONTENT_TYPE       \\$content_type;\n        fastcgi_param  CONTENT_LENGTH     \\$content_length;\n        fastcgi_param  SCRIPT_FILENAME    \\$document_root\\$fastcgi_script_name;\n        fastcgi_param  SCRIPT_NAME        \\$fastcgi_script_name;\n        fastcgi_param  REQUEST_URI        \\$request_uri;\n        fastcgi_param  DOCUMENT_URI       \\$document_uri;\n        fastcgi_param  DOCUMENT_ROOT      \\$document_root;\n        fastcgi_param  SERVER_PROTOCOL    \\$server_protocol;\n        fastcgi_param  REMOTE_ADDR        \\$remote_addr;\n        fastcgi_param  REMOTE_PORT        \\$remote_port;\n        fastcgi_param  SERVER_ADDR        \\$server_addr;\n        fastcgi_param  SERVER_PORT        \\$server_port;\n        fastcgi_param  SERVER_NAME        \\$server_name;\n    }\n}\" > /etc/nginx/sites-enabled/default\n/etc/init.d/nginx restart\n\nadduser --system --shell /sbin/nologin --gecos \"User for running alexandria service\" --disabled-password --home /alexandria alexandria\n\ntouch /var/log/alexandria.log\nchown alexandria:syslog /var/log/alexandria.log\n\necho \"[Unit]\nDescription=Alexandria Server\n\n[Service]\nUser=alexandria\nWorkingDirectory=/alexandria\nExecStart=/alexandria/server\nNice=-20\nRestart=always\n\n[Install]\nWantedBy=multi-user.target\" > /etc/systemd/system/alexandria.service\n\necho \"# Cluster config\nnodes_in_cluster = 4\nnode_id = 0\n\n# Indexer config\nbatches[] = NODE-0\nbatches[] = NODE-1\nbatches[] = NODE-2\nbatches[] = NODE-3\nbatches[] = NODE-4\nbatches[] = NODE-5\n\nlink_batches[] = LINK-0\nlink_batches[] = LINK-1\nlink_batches[] = LINK-2\nlink_batches[] = LINK-3\nlink_batches[] = LINK-4\nlink_batches[] = LINK-5\n\n# Server config\nworker_count = 8\nquery_max_words = 10 # Maximum number of words used in query.\nquery_max_len = 200\ndeduplicate_domain_count = 5\npre_result_limit = 200000\nresult_limit = 1000\n\n# Full text config\nft_max_sections = 8\nft_max_results_per_section = 2000000\nft_section_depth = 4\" > /etc/alexandria.conf\n\nmkdir /alexandria\ncd /alexandria\nwget https://github.com/alexandria-org/alexandria/releases/download/v1.0/alexandria.zip\nunzip alexandria.zip\nchown -R alexandria /mnt/*\n"
  },
  {
    "path": "scripts/build-deps.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nbase_path=`pwd`\n\ncd $base_path\ncd deps\n\ncd zlib-1.2.12\n./configure\nmake -j4\nmake install\n\ncd $base_path\ncd deps\n\nexport CC=/usr/bin/gcc\nexport CXX=/usr/bin/g++\n\ncd CRoaring\nmkdir build\ncd build\ncmake ..\nmake\nmake install\n\n"
  },
  {
    "path": "scripts/clean.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nread -p \"Do you want to delete your local alexandria data? [Y/n] \" -n 1 -r\necho\nif [[ $REPLY =~ ^[Y]$ ]]\nthen\n\tfor shard in $(seq 0 7); do\n\t\trm -r /mnt/$shard/*\n\t\tmkdir /mnt/$shard\n\t\tmkdir \"/mnt/$shard/input\";\n\t\tmkdir \"/mnt/$shard/output\";\n\t\tmkdir \"/mnt/$shard/upload\";\n\t\tmkdir \"/mnt/$shard/hash_table\";\n\t\tmkdir \"/mnt/$shard/full_text\";\n\t\tmkdir \"/mnt/$shard/tmp\";\n\tdone\n\nelse\n\techo \"Ignoring\"\nfi\n\n"
  },
  {
    "path": "scripts/download-deps.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nexport CC=/usr/bin/gcc-10\nexport CXX=/usr/bin/g++-10\n\nbase_path=`pwd`\ncd $base_path\n\nmkdir -p deps\ncd deps\n\ncurl -L https://github.com/nlohmann/json/releases/latest/download/json.hpp > json.hpp\n\ncurl https://zlib.net/fossils/zlib-1.2.12.tar.gz > zlib-1.2.12.tar.gz\ngunzip -f zlib-1.2.12.tar.gz\ntar -xvf zlib-1.2.12.tar\n\ngit clone https://github.com/abseil/abseil-cpp.git\ngit clone https://github.com/RoaringBitmap/CRoaring.git\nwget https://raw.githubusercontent.com/google/robotstxt/master/robots.cc\nwget https://raw.githubusercontent.com/google/robotstxt/master/robots.h\n\n"
  },
  {
    "path": "scripts/download-test-data.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\n\nif [ $# -eq 0 ]; then\n\techo \"Provide destination path as first argument\"\n\texit 1\nfi\n\nfor shard in $(seq 0 7); do\n\tmkdir \"/mnt/$shard\";\n\tmkdir \"/mnt/$shard/input\";\n\tmkdir \"/mnt/$shard/output\";\n\tmkdir \"/mnt/$shard/upload\";\n\tmkdir \"/mnt/$shard/hash_table\";\n\tmkdir \"/mnt/$shard/full_text\";\n\tmkdir \"/mnt/$shard/tmp\";\ndone\n\nDEST=$1\n\ncd $DEST || { echo \"target directory does not exist\"; exit 127; }\n\nrm -r node0003.alexandria.org\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-01/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-02/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-03/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-04/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-05/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-06/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-07/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-08/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-09/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-10/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/50_top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/dev_files/ --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/example.txt --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/example.txt.gz --http-user=alexandria --http-password=wmXN6U4u\nwget -r -l1 --no-parent http://node0003.alexandria.org/test-data/ --http-user=alexandria --http-password=wmXN6U4u\n\nmkdir node0003.alexandria.org/nodes\nmkdir node0003.alexandria.org/nodes/test0001\nmkdir node0003.alexandria.org/upload-tmp\n\nchown -R www-data:www-data node0003.alexandria.org\n\n"
  },
  {
    "path": "scripts/find_missing_files_in_batch.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nbatch=$1\n\nfiles=`curl https://data.commoncrawl.org/crawl-data/$batch/warc.paths.gz | gunzip`\n\nmissing_files_path=\"/mnt/crawl-data/$batch/missing.paths\"\n\ntruncate -s 0 $missing_files_path\n\nfor raw_file in $files; do\n\tfile=\"/mnt/${raw_file/.warc.gz/.gz}\"\n\tif [[ -f \"$file\" ]]; then\n\t\tfilesize=$(stat -c%s \"$file\")\n\t\tif [[ $filesize -lt 1000 ]]; then\n\t\t\techo \"The file '$file' exists and is small.\"\n\t\t\techo $raw_file >> $missing_files_path\n\t\tfi\n\telse\n\t\techo \"The file '$file' does not exist.\"\n\t\techo $raw_file >> $missing_files_path\n\tfi\ndone\n\ngzip $missing_files_path\n\n"
  },
  {
    "path": "scripts/init-docker.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\n\n# The local docker development environment runs the data server on the local machine.\n# This script sets that up and downloads the test data.\n\necho \"Copying nginx config\";\n\necho \"server {\n\tlisten 80 default_server;\n\tlisten [::]:80 default_server;\n\n\troot /var/www/html/node0003.alexandria.org;\n\tindex index.html;\n\tserver_name _;\n\n\tlocation / {\n\t\t\tautoindex on;\n    \t\tclient_body_temp_path /var/www/html/node0003.alexandria.org/upload-tmp;\n    \t\tdav_methods PUT;\n    \t\tcreate_full_put_path  on;\n    \t\tdav_access group:rw  all:r;\n    \t\tclient_max_body_size 10000m;\n\t}\n\tlocation /store {\n\t\tfastcgi_pass   127.0.0.1:8001;\n\t\tfastcgi_param  GATEWAY_INTERFACE  CGI/1.1;\n\t\tfastcgi_param  SERVER_SOFTWARE    nginx;\n\t\tfastcgi_param  QUERY_STRING       \\$query_string;\n\t\tfastcgi_param  REQUEST_METHOD     \\$request_method;\n\t\tfastcgi_param  CONTENT_TYPE       \\$content_type;\n\t\tfastcgi_param  CONTENT_LENGTH     \\$content_length;\n\t\tfastcgi_param  SCRIPT_FILENAME    \\$document_root\\$fastcgi_script_name;\n\t\tfastcgi_param  SCRIPT_NAME        \\$fastcgi_script_name;\n\t\tfastcgi_param  REQUEST_URI        \\$request_uri;\n\t\tfastcgi_param  DOCUMENT_URI       \\$document_uri;\n\t\tfastcgi_param  DOCUMENT_ROOT      \\$document_root;\n\t\tfastcgi_param  SERVER_PROTOCOL    \\$server_protocol;\n\t\tfastcgi_param  REMOTE_ADDR        \\$remote_addr;\n\t\tfastcgi_param  REMOTE_PORT        \\$remote_port;\n\t\tfastcgi_param  SERVER_ADDR        \\$server_addr;\n\t\tfastcgi_param  SERVER_PORT        \\$server_port;\n\t\tfastcgi_param  SERVER_NAME        \\$server_name;\n\t}\n}\n\" > /etc/nginx/sites-enabled/default\n\necho \"Downloading test data\";\n./download-test-data.sh /var/www/html\n\nmkdir /var/www/html/node0003.alexandria.org/nodes\nmkdir /var/www/html/node0003.alexandria.org/nodes/test0001\nmkdir /var/www/html/node0003.alexandria.org/upload-tmp\n\nchown -R www-data:www-data /var/www/html/node0003.alexandria.org\n\n/etc/init.d/nginx restart\n\n./download-deps.sh\n./build-deps.sh\n"
  },
  {
    "path": "scripts/install-deps.sh",
    "content": "#!/bin/bash\n\napt-get install -y zip make cmake gcc-10 g++-10 gcc g++ libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx\n"
  },
  {
    "path": "scripts/packager.sh",
    "content": "#!/bin/bash\n#  Copyright 2018-present Amazon.com, Inc. or its affiliates. All Rights Reserved.\n#\n#  Licensed under the Apache License, Version 2.0 (the \"License\").\n#  You may not use this file except in compliance with the License.\n#  A copy of the License is located at\n#\n#   http://aws.amazon.com/apache2.0\n#\n#  or in the \"license\" file accompanying this file. This file is distributed\n#  on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either\n#  express or implied. See the License for the specific language governing\n#  permissions and limitations under the License.\n\n# Modified by Josef Cullhed 2021\n\nset -euo pipefail\n\nprint_help() {\n    echo -e \"Usage: packager [OPTIONS] <binary name>\\n\"\n    echo -e \"OPTIONS\\n\"\n    echo -e \"\\t-d,--default-libc\\t Use the target host libc libraries. This will not package the C library files.\\n\"\n}\n\nif [ $# -lt 1 ]; then\n    echo -e \"Error: missing arguments\\n\"\n    print_help\n    exit 1\nfi\n\nPOSITIONAL=()\nINCLUDE_LIBC=true\nwhile [[ $# -gt 0 ]]\ndo\n    key=\"$1\"\n    case $key in\n        -d|--default-libc)\n            INCLUDE_LIBC=false\n            shift # past argument\n            ;;\n        *)    # unknown option\n            POSITIONAL+=(\"$1\") # save it in an array for later\n            shift # past argument\n            ;;\n    esac\ndone\nset -- \"${POSITIONAL[@]}\" # restore positional parameters\n\nPKG_BIN_PATH=$1\narchitecture=$(arch)\n\nif [ ! -d \"$PKG_BIN_PATH\" ]; then\n    echo \"$PKG_BIN_PATH\" - No such directory.;\n    exit 1;\nfi\n\nif ! type zip > /dev/null 2>&1; then\n    echo \"zip utility is not found. Please install it and re-run this script\"\n    exit 1\nfi\nfunction package_libc_via_pacman {\n    if grep --extended-regexp \"Arch Linux|Manjaro Linux\" < /etc/os-release > /dev/null 2>&1; then\n        if type pacman > /dev/null 2>&1; then\n            pacman --query --list --quiet glibc | sed -E '/\\.so$|\\.so\\.[0-9]+$/!d'\n        fi\n    fi\n}\n\nfunction package_libc_via_dpkg() {\n    if type dpkg-query > /dev/null 2>&1; then\n        if [[ $(dpkg-query --listfiles libc6 | wc -l) -gt 0 ]]; then\n            dpkg-query --listfiles libc6 | sed -E '/\\.so$|\\.so\\.[0-9]+$/!d'\n        fi\n    fi\n}\n\nfunction package_libc_via_rpm() {\n    if type rpm > /dev/null 2>&1; then\n       if [[ $(rpm --query --list glibc.$architecture | wc -l) -gt 1 ]]; then\n           rpm --query --list glibc.$architecture | sed -E '/\\.so$|\\.so\\.[0-9]+$/!d'\n       fi\n    fi\n}\n\n# hasElement expects an element and an array parameter\n# it's equivalent to array.contains(element)\n# e.g. hasElement \"needle\" ${haystack[@]}\nfunction hasElement() {\n    local el key=$1\n    shift\n    for el in \"$@\"\n    do\n        [[ \"$el\" == \"$key\" ]] && return 0\n    done\n    return 1\n}\n\nPKG_BIN_FILENAME=alexandria\nPKG_DIR=tmp\nPKG_LD=\"\"\n\nlist=$(ldd \"$PKG_BIN_PATH/server\" | awk '{print $(NF-1)}')\nlibc_libs=()\nlibc_libs+=($(package_libc_via_dpkg))\nlibc_libs+=($(package_libc_via_rpm))\nlibc_libs+=($(package_libc_via_pacman))\n\nmkdir -p \"$PKG_DIR/bin\" \"$PKG_DIR/lib\"\n\nfor i in $list\ndo\n    if [[ ! -f $i ]]; then # ignore linux-vdso.so.1\n        continue\n    fi\n\n    # Do not copy libc files which are directly linked unless it's the dynamic loader\n    if hasElement \"$i\" \"${libc_libs[@]}\"; then\n        filename=$(basename \"$i\")\n        if [[ -z \"${filename##ld-*}\" ]]; then\n            PKG_LD=$filename # Use this file as the loader\n            cp \"$i\" \"$PKG_DIR/lib\"\n        fi\n        continue\n    fi\n\n    cp \"$i\" $PKG_DIR/lib\ndone\n\nif [[ $INCLUDE_LIBC == true ]]; then\n    for i in \"${libc_libs[@]}\"\n    do\n        filename=$(basename \"$i\")\n        if [[ -z \"${filename##ld-*}\" ]]; then\n            # if the loader is empty, then the binary is probably linked to a symlink of the loader. The symlink will\n            # not show up when quering the package manager for libc files. So, in this case, we want to copy the loader\n            if [[ -z \"$PKG_LD\" ]]; then \n                PKG_LD=$filename\n                cp \"$i\" \"$PKG_DIR/lib\" # we want to follow the symlink (default behavior)\n            fi\n            continue # We don't want the dynamic loader's symlink because its target is an absolute path (/lib/ld-*).\n        fi\n        cp --no-dereference \"$i\" \"$PKG_DIR/lib\"\n    done\nfi\n\nif [[ -z \"$PKG_LD\" ]]; then\n    echo \"Failed to identify, locate or package the loader. Please file an issue on Github!\" 1>&2\n    exit 1\nfi\n\nbootstrap_script_server=$(cat <<EOF\n#!/bin/bash\nset -euo pipefail\nulimit -n 104857\nALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf nice -n -20 ./lib/$PKG_LD --library-path ./lib ./bin/server\nEOF\n)\n\nbootstrap_script_scraper=$(cat <<EOF\n#!/bin/bash\nset -euo pipefail\nulimit -n 104857\nALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf nice -n -20 ./lib/$PKG_LD --library-path ./lib ./bin/scraper\nEOF\n)\n\nbootstrap_script_indexer=$(cat <<EOF\n#!/bin/bash\nset -euo pipefail\nulimit -n 104857\nALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf ./lib/$PKG_LD --library-path ./lib ./bin/indexer \\$@\nEOF\n)\n\nbootstrap_script_alexandria=$(cat <<EOF\n#!/bin/bash\nset -euo pipefail\nulimit -n 104857\nALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf ./lib/$PKG_LD --library-path ./lib ./bin/alexandria \\$@\nEOF\n)\n\ncp \"$PKG_BIN_PATH/server\" \"$PKG_DIR/bin\"\ncp \"$PKG_BIN_PATH/scraper\" \"$PKG_DIR/bin\"\ncp \"$PKG_BIN_PATH/indexer\" \"$PKG_DIR/bin\"\ncp \"$PKG_BIN_PATH/alexandria\" \"$PKG_DIR/bin\"\ncp \"$PKG_BIN_PATH/../scripts/bootstrap_node_2drives.sh\" \"$PKG_DIR/\"\ncp \"$PKG_BIN_PATH/../scripts/truncate.sh\" \"$PKG_DIR/\"\ncp \"$PKG_BIN_PATH/../scripts/update.sh\" \"$PKG_DIR/\"\nchmod +x \"$PKG_DIR/bootstrap_node_2drives.sh\"\nchmod +x \"$PKG_DIR/truncate.sh\"\nchmod +x \"$PKG_DIR/update.sh\"\necho -e \"$bootstrap_script_server\" > \"$PKG_DIR/server\"\necho -e \"$bootstrap_script_scraper\" > \"$PKG_DIR/scraper\"\necho -e \"$bootstrap_script_indexer\" > \"$PKG_DIR/indexer\"\necho -e \"$bootstrap_script_alexandria\" > \"$PKG_DIR/alexandria\"\nchmod +x \"$PKG_DIR/server\"\nchmod +x \"$PKG_DIR/scraper\"\nchmod +x \"$PKG_DIR/indexer\"\nchmod +x \"$PKG_DIR/alexandria\"\n# some shenanigans to create the right layout in the zip file without extraneous directories\npushd \"$PKG_DIR\" > /dev/null\nzip --symlinks --recurse-paths \"$PKG_BIN_FILENAME\".zip -- *\nORIGIN_DIR=$(dirs -l +1)\nmv \"$PKG_BIN_FILENAME\".zip \"$ORIGIN_DIR\"\npopd > /dev/null\nrm -r \"$PKG_DIR\"\necho Created \"$ORIGIN_DIR/$PKG_BIN_FILENAME\".zip\n\n"
  },
  {
    "path": "scripts/prepare-output-dirs.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nfor shard_id in $(seq 0 7); do\n\tshard=\"/mnt/$shard_id\"\n\trm -r $shard\n\tmkdir $shard\n\tmkdir \"$shard/input\";\n\tmkdir \"$shard/output\";\n\tmkdir \"$shard/upload\";\n\tmkdir \"$shard/hash_table\";\n\tmkdir \"$shard/full_text\";\n\tmkdir \"$shard/tmp\";\ndone\n"
  },
  {
    "path": "scripts/truncate.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nfor shard in $(seq 0 7); do\n\trm -r /mnt/$shard/*\n\tmkdir \"/mnt/$shard/input\";\n\tmkdir \"/mnt/$shard/output\";\n\tmkdir \"/mnt/$shard/upload\";\n\tmkdir \"/mnt/$shard/hash_table\";\n\tmkdir \"/mnt/$shard/full_text\";\n\tmkdir \"/mnt/$shard/tmp\";\ndone\n\nchown -R alexandria /mnt/*\n\n"
  },
  {
    "path": "scripts/update.sh",
    "content": "#!/bin/bash\n\ncd `dirname $0`\n\nwget https://github.com/alexandria-org/alexandria/releases/latest/download/alexandria.zip -O alexandria.zip\nunzip -o alexandria.zip\n\n"
  },
  {
    "path": "src/URL.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"URL.h\"\n#include \"algorithm/hash.h\"\n#include \"parser/parser.h\"\n#include <curl/curl.h>\n#include \"text/text.h\"\n#include \"warc/tlds.h\"\n\nusing namespace std;\n\nURL::URL() {\n\tm_status = ::parser::OK;\n}\n\nURL::URL(const URL &url) :\n\tm_url_string(url.m_url_string),\n\tm_host(url.m_host),\n\tm_host_reverse(url.m_host_reverse),\n\tm_scheme(url.m_scheme),\n\tm_path(url.m_path),\n\tm_query(url.m_query),\n\tm_status(url.m_status),\n\tm_has_www(url.m_has_www)\n{\n}\n\nURL::URL(const string &url) :\n\tm_url_string(url)\n{\n\tm_status = parse();\n}\n\nURL::URL(const string &host, const string &path) :\n\tm_url_string(\"http://\" + host + path), m_host(host), m_path(path)\n{\n\tm_host_reverse = URL::host_reverse(m_host);\n\tm_status = ::parser::OK;\n}\n\nURL::~URL() {\n\n}\n\nvoid URL::set_url_string(const string &url) {\n\tm_url_string = url;\n\tm_status = parse();\n}\n\nstring URL::str() const {\n\treturn m_url_string;\n}\n\nstring URL::key() const {\n\t/*\n\t * We should probably change this to:\n\t * return m_host + path_with_query();\n\t * but we need to do it later..\n\t */\n\treturn m_host + m_path + m_query;\n}\n\nstring URL::hash_input() const {\n\treturn m_host + path_with_query();\n}\n\nuint64_t URL::hash() const {\n\treturn ::algorithm::hash(hash_input());\n}\n\nuint64_t URL::host_hash() const {\n\treturn ::algorithm::hash(m_host);\n}\n\nuint64_t URL::link_hash(const URL &target_url, const string &link_text) const {\n\treturn ::algorithm::hash(host() + target_url.str());\n}\n\nuint64_t URL::domain_link_hash(const URL &target_url, const string &link_text) const {\n\treturn ::algorithm::hash(host() + target_url.host());\n}\n\nbool URL::canonically_different(const URL &url) const {\n\treturn key() != url.key();\n}\n\nbool URL::has_https() const {\n\treturn m_scheme == \"https\";\n}\n\nbool URL::has_www() const {\n\treturn m_has_www;\n}\n\nstring URL::host() const {\n\treturn m_host;\n}\n\nstring URL::host_top_domain() const {\n\tvector<string> parts;\n\tstd::string_view host(m_host);\n\n\tsize_t pos1 = host.find_last_of(\".\");\n\tif (host.substr(pos1 + 1) == \"uk\") {\n\t\tpos1 = host.find_last_of(\".\", pos1 - 1);\n\t\tif (host.substr(pos1 + 1) != \"co.uk\") {\n\t\t\treturn m_host;\n\t\t}\n\t} else if (host.substr(pos1 + 1) == \"au\") {\n\t\tpos1 = host.find_last_of(\".\", pos1 - 1);\n\t}\n\tsize_t pos2 = host.find_last_of(\".\", pos1 - 1);\n\tif (pos2 == string::npos) {\n\t\treturn m_host;\n\t}\n\treturn m_host.substr(pos2 + 1);\n}\n\nstring URL::scheme() const {\n\treturn m_scheme;\n}\n\nstring URL::host_reverse() const {\n\treturn m_host_reverse;\n}\n\nstring URL::path() const {\n\treturn m_path;\n}\n\nstring URL::path_with_query() const {\n\tif (m_query.size() > 0) {\n\t\treturn m_path + \"?\" + m_query;\n\t} else {\n\t\treturn m_path;\n\t}\n}\n\nmap<string, string> URL::query() const {\n\tmap<string, string> ret;\n\tvector<string> parts;\n\tboost::split(parts, m_query, boost::is_any_of(\"&\"));\n\tfor (const string &part : parts) {\n\t\tvector<string> pair;\n\t\tboost::split(pair, part, boost::is_any_of(\"=\"));\n\t\tif (pair.size() > 1) {\n\t\t\tret[pair[0]] = parser::urldecode(pair[1]);\n\t\t}\n\t}\n\n\treturn ret;\n}\n\nfloat URL::harmonic() const {\n\n\treturn 0.0f;\n}\n\nstring URL::host_reverse(const string &host) {\n\tvector<string> parts;\n\tboost::split(parts, host, boost::is_any_of(\".\"));\n\treverse(parts.begin(), parts.end());\n\treturn boost::algorithm::join(parts, \".\");\n}\n\nstring URL::host_reverse_top_domain(const string &host) {\n\t/*\n\t * This algorithm is OK since we only run on these tlds:\n\t * {\"se\", \"com\", \"nu\", \"net\", \"org\", \"gov\", \"edu\", \"info\"}\n\t * */\n\tvector<string> parts;\n\tboost::split(parts, host, boost::is_any_of(\".\"));\n\tif (parts.size() > 2) {\n\t\tparts = {parts[parts.size() - 2], parts[parts.size() - 1]};\n\t}\n\treverse(parts.begin(), parts.end());\n\treturn boost::algorithm::join(parts, \".\");\n}\n\nstring URL::domain_without_tld() const {\n\tvector<string> parts;\n\tboost::split(parts, m_host, boost::is_any_of(\".\"));\n\tif (parts.size() > 1) {\n\t\treturn parts[parts.size() - 2];\n\t}\n\treturn \"\";\n}\n\nuint32_t URL::size() const {\n\treturn str().size();\n}\n\nvoid URL::set_scheme(const string &scheme) {\n\tm_scheme = scheme;\n\trebuild_url_str();\n}\n\nvoid URL::set_www(bool has_www) {\n\tm_has_www = has_www;\n\trebuild_url_str();\n}\n\nURL &URL::operator=(const URL &other) {\n\tm_url_string = other.m_url_string;\n\tm_host = other.m_host;\n\tm_host_reverse = other.m_host_reverse;\n\tm_scheme = other.m_scheme;\n\tm_path = other.m_path;\n\tm_query = other.m_query;\n\tm_status = other.m_status;\n\tm_has_www = other.m_has_www;\n\n\treturn *this;\n}\n\nistream &operator >>(istream &ss, URL &url) {\n\tss >> (url.m_url_string);\n\turl.m_status = url.parse();\n\n\treturn ss;\n}\n\nostream &operator <<(ostream& os, const URL& url) {\n\tos << url.m_url_string;\n\treturn os;\n}\n\nint URL::parse() {\n\tCURLU *h = curl_url();\n\tif (!h) return ::parser::ERROR;\n\n\tCURLUcode uc = curl_url_set(h, CURLUPART_URL, m_url_string.c_str(), 0);\n\tif (uc) {\n\t\tcurl_url_cleanup(h);\n\t\treturn ::parser::ERROR;\n\t}\n\n\tchar *chost;\n\tuc = curl_url_get(h, CURLUPART_HOST, &chost, 0);\n\tif (!uc) {\n\t\tm_host = chost;\n\t\tremove_www(m_host);\n\t\tcurl_free(chost);\n\t}\n\n\tchar *scheme;\n\tuc = curl_url_get(h, CURLUPART_SCHEME, &scheme, 0);\n\tif (!uc) {\n\t\tm_scheme = scheme;\n\t\tcurl_free(scheme);\n\t}\n\n\tchar *cpath;\n\tuc = curl_url_get(h, CURLUPART_PATH, &cpath, 0);\n\tif (!uc) {\n\t\tm_path = cpath;\n\t\tcurl_free(cpath);\n\t}\n\n\tchar *cquery;\n\tuc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0);\n\tif (!uc) {\n\t\tm_query = cquery;\n\t\tcurl_free(cquery);\n\t}\n\n\tcurl_url_cleanup(h);\n\n\tm_host_reverse = URL::host_reverse(m_host);\n\n\treturn ::parser::OK;\n}\n\nvoid URL::rebuild_url_str() {\n\tm_url_string = m_scheme + \"://\" + (m_has_www ? \"www.\" : \"\") + m_host + path_with_query();\n}\n\ninline void URL::remove_www(string &path) {\n\tsize_t pos = path.find(\"www.\");\n\tif (pos == 0) {\n\t\tm_has_www = true;\n\t\tpath.erase(0, 4);\n\t} else {\n\t\tm_has_www = false;\n\t}\n\ttext::trim(path);\n}\n"
  },
  {
    "path": "src/URL.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"config.h\"\n\n#include <iostream>\n#include <functional>\n#include <map>\n#include <boost/algorithm/string/join.hpp>\n\nclass URL {\n\npublic:\n\tURL();\n\tURL(const URL &url);\n\texplicit URL(const std::string &url);\n\texplicit URL(const std::string &host, const std::string &path);\n\t~URL();\n\n\tstatic std::string host_reverse(const std::string &host);\n\tstatic std::string host_reverse_top_domain(const std::string &host);\n\n\tvoid set_url_string(const std::string &url);\n\tstd::string str() const;\n\tstd::string key() const;\n\n\tstd::string hash_input() const;\n\tuint64_t hash() const;\n\tuint64_t host_hash() const;\n\tuint64_t link_hash(const URL &target_url, const std::string &link_text) const;\n\tuint64_t domain_link_hash(const URL &target_url, const std::string &link_text) const;\n\tbool canonically_different(const URL &url) const;\n\tbool has_https() const;\n\tbool has_www() const;\n\n\tstd::string host() const;\n\tstd::string host_top_domain() const;\n\tstd::string scheme() const;\n\tstd::string path() const;\n\tstd::string path_with_query() const;\n\tstd::map<std::string, std::string> query() const;\n\tstd::string host_reverse() const;\n\tstd::string domain_without_tld() const;\n\tuint32_t size() const;\n\n\tvoid set_scheme(const std::string &scheme);\n\tvoid set_www(bool has_www);\n\n\tfloat harmonic() const;\n\n\tsize_t index_on_node() const {\n\t\treturn host_hash() % config::nodes_in_cluster;\n\t}\n\n\tURL &operator=(const URL &other);\n\tfriend std::istream &operator >>(std::istream &ss, URL &url);\n\tfriend std::ostream &operator <<(std::ostream& os, const URL& url);\n\nprivate:\n\n\tstd::string m_url_string;\n\tstd::string m_host;\n\tstd::string m_host_reverse;\n\tstd::string m_scheme;\n\tstd::string m_path;\n\tstd::string m_query;\n\tint m_status;\n\tbool m_has_www;\n\n\tint parse();\n\tvoid rebuild_url_str();\n\tinline void remove_www(std::string &path);\n\n\n};\n"
  },
  {
    "path": "src/alexandria.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iostream>\n#include <sstream>\n#include <numeric>\n#include \"logger/logger.h\"\n#include \"downloader/warc_downloader.h\"\n#include \"downloader/merge_downloader.h\"\n#include \"URL.h\"\n#include \"hash_table2/hash_table.h\"\n#include \"hash_table2/hash_table_shard_builder.h\"\n#include \"indexer/index.h\"\n#include \"indexer/index_builder.h\"\n#include \"indexer/value_record.h\"\n#include \"algorithm/hyper_ball.h\"\n#include \"utils/thread_pool.hpp\"\n#include \"file/file.h\"\n#include \"http/server.h\"\n#include \"parser/parser.h\"\n#include <boost/algorithm/string.hpp>\n\nusing namespace std;\n\nvoid help() {\n\tstd::string content = file::cat(\"../documentation/alexandria.md\");\n\tstd::cout << content << std::endl;\n}\n\nint main(int argc, const char **argv) {\n\n\tlogger::start_logger_thread();\n\tlogger::verbose(true);\n\n\tif (getenv(\"ALEXANDRIA_CONFIG\") != NULL) {\n\t\tconfig::read_config(getenv(\"ALEXANDRIA_CONFIG\"));\n\t} else {\n\t\tconfig::read_config(\"/etc/alexandria.conf\");\n\t}\n\n\tif (argc < 2) {\n\t\thelp();\n\t\treturn 0;\n\t}\n\n\tconst string arg(argc > 1 ? argv[1] : \"\");\n\n\tif (arg == \"--hash-table-url\" && argc > 2) {\n\t\tURL url(argv[2]);\n\t\thash_table2::hash_table ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\n\t\tsize_t ver = 0;\n\t\tstd::string data = ht.find(url.hash(), ver);\n\t\tstd::cout << ver << std::endl;\n\t\tstd::cout << data << std::endl;\n\t} else if (arg == \"--hash-table-url-hash\" && argc > 2) {\n\t\tuint64_t url_hash = std::stoull(argv[2]);\n\t\thash_table2::hash_table ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\n\t\tsize_t ver = 0;\n\t\tstd::string data = ht.find(url_hash, ver);\n\t\tstd::cout << ver << std::endl;\n\t\tstd::cout << data << std::endl;\n\t} else if (arg == \"--hash-table-count\") {\n\n\t\thash_table2::hash_table ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\n\t\tstd::cout << ht.size() << std::endl;\n\n\t} else if (arg == \"--hash-table-find-all\" && argc > 2) {\n\n\t\thash_table2::hash_table ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\n\t\t// Put given hosts in array with hashes to search for.\n\t\tstd::vector<uint64_t> search_for;\n\t\tfor (int i = 2; i < argc; i++) {\n\t\t\tsearch_for.push_back(URL(string(\"https://\") + argv[i]).host_hash());\n\t\t}\n\n\t\tht.for_each([&search_for](uint64_t key, std::string value) {\n\n\t\t\tURL url(value.substr(0, value.find(\"\\t\")));\n\n\t\t\tconst auto my_host_hash = url.host_hash();\n\t\t\tfor (const auto &host_hash : search_for) {\n\t\t\t\tif (host_hash == my_host_hash) {\n\t\t\t\t\tstd::cout << key << \"\\t\" << url.str() << std::endl;\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t}\n\n\t\t});\n\n\t} else if (arg == \"--hash-table-count\" && argc > 2) {\n\n\t\tstd::string data = file::cat(\"domains.txt\");\n\t\tstd::vector<std::string> lines;\n\t\tboost::split(lines, data, boost::is_any_of(\"\\n\"));\n\t\tstd::map<std::string, uint64_t> domains;\n\t\tstd::map<uint64_t, size_t> domain_counts;\n\t\tstd::vector<std::string> domain_list;\n\t\tfor (const auto &line : lines) {\n\t\t\tif (line == \"\") continue;\n\t\t\tconst std::string reversed = URL::host_reverse(line);\n\t\t\tstd::cout << reversed << std::endl;\n\t\t\tconst uint64_t domain_hash = URL(string(\"https://\") + reversed).host_hash();\n\t\t\tdomains[reversed] = domain_hash;\n\t\t\tdomain_counts[domain_hash] = 0;\n\t\t\tdomain_list.push_back(reversed);\n\t\t}\n\n\t\thash_table2::hash_table ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\n\t\tuint64_t thelazy_host_hash = URL(string(\"https://\") + argv[2]).host_hash();\n\n\t\tht.for_each([thelazy_host_hash, &domain_counts](uint64_t key, std::string value) {\n\n\t\t\tURL url(value.substr(0, value.find(\"\\t\")));\n\n\t\t\tconst auto my_host_hash = url.host_hash();\n\t\t\tfor (auto &iter : domain_counts) {\n\t\t\t\tif (iter.first == my_host_hash) {\n\t\t\t\t\tdomain_counts[iter.first]++;\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t/*if (url.host_hash() == thelazy_host_hash) {\n\t\t\t\tstd::cout << key << \" => \" << url.str() << std::endl;\n\t\t\t}*/\n\n\t\t});\n\n\t\tfor (auto &domain : domain_list) {\n\t\t\tstd::cout << domain << \"\\t\" << domain_counts[domains[domain]] << std::endl;\n\t\t}\n\n\t} else if (arg == \"--hash-table-optimize-shard\" && argc > 2) {\n\t\tsize_t shard_id = std::stoull(argv[2]);\n\t\thash_table2::hash_table_shard_builder ht_shard(\"all_urls\", shard_id, 1000000, \"/slow_data\");\n\n\t\tht_shard.optimize();\n\n\t} else if (arg == \"--internal-harmonic\") {\n\t\tprofiler::instance prof_total(\"total\");\n\t\t/*\n\n\t\tstd::vector<std::string> all_files;\n\t\tfile::read_directory(\"/mnt/0/full_text/internal_links\", [&all_files](const std::string &filename) {\n\t\t\tall_files.push_back(filename);\n\t\t});\n\n\t\tsize_t done_with = 0;\n\t\tprofiler::instance prof(\"total\");\n\t\tfor (const auto &filename : all_files) {\n\n\t\t\t// Read the file.\n\t\t\tstd::ifstream infile(\"/mnt/0/full_text/internal_links/\" + filename, std::ios::binary);\n\t\t\tstd::string infile_data(std::istreambuf_iterator<char>(infile), {});\n\t\t\tinfile.close();\n\t\t\tstd::istringstream reader(infile_data);\n\t\t\tindexer::index<indexer::value_record> idx(&reader, 1000);\n\n\t\t\t// Create vertices vector\n\t\t\tstd::vector<uint64_t> vertices;\n\t\t\tstd::map<uint64_t, uint64_t> vertex_map;\n\n\t\t\tsize_t record_id = 0;\n\t\t\tfor (const auto &record : idx.records()) {\n\t\t\t\tvertices.push_back(record.m_value);\n\t\t\t\tvertex_map[record.m_value] = record_id;\n\t\t\t\trecord_id++;\n\t\t\t}\n\n\t\t\tstd::vector<roaring::Roaring> edge_map(vertices.size());\n\n\t\t\t// Populate edge map\n\t\t\tidx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {\n\t\t\t\t\tif (vertex_map.count(key) == 0) {\n\t\t\t\t\t\tvertices.push_back(key);\n\t\t\t\t\t\tedge_map.push_back(roaring::Roaring());\n\t\t\t\t\t\tvertex_map[key] = record_id;\n\t\t\t\t\t\trecord_id++;\n\t\t\t\t\t}\n\t\t\t\t\tedge_map[vertex_map[key]] = std::move(bitmap);\n\t\t\t});\n\n\n\t\t\t// Calculate harmonic centrality on graph.\n\t\t\tif (vertices.size() > 500) {\n\t\t\t\tauto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());\n\t\t\t}\n\n\t\t\t// Sort the results a bit.\n\t\t\tstd::vector<size_t> sorted(harmonic.size());\n\t\t\tstd::iota(sorted.begin(), sorted.end(), 0);\n\t\t\tstd::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {\n\t\t\t\treturn harmonic[a] > harmonic[b];\n\t\t\t});\n\n\t\t\tdone_with++;\n\t\t\tfloat percent = ((float)done_with / all_files.size()) * 100.0f;\n\t\t\tfloat elapsed_milliseconds = prof.get();\n\t\t\tsize_t items_left = all_files.size() - done_with;\n\t\t\tfloat milliseconds_per_file = elapsed_milliseconds/done_with;\n\t\t\tfloat milliseconds_left = milliseconds_per_file * items_left;\n\t\t\tfloat hours_left = milliseconds_left / (1000.0f * 3600.0f);\n\t\t\tstd::cout << \"done with \" << done_with << \" out of \" << all_files.size() << \" (\" <<\n\t\t\t\tpercent << \"% done) time left: \" << hours_left << \" hours\"<< std::endl;\n\t\t}\n\n\t\treturn 0;*/\n\n\t\t// load the file\n\t\tstd::string content = file::cat(\"multiple_domains.tsv\");\n\t\tstd::vector<std::string> lines;\n\t\tboost::split(lines, content, boost::is_any_of(\"\\n\"));\n\t\tstd::vector<std::vector<std::string>> csv_data;\n\t\tfor (auto line : lines) {\n\t\t\tstd::vector<std::string> cols;\n\t\t\tboost::split(cols, line, boost::is_any_of(\"\\t\"));\n\t\t\tif (cols.size() > 1) {\n\t\t\t\tif (URL(cols[1]).host_hash() == URL(\"http://abc13.com\").host_hash()) {\n\t\t\t\t\tcsv_data.push_back(cols);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\tprofiler::instance prof_load(\"load\");\n\t\t//std::ifstream infile(\"/mnt/5/full_text/internal_links/3492248666075096845.data\", std::ios::binary);\n\t\tstd::ifstream infile(\"/mnt/6/full_text/internal_links/12854855988816217414.data\", std::ios::binary);\n\t\tstd::string infile_data(std::istreambuf_iterator<char>(infile), {});\n\t\tinfile.close();\n\t\tstd::istringstream reader(infile_data);\n\t\tindexer::index<indexer::value_record> idx(&reader, 1000);\n\t\tprof_load.stop();\n\n\t\tprofiler::instance prof(\"make vertices\");\n\n\t\tstd::vector<uint64_t> vertices;\n\t\tstd::map<uint64_t, uint64_t> vertex_map;\n\n\t\tsize_t record_id = 0;\n\t\tfor (const auto &record : idx.records()) {\n\t\t\tvertices.push_back(record.m_value);\n\t\t\tvertex_map[record.m_value] = record_id;\n\t\t\trecord_id++;\n\t\t}\n\n\t\tstd::vector<roaring::Roaring> edge_map(vertices.size());\n\n\t\tidx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {\n\t\t\t\tif (vertex_map.count(key) == 0) {\n\t\t\t\t\tvertices.push_back(key);\n\t\t\t\t\tedge_map.push_back(roaring::Roaring());\n\t\t\t\t\tvertex_map[key] = record_id;\n\t\t\t\t\trecord_id++;\n\t\t\t\t}\n\t\t\t\tedge_map[vertex_map[key]] = std::move(bitmap);\n\t\t});\n\n\t\tprof.stop();\n\t\tprofiler::instance prof2(\"run hyper_ball\");\n\n\t\tauto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());\n\n\t\tprof2.stop();\n\n\t\tprof_total.stop();\n\n\t\tstd::vector<size_t> sorted(harmonic.size());\n\t\tstd::iota(sorted.begin(), sorted.end(), 0);\n\t\tstd::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {\n\t\t\treturn harmonic[a] > harmonic[b];\n\t\t});\n\t\tstd::map<uint64_t, double> harmonic_by_url;\n\t\tfor (size_t i = 0; i < harmonic.size(); i++) {\n\t\t\tharmonic_by_url[vertices[sorted[i]]] = harmonic[sorted[i]] / vertices.size();\n\t\t}\n\n\t\tfor (auto row : csv_data) {\n\t\t\tuint64_t url_hash = stoull(row[0]);\n\t\t\tdouble harmonic = harmonic_by_url[url_hash];\n\t\t\tstd::cout << row[0] << \"\\t\" << row[1] << \"\\t\" << harmonic << std::endl;\n\t\t}\n\n\t\t/*\n\t\tprofiler::instance prof_load(\"load\");\n\t\t//std::ifstream infile(\"/mnt/5/full_text/internal_links/3492263685688109621.data\", std::ios::binary);\n\t\t//std::ifstream infile(\"/mnt/5/full_text/internal_links/3492528524383210893.data\", std::ios::binary);\n\t\t//std::ifstream infile(\"/mnt/0/full_text/internal_links/7131549202223940368.data\", std::ios::binary);\n\t\tstd::ifstream infile(\"/mnt/0/full_text/internal_links/10401139885298228528.data\", std::ios::binary);\n\t\tstd::string infile_data(std::istreambuf_iterator<char>(infile), {});\n\t\tinfile.close();\n\t\tstd::istringstream reader(infile_data);\n\t\tindexer::index<indexer::value_record> idx(&reader, 1000);\n\t\tprof_load.stop();\n\n\t\tprofiler::instance prof(\"make vertices\");\n\n\t\tstd::vector<uint64_t> vertices;\n\t\tstd::map<uint64_t, uint64_t> vertex_map;\n\n\t\tsize_t record_id = 0;\n\t\tfor (const auto &record : idx.records()) {\n\t\t\tvertices.push_back(record.m_value);\n\t\t\tvertex_map[record.m_value] = record_id;\n\t\t\trecord_id++;\n\t\t}\n\n\t\tstd::vector<roaring::Roaring> edge_map(vertices.size());\n\n\t\tidx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {\n\t\t\t\tif (vertex_map.count(key) == 0) {\n\t\t\t\t\tvertices.push_back(key);\n\t\t\t\t\tedge_map.push_back(roaring::Roaring());\n\t\t\t\t\tvertex_map[key] = record_id;\n\t\t\t\t\trecord_id++;\n\t\t\t\t}\n\t\t\t\tedge_map[vertex_map[key]] = std::move(bitmap);\n\t\t});\n\n\t\tprof.stop();\n\t\tprofiler::instance prof2(\"run hyper_ball\");\n\n\t\tauto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());\n\n\t\tprof2.stop();\n\n\t\tprof_total.stop();\n\n\t\tstd::vector<size_t> sorted(harmonic.size());\n\t\tstd::iota(sorted.begin(), sorted.end(), 0);\n\t\tstd::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {\n\t\t\treturn harmonic[a] > harmonic[b];\n\t\t});\n\n\t\t//for (size_t i = 0; i < harmonic.size(); i++) {\n\t\t\t//std::cout << \"vertex: \" << vertices[sorted[i]] << \" has harmonic: \" << harmonic[sorted[i]] << std::endl;\n\t\t//}\n\t\t*/\n\t} else if (arg == \"--url-server\") {\n\t\t// Spin up a simple url server.\n\n\t\thash_table2::hash_table ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\n\t\thttp::server url_server([&ht](auto request) {\n\t\t\thttp::response res;\n\n\t\t\tURL url = request.url();\n\t\t\tauto query = url.query();\n\t\t\tURL find_url(parser::urldecode(query[\"url\"]));\n\n\t\t\tsize_t ver;\n\t\t\tconst auto find_str = ht.find(find_url.hash(), ver);\n\n\t\t\tif (find_str == \"\") {\n\t\t\t\tres.code(404);\n\t\t\t\tres.body(\"Not found 404\");\n\t\t\t} else {\n\t\t\t\tres.code(200);\n\t\t\t\tres.body(find_str);\n\t\t\t}\n\n\t\t\treturn res;\n\t\t});\n\t} else {\n\t\thelp();\n\t}\n\n\tlogger::join_logger_thread();\n\n\treturn 0;\n}\n"
  },
  {
    "path": "src/algorithm/algorithm.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"algorithm.h\"\n#include \"profiler/profiler.h\"\n#include <iostream>\n#include <set>\n#include <numeric>\n#include <map>\n#include <math.h>\n#include <cassert>\n#include <future>\n#include <cstring>\n\nnamespace algorithm {\n\n\t/*\n\t\tReturns partitions with indices that are smaller than the values in the dims vector.\n\t\tFor example:\n\t\tdims = {2,2} gives {0,0}, {1,0}, {0,1}, {1,1}\n\t\tdims = {2,3} gives {0,0}, {1,0}, {0,1}, {1,1}, {0,2}, {1,2}\n\t*/\n\tstd::vector<std::vector<int>> incremental_partitions(const std::vector<int> &dims, size_t limit) {\n\t\tstd::vector<std::vector<int>> res;\n\t\tstd::set<std::vector<int>> uniq;\n\t\tstd::vector<int> initial(dims.size(), 0);\n\t\tres.push_back(initial);\n\t\tuniq.insert(initial);\n\n\t\tfor (size_t j = 0; j < res.size(); j++) {\n\t\t\tstd::vector<int> vec = res[j];\n\t\t\tfor (size_t i = 0; i < vec.size(); i++) {\n\t\t\t\tif (vec[i] < dims[i]-1) {\n\t\t\t\t\tstd::vector<int> copy(vec);\n\t\t\t\t\tcopy[i]++;\n\n\t\t\t\t\tres.push_back(copy);\n\t\t\t\t\tuniq.insert(copy);\n\t\t\t\t\tif (uniq.size() >= limit) break;\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (uniq.size() >= limit) break;\n\t\t}\n\n\t\tstd::vector<std::vector<int>> ret(uniq.begin(), uniq.end());\n\t\tsort(ret.begin(), ret.end(), [](const std::vector<int> &a, const std::vector<int> &b) {\n\t\t\tint sum1 = accumulate(a.begin(), a.end(), 0);\n\t\t\tint sum2 = accumulate(b.begin(), b.end(), 0);\n\t\t\tif (sum1 == sum2) {\n\t\t\t\tint max1 = *max_element(a.begin(), a.end());\n\t\t\t\tint max2 = *max_element(b.begin(), b.end());\n\t\t\t\tif (max1 == max2) {\n\t\t\t\t\treturn b < a;\n\t\t\t\t}\n\t\t\t\treturn max1 < max2;\n\t\t\t}\n\t\t\treturn sum1 < sum2;\n\t\t});\n\t\treturn ret;\n\t}\n\n\t/*\n\t\tCalculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i.\n\t\tThe depth parameter is the maximum level to traverse in the neighbour tree.\n\t\tThe edges set contains pairs of edges (from vertex, to vertex)\n\t*/\n\n\t/*\n\t * This is the inner outer loop for calculating harmonic centrality.\n\t * */\n\tstd::vector<double> harmonic_centrality_subvector(size_t vlen, const std::vector<uint32_t> *edge_map,\n\t\t\tsize_t depth, size_t start, size_t len) {\n\n\t\tchar *all = new char[vlen];\n\t\tuint32_t *level1 = new uint32_t[vlen];\n\t\tuint32_t *level2 = new uint32_t[vlen];\n\n\t\tuint32_t *levels[2] = {level1, level2};\n\t\tsize_t level_len[2] = {0, 0};\n\n\t\tstd::vector<double> harmonics;\n\n\t\tprofiler::instance prof(\"Timetaker\");\n\t\tfor (size_t i = start; i < start + len; i++) {\n\t\t\tconst uint32_t vertex = i;\n\n\t\t\tlevel_len[0] = 0;\n\t\t\tlevel_len[1] = 0;\n\t\t\tmemset(all, 0, vlen);\n\n\t\t\tlevels[0][0] = vertex;\n\t\t\tlevel_len[0]++;\n\t\t\tall[vertex] = 1;\n\n\t\t\tdouble harmonic = 0.0;\n\t\t\t/*\n\t\t\t\tIf we can assume the average number of incoming edges per vertex to be constant these loops should be O(1) in n.\n\t\t\t\tExample, if we have n = 10 000 000 vertices and 10 inbound edges on each vertex these loops should be\n\t\t\t\t(first loop is depth) X (worst case second loop is 10^depth) X (inner loop is 10)\n\t\t\t\tdepth * 10^depth * 10\n\t\t\t\tindependent of n\n\t\t\t*/\n\t\t\tsize_t last_level = 0;\n\t\t\tsize_t cur_level = 1;\n\t\t\tfor (size_t level = 1; level <= depth; level++) {\n\t\t\t\t//for (const uint32_t &v : level[level - 1]) {\n\t\t\t\tfor (size_t j = 0; j < level_len[last_level]; j++) {\n\t\t\t\t\tconst uint32_t v = levels[last_level][j];\n\t\t\t\t\tfor (const uint32_t &edge : edge_map[v]) {\n\t\t\t\t\t\tif (!all[edge]) {\n\t\t\t\t\t\t\tlevels[cur_level][level_len[cur_level]++] = edge;\n\t\t\t\t\t\t\tall[edge] = 1;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tif (level_len[cur_level] == 0) break;\n\t\t\t\tharmonic += (double)level_len[cur_level] / level;\n\t\t\t\t// Swap levels\n\t\t\t\tlevel_len[last_level] = 0;\n\t\t\t\tsize_t tmp = last_level;\n\t\t\t\tlast_level = cur_level;\n\t\t\t\tcur_level = tmp;\n\t\t\t}\n\n\t\t\tharmonics.push_back(harmonic);\n\t\t}\n\n\t\tdelete [] level2;\n\t\tdelete [] level1;\n\t\tdelete [] all;\n\n\t\treturn harmonics;\n\t}\n\n\tstd::vector<double> harmonic_centrality(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth) {\n\t\tstd::vector<double> harmonics;\n\n\t\tstd::vector<uint32_t> *edge_map = new std::vector<uint32_t>[vlen];\n\t\tfor (const auto &edge : edges) {\n\t\t\t/*\n\t\t\tsecond -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase\n\t\t\tharmonic centrality of vertex.\n\t\t\t*/\n\t\t\tedge_map[edge.second].push_back(edge.first);\n\t\t}\n\n\t\tstd::vector<double> ret = harmonic_centrality(vlen, edge_map, depth);\n\n\t\tdelete [] edge_map;\n\n\t\treturn ret;\n\t}\n\n\tstd::vector<double> harmonic_centrality(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth) {\n\t\treturn harmonic_centrality_subvector(vlen, edge_map, depth, 0, vlen);\n\t}\n\n\tstd::vector<double> harmonic_centrality_threaded(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth,\n\t\t\tsize_t num_threads) {\n\n\t\tstd::vector<uint32_t> *edge_map = new std::vector<uint32_t>[vlen];\n\t\tfor (const auto &edge : edges) {\n\t\t\t/*\n\t\t\tsecond -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase\n\t\t\tharmonic centrality of vertex.\n\t\t\t*/\n\t\t\tedge_map[edge.second].push_back(edge.first);\n\t\t}\n\n\t\tstd::vector<double> ret = harmonic_centrality_threaded(vlen, edge_map, depth, num_threads);\n\n\t\tdelete [] edge_map;\n\n\t\treturn ret;\n\t}\n\n\tstd::vector<double> harmonic_centrality_threaded(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth, size_t num_threads) {\n\n\t\tassert(vlen >= num_threads);\n\n\t\tstd::vector<std::future<std::vector<double>>> threads;\n\n\t\t// Split the vertices into several vectors.\n\t\tconst size_t max_len = ceil((double)vlen / num_threads);\n\t\tfor (size_t i = 0; i < vlen; i += max_len) {\n\t\t\tconst size_t len = std::min(max_len, vlen - i);\n\t\t\tthreads.emplace_back(std::async(std::launch::async, harmonic_centrality_subvector, vlen, edge_map, depth, i, len));\n\t\t}\n\n\t\tstd::vector<double> harmonic;\n\t\tfor (auto &thread : threads) {\n\t\t\tstd::vector<double> part = thread.get();\n\t\t\tharmonic.insert(harmonic.end(), part.begin(), part.end());\n\t\t}\n\n\t\treturn harmonic;\n\t}\n\n\tstd::vector<uint32_t> *set_to_edge_map(size_t n, const std::set<std::pair<uint32_t, uint32_t>> &edges) {\n\t\tstd::vector<uint32_t> *edge_map = new std::vector<uint32_t>[n];\n\t\tfor (const auto &edge : edges) {\n\t\t\t/*\n\t\t\tsecond -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase\n\t\t\tharmonic centrality of vertex.\n\t\t\t*/\n\t\t\tedge_map[edge.second].push_back(edge.first);\n\t\t}\n\n\t\treturn edge_map;\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/algorithm.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <vector>\n#include <set>\n#include <unordered_map>\n#include <cstdint>\n\nnamespace algorithm {\n\n\ttemplate<class T>\n\tvoid vector_chunk(const std::vector<T> &vec, size_t chunk_size, std::vector<std::vector<T>> &dest) {\n\t\tstd::vector<T> chunk;\n\t\tfor (T item : vec) {\n\t\t\tchunk.push_back(item);\n\t\t\tif (chunk.size() == chunk_size) {\n\t\t\t\tdest.push_back(chunk);\n\t\t\t\tchunk.clear();\n\t\t\t}\n\t\t}\n\t\tif (chunk.size()) {\n\t\t\tdest.push_back(chunk);\n\t\t}\n\t}\n\n\tstd::vector<std::vector<int>> incremental_partitions(const std::vector<int> &dims, size_t limit);\n\n\t/*\n\t\tCalculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i.\n\t\tThe depth parameter is the maximum level to traverse in the neighbour tree.\n\t\tThe edges set contains pairs of edges (from vertex, to vertex)\n\t*/\n\tstd::vector<double> harmonic_centrality(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth);\n\tstd::vector<double> harmonic_centrality(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth);\n\tstd::vector<double> harmonic_centrality_threaded(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth,\n\t\t\tsize_t num_threads);\n\tstd::vector<double> harmonic_centrality_threaded(size_t vlen, const std::vector<uint32_t> *edge_map,\n\t\t\tsize_t depth, size_t num_threads);\n\n\tstd::vector<uint32_t> *set_to_edge_map(size_t n, const std::set<std::pair<uint32_t, uint32_t>> &edges);\n}\n"
  },
  {
    "path": "src/algorithm/bloom_filter.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"bloom_filter.h\"\n#include \"algorithm/hash.h\"\n#include <cmath>\n#include <cstring>\n#include <fstream>\n\nnamespace algorithm {\n\n\tbloom_filter::bloom_filter()\n\t{\n\t\tm_bitmap = std::make_unique<uint64_t[]>(m_dim);\n\t\tfor (size_t i = 0; i < m_dim; i++) {\n\t\t\tm_bitmap[i] = 0x0ull;\n\t\t}\n\t}\n\n\t// Dim should be a prime number..\n\tbloom_filter::bloom_filter(size_t dim)\n\t: m_dim(dim), m_bitlen(dim * 64)\n\t{\n\t\tm_bitmap = std::make_unique<uint64_t[]>(m_dim);\n\t\tfor (size_t i = 0; i < m_dim; i++) {\n\t\t\tm_bitmap[i] = 0x0ull;\n\t\t}\n\t}\n\n\tvoid bloom_filter::insert(const std::string &item) {\n\t\tfor (size_t i = 0; i < m_seeds.size(); i++) {\n\t\t\tconst uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]);\n\t\t\tset_bit(hash);\n\t\t}\n\t}\n\n\tvoid bloom_filter::insert(uint64_t item) {\n\t\tinsert(std::to_string(item));\n\t}\n\n\tvoid bloom_filter::insert_many(std::vector<uint64_t> &items) {\n\n\t\tstd::vector<size_t> hashes;\n\t\tfor (const auto &item : items) {\n\t\t\tconst auto str_item = std::to_string(item);\n\t\t\tfor (size_t i = 0; i < m_seeds.size(); i++) {\n\t\t\t\tconst uint64_t hash = algorithm::hash_with_seed(str_item, m_seeds[i]);\n\t\t\t\thashes.push_back(hash);\n\t\t\t}\n\t\t}\n\n\t\tstd::lock_guard guard(m_mutex);\n\t\tfor (const auto &hash : hashes) {\n\t\t\tset_bit(hash);\n\t\t}\n\t}\n\n\tconst char * bloom_filter::data() const {\n\t\treturn (char *)m_bitmap.get();\n\t}\n\n\tbool bloom_filter::exists(const std::string &item) const {\n\t\tfor (size_t i = 0; i < m_seeds.size(); i++) {\n\t\t\tconst uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]);\n\t\t\tif (!get_bit(hash)) return false;\n\t\t}\n\t\treturn true;\n\t}\n\n\tbool bloom_filter::exists(uint64_t data) const {\n\t\treturn exists(std::to_string(data));\n\t}\n\n\tvoid bloom_filter::read(char *data, size_t len) {\n\t\tmemcpy((char *)m_bitmap.get(), data, len);\n\t}\n\n\tvoid bloom_filter::merge(const bloom_filter &other) {\n\t\tfor (size_t i = 0; i < m_dim; i++) {\n\t\t\tm_bitmap[i] |= other.m_bitmap[i];\n\t\t}\n\t}\n\n\tdouble bloom_filter::saturation() {\n\t\treturn 1.0;\n\t}\n\n\tvoid bloom_filter::read_file(const std::string &file_name) {\n\t\tstd::ifstream infile(file_name, std::ios::binary);\n\t\tinfile.read((char *)m_bitmap.get(), size());\n\t}\n\n\tvoid bloom_filter::write_file(const std::string &file_name) const {\n\t\tstd::ofstream outfile(file_name, std::ios::binary | std::ios::trunc);\n\t\toutfile.write((char *)m_bitmap.get(), size());\n\t}\n\n\tvoid bloom_filter::set_bit(size_t bit) {\n\t\tconst size_t x = bit % m_bitlen;\n\t\tconst size_t pos = static_cast<size_t>(x / 64);\n\t\tconst size_t bit_in_pos = x % 64;\n\t\tm_bitmap[pos] = m_bitmap[pos] | (0x1ull << bit_in_pos);\n\t}\n\n\tbool bloom_filter::get_bit(size_t bit) const {\n\t\tconst size_t x = bit % m_bitlen;\n\t\tconst size_t pos = static_cast<size_t>(x / 64);\n\t\tconst size_t bit_in_pos = x % 64;\n\t\treturn (m_bitmap[pos] & (0x1ull << bit_in_pos)) >> bit_in_pos;\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/bloom_filter.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <memory>\n#include <mutex>\n#include \"roaring/roaring64map.hh\"\n\nnamespace algorithm {\n\n\tclass bloom_filter {\n\t\tpublic:\n\t\t\tbloom_filter();\n\t\t\tbloom_filter(size_t dim);\n\n\t\t\tvoid insert(const std::string &item);\n\t\t\tvoid insert(uint64_t item);\n\t\t\tvoid insert_many(std::vector<uint64_t> &items);\n\t\t\tbool exists(const std::string &item) const;\n\t\t\tbool exists(uint64_t data) const;\n\t\t\tsize_t size() const { return m_dim * sizeof(uint64_t); }\n\t\t\tconst char *data() const;\n\t\t\tvoid read(char *data, size_t len);\n\t\t\tvoid merge(const bloom_filter &other);\n\t\t\tdouble saturation();\n\n\t\t\tvoid read_file(const std::string &file_name);\n\t\t\tvoid write_file(const std::string &file_name) const;\n\n\t\tprivate:\n\n\t\t\tstd::unique_ptr<uint64_t[]> m_bitmap;\n\n\t\t\t#ifdef IS_TEST\n\t\t\tsize_t m_dim = 2695797;\n\t\t\t#else\n\t\t\tsize_t m_dim = 4043696581;\n\t\t\t#endif\n\n\t\t\tsize_t m_bitlen = m_dim * 64;\n\n\t\t\t// some random prime numbers\n\t\t\tstd::array<uint64_t, 10> m_seeds = {3339675911, 2695798769, 2695831867, 2695857877, 2695879891, 2695879891, 2695922687, 2695935521,\n\t\t\t\t\t3339689791, 3339703163};\n\n\t\t\tstd::mutex m_mutex;\n\n\t\t\tvoid set_bit(size_t bit);\n\t\t\tbool get_bit(size_t bit) const;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/algorithm/hash.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <cstdint>\n\n#include \"hash.h\"\n\nnamespace algorithm {\n\n\t/*\n\t * Murmur hash by Austin Appleby\n\t * Taken from here https://sites.google.com/site/murmurhash/\n\t * */\n\tsize_t murmur_hash(const char *key, size_t len, size_t seed) {\n\t\tconst uint64_t m = 0xc6a4a7935bd1e995ull;\n\t\tconst int r = 47;\n\n\t\tuint64_t h = seed ^ (len * m);\n\n\t\tconst uint64_t * data = (const uint64_t *)key;\n\t\tconst uint64_t * end = data + (len/8);\n\n\t\twhile(data != end) {\n\t\t\tuint64_t k = *data++;\n\n\t\t\tk *= m; \n\t\t\tk ^= k >> r; \n\t\t\tk *= m; \n\t\t\t\n\t\t\th ^= k;\n\t\t\th *= m; \n\t\t}\n\n\t\tconst unsigned char * data2 = (const unsigned char*)data;\n\n\t\tswitch(len & 7) {\n\t\t\tcase 7: h ^= uint64_t(data2[6]) << 48;\n\t\t\tcase 6: h ^= uint64_t(data2[5]) << 40;\n\t\t\tcase 5: h ^= uint64_t(data2[4]) << 32;\n\t\t\tcase 4: h ^= uint64_t(data2[3]) << 24;\n\t\t\tcase 3: h ^= uint64_t(data2[2]) << 16;\n\t\t\tcase 2: h ^= uint64_t(data2[1]) << 8;\n\t\t\tcase 1: h ^= uint64_t(data2[0]);\n\t\t\t\th *= m;\n\t\t};\n \n\t\th ^= h >> r;\n\t\th *= m;\n\t\th ^= h >> r;\n\n\t\treturn h;\n\t}\n\n\tsize_t hash(const std::string &str) {\n\t\tstatic const size_t seed = 0xc70f6907ul;\n\t\treturn murmur_hash(str.c_str(), str.size(), seed);\n\t}\n\n\tsize_t hash_with_seed(const std::string &str, size_t seed) {\n\t\treturn murmur_hash(str.c_str(), str.size(), seed);\n\t}\n\n\n}\n"
  },
  {
    "path": "src/algorithm/hash.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <string>\n\nnamespace algorithm {\n\n\tsize_t hash(const std::string &str);\n\tsize_t hash_with_seed(const std::string &str, size_t seed);\n\n}\n"
  },
  {
    "path": "src/algorithm/hyper_ball.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <vector>\n#include <cstdint>\n#include \"hyper_log_log.h\"\n#include \"profiler/profiler.h\"\n#include \"logger/logger.h\"\n#include <future>\n\nnamespace algorithm {\n\n\ttemplate <typename edge_map_type>\n\tbool hyper_ball_worker(double t, size_t v_begin, size_t v_end, const edge_map_type &edge_map,\n\t\t\tstd::vector<hyper_log_log> &c, std::vector<hyper_log_log> &a, std::vector<double> &harmonic) {\n\n\t\tbool counter_changed = false;\n\t\tfor (uint32_t v = v_begin; v < v_end; v++) {\n\t\t\ta[v] = c[v];\n\t\t\tfor (const uint32_t &w : edge_map[v]) {\n\t\t\t\ta[v] += c[w];\n\t\t\t}\n\n\t\t\t// a[v] is t + 1 and c[v] is at t\n\t\t\tconst size_t counter_diff = a[v].count() - c[v].count();\n\t\t\tif (counter_diff) {\n\t\t\t\tcounter_changed = true;\n\t\t\t\tharmonic[v] += (1.0 / (t + 1.0)) * counter_diff;\n\t\t\t}\n\t\t}\n\t\tfor (uint32_t v = v_begin; v < v_end; v++) {\n\t\t\tc[v] = a[v];\n\t\t}\n\t\treturn counter_changed;\n\t}\n\n\t/*\n\t * n is the number of vertices in graph.\n\t * edge_map is pointing to a static array of size n.\n\t * each item in edge_map is a vector of variable size.\n\t * each vector edge_map[m] contains values between 0 and n-1 indicating edge between m and edge_map[m].\n\t * NOTE direction of edge in edge map has to be EDGE_FROM -> EDGE_TO.\n\t * so for vertex m, n = edge_map[m] indicates directed edge from n to m\n\t * */\n\ttemplate <typename edge_map_type>\n\tstd::vector<double> hyper_ball(uint32_t n, const edge_map_type &edge_map) {\n\n\t\tif (n == 0) return {};\n\n\t\tconst size_t num_threads = std::min(32, (int)n);\n\t\tconst size_t items_per_thread = n / num_threads;\n\t\tstd::vector<hyper_log_log> c(n, hyper_log_log(10));\n\t\tstd::vector<hyper_log_log> a(n, hyper_log_log(10));\n\t\tstd::vector<double> harmonic(n, 0.0);\n\n\t\tfor (uint32_t v = 0; v < n; v++) {\n\t\t\tc[v].insert(v);\n\t\t}\n\n\t\tdouble t = 0.0;\n\t\twhile (true) {\n\t\t\tstd::vector<std::future<bool>> threads;\n\t\t\tfor (size_t i = 0; i < num_threads; i++) {\n\t\t\t\tconst size_t v_begin = i * items_per_thread;\n\t\t\t\tconst size_t v_end = (i == num_threads - 1) ? n : (i + 1) * items_per_thread;\n\t\t\t\tauto fut = std::async(hyper_ball_worker<edge_map_type>, t, v_begin, v_end, std::cref(edge_map), std::ref(c), std::ref(a), std::ref(harmonic));\n\t\t\t\tthreads.emplace_back(std::move(fut));\n\t\t\t}\n\n\t\t\tbool should_continue = false;\n\t\t\tfor (auto &fut : threads) {\n\t\t\t\tshould_continue = fut.get() || should_continue;\n\t\t\t}\n\n\t\t\tt += 1.0;\n\t\t\tif (!should_continue) break;\n\t\t}\n\n\t\treturn harmonic;\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/hyper_log_log.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <numeric>\n#include \"hyper_log_log.h\"\n#include \"algorithm/hash.h\"\n\nnamespace algorithm {\n\n\thyper_log_log::hyper_log_log(size_t b)\n\t: m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) {\n\t\tm_M.resize(m_len);\n\t\tstd::fill(m_M.begin(), m_M.end(), 0);\n\t}\n\n\thyper_log_log::hyper_log_log(const char *registers, size_t b)\n\t: m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) {\n\t\tm_M.resize(m_len);\n\t\tmemcpy(m_M.data(), registers, m_len);\n\t}\n\n\thyper_log_log::hyper_log_log(const hyper_log_log &other)\n\t: m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) {\n\t\tm_M.resize(m_len);\n\t\tstd::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin());\n\t}\n\n\thyper_log_log::hyper_log_log(hyper_log_log &&other)\n\t: m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) {\n\t\tm_M.swap(other.m_M);\n\t}\n\n\thyper_log_log::~hyper_log_log() {\n\t}\n\n\tvoid hyper_log_log::insert(size_t v) {\n\t\tsize_t x = algorithm::hash(std::to_string(v));\n\t\tsize_t j = x >> (64-m_b);\n\t\tm_M[j] = std::max(m_M[j], leading_zeros_plus_one(x << m_b));\n\t}\n\n\tsize_t hyper_log_log::count() const {\n\t\tdouble Z = 0.0;\n\t\tfor (size_t j = 0; j < m_len; j++) {\n\t\t\tZ += 1.0 / (1ull << m_M[j]);\n\t\t}\n\t\tdouble E = m_alpha * m_len * m_len / Z;\n\n\t\t// Only small range correction implemented since we use 64 bit hash.\n\t\tif (E <= (5.0/2.0) * m_len) {\n\t\t\tsize_t V = num_zero_registers();\n\t\t\tif (V != 0) {\n\t\t\t\tE = m_len * log((double)m_len / V);\n\t\t\t}\n\t\t}\n\n\t\treturn (size_t)E;\n\t}\n\n\tvoid hyper_log_log::reset() {\n\t\tstd::fill(m_M.begin(), m_M.end(), 0);\n\t}\n\n\tchar hyper_log_log::leading_zeros_plus_one(size_t x) const {\n\t\tsize_t num_zeros = 1;\n\t\tfor (size_t i = 0; i < 64; i++) {\n\t\t\tif ((x >> (64 - 1 - i)) & 0x1ull) return num_zeros;\n\t\t\tnum_zeros++;\n\t\t}\n\t\treturn num_zeros;\n\t}\n\n\tsize_t hyper_log_log::num_zero_registers() const {\n\t\treturn std::transform_reduce(m_M.begin(), m_M.end(), 0,\n\t\t\t[](int a, int b) { return a + b; },\n\t\t\t[](char a) { return a == 0 ? 1 : 0; });\n\t}\n\n\tdouble hyper_log_log::error_bound() const {\n\t\tdouble stdd = 1.04 / sqrt((double)m_len);\n\t\treturn stdd * 3; // Gives 99% confidence\n\t}\n\n\thyper_log_log hyper_log_log::operator +(const hyper_log_log &hl) const {\n\t\thyper_log_log res;\n\t\tstd::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(res.m_M), [] (char a, char b) { return std::max(a, b); });\n\n\t\treturn res;\n\t}\n\n\thyper_log_log &hyper_log_log::operator +=(const hyper_log_log &hl) {\n\t\tstd::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(m_M), [] (char a, char b) { return std::max(a, b); });\n\t\treturn *this;\n\t}\n\n\thyper_log_log &hyper_log_log::operator =(const hyper_log_log &other) {\n\t\tstd::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin());\n\t\treturn *this;\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/hyper_log_log.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <cmath>\n#include <cstring>\n#include <algorithm>\n#include <iostream>\n#include <vector>\n\nnamespace algorithm {\n\n\t/*\n\t * Implementation of the hyper log log algorithm as described by Flajolet1 et al.\n\t * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf\n\t *\n\t * Using 64 bit hash instead of 32bit.\n\t * */\n\n\tclass hyper_log_log {\n\n\t\tpublic:\n\n\t\t\t/*\n\t\t\t * initializes with given b parameter. size of data structure will be 2^b bytes.\n\t\t\t * */\n\t\t\thyper_log_log(size_t b = 15);\n\t\t\thyper_log_log(const char *registers, size_t b = 15);\n\t\t\thyper_log_log(const hyper_log_log &other);\n\t\t\thyper_log_log(hyper_log_log &&other);\n\t\t\t~hyper_log_log();\n\n\t\t\tvoid insert(size_t v);\n\t\t\tsize_t count() const;\n\t\t\tdouble error_bound() const;\n\t\t\tvoid reset();\n\n\t\t\tconst char *data() const { return m_M.data(); };\n\t\t\tchar *data() { return m_M.data(); };\n\t\t\tint b() const { return m_b; }\n\t\t\tsize_t data_size() const { return m_len; };\n\n\t\t\thyper_log_log operator +(const hyper_log_log &hl) const;\n\t\t\thyper_log_log &operator +=(const hyper_log_log &hl);\n\t\t\thyper_log_log &operator =(const hyper_log_log &other);\n\n\t\t\tchar leading_zeros_plus_one(size_t x) const;\n\n\t\tprivate:\n\t\t\t\n\t\t\tstd::vector<char> m_M; // Points to registers.\n\t\t\tconst int m_b;\n\t\t\tconst size_t m_len;\n\t\t\tconst double m_alpha;\n\n\t\t\tsize_t num_zero_registers() const;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/algorithm/intersection.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <functional>\n\n#include \"intersection.h\"\n\nnamespace algorithm {\n\n\troaring::Roaring intersection(const std::vector<roaring::Roaring> &input) {\n\n\t\tif (input.size() == 0) return roaring::Roaring();\n\n\t\troaring::Roaring intersection = input[0];\n\n\t\tfor (size_t i = 1; i < input.size(); i++) {\n\t\t\tintersection &= input[i];\n\t\t}\n\n\t\treturn intersection;\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/intersection.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n#pragma once\n\n#include <vector>\n#include <memory>\n#include \"roaring/roaring.hh\"\n\nnamespace algorithm {\n\n\troaring::Roaring intersection(const std::vector<roaring::Roaring> &input);\n\n\ttemplate<typename item>\n\tstd::vector<item> intersection(const std::vector<std::vector<item>> &input,\n\t\tstd::function<void(item &a, const item &b)> sum_fun) {\n\n\t\tif (input.size() == 0) return {};\n\n\t\tsize_t shortest_vector_position = 0;\n\t\tsize_t shortest_len = SIZE_MAX;\n\t\tsize_t iter_index = 0;\n\t\tfor (const std::vector<item> &vec : input) {\n\t\t\tif (shortest_len > vec.size()) {\n\t\t\t\tshortest_len = vec.size();\n\t\t\t\tshortest_vector_position = iter_index;\n\t\t\t}\n\t\t\titer_index++;\n\t\t}\n\n\t\tstd::vector<size_t> positions(input.size(), 0);\n\t\tstd::vector<item> intersection;\n\n\t\twhile (positions[shortest_vector_position] < shortest_len) {\n\n\t\t\tbool all_equal = true;\n\t\t\titem value = input[shortest_vector_position][positions[shortest_vector_position]];\n\n\t\t\tsize_t iter_index = 0;\n\t\t\tfor (const std::vector<item> &vec : input) {\n\t\t\t\tconst size_t len = vec.size();\n\n\t\t\t\tsize_t *pos = &(positions[iter_index]);\n\t\t\t\twhile (*pos < len && vec[*pos] < value) {\n\t\t\t\t\t(*pos)++;\n\t\t\t\t}\n\t\t\t\tif (((*pos < len) && (value < vec[*pos])) || *pos >= len) {\n\t\t\t\t\tall_equal = false;\n\t\t\t\t\tbreak;\n\t\t\t\t} else {\n\t\t\t\t\tif (iter_index != shortest_vector_position) {\n\t\t\t\t\t\tsum_fun(value, vec[*pos]);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\titer_index++;\n\t\t\t}\n\t\t\tif (all_equal) {\n\t\t\t\tintersection.push_back(value);\n\t\t\t}\n\n\t\t\tpositions[shortest_vector_position]++;\n\t\t}\n\n\t\treturn intersection;\n\t}\n\n\ttemplate<typename item>\n\tstd::vector<item> intersection(const std::vector<std::unique_ptr<item[]>> &input, const std::vector<size_t> lengths) {\n\n\t\tif (input.size() == 0) return {};\n\n\t\tsize_t shortest_vector_position = 0;\n\t\tsize_t shortest_len = SIZE_MAX;\n\t\tsize_t iter_index = 0;\n\t\tfor (size_t len : lengths) {\n\t\t\tif (shortest_len > len) {\n\t\t\t\tshortest_len = len;\n\t\t\t\tshortest_vector_position = iter_index;\n\t\t\t}\n\t\t\titer_index++;\n\t\t}\n\n\t\tstd::vector<size_t> positions(input.size(), 0);\n\t\tstd::vector<item> intersection;\n\n\t\twhile (positions[shortest_vector_position] < shortest_len) {\n\n\t\t\tbool all_equal = true;\n\t\t\titem value = input[shortest_vector_position][positions[shortest_vector_position]];\n\n\t\t\tsize_t iter_index = 0;\n\t\t\tfor (const std::unique_ptr<item[]> &ptr : input) {\n\t\t\t\tconst size_t len = lengths[iter_index];\n\n\t\t\t\tsize_t *pos = &(positions[iter_index]);\n\t\t\t\twhile (*pos < len && ptr[*pos] < value) {\n\t\t\t\t\t(*pos)++;\n\t\t\t\t}\n\t\t\t\tif (((*pos < len) && (value < ptr[*pos])) || *pos >= len) {\n\t\t\t\t\tall_equal = false;\n\t\t\t\t\tbreak;\n\t\t\t\t} else {\n\t\t\t\t\tif (iter_index != shortest_vector_position) {\n\t\t\t\t\t\t//sum_fun(value, ptr[*pos]);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\titer_index++;\n\t\t\t}\n\t\t\tif (all_equal) {\n\t\t\t\tintersection.push_back(value);\n\t\t\t}\n\n\t\t\tpositions[shortest_vector_position]++;\n\t\t}\n\n\t\treturn intersection;\n\t}\n\n\ttemplate<typename item>\n\tstd::vector<item> intersection(const std::vector<std::vector<item>> &input) {\n\t\treturn intersection<item>(input, [](item &a, const item &b) {});\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/sort.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"sort.h\"\n\nnamespace algorithm {\n\n}\n\n"
  },
  {
    "path": "src/algorithm/sort.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <vector>\n#include <span>\n\nnamespace algorithm {\n\n\tnamespace sort {\n\n\t\ttemplate<typename data_record, typename F>\n\t\tvoid merge_arrays(const std::vector<data_record> &arr1, const std::vector<data_record> &arr2, F compare, std::vector<data_record> &arr3) {\n\n\t\t\tsize_t i = 0, j = 0;\n\n\t\t\twhile (i < arr1.size() && j < arr2.size()) {\n\t\t\t\tif (compare(arr1[i], arr2[j])) {\n\t\t\t\t\tarr3.push_back(arr1[i++]);\n\t\t\t\t} else {\n\t\t\t\t\tarr3.push_back(arr2[j++]);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\twhile (i < arr1.size()) arr3.push_back(arr1[i++]);\n\t\t\twhile (j < arr2.size()) arr3.push_back(arr2[j++]);\n\t\t}\n\n\t\ttemplate<typename data_record, typename F>\n\t\tvoid merge_arrays(const std::span<data_record> *arr1, const std::span<data_record> *arr2, F compare, std::vector<data_record> &arr3) {\n\n\t\t\tsize_t i = 0, j = 0;\n\n\t\t\twhile (i < arr1->size() && j < arr2->size()) {\n\t\t\t\tif (compare((*arr1)[i], (*arr2)[j])) {\n\t\t\t\t\tarr3.push_back((*arr1)[i++]);\n\t\t\t\t} else {\n\t\t\t\t\tarr3.push_back((*arr2)[j++]);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\twhile (i < arr1->size()) arr3.push_back((*arr1)[i++]);\n\t\t\twhile (j < arr2->size()) arr3.push_back((*arr2)[j++]);\n\t\t}\n\n\t\ttemplate<typename data_record>\n\t\tvoid merge_arrays(const std::vector<data_record> &arr1, const std::vector<data_record> &arr2, std::vector<data_record> &arr3) {\n\t\t\tmerge_arrays(arr1, arr2, [](const data_record &a, const data_record &b) {\n\t\t\t\treturn a < b;\n\t\t\t}, arr3);\n\t\t}\n\n\t\ttemplate<typename data_record>\n\t\tvoid merge_arrays(const std::vector<std::vector<data_record>> &arrays, std::vector<data_record> &res) {\n\t\t\tmerge_arrays(arrays, [](const data_record &a, const data_record &b) {\n\t\t\t\treturn a < b;\n\t\t\t}, res);\n\t\t}\n\n\t\ttemplate<typename data_record, typename F>\n\t\tvoid merge_array_range(const std::vector<std::vector<data_record>> &arrays, size_t i, size_t j, F compare, std::vector<data_record> &res) {\n\t\t\tif (i == j) {\n\t\t\t\tfor (const data_record &rec : arrays[i]) {\n\t\t\t\t\tres.push_back(rec);\n\t\t\t\t}\n\t\t\t} else if (j - i == 1) {\n\t\t\t\tmerge_arrays(arrays[i], arrays[j], compare, res);\n\t\t\t} else {\n\t\t\t\tstd::vector<data_record> out1;\n\t\t\t\tstd::vector<data_record> out2;\n\n\t\t\t\tmerge_array_range(arrays, i, (i + j)/2, compare, out1);\n\t\t\t\tmerge_array_range(arrays, (i + j)/2 + 1, j, compare, out2);\n\n\t\t\t\tmerge_arrays(out1, out2, compare, res);\n\t\t\t}\n\t\t}\n\n\t\ttemplate<typename data_record, typename F>\n\t\tvoid merge_arrays(const std::vector<std::vector<data_record>> &arrays, F compare, std::vector<data_record> &res) {\n\t\t\tif (arrays.size() == 0) return;\n\t\t\tmerge_array_range(arrays, 0, arrays.size() - 1, compare, res);\n\t\t}\n\n\t\ttemplate<typename data_record, typename F>\n\t\tvoid merge_array_range(const std::vector<std::span<data_record> *> &arrays, size_t i, size_t j, F compare, std::vector<data_record> &res) {\n\t\t\tif (i == j) {\n\t\t\t\tfor (const data_record &rec : *(arrays[i])) {\n\t\t\t\t\tres.push_back(rec);\n\t\t\t\t}\n\t\t\t} else if (j - i == 1) {\n\t\t\t\tmerge_arrays(arrays[i], arrays[j], compare, res);\n\t\t\t} else {\n\t\t\t\tstd::vector<data_record> out1;\n\t\t\t\tstd::vector<data_record> out2;\n\n\t\t\t\tmerge_array_range(arrays, i, (i + j)/2, compare, out1);\n\t\t\t\tmerge_array_range(arrays, (i + j)/2 + 1, j, compare, out2);\n\n\t\t\t\tmerge_arrays(out1, out2, compare, res);\n\t\t\t}\n\t\t}\n\n\t\ttemplate<typename data_record, typename F>\n\t\tvoid merge_arrays(const std::vector<std::span<data_record> *> &arrays, F compare, std::vector<data_record> &res) {\n\t\t\tif (arrays.size() == 0) return;\n\t\t\tmerge_array_range(arrays, 0, arrays.size() - 1, compare, res);\n\t\t}\n\t\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/sum_sorted.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <vector>\n#include <functional>\n\nnamespace algorithm {\n\n\ttemplate<class dtype>\n\tstd::vector<dtype> sum_sorted(const std::vector<std::vector<dtype>> &input,\n\t\t\tstd::function<void(dtype &a, const dtype &b)> plus_eq) {\n\n\t\tconst size_t n = input.size();\n\t\tif (n == 0) return {};\n\n\t\tstd::vector<dtype> ret;\n\t\tstd::vector<size_t> pos(n, 0);\n\t\t\n\t\twhile (true) {\n\t\t\tint start_vec = -1;\n\t\t\tfor (size_t i = 0; i < n; i++) {\n\t\t\t\tif (pos[i] < input[i].size() ) {\n\t\t\t\t\tstart_vec = i;\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (start_vec == -1) break;\n\n\t\t\tdtype smallest = input[start_vec][pos[start_vec]];\n\n\t\t\tfor (size_t i = 0; i < n; i++) {\n\t\t\t\tif (pos[i] < input[i].size() && input[i][pos[i]] < smallest) {\n\t\t\t\t\tsmallest = input[i][pos[i]];\n\t\t\t\t\tstart_vec = i;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tconst dtype el = input[start_vec][pos[start_vec]];\n\t\t\tdtype sum = el;\n\t\t\tpos[start_vec]++;\n\t\t\tfor (size_t i = start_vec + 1; i < n; i++) {\n\t\t\t\twhile (pos[i] < input[i].size() && input[i][pos[i]] < el) {\n\t\t\t\t\tpos[i]++;\n\t\t\t\t}\n\t\t\t\tif (pos[i] < input[i].size() && input[i][pos[i]] == el) {\n\t\t\t\t\tplus_eq(sum, input[i][pos[i]]);\n\t\t\t\t\tpos[i]++;\n\t\t\t\t}\n\t\t\t}\n\t\t\tret.push_back(sum);\n\t\t}\n\t\treturn ret;\n\t}\n\n}\n"
  },
  {
    "path": "src/algorithm/top_k.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <vector>\n#include <functional>\n\nnamespace algorithm {\n\n\t/*\n\t * Returns top k elements in unsorted const vector in linear time using a 2k memory buffer.\n\t * */\n\n\ttemplate<class dtype>\n\tstd::vector<dtype> top_k(const std::vector<dtype> &input, size_t k,\n\t\tstd::function<bool(const dtype &, const dtype &)> ordered) {\n\t\t\n\t\tif (input.size() <= k) return input;\n\t\tif (input.size() <= 2 * k) {\n\t\t\tstd::vector<dtype> buf(input.begin(), input.end());\n\t\t\tstd::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered);\n\t\t\treturn std::vector<dtype>(buf.begin() + buf.size() / 2, buf.end());\n\t\t}\n\n\t\tstd::vector<dtype> buf(input.begin(), input.begin() + (2 * k));\n\n\t\tsize_t idx = 2 * k;\n\t\twhile (idx < input.size()) {\n\t\t\tstd::nth_element(buf.begin(), buf.begin() + k, buf.end(), ordered);\n\t\t\tfor (size_t i = 0, j = idx; i < k && j < input.size(); i++, j++) {\n\t\t\t\t// Only insert objects that are out of order compared to pivot buf[k]\n\t\t\t\tif (!ordered(input[j], buf[k])) {\n\t\t\t\t\tbuf[i] = input[idx + i];\n\t\t\t\t}\n\t\t\t}\n\t\t\tidx += k;\n\t\t}\n\t\t// Run final partition.\n\t\tstd::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered);\n\n\t\treturn std::vector<dtype>(buf.begin() + k, buf.end());\n\t}\n\n\t/*\n\t * top_k but with default less than operator.\n\t * */\n\ttemplate<class dtype>\n\tstd::vector<dtype> top_k(const std::vector<dtype> &input, size_t k) {\n\t\treturn top_k<dtype>(input, k, [](const dtype &a, const dtype &b) { return a < b; });\n\t}\n\n}\n"
  },
  {
    "path": "src/api/api_response.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"api_response.h\"\n#include \"indexer/return_record.h\"\n#include \"full_text/search_metric.h\"\n#include \"parser/unicode.h\"\n#include \"json.hpp\"\n\nnamespace api {\n\n\tapi_response::api_response(const std::vector<indexer::return_record> &results, const struct full_text::search_metric &metric, double profile) {\n\n\t\tusing json = nlohmann::ordered_json;\n\n\t\tjson message;\n\n\t\tjson result_array;\n\t\tfor (const auto &result : results) {\n\t\t\tjson json_result;\n\n\t\t\ttry {\n\t\t\t\tjson_result[\"url\"] = result.m_url.str();\n\t\t\t\tjson_result[\"title\"] = parser::unicode::encode(result.m_title);\n\t\t\t\tjson_result[\"snippet\"] = parser::unicode::encode(result.m_snippet);\n\t\t\t\tjson_result[\"score\"] = result.m_score;\n\t\t\t\tjson_result[\"domain_hash\"] = std::to_string(result.m_domain_hash);\n\t\t\t\tjson_result[\"url_hash\"] = std::to_string(result.m_url.hash());\n\n\t\t\t\tresult_array.push_back(json_result);\n\t\t\t} catch (nlohmann::detail::type_error &error) {\n\t\t\t\t// skip this result.\n\t\t\t\t// in future log this and fix what is wrong.\n\t\t\t}\n\t\t}\n\n\t\tmessage[\"status\"] = \"success\";\n\t\tmessage[\"time_ms\"] = profile;\n\t\tmessage[\"total_found\"] = metric.m_total_found;\n\t\tmessage[\"total_url_links_found\"] = metric.m_total_url_links_found;\n\t\tmessage[\"total_domain_links_found\"] = metric.m_total_domain_links_found;\n\t\tmessage[\"links_handled\"] = metric.m_links_handled;\n\t\tmessage[\"link_domain_matches\"] = metric.m_link_domain_matches;\n\t\tmessage[\"link_url_matches\"] = metric.m_link_url_matches;\n\t\tmessage[\"results\"] = result_array;\n\n\t\t//m_response = message.dump();\n\t\tm_response = message.dump(4);\n\t}\n\n\tapi_response::~api_response() {\n\n\t}\n\n\tstd::ostream &operator<<(std::ostream &os, const api_response &api_response) {\n\t\tos << api_response.m_response;\n\t\treturn os;\n\t}\n\n}\n"
  },
  {
    "path": "src/api/api_response.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n\nnamespace full_text {\n\tstruct search_metric;\n}\n\nnamespace indexer {\n\tclass return_record;\n}\n\nnamespace api {\n\n\tclass api_response {\n\n\t\tpublic:\n\t\t\tapi_response(const std::vector<indexer::return_record> &results, const struct full_text::search_metric &metric, double profile);\n\t\t\t~api_response();\n\n\t\t\tfriend std::ostream &operator<<(std::ostream &os, const api_response &api_response);\n\n\t\tprivate:\n\n\t\t\tstd::string m_response;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/api/result_with_snippet.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"result_with_snippet.h\"\n#include \"text/text.h\"\n\nnamespace api {\n\n\tresult_with_snippet::result_with_snippet(const std::string &tsv_data, const indexer::return_record &res)\n\t: m_score(res.m_score), m_domain_hash(res.m_domain_hash) {\n\t\tsize_t pos_start = 0;\n\t\tsize_t pos_end = 0;\n\t\tsize_t col_num = 0;\n\t\twhile (pos_end != std::string::npos) {\n\t\t\tpos_end = tsv_data.find('\\t', pos_start);\n\t\t\tconst size_t len = pos_end - pos_start;\n\t\t\tif (col_num == 0) {\n\t\t\t\tm_url = URL(tsv_data.substr(pos_start, len));\n\t\t\t}\n\t\t\tif (col_num == 1) {\n\t\t\t\tm_title = tsv_data.substr(pos_start, len);\n\t\t\t}\n\t\t\tif (col_num == 3) {\n\t\t\t\tm_meta = tsv_data.substr(pos_start, len);\n\t\t\t}\n\t\t\tif (col_num == 4) {\n\t\t\t\tm_snippet = make_snippet(tsv_data.substr(pos_start, len));\n\t\t\t\tif (m_snippet.size() == 0) {\n\t\t\t\t\tm_snippet = make_snippet(m_meta);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tpos_start = pos_end + 1;\n\t\t\tcol_num++;\n\t\t}\n\t}\n\n\tresult_with_snippet::~result_with_snippet() {\n\n\t}\n\n\tstd::string result_with_snippet::make_snippet(const std::string &text) const {\n\t\tstd::string response = text.substr(0, 140);\n\t\ttext::trim(response);\n\t\tif (response.size() >= 140) response += \"...\";\n\t\treturn response;\n\t}\n\n}\n\n"
  },
  {
    "path": "src/api/result_with_snippet.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include \"URL.h\"\n#include \"indexer/return_record.h\"\n\nnamespace api {\n\n\tclass result_with_snippet {\n\n\tpublic:\n\t\tresult_with_snippet(const std::string &tsv_data, const indexer::return_record &res);\n\t\t~result_with_snippet();\n\n\t\tconst URL &url() const { return m_url; };\n\t\tconst std::string &title() const { return m_title; };\n\t\tconst std::string &snippet() const { return m_snippet; };\n\t\tconst float &score() const { return m_score; };\n\t\tconst uint64_t &domain_hash() const { return m_domain_hash; };\n\n\tprivate:\n\n\t\tURL m_url;\n\t\tstd::string m_title;\n\t\tstd::string m_meta;\n\t\tstd::string m_snippet;\n\t\tfloat m_score;\n\t\tuint64_t m_domain_hash;\n\n\t\tstd::string make_snippet(const std::string &text) const;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/cluster/cluster.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n\n"
  },
  {
    "path": "src/cluster/document.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"document.h\"\n#include \"algorithm/hash.h\"\n#include \"text/text.h\"\n#include \"URL.h\"\n\nnamespace cluster {\n\n\tdocument::document() \n\t: m_name(\"unnamed document\"){\n\t}\n\n\tdocument::document(const std::string &name)\n\t: m_name(name) {\n\n\t}\n\n\tdocument::~document() {\n\n\t}\n\n\tvoid document::read_text(const std::string &text) {\n\t\tconst std::vector<std::string> words = text::get_words(text, 0);\n\n\t\tfor (const auto &word : words) {\n\t\t\tm_counts[algorithm::hash(word)]++;\n\t\t}\n\t}\n\n\tvoid read_text_to_corpus(corpus &corp, const std::string &text) {\n\t\tconst std::vector<std::string> words = text::get_words(text, 0);\n\n\t\tfor (const auto &word : words) {\n\t\t\tsize_t key = algorithm::hash(word);\n\t\t\tcorp.counts[key]++;\n\t\t\tif (corp.words.count(key) == 0) {\n\t\t\t\tcorp.words[key] = word;\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid read_corpus(corpus &corp, documents &documents, std::stringstream &tsv) {\n\t\tstd::string line;\n\t\twhile (getline(tsv, line)) {\n\t\t\tconst size_t pos = line.find('\\t');\n\t\t\tif (pos == std::string::npos) continue;\n\n\t\t\tURL url(line.substr(0, pos));\n\t\t\tconst std::string doc_text = line.substr(pos);\n\n\t\t\tconst size_t key = url.host_hash();\n\n\t\t\tif (!documents.count(key)) {\n\t\t\t\tdocuments.emplace(key, url.host());\n\t\t\t}\n\t\t\tdocuments[key].read_text(doc_text);\n\t\t\tif (key == algorithm::hash(\"annicaviklund.se\")) {\n\t\t\t\tstd::cout << doc_text << std::endl;\n\t\t\t}\n\t\t\tread_text_to_corpus(corp, doc_text);\n\t\t}\n\t}\n\n\tvoid print_document(corpus &corp, const document &document) {\n\t\tstd::vector<std::pair<size_t, size_t>> keys;\n\t\tfor (const auto &iter : document.m_counts) {\n\t\t\tkeys.emplace_back(iter.first, iter.second);\n\t\t}\n\n\t\tsort(keys.begin(), keys.end(), [](const auto &a, const auto &b) {\n\t\t\treturn a.second > b.second;\n\t\t});\n\n\t\tsize_t len = keys.size();\n\t\tfor (size_t i = 0; i < std::min(100ul, len); i++) {\n\t\t\tstd::cout << corp.words[keys[i].first] << \" = \" << keys[i].second << std::endl;\n\t\t}\n\t}\n}\n\n"
  },
  {
    "path": "src/cluster/document.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <unordered_map>\n#include <cstdint>\n\nnamespace cluster {\n\n\ttypedef struct corpus_s {\n\t\tstd::unordered_map<size_t, std::string> words;\n\t\tstd::unordered_map<size_t, size_t> counts;\n\t} corpus;\n\n\tclass document {\n\t\tpublic:\n\t\t\tdocument();\n\t\t\tdocument(const std::string &name);\n\t\t\t~document();\n\t\t\tstd::string name() const { return m_name; };\n\t\t\tsize_t size() const { return m_counts.size(); };\n\n\t\t\tvoid read_text(const std::string &text);\n\t\t\tfriend void print_document(corpus &corp, const document &document);\n\n\t\tprivate:\n\n\t\t\tstd::string m_name;\n\t\t\tstd::unordered_map<size_t, size_t> m_counts;\n\n\t};\n\n\ttypedef document topic;\n\ttypedef std::unordered_map<size_t, document> documents;\n\n\tvoid read_corpus(corpus &corp, documents &documents, std::stringstream &tsv);\n\tvoid print_document(corpus &corp, const document &document);\n}\n"
  },
  {
    "path": "src/common/ThreadPool.h",
    "content": "/*\nCopyright (c) 2012 Jakob Progsch, Václav Zeman\n\nThis software is provided 'as-is', without any express or implied\nwarranty. In no event will the authors be held liable for any damages\narising from the use of this software.\n\nPermission is granted to anyone to use this software for any purpose,\nincluding commercial applications, and to alter it and redistribute it\nfreely, subject to the following restrictions:\n\n   1. The origin of this software must not be misrepresented; you must not\n   claim that you wrote the original software. If you use this software\n   in a product, an acknowledgment in the product documentation would be\n   appreciated but is not required.\n\n   2. Altered source versions must be plainly marked as such, and must not be\n   misrepresented as being the original software.\n\n   3. This notice may not be removed or altered from any source\n   distribution.\n*/\n\n#ifndef THREAD_POOL_H\n#define THREAD_POOL_H\n\n#include <vector>\n#include <queue>\n#include <memory>\n#include <thread>\n#include <mutex>\n#include <condition_variable>\n#include <future>\n#include <functional>\n#include <stdexcept>\n\nclass ThreadPool {\npublic:\n    explicit ThreadPool(size_t);\n    template<class F, class... Args>\n    auto enqueue(F&& f, Args&&... args) \n        -> std::future<typename std::result_of<F(Args...)>::type>;\n    ~ThreadPool();\nprivate:\n    // need to keep track of threads so we can join them\n    std::vector< std::thread > workers;\n    // the task queue\n    std::queue< std::function<void()> > tasks;\n    \n    // synchronization\n    std::mutex queue_mutex;\n    std::condition_variable condition;\n    bool stop;\n};\n \n// the constructor just launches some amount of workers\ninline ThreadPool::ThreadPool(size_t threads)\n    :   stop(false)\n{\n    for(size_t i = 0;i<threads;++i)\n        workers.emplace_back(\n            [this]\n            {\n                for(;;)\n                {\n                    std::function<void()> task;\n\n                    {\n                        std::unique_lock<std::mutex> lock(this->queue_mutex);\n                        this->condition.wait(lock,\n                            [this]{ return this->stop || !this->tasks.empty(); });\n                        if(this->stop && this->tasks.empty())\n                            return;\n                        task = std::move(this->tasks.front());\n                        this->tasks.pop();\n                    }\n\n                    task();\n                }\n            }\n        );\n}\n\n// add new work item to the pool\ntemplate<class F, class... Args>\nauto ThreadPool::enqueue(F&& f, Args&&... args) \n    -> std::future<typename std::result_of<F(Args...)>::type>\n{\n    using return_type = typename std::result_of<F(Args...)>::type;\n\n    auto task = std::make_shared< std::packaged_task<return_type()> >(\n            std::bind(std::forward<F>(f), std::forward<Args>(args)...)\n        );\n        \n    std::future<return_type> res = task->get_future();\n    {\n        std::unique_lock<std::mutex> lock(queue_mutex);\n\n        // don't allow enqueueing after stopping the pool\n        if(stop)\n            throw std::runtime_error(\"enqueue on stopped ThreadPool\");\n\n        tasks.emplace([task](){ (*task)(); });\n    }\n    condition.notify_one();\n    return res;\n}\n\n// the destructor joins all threads\ninline ThreadPool::~ThreadPool()\n{\n    {\n        std::unique_lock<std::mutex> lock(queue_mutex);\n        stop = true;\n    }\n    condition.notify_all();\n    for(std::thread &worker: workers)\n        worker.join();\n}\n\n#endif\n"
  },
  {
    "path": "src/common/datetime.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n#include \"datetime.h\"\n#include <ctime>\n\nnamespace common {\n\n\tsize_t cur_date() {\n\t\ttime_t tt = time(NULL);\n\t\tstruct tm tm = *localtime(&tt);\n\t\tsize_t year_since_00 = tm.tm_year - 100;\n\t\tsize_t year = 2000 + year_since_00;\n\t\treturn (year * 100 * 100) + ((tm.tm_mon + 1) * 100) + tm.tm_mday;\n\t}\n\n\tsize_t cur_time() {\n\t\ttime_t tt = time(NULL);\n\t\tstruct tm tm = *localtime(&tt);\n\t\treturn (tm.tm_hour * 100 * 100) + (tm.tm_min * 100) + tm.tm_sec;\n\t}\n\n\tsize_t cur_datetime() {\n\t\tsize_t date = cur_date();\n\t\treturn (date * 100 * 100 * 100) + cur_time();\n\t}\n\n\tconst std::string iso8601_datetime() {\n\t\ttime_t now;\n\t\ttime(&now);\n\t\tchar buf[21];\n\t\tstrftime(buf, sizeof(buf), \"%FT%TZ\", gmtime(&now));\n\t\treturn std::string(buf);\n\t}\n\n}\n"
  },
  {
    "path": "src/common/datetime.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n#pragma once\n\n#include <iostream>\n\nnamespace common {\n\tsize_t cur_date();\n\tsize_t cur_time();\n\tsize_t cur_datetime();\n\tconst std::string iso8601_datetime();\n}\n"
  },
  {
    "path": "src/common/dictionary.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"dictionary.h\"\n#include \"logger/logger.h\"\n#include \"file/tsv_file.h\"\n#include \"dictionary_row.h\"\n#include \"algorithm/hash.h\"\n\nusing namespace std;\n\nnamespace common {\n\n\tdictionary::dictionary() {\n\n\t}\n\n\tdictionary::dictionary(file::tsv_file &tsv_file) {\n\t\tload_tsv(tsv_file);\n\t}\n\n\tdictionary::~dictionary() {\n\n\t}\n\n\tvoid dictionary::load_tsv(file::tsv_file &tsv_file) {\n\t\twhile (!tsv_file.eof()) {\n\t\t\tauto line = tsv_file.get_line();\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tgetline(ss, col, '\\t');\n\n\t\t\tif (col.size()) {\n\t\t\t\tsize_t key = ::algorithm::hash(col);\n\n\t\t\t\tif (m_rows.find(key) != m_rows.end()) {\n\t\t\t\t\thandle_collision(key, col);\n\t\t\t\t}\n\n\t\t\t\tm_rows[key] = dictionary_row(ss);\n\t\t\t}\n\t\t}\n\t}\n\n\tunordered_map<size_t, dictionary_row>::const_iterator dictionary::find(const std::string &key) const {\n\t\treturn m_rows.find(::algorithm::hash(key));\n\t}\n\n\tunordered_map<size_t, dictionary_row>::const_iterator dictionary::find(size_t hash) const {\n\t\treturn m_rows.find(hash);\n\t}\n\n\tunordered_map<size_t, dictionary_row>::const_iterator dictionary::begin() const {\n\t\treturn m_rows.begin();\n\t}\n\n\tunordered_map<size_t, dictionary_row>::const_iterator dictionary::end() const {\n\t\treturn m_rows.end();\n\t}\n\n\tbool dictionary::has_key(const std::string &key) const {\n\t\treturn find(key) != end();\n\t}\n\n\tvoid dictionary::handle_collision(size_t key, const std::string &col) {\n\t\tLOG_ERROR(\"Collision: \" + std::to_string(key) + \" \" + col);\n\t}\n}\n"
  },
  {
    "path": "src/common/dictionary.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <map>\n#include <unordered_map>\n#include \"dictionary_row.h\"\n\nnamespace file {\n\tclass tsv_file;\n}\n\nnamespace common {\n\n\tclass dictionary {\n\n\t\tpublic:\n\n\t\t\tdictionary();\n\t\t\texplicit dictionary(file::tsv_file &tsv_file);\n\t\t\t~dictionary();\n\n\t\t\tvoid load_tsv(file::tsv_file &tsv_file);\n\n\t\t\tstd::unordered_map<size_t, dictionary_row>::const_iterator find(const std::string &key) const;\n\t\t\tstd::unordered_map<size_t, dictionary_row>::const_iterator find(size_t hash) const;\n\n\t\t\tstd::unordered_map<size_t, dictionary_row>::const_iterator begin() const;\n\t\t\tstd::unordered_map<size_t, dictionary_row>::const_iterator end() const;\n\n\t\t\tbool has_key(const std::string &key) const;\n\t\t\tsize_t size() const { return m_rows.size(); }\n\n\t\tprivate:\n\n\t\t\tstd::unordered_map<size_t, dictionary_row> m_rows;\n\n\t\t\tvoid handle_collision(size_t key, const std::string &col);\n\n\t};\n}\n"
  },
  {
    "path": "src/common/dictionary_row.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"dictionary_row.h\"\n\nnamespace common {\n\n\tdictionary_row::dictionary_row() {\n\t}\n\n\tdictionary_row::dictionary_row(const dictionary_row &row) {\n\t\tm_columns = row.m_columns;\n\t}\n\n\tdictionary_row::dictionary_row(const std::string &row) {\n\t\tstd::stringstream stream(row);\n\t\tread_stream(stream);\n\t}\n\n\tdictionary_row::dictionary_row(std::stringstream &stream) {\n\t\tread_stream(stream);\n\t}\n\n\tdictionary_row::~dictionary_row() {\n\n\t}\n\n\tint dictionary_row::get_int(int column) const {\n\t\treturn (int)m_columns[column];\n\t}\n\n\tfloat dictionary_row::get_float(int column) const {\n\t\treturn (float)m_columns[column];\n\t}\n\n\tdouble dictionary_row::get_double(int column) const {\n\t\treturn m_columns[column];\n\t}\n\n\tvoid dictionary_row::read_stream(std::stringstream &stream) {\n\t\tstd::string col;\n\t\tint i = 0;\n\t\twhile (std::getline(stream, col, '\\t')) {\n\t\t\ttry {\n\t\t\t\tm_columns.push_back(stod(col));\n\t\t\t} catch(const std::invalid_argument &error) {\n\n\t\t\t} catch(const std::out_of_range &error) {\n\t\t\t}\n\t\t\ti++;\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/common/dictionary_row.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <sstream>\n#include <vector>\n\n#define CC_ROW_LEN 5\n\nnamespace common {\n\n\tclass dictionary_row {\n\n\t\tpublic:\n\n\t\t\tdictionary_row();\n\t\t\tdictionary_row(const dictionary_row &row);\n\t\t\texplicit dictionary_row(const std::string &row);\n\t\t\texplicit dictionary_row(std::stringstream &stream);\n\t\t\t~dictionary_row();\n\n\t\t\tint get_int(int column) const;\n\t\t\tfloat get_float(int column) const;\n\t\t\tdouble get_double(int column) const;\n\n\t\tprivate:\n\t\t\tstd::vector<double> m_columns;\n\n\t\t\tvoid read_stream(std::stringstream &stream);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/common/simple_thread_pool.hpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <thread>\n#include <future>\n#include <queue>\n\nnamespace common {\n\n\tclass simple_thread_pool {\n\n\t\tpublic:\n\n\t\t\texplicit simple_thread_pool(size_t);\n\t\t\t~simple_thread_pool();\n\n\t\t\tvoid enqueue(std::function<void()> &&fun);\n\n\t\tprivate:\n\n\t\t\tvoid handle_work();\n\n\t\t\tstd::vector<std::thread> m_workers;\n\t\t\tstd::queue<std::function<void()>> m_queue;\n\n\t\t\tstd::mutex m_queue_lock;\n\t\t\tstd::condition_variable m_condition;\n\t\t\tbool m_stop = false;\n\n\t};\n\t\n}\n"
  },
  {
    "path": "src/common/system.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"system.h\"\n#include <thread>\n#include <boost/uuid/uuid.hpp>\n#include <boost/uuid/uuid_generators.hpp>\n#include <boost/uuid/uuid_io.hpp>\n\nnamespace common {\n\n\tbool is_dev() {\n\t\tif (getenv(\"ALEXANDRIA_LIVE\") != NULL && std::stoi(getenv(\"ALEXANDRIA_LIVE\")) > 0) {\n\t\t\treturn false;\n\t\t}\n\t\treturn true;\n\t}\n\n\tstd::string domain_index_filename() {\n\t\tif (is_dev()) {\n\t\t\treturn \"/dev_files/domain_info.tsv\";\n\t\t}\n\t\treturn \"/files/domain_info.tsv\";\n\t}\n\n\tstd::string dictionary_filename() {\n\t\tif (is_dev()) {\n\t\t\treturn \"/dev_files/dictionary.tsv\";\n\t\t}\n\t\treturn \"/files/dictionary.tsv\";\n\t}\n\n\tstd::string uuid() {\n\t\t// Create a random UUID\n\t\tboost::uuids::uuid uuid = boost::uuids::random_generator()();\n\t\t// Convert UUID to string and return\n\t\treturn boost::uuids::to_string(uuid);\n\t}\n\n}\n"
  },
  {
    "path": "src/common/system.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace common {\n\n\tbool is_dev();\n\tstd::string domain_index_filename();\n\tstd::string dictionary_filename();\n\tstd::string uuid();\n\n}\n"
  },
  {
    "path": "src/config.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"config.h\"\n#include \"text/text.h\"\n#include \"logger/logger.h\"\n#include \"file/file.h\"\n\nusing namespace std;\n\nnamespace config {\n\n\tconfig::config() {\n\t\tcreate_data_directories(m_data_path);\n\t}\n\n\tconst config s_instance = config();\n\n\tconst std::string &data_path() {\n\t\treturn s_instance.data_path();\n\t}\n\n\tvoid create_data_directories(const std::string &data_path) {\n\t\tif (file::directory_exists(data_path)) {\n\t\t\tfor (size_t shard_id = 0; shard_id < 8; shard_id++) {\n\t\t\t\tconst std::string base = data_path + \"/\" + to_string(shard_id);\n\t\t\t\tfile::create_directory(base);\n\t\t\t\tfile::create_directory(base + \"/input\");\n\t\t\t\tfile::create_directory(base + \"/output\");\n\t\t\t\tfile::create_directory(base + \"/upload\");\n\t\t\t\tfile::create_directory(base + \"/hash_table\");\n\t\t\t\tfile::create_directory(base + \"/full_text\");\n\t\t\t\tfile::create_directory(base + \"/tmp\");\n\t\t\t}\n\t\t}\n\t}\n\n\tstring node = \"test0001\";\n\tstring master = \"localhost\";\n\tstring upload = \"localhost\";\n\tstring data_node;\n\t//string url_store_host = \"http://localhost\";\n\tstring url_store_host = \"http://node0009.alexandria.org\";\n\tstring url_store_path = \"/alexandria/urlstore\";\n\tstring url_store_cache_path = \"/mnt/4/urlstore_cache\";\n\n\tsize_t nodes_in_cluster = 1;\n\tsize_t node_id = 0;\n\n\tbool index_snippets = true;\n\tbool index_text = true;\n\n\tvector<string> batches;\n\tvector<string> link_batches;\n\tsize_t worker_count = 8;\n\tsize_t query_max_words = 10;\n\tsize_t query_max_len = 200;\n\tsize_t deduplicate_domain_count = 5;\n\tsize_t pre_result_limit = 200000;\n\tsize_t result_limit = 1000;\n\tstring file_upload_user = \"\";\n\tstring file_upload_password = \"\";\n\tsize_t n_grams = 1;\n\tsize_t shard_hash_table_size = 100000;\n\tsize_t html_parser_long_text_len = 1000;\n\tsize_t ft_shard_builder_buffer_len = 240000;\n\n\tsize_t ft_num_shards = 2048;\n\tsize_t ft_max_sections = 8;\n\tsize_t ft_max_results_per_section = 100000;\n\tsize_t ft_section_depth = 8;\n\tsize_t ft_max_cache_gb = 30;\n\tsize_t ft_num_threads_indexing = 24;\n\tsize_t ft_num_threads_merging = 24;\n\tsize_t ft_num_threads_appending = 8;\n\n\tdouble ft_cached_bytes_per_shard() {\n\t\treturn (ft_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * ft_num_threads_indexing);\n\t}\n\n\tvoid read_config(const string &config_file) {\n\n\t\tbatches.clear();\n\t\tlink_batches.clear();\n\n\t\tifstream in(config_file);\n\n\t\tif (!in.is_open()) {\n\t\t\tLOG_ERROR(\"Could not read config file: \" + config_file);\n\t\t\treturn;\n\t\t}\n\n\t\tstring line;\n\t\twhile (getline(in, line)) {\n\t\t\tsize_t comment_pos = line.find(\"#\");\n\t\t\tif (comment_pos != string::npos) {\n\t\t\t\tline = line.substr(0, comment_pos);\n\t\t\t}\n\t\t\tif (text::trim(line) == \"\") {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tvector<string> parts;\n\t\t\tboost::split(parts, line, boost::is_any_of(\"=\"));\n\n\t\t\tfor (string &part : parts) {\n\t\t\t\tpart = text::trim(part);\n\t\t\t}\n\n\t\t\tif (parts[0] == \"node\") {\n\t\t\t\tnode = parts[1];\n\t\t\t} else if (parts[0] == \"master\") {\n\t\t\t\tmaster = parts[1];\n\t\t\t\tupload = parts[1];\n\t\t\t} else if (parts[0] == \"upload\") {\n\t\t\t\tupload = parts[1];\n\t\t\t} else if (parts[0] == \"data_node\") {\n\t\t\t\tdata_node = parts[1];\n\t\t\t} else if (parts[0] == \"url_store_host\") {\n\t\t\t\turl_store_host = parts[1];\n\t\t\t} else if (parts[0] == \"url_store_path\") {\n\t\t\t\turl_store_path = parts[1];\n\t\t\t} else if (parts[0] == \"nodes_in_cluster\") {\n\t\t\t\tnodes_in_cluster = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"node_id\") {\n\t\t\t\tnode_id = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"batches[]\") {\n\t\t\t\tbatches.push_back(parts[1]);\n\t\t\t} else if (parts[0] == \"link_batches[]\") {\n\t\t\t\tlink_batches.push_back(parts[1]);\n\t\t\t} else if (parts[0] == \"worker_count\") {\n\t\t\t\tworker_count = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"query_max_words\") {\n\t\t\t\tquery_max_words = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"query_max_len\") {\n\t\t\t\tquery_max_len = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"deduplicate_domain_count\") {\n\t\t\t\tdeduplicate_domain_count = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"pre_result_limit\") {\n\t\t\t\tpre_result_limit = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"result_limit\") {\n\t\t\t\tresult_limit = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_num_shards\") {\n\t\t\t\tft_num_shards = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_max_sections\") {\n\t\t\t\tft_max_sections = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_max_results_per_section\") {\n\t\t\t\tft_max_results_per_section = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_section_depth\") {\n\t\t\t\tft_section_depth = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_max_cache_gb\") {\n\t\t\t\tft_max_cache_gb = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_num_threads_indexing\") {\n\t\t\t\tft_num_threads_indexing = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_num_threads_merging\") {\n\t\t\t\tft_num_threads_merging = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"ft_num_threads_appending\") {\n\t\t\t\tft_num_threads_appending = stoi(parts[1]);\n\t\t\t} else if (parts[0] == \"file_upload_user\") {\n\t\t\t\tfile_upload_user = parts[1];\n\t\t\t} else if (parts[0] == \"file_upload_password\") {\n\t\t\t\tfile_upload_password = parts[1];\n\t\t\t} else if (parts[0] == \"n_grams\") {\n\t\t\t\tn_grams = stoull(parts[1]);\n\t\t\t} else if (parts[0] == \"index_snippets\") {\n\t\t\t\tindex_snippets = static_cast<bool>(stoull(parts[1]));\n\t\t\t} else if (parts[0] == \"index_text\") {\n\t\t\t\tindex_text = static_cast<bool>(stoull(parts[1]));\n\t\t\t} else if (parts[0] == \"shard_hash_table_size\") {\n\t\t\t\tshard_hash_table_size = stoull(parts[1]);\n\t\t\t} else if (parts[0] == \"html_parser_long_text_len\") {\n\t\t\t\thtml_parser_long_text_len = stoull(parts[1]);\n\t\t\t} else if (parts[0] == \"data_path\") {\n\t\t\t\ts_instance.data_path(parts[1]);\n\t\t\t}\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/config.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <fstream>\n#include <vector>\n\nnamespace config {\n\n\tvoid create_data_directories(const std::string &data_path);\n\n\tclass config {\n\t\tpublic:\n\n\t\t\tconfig();\n\n\t\t\tconst std::string &data_path() const { return m_data_path; }\n\n\t\t\tvoid data_path(const std::string &str) const { m_data_path = str; create_data_directories(m_data_path); }\n\n\t\tprivate:\n\n\t\t\tmutable std::string m_data_path = \"/mnt\";\n\n\t};\n\n\tconst std::string &data_path();\n\n\textern std::string node;\n\textern std::string master;\n\textern std::string upload;\n\textern std::string data_node;\n\textern std::string url_store_host;\n\textern std::string url_store_path;\n\textern std::string url_store_cache_path;\n\n\tconst size_t url_store_shards = 24;\n\n\textern size_t nodes_in_cluster;\n\textern size_t node_id;\n\n\textern bool index_snippets;\n\textern bool index_text;\n\n\textern std::vector<std::string> batches;\n\textern std::vector<std::string> link_batches;\n\n\textern size_t worker_count;\n\textern size_t query_max_words;\n\textern size_t query_max_len;\n\textern size_t deduplicate_domain_count;\n\textern size_t pre_result_limit;\n\textern size_t result_limit;\n\textern std::string file_upload_user;\n\textern std::string file_upload_password;\n\textern size_t n_grams;\n\textern size_t shard_hash_table_size;\n\textern size_t html_parser_long_text_len;\n\textern size_t ft_shard_builder_buffer_len;\n\n\t/*\n\t\tConstants only configurable at compilation time.\n\t*/\n\n\t// Full text indexer config\n\textern size_t ft_num_shards;\n\textern size_t ft_max_sections;\n\textern size_t ft_max_results_per_section;\n\textern size_t ft_section_depth;\n\textern size_t ft_max_cache_gb;\n\textern size_t ft_num_threads_indexing;\n\textern size_t ft_num_threads_merging;\n\textern size_t ft_num_threads_appending;\n\tdouble ft_cached_bytes_per_shard();\n\n\t// Link indexer config\n\tinline const unsigned long long li_max_cache_gb = 4;\n\tinline const unsigned long long li_num_threads_indexing = 48;\n\tinline const unsigned long long li_num_threads_merging = 16;\n\tinline const double li_cached_bytes_per_shard  = (li_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * li_num_threads_indexing);\n\tinline const unsigned long long li_indexer_max_cache_size = 500;\n\n\t// Hash table indexer config\n\tinline const unsigned long long ht_num_shards = 1031;\n\tinline const unsigned long long ht_num_buckets = 8;\n\tinline const unsigned long long ht_key_size = 8;\n\n\t// Server config\n\n\t// Other constants.\n\tinline const unsigned long long num_async_file_transfers = 48;\n\tinline const std::string test_data_path = \"/var/www/html/node0003.alexandria.org/test-data/\";\n\n\t// Commoncrawl parser.\n\tinline const std::string cc_target_output = \"alexandria-cc-output\";\n\tinline const bool cc_run_on_lambda = false;\n\n\tinline const std::string log_file_path = \"/var/log/alexandria.log\";\n\n\tvoid read_config(const std::string &config_file);\n\n}\n\n\n"
  },
  {
    "path": "src/debug.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"debug.h\"\n\nvoid print_elem(std::map<size_t, size_t> &m, size_t elem) {\n\tstd::cout << m[elem] << std::endl;\n}\n"
  },
  {
    "path": "src/debug.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <map>\n\nvoid print_elem(std::map<size_t, size_t> &m, size_t elem);\n"
  },
  {
    "path": "src/domain_stats/domain_stats.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"domain_stats.h\"\n#include <iostream>\n#include \"common/dictionary.h\"\n#include \"file/tsv_file_remote.h\"\n#include \"logger/logger.h\"\n#include \"common/system.h\"\n\nnamespace domain_stats {\n\n\tcommon::dictionary domain_data;\n\n\tvoid download_domain_stats() {\n\t\tLOG_INFO(\"download domain_info.tsv\");\n\t\tfile::tsv_file_remote domain_info_tsv(common::domain_index_filename());\n\t\tLOG_INFO(\"parsing.....\");\n\t\tdomain_data.load_tsv(domain_info_tsv);\n\t}\n\n\tfloat harmonic_centrality(const URL &url) {\n\t\treturn harmonic_centrality(url.host());\n\t}\n\n\tfloat harmonic_centrality(const std::string &host) {\n\n\t\tconst auto iter = domain_data.find(host);\n\n\t\tfloat harmonic = 0.0f;\n\t\tif (iter != domain_data.end()) {\n\t\t\tconst common::dictionary_row row = iter->second;\n\t\t\tharmonic = row.get_float(0);\n\t\t}\n\n\t\treturn harmonic;\n\t}\n\n}\n"
  },
  {
    "path": "src/domain_stats/domain_stats.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include \"URL.h\"\n\nnamespace domain_stats {\n\tvoid download_domain_stats();\n\tfloat harmonic_centrality(const URL &url);\n\tfloat harmonic_centrality(const std::string &domain);\n}\n"
  },
  {
    "path": "src/downloader/merge_downloader.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iostream>\n#include <sstream>\n#include \"file/file.h\"\n#include \"file/archive.h\"\n#include \"hash_table2/builder.h\"\n#include \"utils/thread_pool.hpp\"\n#include \"indexer/index.h\"\n#include \"indexer/index_builder.h\"\n#include \"indexer/index_reader.h\"\n#include \"indexer/value_record.h\"\n\nnamespace downloader {\n\n\tbool internal_links_complete(const std::string &path) {\n\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tif (!file::file_exists(path + \"/internal_links_\" + std::to_string(i))) {\n\t\t\t\treturn false;\n\t\t\t}\n\t\t}\n\n\t\treturn true;\n\t}\n\n\tbool hash_table_complete(const std::string &path) {\n\t\tconst size_t num_shards = 1019;\n\t\tfor (size_t i = 0; i < num_shards; i++) {\n\t\t\tif (!file::file_exists(path + \"/\" + std::to_string(i) + \".pos\")) {\n\t\t\t\treturn false;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < num_shards; i++) {\n\t\t\tif (!file::file_exists(path + \"/\" + std::to_string(i) + \".data\")) {\n\t\t\t\treturn false;\n\t\t\t}\n\t\t}\n\n\t\treturn true;\n\t}\n\n\tvoid merge_internal_links(const std::string &path, const std::string &batch_name) {\n\t\treturn;\n\t\t/*\n\t\tconst std::string target_path = \"/slow_data/internal_links/\" + batch_name;\n\t\tfile::create_directory(target_path);\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tfile::copy_file(path + \"/internal_links_\" + std::to_string(i), target_path + \"/internal_links_\" + std::to_string(i));\n\t\t}\n\t\t*/\n\t\tutils::thread_pool pool(8);\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tpool.enqueue([i, path]() {\n\t\t\t\tfile::archive tar(path + \"/internal_links_\" + std::to_string(i));\n\t\t\t\tutils::thread_pool pool(4, 10);\n\t\t\t\ttar.untar([&pool](const std::string &filename, const std::string &data) {\n\n\t\t\t\t\tpool.enqueue([filename, data]() {\n\t\t\t\t\t\tuint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5));\n\n\t\t\t\t\t\tstd::istringstream ram_reader(data);\n\n\t\t\t\t\t\tindexer::index_builder<indexer::value_record> idx1(\"internal_links\", host_hash, 1000);\n\t\t\t\t\t\tindexer::index<indexer::value_record> idx2(&ram_reader, 1000);\n\n\t\t\t\t\t\ttry {\n\t\t\t\t\t\t\tidx1.merge_with(idx2);\n\t\t\t\t\t\t} catch (const std::runtime_error &err) {\n\t\t\t\t\t\t\t// The file is corrupt. Lets delete it and report.\n\t\t\t\t\t\t\tstd::cout << \"internal_links: \" << host_hash << \" is corrupt\" << std::endl;\n\t\t\t\t\t\t\tidx1.truncate();\n\t\t\t\t\t\t} catch (const std::bad_alloc &err) {\n\t\t\t\t\t\t\t// The file is corrupt. Lets delete it and report.\n\t\t\t\t\t\t\tstd::cout << \"internal_links: \" << host_hash << \" is corrupt\" << std::endl;\n\t\t\t\t\t\t\tidx1.truncate();\n\t\t\t\t\t\t}\n\t\t\t\t\t});\n\t\t\t\t});\n\t\t\t\tpool.run_all();\n\t\t\t});\n\t\t}\n\t\tpool.run_all();\n\t\tstd::cout << \"finished with the merge\" << std::endl;\n\t}\n\n\tvoid merge_hash_table(const std::string &path) {\n\t\tutils::thread_pool pool(32);\n\t\thash_table2::builder ht(\"all_urls\", 1019, 1000000, \"/slow_data\");\n\t\tfor (size_t i = 0; i < 1019; i++) {\n\t\t\tpool.enqueue([&ht, i, path]() {\n\t\t\t\tht.get_shard(i)->merge_with(path + \"/\" + std::to_string(i) + \".pos\", path + \"/\" + std::to_string(i) + \".data\");\n\t\t\t});\n\t\t}\n\t\tpool.run_all();\n\t}\n\n\tvoid merge_downloader() {\n\n\t\tindexer::index_builder<indexer::value_record>::create_directories(\"internal_links\");\n\n\t\tfile::read_directory(config::data_path() + \"/downloader\", [](const std::string &node_id) {\n\t\t\tconst std::string dir = config::data_path() + \"/downloader/\" + node_id;\n\t\t\tfile::read_directory(dir, [dir](const std::string &file) {\n\t\t\t\ttry {\n\t\t\t\t\tsize_t ts = std::stoull(file);\n\t\t\t\t\tconst std::string batch = dir + \"/\" + std::to_string(ts);\n\t\t\t\t\tif (internal_links_complete(batch) && hash_table_complete(batch + \"/ht\")) {\n\t\t\t\t\t\tstd::cout << \"merging directory: \" << batch << std::endl;\n\t\t\t\t\t\tprofiler::instance prof1(\"merge_internal_links\");\n\t\t\t\t\t\tmerge_internal_links(batch, std::to_string(ts));\n\t\t\t\t\t\tprof1.stop();\n\t\t\t\t\t\tprofiler::instance prof2(\"merge_hash_table\");\n\t\t\t\t\t\tmerge_hash_table(batch + \"/ht\");\n\t\t\t\t\t\tprof2.stop();\n\t\t\t\t\t\tfile::delete_directory(batch);\n\t\t\t\t\t\texit(0);\n\t\t\t\t\t}\n\t\t\t\t} catch (...) {\n\t\t\t\t}\n\t\t\t});\n\t\t});\n\t}\n}\n\n"
  },
  {
    "path": "src/downloader/merge_downloader.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace downloader {\n\tvoid merge_downloader();\n}\n\n"
  },
  {
    "path": "src/downloader/warc_downloader.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iomanip>\n\n#include \"config.h\"\n#include \"common/datetime.h\"\n#include \"warc/warc.h\"\n#include \"utils/thread_pool.hpp\"\n#include \"utils/id_allocator.h\"\n#include \"file/archive.h\"\n#include \"logger/logger.h\"\n#include \"text/text.h\"\n#include \"transfer/transfer.h\"\n#include <iostream>\n#include \"hash_table2/builder.h\"\n#include \"algorithm/algorithm.h\"\n#include \"indexer/index_utils.h\"\n#include \"indexer/index_builder.h\"\n#include \"indexer/value_record.h\"\n#include \"indexer/merger.h\"\n\nnamespace downloader {\n\n\tvoid run_downloader(const std::string &warc_path) {\n\n\t\twarc::parser pp;\n\t\tfor (int retry = 0; retry < 3; retry++) {\n\t\t\ttry {\n\t\t\t\twarc::multipart_download(\"http://data.commoncrawl.org/\" + warc_path, [&pp](const std::string &chunk) {\n\t\t\t\t\tstd::stringstream ss(chunk);\n\t\t\t\t\tpp.parse_stream(ss);\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t} catch (const std::runtime_error &err) {\n\t\t\t\tstd::cout << \"GOT ERROR: \" << err.what() << std::endl;\n\t\t\t\tstd::cout << \"Retrying... try \" << retry << std::endl;\n\t\t\t\tstd::this_thread::sleep_for(std::chrono::seconds(5));\n\t\t\t}\n\t\t}\n\n\t\tLOG_INFO(\"uploading: \" + warc_path);\n\t\tint error;\n\t\terror = transfer::upload_gz_file(warc::get_result_path(warc_path), pp.result());\n\t\terror = transfer::upload_gz_file(warc::get_link_result_path(warc_path), pp.link_result());\n\n\t\tif (error) {\n\t\t\tLOG_INFO(\"error uploading: \" + warc_path);\n\t\t}\n\n\t}\n\n\tstd::vector<std::string> download_warc_paths() {\n\t\tint error;\n\t\tauto content = transfer::file_to_string(\"nodes/\" + config::node + \"/warc.paths\", error);\n\t\tif (error == transfer::ERROR) return {};\n\n\t\tcontent = text::trim(content);\n\n\t\tstd::vector<std::string> raw_warc_paths;\n\t\tboost::algorithm::split(raw_warc_paths, content, boost::is_any_of(\"\\n\"));\n\n\t\tstd::vector<std::string> warc_paths;\n\t\tfor (const auto &warc_path : raw_warc_paths) {\n\t\t\tif (text::trim(warc_path).size()) {\n\t\t\t\twarc_paths.push_back(text::trim(warc_path));\n\t\t\t}\n\t\t}\n\n\t\treturn warc_paths;\n\t}\n\n\tbool upload_warc_paths(const std::vector<std::string> &warc_paths) {\n\t\tauto content = boost::algorithm::join(warc_paths, \"\\n\");\n\t\tint error = transfer::upload_file(\"nodes/\" + config::node + \"/warc.paths\", content);\n\t\treturn error == transfer::OK;\n\t}\n\n\tvoid start_downloaders(const std::vector<std::string> &warc_paths) {\n\n\t\tconst size_t num_threads = 12;\n\n\t\tstd::vector<std::vector<std::string>> chunks;\n\t\talgorithm::vector_chunk<std::string>(warc_paths, std::ceil(warc_paths.size() / num_threads) + 1, chunks);\n\n\t\tutils::thread_pool pool(num_threads);\n\n\t\tfor (const auto &chunk : chunks) {\n\t\t\tpool.enqueue([chunk] {\n\t\t\t\tsize_t count = 0;\n\t\t\t\tfor (const auto &warc_path : chunk) {\n\t\t\t\t\trun_downloader(warc_path);\n\t\t\t\t\tcount++;\n\t\t\t\t\tstd::cout << \"done with \" << warc_path << \" done with \" << count << \"/\" << chunk.size() << std::endl;\n\t\t\t\t}\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\tvoid upload_all() {\n\n\t\t/*auto upload_id = std::to_string(common::cur_datetime());\n\n\t\t// Upload internal links.\n\t\tfor (size_t i = 0; i < 8; i++) {\n\n\t\t\t// Optimize all internal links.\n\t\t\tutils::thread_pool pool(32);\n\t\t\tfile::read_directory(config::data_path() + \"/\" + std::to_string(i) + \"/full_text/internal_links\", [&pool](const std::string &filename) {\n\t\t\t\tuint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5));\n\t\t\t\tindexer::index_builder<indexer::value_record> idx(\"internal_links\", host_hash, 1000);\n\t\t\t\tidx.optimize();\n\t\t\t});\n\t\t\tpool.run_all();\n\n\t\t\tconst auto filename = \"internal_links_\" + std::to_string(i);\n\t\t\tfile::archive tar(filename);\n\t\t\ttar.read_dir(config::data_path() + \"/\" + std::to_string(i) + \"/full_text/internal_links\");\n\n\t\t\ttransfer::upload_file_from_disk(\"downloader/\" + config::node + \"/\" + upload_id + \"/\" + filename, filename);\n\n\t\t\tfile::delete_file(filename);\n\t\t}\n\n\t\thash_table2::hash_table ht(\"crawl_index\", 1019);\n\t\tht.for_each_shard([upload_id](auto shard) {\n\n\t\t\tconst auto pos_filename = shard->filename_pos();\n\t\t\tconst auto data_filename = shard->filename_data();\n\t\t\tconst auto target_filename = std::to_string(shard->shard_id());\n\n\t\t\ttransfer::upload_file_from_disk(\"downloader/\" + config::node + \"/\" + upload_id + \"/ht/\" + target_filename + \".pos\", pos_filename);\n\t\t\ttransfer::upload_file_from_disk(\"downloader/\" + config::node + \"/\" + upload_id + \"/ht/\" + target_filename + \".data\", data_filename);\n\t\t});\n\t\t*/\n\n\t}\n\n\tvoid warc_downloader_with_url(const std::string &batch, const std::string &warc_paths_url) {\n\t\n\t\tstd::vector<std::string> warc_paths;\n\n\t\tint error;\n\t\tauto content = transfer::gz_file_to_string(warc_paths_url, error);\n\n\t\tstd::stringstream ss(content);\n\n\t\tstd::string line;\n\t\tsize_t line_num = 0;\n\t\twhile (std::getline(ss, line)) {\n\t\t\tif (line_num % config::nodes_in_cluster == config::node_id) {\n\t\t\t\twarc_paths.emplace_back(std::move(line));\n\t\t\t}\n\n\t\t\tline_num++;\n\t\t}\n\n\t\tstart_downloaders(warc_paths);\n\t}\n\n\tvoid warc_downloader(const std::string &batch) {\n\t\twarc_downloader_with_url(batch, \"https://data.commoncrawl.org/crawl-data/\" + batch + \"/warc.paths.gz\");\n\t}\n\n\tvoid warc_downloader_missing(const std::string &batch) {\n\t\twarc_downloader_with_url(batch, \"crawl-data/\" + batch + \"/missing.paths.gz\");\n\t}\n}\n\n"
  },
  {
    "path": "src/downloader/warc_downloader.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n\nnamespace downloader {\n\n\tstd::vector<std::string> download_warc_paths();\n\tbool upload_warc_paths(const std::vector<std::string> &warc_paths);\n\n\tvoid warc_downloader(const std::string &batch);\n\tvoid warc_downloader_missing(const std::string &batch);\n}\n"
  },
  {
    "path": "src/file/archive.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"archive.h\"\n#include \"file.h\"\n#include \"algorithm/algorithm.h\"\n#include \"utils/thread_pool.hpp\"\n#include <cmath>\n#include <boost/filesystem.hpp>\n#include <boost/range/iterator_range.hpp>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <sstream>\n\nnamespace file {\n\n\tarchive::archive(const std::string &filename)\n\t: m_filename(filename) {\n\n\t}\n\n\tarchive::~archive() {\n\t\n\t}\n\n\tvoid archive::read_dir(const std::string &dirname) {\n\n\t\t// Truncate target file.\n\t\tstd::ofstream outfile(m_filename, std::ios::binary | std::ios::trunc);\n\t\toutfile.close();\n\n\t\tboost::filesystem::path path(dirname);\n\n\t\tstd::vector<boost::filesystem::path> paths;\n\n\t\tif (is_directory(path)) {\n\t\t\tboost::filesystem::directory_iterator iter(path);\n\t\t\tfor (auto &file : boost::make_iterator_range(iter, {})) {\n\t\t\t\tpaths.push_back(file.path());\n\t\t\t}\n\t\t}\n\n\t\tstd::vector<std::vector<boost::filesystem::path>> chunks;\n\t\talgorithm::vector_chunk(paths, std::ceil(paths.size() / m_num_threads) + 1, chunks);\n\n\t\tutils::thread_pool pool(m_num_threads);\n\n\t\tsize_t worker_id = 0;\n\t\tfor (const auto &chunk : chunks) {\n\n\t\t\t// Remove worker file.\n\t\t\t::file::delete_file(m_filename + \".\" + std::to_string(worker_id));\n\n\t\t\tpool.enqueue([this, chunk, worker_id]() {\n\t\t\t\tfor (const auto &path : chunk) {\n\t\t\t\t\tadd_file(path.generic_string(), path.filename().generic_string(), worker_id);\n\t\t\t\t}\n\t\t\t});\n\t\t\tworker_id++;\n\t\t}\n\n\t\tpool.run_all();\n\n\t\t// Merge workers.\n\t\tfor (size_t worker_id = 0; worker_id < m_num_threads; worker_id++) {\n\n\t\t\tstd::filebuf infile, outfile;\n\t\t\t\n\t\t\toutfile.open(m_filename, std::ios::out | std::ios::binary | std::ios::app);\n\t\t\tinfile.open(m_filename + \".\" + std::to_string(worker_id), std::ios::in | std::ios::binary);\n\n\t\t\tstd::copy(std::istreambuf_iterator<char>(&infile), {}, std::ostreambuf_iterator<char>(&outfile));\n\n\t\t\t// Remove worker file.\n\t\t\t::file::delete_file(m_filename + \".\" + std::to_string(worker_id));\n\t\t}\n\t}\n\n\tvoid archive::untar(const std::string &dest_dir) {\n\t\tstd::ifstream infile(m_filename, std::ios::binary);\n\n\t\ttar_header header;\n\n\t\twhile (!infile.eof()) {\n\t\t\tinfile.read((char *)&header, sizeof(tar_header));\n\n\t\t\tif (infile.eof()) break;\n\n\t\t\t// This is an unnessecary copy.\n\t\t\tchar *buffer = new char[header.m_len];\n\t\t\tinfile.read(buffer, header.m_len);\n\n\t\t\tstd::string buffer_string(buffer, header.m_len);\n\t\t\tstd::stringstream buffer_stream(buffer_string);\n\n\t\t\tdelete[] buffer;\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(buffer_stream);\n\n\t\t\tstd::string decompressed_data(std::istreambuf_iterator<char>(decompress_stream), {});\n\n\t\t\tstd::ofstream outfile(dest_dir + \"/\" + header.m_filename, std::ios::binary);\n\t\t\toutfile.write(decompressed_data.c_str(), decompressed_data.size());\n\t\t}\n\t\t\n\t}\n\n\tvoid archive::untar(std::function<void(const std::string &, const std::string &)> cb) {\n\t\tstd::ifstream infile(m_filename, std::ios::binary);\n\n\t\ttar_header header;\n\n\t\twhile (!infile.eof()) {\n\t\t\tinfile.read((char *)&header, sizeof(tar_header));\n\n\t\t\tif (infile.eof()) break;\n\n\t\t\t// This is an unnessecary copy.\n\t\t\tchar *buffer = new char[header.m_len];\n\t\t\tinfile.read(buffer, header.m_len);\n\n\t\t\tstd::string buffer_string(buffer, header.m_len);\n\t\t\tstd::stringstream buffer_stream(buffer_string);\n\n\t\t\tdelete[] buffer;\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(buffer_stream);\n\n\t\t\tstd::string decompressed_data(std::istreambuf_iterator<char>(decompress_stream), {});\n\n\t\t\tcb(header.m_filename, decompressed_data);\n\t\t}\n\t\t\n\t}\n\n\tvoid archive::add_file(const std::string &path, const std::string &filename, size_t worker_id) {\n\n\t\tstd::ofstream outfile(m_filename + \".\" + std::to_string(worker_id), std::ios::binary | std::ios::app);\n\n\t\tstd::string data = ::file::cat(path);\n\n\t\tstd::stringstream ss(data);\n\t\tboost::iostreams::filtering_istream compress_stream;\n\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\tcompress_stream.push(ss);\n\n\t\tstd::string compressed_data(std::istreambuf_iterator<char>(compress_stream), {});\n\n\t\ttar_header header;\n\t\theader.m_len = compressed_data.size();\n\t\tfilename.copy(header.m_filename, filename.size(), 0);\n\t\theader.m_filename[filename.size()] = 0;\n\n\t\toutfile.write((char *)&header, sizeof(tar_header));\n\t\toutfile.write((char *)compressed_data.c_str(), compressed_data.size());\n\t}\n\n\n}\n"
  },
  {
    "path": "src/file/archive.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <functional>\n\nnamespace file {\n\n\tclass archive {\n\n\t\tpublic:\n\t\t\texplicit archive(const std::string &filename);\n\t\t\t~archive();\n\n\t\t\tvoid read_dir(const std::string &dirname);\n\t\t\tvoid untar(const std::string &dest_dir);\n\t\t\tvoid untar(std::function<void(const std::string &, const std::string &)> cb);\n\n\t\tprivate:\n\t\t\tconst size_t m_num_threads = 32;\n\t\t\tstd::string m_filename;\n\n\t\t\tstruct tar_header {\n\t\t\t\tsize_t m_len;\n\t\t\t\tchar m_filename[256];\n\t\t\t};\n\n\t\t\tvoid add_file(const std::string &path, const std::string &filename, size_t worker_id);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/file/file.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"config.h\"\n#include \"file.h\"\n#include <boost/filesystem.hpp>\n#include <boost/range/iterator_range.hpp>\n\nnamespace file {\n\n\tstd::string read_test_file(const std::string &file_name) {\n\n\t\tstd::ifstream file(config::test_data_path + file_name);\n\t\tif (file.is_open()) {\n\t\t\tstd::string ret;\n\t\t\tfile.seekg(0, std::ios::end);\n\t\t\tret.resize(file.tellg());\n\t\t\tfile.seekg(0, std::ios::beg);\n\t\t\tfile.read(&ret[0], ret.size());\n\t\t\tfile.close();\n\t\t\treturn ret;\n\t\t}\n\t\treturn \"\";\n\t}\n\n\tvoid rename(const std::string &old_path, const std::string &new_path) {\n\t\tboost::filesystem::rename(old_path, new_path);\n\t}\n\n\tvoid copy_file(const std::string &source, const std::string &dest) {\n\t\tstd::ifstream infile(source, std::ios::binary);\n\t\tstd::ofstream outfile(dest, std::ios::binary | std::ios::trunc);\n\n\t\toutfile << infile.rdbuf();\n\t}\n\n\tvoid delete_file(const std::string &file) {\n\t\tboost::filesystem::remove(file);\n\t}\n\n\tvoid create_directory(const std::string &path) {\n\t\tboost::filesystem::create_directories(path);\n\t}\n\n\tvoid delete_directory(const std::string &path) {\n\t\tboost::filesystem::remove_all(path);\n\t}\n\n\tstd::string cat(const std::string &filename) {\n\t\tstd::ifstream infile(filename);\n\t\tstd::istreambuf_iterator<char> iter(infile), end; \n\t\tstd::string ret(iter, end);\n\t\treturn ret;\n\t}\n\n\tvoid read_directory(const std::string &dirname, std::function<void(const std::string &)> cb) {\n\n\t\tboost::filesystem::path path(dirname);\n\n\t\tif (is_directory(path)) {\n\t\t\tboost::filesystem::directory_iterator iter(path);\n\t\t\tfor (auto &file : boost::make_iterator_range(iter, {})) {\n\t\t\t\tcb(file.path().filename().generic_string());\n\t\t\t}\n\t\t}\n\t}\n\n\tbool directory_exists(const std::string &filename) {\n\t\treturn boost::filesystem::is_directory(filename) && boost::filesystem::exists(filename);\n\t}\n\n\tbool file_exists(const std::string &filename) {\n\t\tstd::ifstream infile(filename);\n\t\treturn infile.good();\n\t}\n\n}\n"
  },
  {
    "path": "src/file/file.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <fstream>\n#include <stdio.h>\n#include <functional>\n\nnamespace file {\n\n\tstd::string read_test_file(const std::string &file_name);\n\n\tvoid rename(const std::string &old_path, const std::string &new_path);\n\n\tvoid copy_file(const std::string &source, const std::string &dest);\n\tvoid delete_file(const std::string &filename);\n\n\tvoid create_directory(const std::string &path);\n\tvoid delete_directory(const std::string &path);\n\n\t/*\n\t * Returns the whole content of the file.\n\t * */\n\tstd::string cat(const std::string &filename);\n\n\tvoid read_directory(const std::string &path, std::function<void(const std::string &)> cb);\n\n\tbool directory_exists(const std::string &filename);\n\tbool file_exists(const std::string &filename);\n\n}\n"
  },
  {
    "path": "src/file/gz_tsv_file.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"gz_tsv_file.h\"\n#include <exception>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/algorithm/string.hpp>\n\nnamespace file {\n\n\tgz_tsv_file::gz_tsv_file() {\n\n\t}\n\n\tgz_tsv_file::gz_tsv_file(const std::string &file_name) {\n\t\tm_file_name = file_name;\n\n\t\tstd::ifstream infile(m_file_name);\n\n\t\tif (infile.is_open()) {\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tm_data = std::string(std::istreambuf_iterator<char>(decompress_stream), {});\n\t\t}\n\t}\n\n\tgz_tsv_file::~gz_tsv_file() {\n\t}\n\n\tsize_t gz_tsv_file::read_column_into(size_t column, std::vector<std::string> &container) {\n\t\tstd::stringstream ss(m_data);\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(ss, line)) {\n\t\t\tstd::vector<std::string> cols;\n\t\t\tboost::algorithm::split(cols, line, boost::is_any_of(\"\\t\"));\n\t\t\tif (cols.size() > column) {\n\t\t\t\tcontainer.push_back(cols[column]);\n\t\t\t} else {\n\t\t\t\tcontainer.push_back(\"\");\n\t\t\t}\n\t\t\trows_read++;\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n}\n"
  },
  {
    "path": "src/file/gz_tsv_file.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <sstream>\n#include <fstream>\n#include <set>\n#include <vector>\n#include <map>\n#include <string.h>\n\nnamespace file {\n\n\tclass gz_tsv_file {\n\n\tpublic:\n\n\t\tgz_tsv_file();\n\t\texplicit gz_tsv_file(const std::string &file_name);\n\t\t~gz_tsv_file();\n\n\t\tsize_t read_column_into(size_t column, std::vector<std::string> &container);\n\n\tprotected:\n\n\t\tstd::string m_file_name;\n\t\tstd::string m_data;\n\n\t};\n}\n"
  },
  {
    "path": "src/file/tsv_file.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"tsv_file.h\"\n#include <exception>\n\nnamespace file {\n\n\ttsv_file::tsv_file() {\n\n\t}\n\n\ttsv_file::tsv_file(const std::string &file_name) {\n\t\tset_file_name(file_name);\n\t}\n\n\ttsv_file::~tsv_file() {\n\t\tm_file.close();\n\t}\n\n\tstd::string tsv_file::find(const std::string &key) {\n\t\tsize_t pos = binary_find_position(m_file_size, 0, key);\n\t\tif (pos == std::string::npos) {\n\t\t\treturn \"\";\n\t\t}\n\n\t\tm_file.seekg(pos, m_file.beg);\n\t\t\n\n\t\tstd::string line;\n\t\tgetline(m_file, line);\n\n\t\treturn line;\n\t}\n\n\tsize_t tsv_file::find_first_position(const std::string &key) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\t\tconst size_t pos = binary_find_position(m_file_size, 0, key);\n\t\tif (pos == std::string::npos) return std::string::npos;\n\t\t// pos is the position of one item. but we need the first one.\n\t\tsize_t jump = 1000;\n\t\twhile (pos > jump) {\n\t\t\tm_file.seekg(pos - jump, m_file.beg);\n\t\t\t// read next line.\n\t\t\tstd::string line;\n\t\t\tgetline(m_file, line);\n\t\t\tgetline(m_file, line);\n\t\t\tauto jump_key = line.substr(0, line.find(\"\\t\"));\n\t\t\tif (jump_key < key) {\n\t\t\t\t// We jamp too far.\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tjump = jump << 1;\n\t\t}\n\t\tif (pos < jump) jump = pos;\n\n\t\t// The first occurance is between pos - jump and pos - (jump/2)\n\t\t// Linear search.\n\t\tm_file.seekg(pos - jump, m_file.beg);\n\t\tstd::string line;\n\t\tif (pos > jump) {\n\t\t\tgetline(m_file, line);\n\t\t}\n\t\twhile (getline(m_file, line)) {\n\t\t\tauto jump_key = line.substr(0, line.find(\"\\t\"));\n\t\t\tif (jump_key == key) {\n\t\t\t\treturn (size_t)m_file.tellg() - (line.size() + 1u);\n\t\t\t}\n\t\t}\n\t\treturn std::string::npos;\n\t}\n\n\tsize_t tsv_file::find_last_position(const std::string &key) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\t\tconst size_t pos = binary_find_position(m_file_size, 0, key);\n\t\tif (pos == std::string::npos) return std::string::npos;\n\t\t// pos is the position of one item. but we need the last one.\n\t\tsize_t jump = 1000;\n\t\twhile (pos + jump < m_file_size) {\n\t\t\tm_file.seekg(pos + jump, m_file.beg);\n\t\t\t// read next line.\n\t\t\tstd::string line;\n\t\t\tgetline(m_file, line);\n\t\t\tgetline(m_file, line);\n\t\t\tauto jump_key = line.substr(0, line.find(\"\\t\"));\n\t\t\tif (jump_key > key) {\n\t\t\t\t// We jamp too far.\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tjump = jump << 1;\n\t\t}\n\t\tjump = jump >> 1;\n\t\tif (pos + jump > m_file_size) {\n\t\t\tjump = 0;\n\t\t}\n\n\t\t// The first occurance is between pos - jump and pos - (jump/2)\n\t\t// Linear search.\n\t\tm_file.seekg(pos + jump, m_file.beg);\n\t\tsize_t ret_pos = pos + jump;\n\t\tstd::string line;\n\t\tgetline(m_file, line);\n\t\tsize_t last_line_length = line.size() + 1u;\n\t\tret_pos += line.size() + 1u;\n\t\twhile (getline(m_file, line)) {\n\t\t\tauto jump_key = line.substr(0, line.find(\"\\t\"));\n\t\t\tif (jump_key > key) {\n\t\t\t\treturn ret_pos - last_line_length;\n\t\t\t}\n\t\t\tret_pos += line.size() + 1u;\n\t\t\tlast_line_length = line.size() + 1u;\n\t\t}\n\t\treturn ret_pos - last_line_length;\n\t}\n\n\tsize_t tsv_file::find_next_position(const std::string &key) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\t\tconst size_t pos = binary_find_position_any(m_file_size, 0, key);\n\n\t\t// pos is the position of one item. but we need the last one.\n\t\tsize_t jump = 1000;\n\t\twhile (pos + jump < m_file_size) {\n\t\t\tm_file.seekg(pos + jump, m_file.beg);\n\t\t\t// read next line.\n\t\t\tstd::string line;\n\t\t\tgetline(m_file, line);\n\t\t\tgetline(m_file, line);\n\t\t\tauto jump_key = line.substr(0, line.find(\"\\t\"));\n\t\t\tif (jump_key > key) {\n\t\t\t\t// We jamp too far.\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tjump = jump << 1;\n\t\t}\n\t\tjump = jump >> 1;\n\t\tif (pos + jump > m_file_size) {\n\t\t\tjump = 0;\n\t\t}\n\n\t\t// The first occurance is between pos - jump and pos - (jump/2)\n\t\t// Linear search.\n\t\tm_file.seekg(pos + jump, m_file.beg);\n\t\tsize_t ret_pos = pos + jump;\n\t\tstd::string line;\n\t\tgetline(m_file, line);\n\t\tret_pos += line.size() + 1u;\n\t\twhile (getline(m_file, line)) {\n\t\t\tauto jump_key = line.substr(0, line.find(\"\\t\"));\n\t\t\tif (jump_key > key) {\n\t\t\t\treturn ret_pos;\n\t\t\t}\n\t\t\tret_pos += line.size() + 1u;\n\t\t}\n\t\treturn m_file_size;\n\t}\n\n\tstd::map<std::string, std::string> tsv_file::find_all(const std::set<std::string> &keys) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\t\tsize_t pos = 0;\n\t\tstd::map<std::string, std::string> result;\n\t\tstd::string line;\n\t\tfor (const auto &key : keys) {\n\t\t\tpos = binary_find_position(m_file_size, pos, key);\n\t\t\tif (pos != std::string::npos) {\n\t\t\t\tm_file.seekg(pos, m_file.beg);\n\t\t\t\tgetline(m_file, line);\n\t\t\t\tresult[key] = line;\n\t\t\t} else {\n\t\t\t\t// Key not found, ignore.\n\t\t\t}\n\t\t}\n\n\t\treturn result;\n\t}\n\n\tsize_t tsv_file::read_column_into(int column, std::set<std::string> &container) {\n\t\t(void)column;\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\n\t\tif (!m_file.is_open()) {\n\t\t\tthrow std::runtime_error(\"File is not open any more: \" + m_file_name);\n\t\t}\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(m_file, line)) {\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tss >> col;\n\t\t\tcontainer.insert(col);\n\t\t\trows_read++;\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n\tsize_t tsv_file::read_column_into(int column, std::set<std::string> &container, size_t limit) {\n\t\t(void)limit;\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\n\t\tif (!m_file.is_open()) {\n\t\t\tthrow std::runtime_error(\"File is not open any more: \" + m_file_name);\n\t\t}\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(m_file, line)) {\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tss >> col;\n\t\t\tcontainer.insert(col);\n\t\t\trows_read++;\n\t\t\tif (rows_read >= limit) break;\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n\tsize_t tsv_file::read_column_into(int column, std::set<std::string> &container, size_t limit, size_t offset) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\n\t\tif (!m_file.is_open()) {\n\t\t\tthrow std::runtime_error(\"File is not open any more: \" + m_file_name);\n\t\t}\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(m_file, line)) {\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tss >> col;\n\t\t\tif (rows_read >= offset) {\n\t\t\t\tcontainer.insert(col);\n\t\t\t\trows_read++;\n\t\t\t\tif ((rows_read - offset) >= limit) break;\n\t\t\t} else {\n\t\t\t\trows_read++;\n\t\t\t}\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n\tsize_t tsv_file::size() const {\n\t\treturn m_file_size;\n\t}\n\n\tbool tsv_file::eof() const {\n\t\treturn m_file.eof();\n\t}\n\n\tbool tsv_file::is_open() const {\n\t\treturn m_file.is_open();\n\t}\n\n\tstd::string tsv_file::get_line() {\n\t\tstd::string line;\n\t\tgetline(m_file, line);\n\t\treturn line;\n\t}\n\n\tsize_t tsv_file::read_column_into(int column, std::vector<std::string> &container) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(m_file, line)) {\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tss >> col;\n\t\t\tcontainer.push_back(col);\n\t\t\trows_read++;\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n\tsize_t tsv_file::read_column_into(int column, std::vector<std::string> &container, size_t limit) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(m_file, line)) {\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tss >> col;\n\t\t\tcontainer.push_back(col);\n\t\t\trows_read++;\n\t\t\tif (rows_read >= limit) break;\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n\tsize_t tsv_file::read_column_into(int column, std::vector<std::string> &container, size_t limit, size_t offset) {\n\t\tm_file.clear();\n\t\tm_file.seekg(0, m_file.beg);\n\n\t\tstd::string line;\n\t\tsize_t rows_read = 0;\n\t\twhile (getline(m_file, line)) {\n\t\t\tstd::stringstream ss(line);\n\t\t\tstd::string col;\n\t\t\tss >> col;\n\t\t\tif (rows_read >= offset) {\n\t\t\t\tcontainer.push_back(col);\n\t\t\t\trows_read++;\n\t\t\t\tif ((rows_read - offset) >= limit) break;\n\t\t\t} else {\n\t\t\t\trows_read++;\n\t\t\t}\n\t\t}\n\n\t\treturn rows_read;\n\t}\n\n\tsize_t tsv_file::binary_find_position(size_t file_size, size_t offset, const std::string &key) {\n\n\t\tstd::string line;\n\n\t\tif (file_size - offset < 750) {\n\t\t\t// Make linear search.\n\t\t\tm_file.seekg(offset, m_file.beg);\n\t\t\tsize_t bytes_read = 0;\n\t\t\twhile (getline(m_file, line) && bytes_read <= file_size - offset) {\n\t\t\t\tbytes_read += (line.size() + 1u);\n\t\t\t\tif (line.starts_with(key + \"\\t\")) {\n\t\t\t\t\treturn (size_t)m_file.tellg() - (line.size() + 1u);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn std::string::npos;\n\t\t}\n\n\t\tsize_t pivot_len_1 = (file_size - offset) / 2;\n\t\tsize_t pivot = offset + pivot_len_1;\n\n\t\t// Get key at pivot.\n\t\tm_file.seekg(pivot, m_file.beg);\n\n\t\tgetline(m_file, line);\n\t\tgetline(m_file, line);\n\t\tauto pivot_key = line.substr(0, line.find(\"\\t\"));\n\n\t\tif (key < pivot_key) {\n\t\t\treturn binary_find_position(offset + pivot_len_1, offset, key);\n\t\t} else if (key > pivot_key) {\n\t\t\treturn binary_find_position(file_size, pivot, key);\n\t\t}\n\n\t\treturn (size_t)m_file.tellg() - (line.size() + 1u);\n\t}\n\n\tsize_t tsv_file::binary_find_position_any(size_t file_size, size_t offset, const std::string &key) {\n\n\t\tstd::string line;\n\n\t\tif (file_size - offset < 750) {\n\t\t\t// Make linear search.\n\t\t\tm_file.seekg(offset, m_file.beg);\n\t\t\tsize_t bytes_read = 0;\n\t\t\twhile (getline(m_file, line) && bytes_read <= file_size - offset) {\n\t\t\t\tbytes_read += (line.size() + 1u);\n\t\t\t\tconst auto this_key = line.substr(0, line.find(\"\\t\"));\n\t\t\t\tif (this_key >= key) {\n\t\t\t\t\treturn (size_t)m_file.tellg() - (line.size() + 1u);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn m_file_size;\n\t\t}\n\n\t\tsize_t pivot_len_1 = (file_size - offset) / 2;\n\t\tsize_t pivot = offset + pivot_len_1;\n\n\t\t// Get key at pivot.\n\t\tm_file.seekg(pivot, m_file.beg);\n\n\t\tgetline(m_file, line);\n\t\tgetline(m_file, line);\n\t\tauto pivot_key = line.substr(0, line.find(\"\\t\"));\n\n\t\tif (key < pivot_key) {\n\t\t\treturn binary_find_position(offset + pivot_len_1, offset, key);\n\t\t} else if (key > pivot_key) {\n\t\t\treturn binary_find_position(file_size, pivot, key);\n\t\t}\n\n\t\treturn (size_t)m_file.tellg() - (line.size() + 1u);\n\t}\n\n\tvoid tsv_file::set_file_name(const std::string &file_name) {\n\n\t\tm_file_name = file_name;\n\t\tm_original_file_name = file_name;\n\n\t\tm_file.open(m_file_name);\n\n\t\tif (!m_file.is_open()) {\n\t\t\tthrow std::runtime_error(\"Could not open file: \" + m_file_name + \" error: \" + strerror(errno));\n\t\t}\n\n\t\tm_file.seekg(0, m_file.end);\n\t\tm_file_size = m_file.tellg();\n\t\tm_file.seekg(0, m_file.beg);\n\t}\n\n}\n"
  },
  {
    "path": "src/file/tsv_file.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <sstream>\n#include <fstream>\n#include <set>\n#include <vector>\n#include <map>\n#include <string.h>\n\nnamespace file {\n\n\tclass tsv_file {\n\n\tpublic:\n\n\t\ttsv_file();\n\t\texplicit tsv_file(const std::string &file_name);\n\t\t~tsv_file();\n\n\t\t// Returns the line with the first column equals key. Returns std::string::npos if not present in file.\n\t\tstd::string find(const std::string &key);\n\n\t\t/*\n\t\t\tReturns the position of the FIRST line in the file with first column equals key.\n\t\t\tReturns std::string::npos if not present in file.\n\t\t*/\n\t\tsize_t find_first_position(const std::string &key);\n\t\t\n\t\t/*\n\t\t\tReturns the position of the LAST line in the file with first column equals key.\n\t\t\tReturns std::string::npos if not present in file.\n\t\t*/\n\t\tsize_t find_last_position(const std::string &key);\n\n\t\t/*\n\t\t\tReturns the position of the line AFTER the line in the file with first column equals key.\n\t\t\tIf the key does not exist it returns the position to the line where this key would be inserted. If the\n\t\t\tkey should be inserted to the end it returns m_file_size\n\t\t*/\n\t\tsize_t find_next_position(const std::string &key);\n\n\t\tstd::map<std::string, std::string> find_all(const std::set<std::string> &keys);\n\n\t\tsize_t read_column_into(int column, std::set<std::string> &container);\n\t\tsize_t read_column_into(int column, std::set<std::string> &container, size_t limit);\n\t\tsize_t read_column_into(int column, std::set<std::string> &container, size_t limit, size_t offset);\n\t\tsize_t read_column_into(int column, std::vector<std::string> &container);\n\t\tsize_t read_column_into(int column, std::vector<std::string> &container, size_t limit);\n\t\tsize_t read_column_into(int column, std::vector<std::string> &container, size_t limit, size_t offset);\n\n\t\tsize_t size() const;\n\t\tbool eof() const;\n\t\tbool is_open() const;\n\t\tstd::string get_line();\n\n\tprotected:\n\n\t\tstd::string m_file_name;\n\t\tstd::string m_original_file_name;\n\t\tstd::ifstream m_file;\n\t\tsize_t m_file_size;\n\t\tbool m_is_gzipped = false;\n\t\t\n\t\t/*\n\t\t\tDifference is that _any returns the position where this key WOULD be if it was inserted even if it is not\n\t\t\tpresent.\n\t\t*/\n\t\tsize_t binary_find_position(size_t file_size, size_t offset, const std::string &key);\n\t\tsize_t binary_find_position_any(size_t file_size, size_t offset, const std::string &key);\n\n\t\tvoid set_file_name(const std::string &file_name);\n\n\t};\n}\n"
  },
  {
    "path": "src/file/tsv_file_remote.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"tsv_file_remote.h\"\n#include \"logger/logger.h\"\n#include \"transfer/transfer.h\"\n\n#include <boost/filesystem.hpp>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/copy.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n\n//using namespace boost::iostreams;\n\nnamespace file {\n\n\ttsv_file_remote::tsv_file_remote(const std::string &file_name) {\n\t\t// Check if the file exists.\n\n\t\tm_file_name = file_name;\n\n\t\tstd::ifstream infile(get_path());\n\n\t\tif (download_file() == transfer::OK) {\n\t\t\tset_file_name(get_path());\n\t\t} else {\n\t\t\tinfile.close();\n\t\t}\n\t}\n\n\ttsv_file_remote::~tsv_file_remote() {\n\t\t\n\t}\n\n\tstd::string tsv_file_remote::get_path() const {\n\t\treturn config::data_path() + \"/0/\" + m_file_name;\n\t}\n\n\tint tsv_file_remote::download_file() {\n\n\t\tif (m_file_name.find(\".gz\") == m_file_name.size() - 3) {\n\t\t\tm_is_gzipped = true;\n\t\t} else {\n\t\t\tm_is_gzipped = false;\n\t\t}\n\n\t\tLOG_INFO(\"Downloading file with key: \" + m_file_name);\n\n\t\tcreate_directory();\n\t\tstd::ofstream outfile(get_path(), std::ios::trunc);\n\n\t\tint error = transfer::ERROR;\n\t\tif (outfile.good()) {\n\t\t\tif (m_is_gzipped) {\n\t\t\t\ttransfer::gz_file_to_stream(m_file_name, outfile, error);\n\t\t\t} else {\n\t\t\t\ttransfer::file_to_stream(m_file_name, outfile, error);\n\t\t\t}\n\n\t\t\tif (error == transfer::ERROR) {\n\t\t\t\tLOG_INFO(\"Download failed...\");\n\t\t\t}\n\t\t}\n\n\t\tLOG_INFO(\"Done downloading file with key: \" + m_file_name);\n\n\t\treturn error;\n\t}\n\n\tvoid tsv_file_remote::create_directory() {\n\t\tboost::filesystem::path path(get_path());\n\t\tboost::filesystem::create_directories(path.parent_path());\n\t}\n\n}\n"
  },
  {
    "path": "src/file/tsv_file_remote.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"tsv_file.h\"\n\nnamespace file {\n\n\tclass tsv_file_remote : public tsv_file {\n\n\tpublic:\n\n\t\texplicit tsv_file_remote(const std::string &file_name);\n\t\t~tsv_file_remote();\n\n\t\tstd::string get_path() const;\n\n\tprivate:\n\n\t\tint download_file();\n\t\tvoid create_directory();\n\n\t};\n\n}\n"
  },
  {
    "path": "src/file/tsv_row.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"tsv_row.h\"\n\nnamespace file {\n\n\ttsv_row::tsv_row(const std::string &line) {\n\t\tsize_t pos_start = 0;\n\t\tsize_t pos_end = 0;\n\t\twhile (pos_end != std::string::npos) {\n\t\t\tpos_end = line.find(pos_start, '\\t');\n\t\t\tm_cols.emplace_back(line.substr(pos_start, pos_end));\n\t\t\tpos_start = pos_end + 1;\n\t\t}\n\t}\n\n\ttsv_row::~tsv_row() {\n\n\t}\n\n}\n"
  },
  {
    "path": "src/file/tsv_row.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n\nnamespace file {\n\n\tclass tsv_row {\n\n\tpublic:\n\t\texplicit tsv_row(const std::string &line);\n\t\t~tsv_row();\n\n\tprivate:\n\t\tstd::vector<std::string> m_cols;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/full_text/domain_link_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\nnamespace full_text {\n\n\tstruct domain_link_record {\n\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\t\tuint64_t m_source_domain;\n\t\tuint64_t m_target_domain;\n\n\t};\n}\n"
  },
  {
    "path": "src/full_text/link_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\nnamespace full_text {\n\n\tstruct link_record {\n\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\t\tuint64_t m_source_domain;\n\t\tuint64_t m_target_hash;\n\n\t};\n}\n"
  },
  {
    "path": "src/full_text/record.h",
    "content": "\n#pragma once\n\nnamespace full_text {\n\n\tstruct record {\n\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\t\tuint64_t m_domain_hash;\n\n\t};\n}\n"
  },
  {
    "path": "src/full_text/result_set.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"config.h\"\n#include <fcntl.h>\n#include <unistd.h>\n#include <iostream>\n#include <span>\n#include <cassert>\n\nnamespace full_text {\n\n\ttemplate<typename data_record>\n\tclass result_set {\n\n\tpublic:\n\n\t\tresult_set(size_t size);\n\t\t~result_set();\n\n\t\tsize_t size() const { return m_size; }\n\t\tsize_t max_size() const { return m_max_size; }\n\n\t\tconst data_record *data_pointer() const { return m_data_pointer; }\n\t\tconst data_record *section_pointer(size_t section) const { return &m_data_pointer[section * config::ft_max_results_per_section]; }\n\t\tdata_record *data_pointer() { return m_data_pointer; }\n\t\tdata_record *section_pointer(size_t section) { return &m_data_pointer[section * config::ft_max_results_per_section]; }\n\t\tstd::span<data_record> *span_pointer() { return &m_span; }\n\n\t\tsize_t total_num_results() const { return m_total_num_results ; };\n\t\tvoid set_total_num_results(size_t total_num_results);\n\n\t\tvoid resize(size_t n) {\n\t\t\tm_span = std::span<data_record>(m_data_pointer, n);\n\t\t\tm_size = n;\n\t\t}\n\n\t\tvoid prepare_sections(const std::string &filename, size_t offset, size_t len);\n\t\tvoid read_to_section(size_t section);\n\t\tbool has_next_section();\n\t\tsize_t num_sections();\n\t\tvoid close_sections();\n\t\tvoid copy_vector(const std::vector<data_record> &vec);\n\n\tprivate:\n\n\t\tresult_set(const result_set &res) = delete;\n\n\t\tstd::span<data_record> m_span;\n\t\tdata_record *m_data_pointer;\n\n\t\tsize_t m_size; // The length in first section.\n\t\tconst size_t m_max_size; // The maximum number of elements the result set can hold.\n\t\tsize_t m_total_size; // The lengths of all elements in all sections.\n\t\tsize_t m_total_num_results; // The total indexed length, only used to display total number of results.\n\t\tsize_t m_section_len;\n\t\tsize_t m_records_read;\n\t\tint m_file_descriptor;\n\t\tbool m_error = false;\n\n\t};\n\n\ttemplate<typename data_record>\n\tresult_set<data_record>::result_set(size_t size)\n\t: m_size(size), m_max_size(size), m_total_num_results(0)\n\t{\n\t\tm_file_descriptor = -1;\n\t\tm_data_pointer = new data_record[size];\n\t\tm_span = std::span<data_record>(m_data_pointer, size);\n\t}\n\n\ttemplate<typename data_record>\n\tresult_set<data_record>::~result_set() {\n\t\tdelete []m_data_pointer;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid result_set<data_record>::set_total_num_results(size_t total_num_results) {\n\t\tm_total_num_results = total_num_results;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid result_set<data_record>::prepare_sections(const std::string &filename, size_t offset, size_t len) {\n\n\t\tassert(m_file_descriptor < 0);\n\n\t\tm_size = len / sizeof(data_record);\n\t\tm_total_size = m_size;\n\t\tif (m_size > config::ft_max_results_per_section) m_size = config::ft_max_results_per_section;\n\n\t\tm_file_descriptor = open(filename.c_str(), O_RDONLY);\n\t\tposix_fadvise(m_file_descriptor, offset, m_total_size * sizeof(data_record), POSIX_FADV_SEQUENTIAL);\n\t\tlseek(m_file_descriptor, offset, SEEK_SET);\n\t\tm_records_read = 0;\n\t\tresize(m_size);\n\t}\n\n\t/*\n\t\tReads data up to and includint the section. So if the argument section equals zero the first section is read.\n\t*/\n\ttemplate<typename data_record>\n\tvoid result_set<data_record>::read_to_section(size_t section) {\n\t\tsize_t read_start = m_records_read;\n\t\tsize_t read_end = (section + 1) * config::ft_max_results_per_section;\n\t\tif (read_end > m_total_size) read_end = m_total_size;\n\n\t\tif (read_start > read_end) return;\n\n\t\tsize_t records_to_read = read_end - read_start;\n\n\t\tint bytes_read = ::read(m_file_descriptor, (void *)&m_data_pointer[m_records_read], (size_t)records_to_read * sizeof(data_record));\n\t\tif (bytes_read < 0) {\n\t\t\tm_error = true;\n\t\t} else {\n\t\t\tm_error = false;\n\t\t}\n\t\tm_records_read += records_to_read;\n\t}\n\n\ttemplate<typename data_record>\n\tbool result_set<data_record>::has_next_section() {\n\t\tif (m_file_descriptor < 0) return false;\n\t\treturn m_total_size > m_records_read;\n\t}\n\n\ttemplate<typename data_record>\n\tsize_t result_set<data_record>::num_sections() {\n\t\t// Ceiling integer division of m_total_size/config::ft_max_results_per_section;\n\t\treturn (m_total_size + config::ft_max_results_per_section - 1) / config::ft_max_results_per_section;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid result_set<data_record>::close_sections() {\n\t\tif (m_file_descriptor >= 0) {\n\t\t\tclose(m_file_descriptor);\n\t\t\tm_file_descriptor = -1;\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid result_set<data_record>::copy_vector(const std::vector<data_record> &vec) {\n\t\tmemcpy(&m_data_pointer[0], vec.data(), vec.size() * sizeof(data_record));\n\t\tresize(vec.size());\n\t}\n}\n\n"
  },
  {
    "path": "src/full_text/search_metric.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\nnamespace full_text {\n\n\tclass search_metric {\n\n\t\tpublic:\n\t\tsize_t m_total_found = 0;\n\t\tsize_t m_total_url_links_found = 0;\n\t\tsize_t m_total_domain_links_found = 0;\n\t\tsize_t m_links_handled = 0;\n\t\tsize_t m_link_domain_matches = 0;\n\t\tsize_t m_link_url_matches = 0;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/hash_table2/builder.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"builder.h\"\n#include \"utils/thread_pool.hpp\"\n\nnamespace hash_table2 {\n\n\tbuilder::builder(const std::string &db_name, size_t num_shards, size_t hash_table_size,\n\t\t\tconst std::string &data_path)\n\t: m_db_name(db_name) {\n\t\tfor (size_t i = 0; i < num_shards; i++) {\n\t\t\tm_shards.push_back(new hash_table_shard_builder(db_name, i, hash_table_size, data_path));\n\t\t}\n\t}\n\n\tbuilder::~builder() {\n\t\tfor (hash_table_shard_builder *shard : m_shards) {\n\t\t\tdelete shard;\n\t\t}\n\t}\n\n\tvoid builder::add(uint64_t key, const std::string &value, size_t version) {\n\t\tm_shards[key % m_shards.size()]->add(key, value, version);\n\t}\n\n\tvoid builder::remove(uint64_t key) {\n\t\tm_shards[key % m_shards.size()]->remove(key);\n\t}\n\n\tvoid builder::merge() {\n\t\tutils::thread_pool pool(32);\n\t\tfor (hash_table_shard_builder *shard : m_shards) {\n\t\t\tpool.enqueue([shard]() -> void {\n\t\t\t\tshard->append();\n\t\t\t\tshard->merge();\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\tvoid builder::optimize() {\n\t\tutils::thread_pool pool(32);\n\t\tfor (hash_table_shard_builder *shard : m_shards) {\n\t\t\tpool.enqueue([shard]() -> void {\n\t\t\t\tshard->optimize();\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\tvoid builder::truncate() {\n\t\tfor (hash_table_shard_builder *shard : m_shards) {\n\t\t\tshard->truncate();\n\t\t}\n\t}\n\n\tvoid builder::merge_with(const builder &other) {\n\t\tfor (size_t i = 0; i < m_shards.size(); i++) {\n\t\t\tm_shards[i]->merge_with(*(other.m_shards[i]));\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "src/hash_table2/builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include \"hash_table_shard_builder.h\"\n#include \"config.h\"\n\nnamespace hash_table2 {\n\n\tclass builder {\n\n\tpublic:\n\n\t\texplicit builder(const std::string &db_name, size_t num_shards = config::ht_num_shards,\n\t\t\tsize_t hash_table_size = 1000000,\n\t\t\tconst std::string &data_path = config::data_path() + \"/{shard_id_mod_8}/hash_table\");\n\t\t~builder();\n\n\t\tvoid add(uint64_t key, const std::string &value, size_t version = 0);\n\t\tvoid remove(uint64_t key);\n\n\t\tvoid merge();\n\t\tvoid optimize();\n\t\tvoid truncate();\n\n\t\tvoid merge_with(const builder &other);\n\n\t\thash_table_shard_builder *get_shard(size_t shard_id) { return m_shards[shard_id]; };\n\n\tprivate:\n\n\t\tstd::vector<hash_table_shard_builder *> m_shards;\n\t\tconst std::string m_db_name;\n\n\t};\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"config.h\"\n#include \"hash_table.h\"\n#include \"hash_table_shard_builder.h\"\n#include \"logger/logger.h\"\n\nnamespace hash_table2 {\n\n\thash_table::hash_table(const std::string &db_name, size_t num_shards, size_t hash_table_size, const std::string &data_path)\n\t: m_db_name(db_name)\n\t{\n\t\tfor (size_t shard_id = 0; shard_id < num_shards; shard_id++) {\n\t\t\tauto shard = new hash_table_shard(m_db_name, shard_id, hash_table_size, data_path);\n\t\t\tm_shards.push_back(shard);\n\t\t}\n\t}\n\n\thash_table::~hash_table() {\n\t\tfor (hash_table_shard *shard : m_shards) {\n\t\t\tdelete shard;\n\t\t}\n\t}\n\n\tvoid hash_table::add(uint64_t key, const std::string &value) {\n\n\t\tconst size_t shard_id = key % m_shards.size();\n\t\thash_table_shard_builder builder(m_db_name, shard_id);\n\n\t\tbuilder.add(key, value);\n\t}\n\n\tvoid hash_table::truncate() {\n\t\tfor (size_t shard_id = 0; shard_id < m_shards.size(); shard_id++) {\n\t\t\thash_table_shard_builder builder(m_db_name, shard_id);\n\t\t\tbuilder.truncate();\n\t\t}\n\t}\n\n\tbool hash_table::has(uint64_t key) {\n\t\treturn m_shards[key % m_shards.size()]->has(key);\n\t}\n\n\tstd::string hash_table::find(uint64_t key) {\n\t\tsize_t ver = 0;\n\t\treturn find(key, ver);\n\t}\n\n\tstd::string hash_table::find(uint64_t key, size_t &ver) {\n\t\treturn m_shards[key % m_shards.size()]->find(key, ver);\n\t}\n\n\tsize_t hash_table::size() const {\n\t\tsize_t num_items = 0;\n\t\tfor (const auto &shard : m_shards) {\n\t\t\tnum_items += shard->size();\n\t\t} \n\t\treturn num_items;\n\t}\n\n\tvoid hash_table::for_each(std::function<void(uint64_t, const std::string &)> callback) const {\n\t\tfor (const auto &shard : m_shards) {\n\t\t\tshard->for_each(callback);\n\t\t}\n\t}\n\n\tvoid hash_table::for_each_key(std::function<void(uint64_t)> callback) const {\n\t\tfor (const auto &shard : m_shards) {\n\t\t\tshard->for_each_key(callback);\n\t\t}\n\t}\n\n\tvoid hash_table::for_each_shard(std::function<void(const hash_table_shard *shard)> callback) const {\n\t\tfor (const auto &shard : m_shards) {\n\t\t\tcallback(shard);\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <thread>\n#include <vector>\n#include <map>\n\n#include \"config.h\"\n#include \"hash_table_shard.h\"\n\nnamespace hash_table2 {\n\n\tclass hash_table_shard;\n\n\tclass hash_table {\n\n\tpublic:\n\n\t\texplicit hash_table(const std::string &db_name, size_t num_shards = config::ht_num_shards,\n\t\t\t\tsize_t hash_table_size = 1000000,\n\t\t\t\tconst std::string &data_path = config::data_path() + \"/{shard_id_mod_8}/hash_table\");\n\t\t~hash_table();\n\n\t\tvoid add(uint64_t key, const std::string &value);\n\t\tvoid truncate();\n\t\tbool has(uint64_t key);\n\t\tstd::string find(uint64_t key);\n\t\tstd::string find(uint64_t key, size_t &ver);\n\t\tsize_t size() const;\n\t\tvoid for_each(std::function<void(uint64_t, const std::string &)> callback) const;\n\t\tvoid for_each_key(std::function<void(uint64_t)> callback) const;\n\t\tvoid for_each_shard(std::function<void(const hash_table_shard *shard)> callback) const;\n\n\tprivate:\n\n\t\tstd::vector<hash_table_shard *> m_shards;\n\t\tconst std::string m_db_name;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table_shard.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iostream>\n#include <sstream>\n#include <numeric>\n#include \"config.h\"\n#include \"hash_table_shard.h\"\n#include \"logger/logger.h\"\n\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n\nnamespace hash_table2 {\n\n\thash_table_shard::hash_table_shard(const std::string &db_name, size_t shard_id, size_t hash_table_size,\n\t\t\tconst std::string &data_path)\n\t: hash_table_shard_base(db_name, shard_id, hash_table_size, data_path)\n\t{\n\t}\n\n\thash_table_shard::~hash_table_shard() {\n\n\t}\n\n\tbool hash_table_shard::has(uint64_t key) const {\n\n\t\tstd::ifstream reader(filename_pos(), std::ios::binary);\n\n\t\tconst size_t hash_pos = key % this->m_hash_table_size;\n\t\treader.seekg(hash_pos * sizeof(size_t));\n\n\t\t// Read page pos.\n\t\tsize_t page_pos = SIZE_MAX;\n\t\treader.read((char *)&page_pos, sizeof(size_t));\n\n\t\tif (page_pos == SIZE_MAX) return false;\n\n\t\t// Read page.\n\t\tsize_t page_len;\n\t\treader.seekg(this->hash_table_byte_size() + page_pos, std::ios::beg);\n\t\treader.read((char *)&page_len, sizeof(size_t));\n\n\t\tstd::vector<std::array<uint64_t, 3>> page(page_len);\n\t\treader.read((char *)page.data(), page_len * sizeof(std::array<uint64_t, 3>));\n\n\t\t// Find key among pages.\n\t\tfor (const auto &page_item : page) {\n\t\t\tif (page_item[0] == key) {\n\t\t\t\treturn true;\n\t\t\t}\n\t\t}\n\n\t\treturn false;\n\t}\n\n\tstd::string hash_table_shard::find(uint64_t key) const {\n\t\tsize_t ver;\n\t\treturn find(key, ver);\n\t}\n\n\tstd::string hash_table_shard::find(uint64_t key, size_t &ver) const {\n\n\t\tstd::ifstream reader(filename_pos(), std::ios::binary);\n\n\t\tconst size_t hash_pos = key % this->m_hash_table_size;\n\t\treader.seekg(hash_pos * sizeof(size_t));\n\n\t\t// Read page pos.\n\t\tsize_t page_pos = SIZE_MAX;\n\t\treader.read((char *)&page_pos, sizeof(size_t));\n\n\t\tif (page_pos == SIZE_MAX) return \"\";\n\n\t\t// Read page.\n\t\tsize_t page_len;\n\t\treader.seekg(this->hash_table_byte_size() + page_pos, std::ios::beg);\n\t\treader.read((char *)&page_len, sizeof(size_t));\n\n\t\tstd::vector<std::array<uint64_t, 3>> page(page_len);\n\t\treader.read((char *)page.data(), page_len * sizeof(std::array<uint64_t, 3>));\n\n\t\t// Find key among pages.\n\t\tsize_t pos = SIZE_MAX;\n\t\tfor (const auto &page_item : page) {\n\t\t\tif (page_item[0] == key) {\n\t\t\t\tpos = page_item[1];\n\t\t\t\tver = page_item[2];\n\t\t\t}\n\t\t}\n\n\t\tif (pos == SIZE_MAX) return \"\";\n\n\t\treturn data_at_position(pos);\n\t}\n\n\tvoid hash_table_shard::for_each(std::function<void(uint64_t, std::string)> callback) const {\n\t\tstd::ifstream infile(filename_data(), std::ios::binary);\n\t\tinfile.seekg(0, std::ios::beg);\n\n\t\twhile (!infile.eof()) {\n\t\t\tsize_t key;\n\t\t\tif (!infile.read((char *)&key, sizeof(size_t))) break;\n\t\t\t\n\t\t\tsize_t data_len;\n\t\t\tif (!infile.read((char *)&data_len, sizeof(size_t))) break;\n\n\t\t\tif (key == 0ull) {\n\t\t\t\t// Skip.\n\t\t\t\tinfile.seekg(data_len, std::ios::cur);\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\tstd::unique_ptr<char[]> buffer_allocator;\n\t\t\ttry {\n\t\t\t\tbuffer_allocator = std::make_unique<char[]>(data_len);\n\t\t\t} catch (std::bad_alloc &exception) {\n\t\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\t\tstd::cout << \"tried to allocate: \" << data_len << \" bytes\" << std::endl;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tchar *buffer = buffer_allocator.get();\n\n\t\t\tinfile.read(buffer, data_len);\n\t\t\tstd::stringstream ss(std::string(buffer, data_len));\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(ss);\n\n\t\t\tstd::stringstream decompressed;\n\t\t\tdecompressed << decompress_stream.rdbuf();\n\n\t\t\tconst std::string value = decompressed.str();\n\n\t\t\tcallback(key, std::move(value));\n\t\t}\n\t}\n\n\tvoid hash_table_shard::for_each_key(std::function<void(uint64_t)> callback) const {\n\t\tstd::ifstream infile(filename_data(), std::ios::binary);\n\t\tinfile.seekg(0, std::ios::beg);\n\n\t\twhile (!infile.eof()) {\n\t\t\tsize_t key;\n\t\t\tif (!infile.read((char *)&key, sizeof(size_t))) break;\n\t\t\t\n\t\t\tsize_t data_len;\n\t\t\tif (!infile.read((char *)&data_len, sizeof(size_t))) break;\n\n\t\t\tinfile.seekg(data_len, std::ios::cur);\n\n\t\t\tcallback(key);\n\t\t}\n\t}\n\n\tsize_t hash_table_shard::shard_id() const {\n\t\treturn m_shard_id;\n\t}\n\n\tsize_t hash_table_shard::size() const {\n\t\tauto pages = this->read_pages();\n\t\treturn std::transform_reduce(pages.cbegin(), pages.cend(), 0, [](auto a, auto b) { return a + b; }, [](const auto &p) { return p.size(); });\n\t}\n\n\tsize_t hash_table_shard::file_size() const {\n\t\tstd::ifstream infile(filename_data(), std::ios::binary);\n\t\tinfile.seekg(0, std::ios::end);\n\t\treturn infile.tellg();\n\t}\n\n\tstd::string hash_table_shard::data_at_position(size_t pos) const {\n\n\t\tstd::ifstream infile(filename_data(), std::ios::binary);\n\t\tinfile.seekg(pos, std::ios::beg);\n\n\t\t// Read key\n\t\tuint64_t read_key;\n\t\tinfile.read((char *)&read_key, sizeof(uint64_t));\n\n\t\t// Read data length.\n\t\tsize_t data_len;\n\t\tinfile.read((char *)&data_len, sizeof(size_t));\n\n\t\tstd::unique_ptr<char[]> buffer_allocator;\n\t\ttry {\n\t\t\tbuffer_allocator = std::make_unique<char[]>(data_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << data_len << \" bytes\" << std::endl;\n\t\t\treturn \"\";\n\t\t}\n\t\tchar *buffer = buffer_allocator.get();\n\n\t\tinfile.read(buffer, data_len);\n\t\tstd::stringstream ss(std::string(buffer, data_len));\n\n\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\tdecompress_stream.push(ss);\n\n\t\tstd::stringstream decompressed;\n\t\tdecompressed << decompress_stream.rdbuf();\n\n\t\treturn decompressed.str();\n\t}\n\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table_shard.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <map>\n#include <vector>\n#include <functional>\n\n#include \"config.h\"\n#include \"hash_table_shard_base.h\"\n\nnamespace hash_table2 {\n\n\tclass hash_table_shard : public hash_table_shard_base {\n\n\t\tpublic:\n\n\t\t\thash_table_shard(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000,\n\t\t\t\t\tconst std::string &data_path = config::data_path() + \"/{shard_id_mod_8}/hash_table\");\n\t\t\t~hash_table_shard();\n\n\t\t\t/*\n\t\t\t * Checks if the key exists in the hash table.\n\t\t\t * */\n\t\t\tbool has(uint64_t key) const;\n\n\t\t\t/*\n\t\t\t * Finds a value for the given key. Returns empty string if key is not present.\n\t\t\t * */\n\t\t\tstd::string find(uint64_t key) const;\n\n\t\t\t/*\n\t\t\t * Finds a value for the given key. Returns empty string if key is not present. Also sets version in 'ver'\n\t\t\t * */\n\t\t\tstd::string find(uint64_t key, size_t &ver) const;\n\n\t\t\t/*\n\t\t\t * Loop over all elements in hash table shard and call the given function. \n\t\t\t * */\n\t\t\tvoid for_each(std::function<void(uint64_t, std::string)>) const;\n\t\t\tvoid for_each_key(std::function<void(uint64_t)>) const;\n\n\t\t\t/*\n\t\t\t * Returns the id of the shard.\n\t\t\t * */\n\t\t\tsize_t shard_id() const;\n\n\t\t\t/*\n\t\t\t * Returns the number of elements in the shard.\n\t\t\t * */\n\t\t\tsize_t size() const;\n\n\t\t\t/*\n\t\t\t * Returns the size of the data file in bytes.\n\t\t\t * */\n\t\t\tsize_t file_size() const;\n\n\t\tprivate:\n\n\t\t\tstd::string data_at_position(size_t pos) const;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table_shard_base.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <memory>\n#include <array>\n#include <vector>\n\nnamespace hash_table2 {\n\n\tclass hash_table_shard_base {\n\n\t\tpublic:\n\n\t\t\thash_table_shard_base(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000,\n\t\t\t\t\tconst std::string &data_path = config::data_path() + \"/{shard_id_mod_8}/hash_table\")\n\t\t\t: m_db_name(db_name), m_shard_id(shard_id), m_hash_table_size(hash_table_size), m_data_path(data_path) {}\n\n\t\t\tstd::string file_base_data() const {\n\t\t\t\tconst size_t disk_shard = m_shard_id % 8;\n\t\t\t\tstd::string data_path = m_data_path;\n\t\t\t\tif (data_path.find(\"{shard_id_mod_8}\") != std::string::npos) {\n\t\t\t\t\tdata_path.replace(data_path.find(\"{shard_id_mod_8}\"), 16, std::to_string(disk_shard));\n\t\t\t\t}\n\t\t\t\treturn data_path + \"/ht_\" + m_db_name + \"_\" + std::to_string(m_shard_id);\n\t\t\t}\n\n\t\t\tstd::string file_base() const {\n\t\t\t\tconst size_t disk_shard = m_shard_id % 8;\n\t\t\t\tstd::string data_path = config::data_path() + \"/{shard_id_mod_8}/hash_table\";\n\t\t\t\tif (data_path.find(\"{shard_id_mod_8}\") != std::string::npos) {\n\t\t\t\t\tdata_path.replace(data_path.find(\"{shard_id_mod_8}\"), 16, std::to_string(disk_shard));\n\t\t\t\t}\n\t\t\t\treturn data_path + \"/ht_\" + m_db_name + \"_\" + std::to_string(m_shard_id);\n\t\t\t}\n\n\t\t\tstd::string filename_data() const {\n\t\t\t\treturn file_base_data() + \".data\";\n\t\t\t}\n\n\t\t\tstd::string filename_pos() const {\n\t\t\t\treturn file_base() + \".pos\";\n\t\t\t}\n\n\t\t\tstd::string filename_data_tmp() const {\n\t\t\t\treturn file_base() + \".data.tmp\";\n\t\t\t}\n\n\t\tprotected:\n\n\t\t\tconst std::string m_db_name;\n\t\t\tsize_t m_shard_id;\n\t\t\tsize_t m_hash_table_size;\n\t\t\tconst std::string m_data_path;\n\n\t\t\tsize_t hash_table_byte_size() const { return m_hash_table_size * sizeof(size_t); }\n\n\t\t\tstd::vector<std::vector<std::array<uint64_t, 3>>> read_pages() const {\n\t\t\t\tstd::ifstream infile(filename_pos(), std::ios::binary);\n\t\t\t\treturn read_pages(infile);\n\t\t\t}\n\n\t\t\tstd::vector<std::vector<std::array<uint64_t, 3>>> read_pages(std::ifstream &infile) const {\n\t\t\t\t\n\t\t\t\tconst size_t max_records = 10000;\n\t\t\t\tconst size_t record_len = sizeof(std::array<uint64_t, 3>);\n\t\t\t\tconst size_t buffer_len = record_len * max_records;\n\n\t\t\t\tauto buffer_allocator = std::make_unique<char[]>(buffer_len);\n\t\t\t\tchar *buffer = buffer_allocator.get();\n\n\t\t\t\tstd::vector<std::vector<std::array<uint64_t, 3>>> ret(this->m_hash_table_size);\n\n\t\t\t\tif (infile.is_open()) {\n\t\t\t\t\tinfile.seekg(this->hash_table_byte_size());\n\n\t\t\t\t\tdo {\n\t\t\t\t\t\tsize_t num_keys;\n\t\t\t\t\t\tinfile.read((char *)&num_keys, sizeof(size_t));\n\n\t\t\t\t\t\tif (infile.eof()) break;\n\n\t\t\t\t\t\tif (num_keys > max_records) {\n\t\t\t\t\t\t\tbreak;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tconst size_t len = record_len * num_keys;\n\t\t\t\t\t\tinfile.read(buffer, len);\n\n\t\t\t\t\t\tfor (size_t i = 0; i < len; i += record_len) {\n\t\t\t\t\t\t\tconst uint64_t key = *((uint64_t *)&buffer[i]);\n\t\t\t\t\t\t\tconst size_t page_id = key % this->m_hash_table_size;\n\t\t\t\t\t\t\tconst size_t pos = *((size_t *)&buffer[i + sizeof(uint64_t)]);\n\t\t\t\t\t\t\tconst size_t version = *((size_t *)&buffer[i + sizeof(uint64_t) + sizeof(size_t)]);\n\t\t\t\t\t\t\tret[page_id].emplace_back(std::array{key, (uint64_t)pos, (uint64_t)version});\n\t\t\t\t\t\t}\n\n\t\t\t\t\t} while (!infile.eof());\n\t\t\t\t}\n\t\t\t\n\t\t\t\treturn ret;\n\t\t\t}\n\n\t};\n\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table_shard_builder.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <sstream>\n#include \"config.h\"\n#include \"hash_table_shard_builder.h\"\n#include \"logger/logger.h\"\n#include \"file/file.h\"\n#include \"indexer/merger.h\"\n\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n\nnamespace hash_table2 {\n\n\thash_table_shard_builder::hash_table_shard_builder(const std::string &db_name, size_t shard_id, size_t hash_table_size,\n\t\tconst std::string &data_path)\n\t: hash_table_shard_base(db_name, shard_id, hash_table_size, data_path)\n\t{\n\t\tindexer::merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t\tindexer::merger::register_merger((size_t)this, [this]() {merge();});\n\t}\n\n\thash_table_shard_builder::~hash_table_shard_builder() {\n\t\tindexer::merger::deregister_merger((size_t)this);\n\t}\n\n\tvoid hash_table_shard_builder::add(uint64_t key, const std::string &value, size_t version) {\n\t\tindexer::merger::lock();\n\n\t\tstd::lock_guard guard(m_lock);\n\n\t\tauto ver_iter = m_version.find(key);\n\t\tif (version > 0 && ver_iter != m_version.end() && ver_iter->second > version) {\n\t\t\t// do nothing\n\t\t} else {\n\t\t\tm_data_size += value.capacity();\n\t\t\tm_cache[key] = value;\n\t\t\tm_version[key] = version;\n\t\t}\n\t}\n\n\tvoid hash_table_shard_builder::remove(uint64_t key) {\n\t\tm_remove_keys.push_back(key);\n\t}\n\n\tsize_t hash_table_shard_builder::cache_size() const {\n\t\t// This is an OK approximation since m_data_size will be much larger than the keys.\n\t\treturn m_cache.size() * sizeof(uint64_t) * 2 + m_data_size;\n\t}\n\n\tvoid hash_table_shard_builder::append() {\n\n\t\tstd::lock_guard guard(m_lock);\n\n\t\tofstream outfile(this->filename_data_tmp(), ios::binary | ios::app);\n\n\t\tfor (const auto &iter : m_cache) {\n\t\t\tconst size_t version = m_version[iter.first];\n\t\t\toutfile.write((char *)&iter.first, sizeof(uint64_t));\n\t\t\toutfile.write((char *)&version, sizeof(size_t));\n\n\t\t\t// Compress data\n\t\t\tstd::stringstream ss(iter.second);\n\n\t\t\tboost::iostreams::filtering_istream compress_stream;\n\t\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\t\tcompress_stream.push(ss);\n\n\t\t\tstd::stringstream compressed;\n\t\t\tcompressed << compress_stream.rdbuf();\n\n\t\t\tstd::string compressed_string(compressed.str());\n\n\t\t\tconst size_t data_len = compressed_string.size();\n\t\t\toutfile.write((char *)&data_len, sizeof(size_t));\n\n\t\t\toutfile.write(compressed_string.c_str(), data_len);\n\t\t}\n\n\t\t// Free RAM caches and set m_data_size to zero.\n\t\tm_cache = std::map<uint64_t, std::string>{};\n\t\tm_version = std::map<uint64_t, size_t>{};\n\t\tm_data_size = 0;\n\t}\n\n\tvoid hash_table_shard_builder::merge() {\n\n\t\tauto pages = this->read_pages();\n\n\t\tconst size_t buffer_len = 1024*1024*20;\n\t\t\n\t\tstd::unique_ptr<char[]> buffer_allocator;\n\t\ttry {\n\t\t\tbuffer_allocator = std::make_unique<char[]>(buffer_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << buffer_len << \" bytes\" << std::endl;\n\t\t\treturn;\n\t\t}\n\t\tchar *buffer = buffer_allocator.get();\n\n\t\t// Read append cache and add to pages + data file.\n\t\tstd::ifstream infile(this->filename_data_tmp(), std::ios::binary);\n\t\tstd::ofstream outfile(this->filename_data(), std::ios::binary | std::ios::app);\n\n\t\tsize_t last_pos = outfile.tellp();\n\n\t\twhile (!infile.eof()) {\n\t\t\tuint64_t key;\n\t\t\tif (!infile.read((char *)&key, sizeof(uint64_t))) break;\n\n\t\t\tsize_t version;\n\t\t\tif (!infile.read((char *)&version, sizeof(size_t))) break;\n\n\t\t\tsize_t data_len;\n\t\t\tif (!infile.read((char *)&data_len, sizeof(size_t))) break;\n\n\t\t\tif (data_len > buffer_len) {\n\t\t\t\tLOG_INFO(\"data_len \" + std::to_string(data_len) + \"is larger than buffer_len \" + std::to_string(buffer_len) + \" in file \" + filename_data());\n\t\t\t\tinfile.seekg(data_len, ios::cur);\n\t\t\t\tcontinue;\n\t\t\t} else {\n\t\t\t\tif (!infile.read(buffer, data_len)) break;\n\t\t\t}\n\n\t\t\tconst size_t page_id = key % this->m_hash_table_size;\n\t\t\tconst std::array elem{key, last_pos, version};\n\n\t\t\tauto insert_at = std::upper_bound(pages[page_id].begin(), pages[page_id].end(), elem, [](const auto &a, const auto &b) {\n\t\t\t\treturn a[0] < b[0];\n\t\t\t});\n\n\t\t\t// insert_at points to the element after \"elem\"\n\n\t\t\tbool add_data = false;\n\t\t\tif (pages[page_id].size() == 0) {\n\t\t\t\tpages[page_id].push_back(elem);\n\t\t\t\tadd_data = true;\n\t\t\t} else {\n\n\t\t\t\tconst auto elem_at = *(insert_at - 1);\n\t\t\t\tif (elem_at[0] == elem[0]) {\n\t\t\t\t\t// If version is bigger on the new element. Replace element.\n\t\t\t\t\tif (elem_at[2] <= elem[2]) {\n\t\t\t\t\t\t*(insert_at - 1) = elem;\n\t\t\t\t\t\tadd_data = true;\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\tpages[page_id].insert(insert_at, elem);\n\t\t\t\t\tadd_data = true;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif (add_data) {\n\t\t\t\toutfile.write((char *)&key, sizeof(uint64_t));\n\t\t\t\toutfile.write((char *)&data_len, sizeof(size_t));\n\t\t\t\toutfile.write(buffer, data_len);\n\n\t\t\t\tlast_pos += data_len + sizeof(uint64_t) + sizeof(size_t);\n\t\t\t}\n\t\t}\n\n\t\t// Delete cache file.\n\t\tfile::delete_file(this->filename_data_tmp());\n\n\t\t// Remove keys that are in m_remove_keys.\n\t\tremove_keys_from_pages(pages);\n\t\tm_remove_keys = std::vector<uint64_t>{};\n\n\t\twrite_pages(pages);\n\t}\n\n\tvoid hash_table_shard_builder::optimize() {\n\t\tauto pages = this->read_pages();\n\n\t\tstd::ifstream infile(this->filename_data(), std::ios::binary);\n\t\tstd::ofstream outfile(this->filename_data_tmp(), std::ios::binary | std::ios::trunc);\n\n\t\tread_optimized_to(pages, infile, outfile);\n\n\t\toutfile.close();\n\n\t\tfile::delete_file(filename_data());\n\t\tfile::delete_file(filename_pos());\n\n\t\tmerge();\n\t}\n\n\tvoid hash_table_shard_builder::truncate() {\n\t\tstd::lock_guard guard(m_lock);\n\t\tofstream outfile(this->filename_data(), ios::binary | ios::trunc);\n\t\tofstream outfile_pos(this->filename_pos(), ios::binary | ios::trunc);\n\n\t\tfile::delete_file(this->filename_data_tmp());\n\t}\n\n\tvoid hash_table_shard_builder::merge_with(const hash_table_shard_builder &other) {\n\t\tmerge_with(other.filename_pos(), other.filename_data());\n\t}\n\n\tvoid hash_table_shard_builder::merge_with(const std::string &pos_file, const std::string &data_file) {\n\n\t\tstd::ifstream other_posfile(pos_file, std::ios::binary);\n\n\t\tauto pages1 = this->read_pages();\n\t\tauto pages2 = this->read_pages(other_posfile);\n\n\t\t// Remove the pages in pages1 that have higher version number in pages2 and vise versa.\n\t\tfor (size_t p = 0; p < pages1.size(); p++) {\n\t\t\tsize_t i = 0, j = 0;\n\t\t\twhile (i < pages1[p].size() && j < pages2[p].size()) {\n\t\t\t\tif (pages1[p][i][0] == pages2[p][j][0]) {\n\t\t\t\t\tif (pages1[p][i][2] < pages2[p][j][2]) {\n\t\t\t\t\t\t// delete pages1[p][i]\n\t\t\t\t\t\tpages1[p][i][1] = SIZE_MAX;\n\t\t\t\t\t} else {\n\t\t\t\t\t\t// delete pages2[p][j]\n\t\t\t\t\t\tpages2[p][j][1] = SIZE_MAX;\n\t\t\t\t\t}\n\t\t\t\t\ti++;\n\t\t\t\t\tj++;\n\t\t\t\t} else if (pages1[p][i][0] < pages2[p][j][0]) {\n\t\t\t\t\ti++;\n\t\t\t\t} else {\n\t\t\t\t\tj++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\tstd::ofstream outfile(this->filename_data_tmp(), std::ios::binary | std::ios::trunc);\n\n\t\tstd::ifstream data_file_2(data_file, std::ios::binary);\n\t\t\n\t\tread_optimized_to(pages2, data_file_2, outfile);\n\n\t\toutfile.close();\n\n\t\tmerge();\n\t}\n\n\tvoid hash_table_shard_builder::read_optimized_to(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages, std::ifstream &infile,\n\t\tstd::ofstream &outfile) const {\n\t\t\n\t\tinfile.seekg(0, std::ios::beg);\n\n\t\twhile (!infile.eof()) {\n\t\t\tconst size_t my_pos = infile.tellg();\n\n\t\t\tsize_t key;\n\t\t\tif (!infile.read((char *)&key, sizeof(size_t))) break;\n\t\t\t\n\t\t\tsize_t data_len;\n\t\t\tif (!infile.read((char *)&data_len, sizeof(size_t))) break;\n\n\t\t\tconst size_t page_id = key % this->m_hash_table_size;\n\n\t\t\tstd::array elem{key, (uint64_t)0, (uint64_t)0};\n\n\t\t\tauto iter = std::upper_bound(pages[page_id].cbegin(), pages[page_id].cend(), elem, [](const auto &a, const auto &b) {\n\t\t\t\treturn a[0] < b[0];\n\t\t\t});\n\n\t\t\tif (pages[page_id].size() == 0) {\n\t\t\t\t// Skip. Did not find key.\n\t\t\t\tinfile.seekg(data_len, std::ios::cur);\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\telem = *(iter - 1);\n\n\t\t\tif (elem[0] == key && elem[1] == my_pos) {\n\n\t\t\t\tstd::unique_ptr<char[]> buffer_allocator;\n\t\t\t\ttry {\n\t\t\t\t\tbuffer_allocator = std::make_unique<char[]>(data_len);\n\t\t\t\t} catch (std::bad_alloc &exception) {\n\t\t\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\t\t\tstd::cout << \"tried to allocate: \" << data_len << \" bytes\" << std::endl;\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t\tchar *buffer = buffer_allocator.get();\n\n\t\t\t\tinfile.read(buffer, data_len);\n\n\t\t\t\t// Keep this data.\n\t\t\t\tconst size_t version = elem[2];\n\t\t\t\toutfile.write((char *)&key, sizeof(uint64_t));\n\t\t\t\toutfile.write((char *)&version, sizeof(size_t));\n\t\t\t\toutfile.write((char *)&data_len, sizeof(size_t));\n\t\t\t\toutfile.write(buffer, data_len);\n\t\t\t} else {\n\t\t\t\t// Ignore data.\n\t\t\t\tinfile.seekg(data_len, std::ios::cur);\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid hash_table_shard_builder::write_pages(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages) {\n\n\t\tstd::ofstream key_writer(this->filename_pos(), std::ios::binary | std::ios::trunc);\n\n\t\tconst size_t page_item_size = sizeof(std::array<uint64_t, 3>);\n\t\tconst size_t empty_key = SIZE_MAX;\n\n\t\tsize_t last_pos = 0;\n\t\tfor (size_t page_id = 0; page_id < pages.size(); page_id++) {\n\t\t\tconst size_t page_len = pages[page_id].size();\n\t\t\tif (page_len) {\n\t\t\t\tkey_writer.write((char *)&last_pos, sizeof(size_t));\n\t\t\t\tlast_pos += pages[page_id].size() * page_item_size + sizeof(size_t);\n\t\t\t} else {\n\t\t\t\tkey_writer.write((char *)&empty_key, sizeof(size_t));\n\t\t\t}\n\t\t}\n\n\t\t// Write pages.\n\t\tfor (size_t page_id = 0; page_id < pages.size(); page_id++) {\n\t\t\tconst size_t page_len = pages[page_id].size();\n\t\t\tif (page_len) {\n\t\t\t\tkey_writer.write((char *)&page_len, sizeof(size_t));\n\t\t\t\tfor (const auto &page_item : pages[page_id]) {\n\t\t\t\t\tkey_writer.write((char *)&page_item, page_item_size);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid hash_table_shard_builder::remove_keys_from_pages(std::vector<std::vector<std::array<uint64_t, 3>>> &pages) {\n\t\tfor (auto key : m_remove_keys) {\n\n\t\t\tconst size_t page_id = key % this->m_hash_table_size;\n\n\t\t\tstd::array elem{key, (uint64_t)0, (uint64_t)0};\n\n\t\t\tauto iter = std::upper_bound(pages[page_id].cbegin(), pages[page_id].cend(), elem, [](const auto &a, const auto &b) {\n\t\t\t\treturn a[0] < b[0];\n\t\t\t});\n\n\t\t\titer--;\n\n\t\t\tif ((*iter)[0] == key) {\n\t\t\t\t// remove the key from the page.\n\t\t\t\tpages[page_id].erase(iter);\n\t\t\t}\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/hash_table2/hash_table_shard_builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <map>\n#include <mutex>\n\n#include \"hash_table.h\"\n#include \"hash_table_shard_base.h\"\n\nnamespace hash_table2 {\n\n\t/*\n\t * Implementation of a hash table shard.\n\t *\n\t * usage:\n\t * hash_table_shard shard(\"test_db\", 0);\n\t * shard.add(12345, \"test data\", 3);\n\t * shard.add(12345, \"new test data\", 4);\n\t *\n\t * shard.append();\n\t * shard.merge();\n\t *\n\t * */\n\n\tclass hash_table_shard_builder : public hash_table_shard_base {\n\n\t\tpublic:\n\n\t\t\thash_table_shard_builder(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000,\n\t\t\t\t\tconst std::string &data_path = config::data_path() + \"/{shard_id_mod_8}/hash_table\");\n\t\t\t~hash_table_shard_builder();\n\n\t\t\t/*\n\t\t\t * Add key/value pair to hash table.\n\t\t\t * */\n\t\t\tvoid add(uint64_t key, const std::string &value, size_t version = 0);\n\n\t\t\t/*\n\t\t\t * Remove key from hash table.\n\t\t\t * */\n\t\t\tvoid remove(uint64_t key);\n\n\t\t\t/*\n\t\t\t * Return approximation of amount of memory in cache.\n\t\t\t * */\n\t\t\tsize_t cache_size() const;\n\n\t\t\t/*\n\t\t\t * Write memory cache to disc cache.\n\t\t\t * */\n\t\t\tvoid append();\n\n\t\t\t/*\n\t\t\t * Write disc cache to persistant hash table.\n\t\t\t * */\n\t\t\tvoid merge();\n\n\t\t\t/*\n\t\t\t * Optimize persistant has table to remove data for unused versions.\n\t\t\t * */\n\t\t\tvoid optimize();\n\n\t\t\t/*\n\t\t\t * Delete all data in shard.\n\t\t\t * */\n\t\t\tvoid truncate();\n\n\t\t\t/*\n\t\t\t * Merge with another shard. Handles key collisions by keeping the one with highest version.\n\t\t\t * */\n\t\t\tvoid merge_with(const hash_table_shard_builder &other);\n\n\t\t\t/*\n\t\t\t * Merge with another pos and datafile.\n\t\t\t * */\n\t\t\tvoid merge_with(const std::string &pos_file, const std::string &data_file);\n\n\t\tprivate:\n\n\t\t\tstd::map<uint64_t, std::string> m_cache;\n\t\t\tstd::map<uint64_t, size_t> m_version;\n\t\t\tstd::vector<uint64_t> m_remove_keys;\n\n\t\t\tstd::map<uint64_t, size_t> m_sort_pos;\n\t\t\tstd::mutex m_lock;\n\t\t\tsize_t m_data_size = 0;\n\n\t\t\tvoid read_optimized_to(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages, std::ifstream &infile, std::ofstream &outfile) const;\n\t\t\tvoid write_pages(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages);\n\t\t\tvoid remove_keys_from_pages(std::vector<std::vector<std::array<uint64_t, 3>>> &pages);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/hash_table_helper/hash_table_helper.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"config.h\"\n#include \"hash_table_helper.h\"\n#include \"logger/logger.h\"\n\nnamespace hash_table_helper {\n\n\tvoid truncate(const std::string &hash_table_name) {\n\t\tstd::vector<hash_table2::hash_table_shard_builder *> shards = create_shard_builders(hash_table_name);\n\n\t\tfor (auto shard : shards) {\n\t\t\tshard->truncate();\n\t\t}\n\n\t\tdelete_shard_builders(shards);\n\t}\n\n\tstd::vector<hash_table2::hash_table_shard_builder *> create_shard_builders(const std::string &hash_table_name) {\n\t\tstd::vector<hash_table2::hash_table_shard_builder *> shards;\n\t\tfor (size_t shard_id = 0; shard_id < config::ht_num_shards; shard_id++) {\n\t\t\tshards.push_back(new hash_table2::hash_table_shard_builder(hash_table_name, shard_id));\n\t\t}\n\n\t\treturn shards;\n\t}\n\n\tvoid delete_shard_builders(std::vector<hash_table2::hash_table_shard_builder *> &shards) {\n\t\tfor (auto shard : shards) {\n\t\t\tdelete shard;\n\t\t}\n\n\t\tshards.clear();\n\t}\n\n}\n"
  },
  {
    "path": "src/hash_table_helper/hash_table_helper.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include \"hash_table2/hash_table.h\"\n#include \"hash_table2/hash_table_shard_builder.h\"\n\nnamespace hash_table_helper {\n\n\tvoid truncate(const std::string &hash_table_name);\n\tstd::vector<hash_table2::hash_table_shard_builder *> create_shard_builders(const std::string &hash_table_name);\n\tvoid delete_shard_builders(std::vector<hash_table2::hash_table_shard_builder *> &shards);\n\n}\n"
  },
  {
    "path": "src/http/request.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"request.h\"\n\nnamespace http {\n\n\trequest::request(const URL &url, std::string request_method, std::string request_body)\n\t: m_url(url), m_request_method(request_method), m_request_body(request_body) {\n\t\n\t}\n\n}\n"
  },
  {
    "path": "src/http/request.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include \"URL.h\"\n\nnamespace http {\n\n\tclass request {\n\t\tpublic:\n\t\t\trequest(const URL &url, std::string request_method = \"POST\", std::string request_body = \"\");\n\n\t\t\tconst URL& url() const { return m_url; }\n\t\t\tconst std::string &request_method() const { return m_request_method; }\n\t\t\tconst std::string &request_body() const { return m_request_body; }\n\n\t\tprivate:\n\t\t\tsize_t m_code = 200;\n\t\t\tURL m_url;\n\t\t\tstd::string m_request_method;\n\t\t\tstd::string m_request_body;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/http/response.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace http {\n\n\tclass response {\n\t\tpublic:\n\n\t\t\tvoid code(size_t code) { m_code = code; }\n\t\t\tsize_t code() const { return m_code; }\n\n\t\t\tvoid body(const std::string &body) { m_body = body; }\n\t\t\tconst std::string &body() const { return m_body; }\n\n\t\t\tvoid content_type(const std::string &content_type) { m_content_type = content_type; }\n\t\t\tconst std::string &content_type() const { return m_content_type; }\n\n\t\tprivate:\n\t\t\tsize_t m_code = 200;\n\t\t\tstd::string m_body = \"\";\n\t\t\tstd::string m_content_type = \"text/html\";\n\n\t};\n\n}\n"
  },
  {
    "path": "src/http/server.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"server.h\"\n#include \"fcgio.h\"\n#include \"logger/logger.h\"\n#include \"URL.h\"\n\n#include <thread>\n#include <vector>\n\nnamespace http {\n\n\tserver::server(std::function<http::response(const http::request &)> handler) {\n\t\tm_handler = handler;\n\n\t\tstart();\n\t}\n\n\tvoid server::run_worker(int socket_id) {\n\n\t\tconst size_t max_post_len = 1024*1024*1024;\n\t\tconst size_t buffer_len = 1024*1024;\n\t\tstd::unique_ptr<char[]> buffer_allocator = std::make_unique<char[]>(buffer_len);\n\t\tchar *buffer = buffer_allocator.get();\n\n\t\tFCGX_Request request;\n\n\t\tFCGX_InitRequest(&request, socket_id, 0);\n\n\t\tLOG_INFO(\"Server has started...\");\n\n\t\twhile (true) {\n\n\t\t\tm_lock.lock();\n\t\t\tint accept_response = FCGX_Accept_r(&request);\n\t\t\tm_lock.unlock();\n\n\t\t\tif (accept_response < 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\n\t\t\tconst char *uri_ptr = FCGX_GetParam(\"REQUEST_URI\", request.envp);\n\t\t\tconst char *req_ptr = FCGX_GetParam(\"REQUEST_METHOD\", request.envp);\n\t\t\tif ((uri_ptr == nullptr) || (req_ptr == nullptr)) {\n\t\t\t\tFCGX_Finish_r(&request);\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tstd::string uri(uri_ptr);\n\t\t\tstd::string request_method(req_ptr);\n\n\t\t\tLOG_INFO(\"Serving request: \" + uri);\n\n\t\t\tURL url(\"http://alexandria.org\" + uri);\n\n\t\t\tstd::string post_data;\n\t\t\tif (request_method == \"POST\") {\n\t\t\t\twhile (true) {\n\n\t\t\t\t\tconst size_t read_bytes = FCGX_GetStr(buffer, buffer_len, request.in);\n\t\t\t\t\tif (read_bytes == 0) break;\n\n\t\t\t\t\tif (post_data.size() + read_bytes > max_post_len) {\n\t\t\t\t\t\tLOG_ERROR(\"Posted data larger then \" + std::to_string(max_post_len) + \", ignoring request\");\n\t\t\t\t\t\tbreak;\n\t\t\t\t\t}\n\t\t\t\t\tpost_data.append(buffer, read_bytes);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t::http::request http_request(url, request_method, post_data);\n\n\t\t\t::http::response http_response = m_handler(http_request);\n\n\t\t\tconst std::string data_out = http_response.body();\n\n\t\t\t// Output response\n\t\t\tconst std::string content_type = std::string(\"Content-type: \") + http_response.content_type() + \"\\r\\n\";\n\t\t\tconst std::string status = std::string(\"Status: \") + std::to_string(http_response.code()) + \"\\r\\n\";\n\t\t\tconst std::string end_req = \"\\r\\n\";\n\n\t\t\tFCGX_FPrintF(request.out, status.c_str());\n\t\t\tFCGX_FPrintF(request.out, content_type.c_str());\n\t\t\tFCGX_FPrintF(request.out, end_req.c_str());\n\t\t\tFCGX_PutStr(data_out.c_str(), data_out.size(), request.out);\n\n\t\t\tFCGX_Finish_r(&request);\n\t\t}\n\n\t\tFCGX_Free(&request, true);\n\t}\n\n\tvoid server::start() {\n\t\tFCGX_Init();\n\n\t\tint socket_id = FCGX_OpenSocket(\"127.0.0.1:8000\", 20);\n\t\tif (socket_id < 0) {\n\t\t\tLOG_INFO(\"Could not open socket, exiting\");\n\t\t\treturn;\n\t\t}\n\n\t\tstd::vector<std::thread> threads;\n\n\t\tfor (size_t i = 0; i < m_workers; i++) {\n\t\t\tthreads.emplace_back(std::move(std::thread([this](int socket_id){ run_worker(socket_id); }, socket_id)));\n\t\t}\n\n\t\tfor (auto &thread : threads) {\n\t\t\tthread.join();\n\t\t}\n\n\t\tclose(socket_id);\n\t}\n\n}\n"
  },
  {
    "path": "src/http/server.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <mutex>\n#include <functional>\n#include \"request.h\"\n#include \"response.h\"\n\nnamespace http {\n\n\tclass server {\n\t\tpublic:\n\t\t\tserver(std::function<::http::response(const ::http::request &)> handler);\n\n\t\tprivate:\n\t\t\tstd::function<::http::response(const ::http::request &)> m_handler;\n\t\t\tsize_t m_port = 8080;\n\t\t\tsize_t m_workers = 8;\n\t\t\tstd::mutex m_lock;\n\n\t\t\tvoid run_worker(int socket_id);\n\t\t\tvoid start();\n\t};\n\n}\n"
  },
  {
    "path": "src/indexer/basic_index.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"index_reader.h\"\n#include \"index_base.h\"\n#include <vector>\n\nnamespace indexer {\n\n\ttemplate<typename data_record>\n\tclass basic_index : public index_base<data_record> {\n\n\tpublic:\n\n\t\texplicit basic_index(const std::string &file_name);\n\t\texplicit basic_index(const std::string &db_name, size_t id);\n\t\texplicit basic_index(const std::string &db_name, size_t id, size_t hash_table_size);\n\t\texplicit basic_index(std::istream *reader, size_t hash_table_size);\n\t\t~basic_index();\n\n\t\tstd::vector<data_record> find(uint64_t key) const;\n\t\tstd::vector<data_record> find(uint64_t key, size_t limit) const;\n\n\t\tstd::unique_ptr<data_record[]> find_ptr(uint64_t key, size_t &num_records) const;\n\t\tstd::unique_ptr<data_record[]> find_ptr(uint64_t key, size_t limit, size_t &num_records) const;\n\t\tsize_t find_count(uint64_t key) const;\n\n\t\t/*\n\t\t * Iterates the keys of the index and calls the callback with key and vector of records for that key.\n\t\t * */\n\t\tvoid for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const;\n\t\tvoid for_each_key(std::function<void(uint64_t key)> on_each_key) const;\n\n\tprivate:\n\n\t\tmutable std::istream *m_reader;\n\t\tstd::unique_ptr<std::ifstream> m_default_reader;\n\t\t\n\t\tstd::string m_file_name;\n\t\tstd::string m_db_name;\n\t\tsize_t m_id;\n\t\tsize_t m_unique_count = 0;\n\n\t\tsize_t read_key_pos(uint64_t key) const;\n\t\tvoid read_meta();\n\t\tstd::string mountpoint() const;\n\t\tstd::string filename() const;\n\t\tstd::string meta_filename() const;\n\t\t\n\t};\n\n\ttemplate<typename data_record>\n\tbasic_index<data_record>::basic_index(const std::string &file_name)\n\t: index_base<data_record>(), m_file_name(file_name) {\n\t\tm_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);\n\t\tm_reader = m_default_reader.get();\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index<data_record>::basic_index(const std::string &db_name, size_t id)\n\t: index_base<data_record>(), m_db_name(db_name), m_id(id) {\n\t\tm_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);\n\t\tm_reader = m_default_reader.get();\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index<data_record>::basic_index(const std::string &db_name, size_t id, size_t hash_table_size)\n\t: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id) {\n\t\tm_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);\n\t\tm_reader = m_default_reader.get();\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index<data_record>::basic_index(std::istream *reader, size_t hash_table_size)\n\t: index_base<data_record>(hash_table_size) {\n\t\tm_reader = reader;\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index<data_record>::~basic_index() {\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> basic_index<data_record>::find(uint64_t key) const {\n\t\treturn find(key, 0);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> basic_index<data_record>::find(uint64_t key, size_t limit) const {\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\tsize_t num_records;\n\t\tunique_ptr<data_record[]> ptr = find_ptr(key, limit, num_records);\n\n\t\tstd::vector<data_record> ret;\n\t\tfor (size_t i = 0; i < num_records; i++) {\n\t\t\tret.push_back(ptr[i]);\n\t\t}\n\n\t\treturn ret;\n\t\t\n\t}\n\n\ttemplate<typename data_record>\n\tstd::unique_ptr<data_record[]> basic_index<data_record>::find_ptr(uint64_t key, size_t &num_records) const {\n\t\treturn find_ptr(key, 0, num_records);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::unique_ptr<data_record[]> basic_index<data_record>::find_ptr(uint64_t key, size_t limit, size_t &num_records) const {\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\tnum_records = 0;\n\n\t\tsize_t key_pos = read_key_pos(key);\n\n\t\tif (key_pos == SIZE_MAX) {\n\t\t\treturn {};\n\t\t}\n\n\t\t// Read page.\n\t\tm_reader->seekg(key_pos);\n\t\tsize_t num_keys;\n\t\tm_reader->read((char *)&num_keys, sizeof(size_t));\n\n\t\tstd::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);\n\t\tuint64_t *keys = keys_allocator.get();\n\t\tm_reader->read((char *)keys, num_keys * sizeof(uint64_t));\n\n\t\tsize_t key_data_pos = SIZE_MAX;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tif (keys[i] == key) {\n\t\t\t\tkey_data_pos = i;\n\t\t\t}\n\t\t}\n\n\t\tif (key_data_pos == SIZE_MAX) {\n\t\t\treturn {};\n\t\t}\n\n\t\tchar buffer[64];\n\n\t\t// Read position and length.\n\t\tm_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8);\n\t\tm_reader->read(buffer, 8);\n\t\tsize_t pos = *((size_t *)(&buffer[0]));\n\n\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8);\n\t\tm_reader->read(buffer, 8);\n\t\tsize_t len = *((size_t *)(&buffer[0]));\n\n\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos);\n\n\t\tnum_records = len / sizeof(data_record);\n\n\t\tif (limit && num_records > limit) {\n\t\t\tnum_records = limit;\n\t\t\tlen = num_records * sizeof(data_record);\n\t\t}\n\n\t\tstd::unique_ptr<data_record[]> ret = std::make_unique<data_record[]>(num_records);\n\n\t\tm_reader->read((char *)ret.get(), len);\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\tsize_t basic_index<data_record>::find_count(uint64_t key) const {\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\tsize_t key_pos = read_key_pos(key);\n\n\t\tif (key_pos == SIZE_MAX) {\n\t\t\treturn 0;\n\t\t}\n\n\t\t// Read page.\n\t\tm_reader->seekg(key_pos);\n\t\tsize_t num_keys;\n\t\tm_reader->read((char *)&num_keys, sizeof(size_t));\n\n\t\tstd::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);\n\t\tuint64_t *keys = keys_allocator.get();\n\t\tm_reader->read((char *)keys, num_keys * sizeof(uint64_t));\n\n\t\tsize_t key_data_pos = SIZE_MAX;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tif (keys[i] == key) {\n\t\t\t\tkey_data_pos = i;\n\t\t\t}\n\t\t}\n\n\t\tif (key_data_pos == SIZE_MAX) {\n\t\t\treturn 0;\n\t\t}\n\n\t\tchar buffer[64];\n\n\t\t// Read length only.\n\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8);\n\t\tm_reader->read(buffer, 8);\n\t\tsize_t len = *((size_t *)(&buffer[0]));\n\n\t\treturn len / sizeof(data_record);\n\t}\n\n\t/*\n\t * Iterates the keys of the index and calls the callback with key and vector of records for that key.\n\t * */\n\ttemplate<typename data_record>\n\tvoid basic_index<data_record>::for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const {\n\n\t\tstd::ifstream reader(filename(), std::ios::binary);\n\t\treader.seekg(this->hash_table_byte_size(), std::ios::beg);\n\n\t\tstd::map<uint64_t, std::vector<data_record>> page;\n\t\twhile (this->read_page_into(reader, page)) {\n\t\t\tfor (auto &iter : page) {\n\t\t\t\ton_each_key(iter.first, iter.second);\n\t\t\t}\n\t\t\tpage.clear();\n\t\t}\n\t\t\n\t}\n\n\t/*\n\t * Reads the exact position of the key, returns SIZE_MAX if the key was not found.\n\t * */\n\ttemplate<typename data_record>\n\tsize_t basic_index<data_record>::read_key_pos(uint64_t key) const {\n\n\t\tif (this->m_hash_table_size == 0) return 0;\n\n\t\tconst size_t hash_pos = key % this->m_hash_table_size;\n\n\t\tif (!m_reader->seekg(hash_pos * sizeof(size_t))) return SIZE_MAX;\n\n\t\tsize_t pos;\n\t\tm_reader->read((char *)&pos, sizeof(size_t));\n\n\t\treturn pos;\n\t}\n\n\t/*\n\t * Reads the count of unique recprds from the count file and puts it in the m_unique_count member.\n\t * */\n\ttemplate<typename data_record>\n\tvoid basic_index<data_record>::read_meta() {\n\t\tstruct meta {\n\t\t\tsize_t unique_count;\n\t\t};\n\n\t\tmeta m;\n\n\t\tstd::ifstream meta_reader(meta_filename(), std::ios::binary);\n\n\t\tif (meta_reader.is_open()) {\n\t\t\tmeta_reader.read((char *)(&m), sizeof(meta));\n\t\t}\n\n\t\tm_unique_count = m.unique_count;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index<data_record>::mountpoint() const {\n\t\treturn std::to_string(m_id % 8);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index<data_record>::filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".data\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".data\";\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index<data_record>::meta_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".meta\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".meta\";\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/basic_index_builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n#include <map>\n#include <set>\n#include <unordered_set>\n#include <cstring>\n#include <cassert>\n#include <boost/filesystem.hpp>\n#include \"merger.h\"\n#include \"score_builder.h\"\n#include \"algorithm/hyper_log_log.h\"\n#include \"config.h\"\n#include \"profiler/profiler.h\"\n#include \"logger/logger.h\"\n#include \"memory/debugger.h\"\n#include \"file/file.h\"\n#include \"index_base.h\"\n\nnamespace indexer {\n\n\ttemplate<typename data_record>\n\tclass basic_index_builder : public index_base<data_record>{\n\tprivate:\n\t\t// Non copyable\n\t\tbasic_index_builder(const basic_index_builder &);\n\t\tbasic_index_builder& operator=(const basic_index_builder &);\n\tpublic:\n\n\t\tbasic_index_builder(const std::string &file_name);\n\t\tbasic_index_builder(const std::string &db_name, size_t id);\n\t\tbasic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size);\n\t\tbasic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results);\n\t\t~basic_index_builder();\n\n\t\tvoid add(uint64_t key, const data_record &record);\n\t\tsize_t cache_size() const;\n\t\t\n\t\tvoid append();\n\t\tvoid merge();\n\t\tvoid transform(const std::function<data_record(const data_record &, size_t)> &transform);\n\t\tvoid sort_by(const std::function<bool(const data_record &a, const data_record &b)> sort_by);\n\n\t\tvoid truncate();\n\t\tvoid truncate_cache_files();\n\t\tvoid create_directories();\n\n\tprivate:\n\n\t\tstd::string m_file_name;\n\t\tstd::string m_db_name;\n\t\tconst size_t m_id;\n\n\t\tconst size_t m_max_results;\n\n\t\tconst size_t m_buffer_len = config::ft_shard_builder_buffer_len;\n\t\tchar *m_buffer;\n\t\tstd::mutex m_lock;\n\n\t\t// Caches\n\t\tstd::vector<uint64_t> m_key_cache;\n\t\tstd::vector<data_record> m_record_cache;\n\n\t\tstd::map<uint64_t, vector<data_record>> m_cache;\n\n\t\tvoid read_append_cache();\n\t\tvoid read_data_to_cache();\n\t\tvoid sort_cache();\n\t\tvoid sort_record_list(uint64_t key, std::vector<data_record> &records);\n\t\tvoid reset_cache_variables();\n\t\tvoid save_file();\n\t\tvoid write_key(std::ofstream &key_writer, uint64_t key, size_t page_pos);\n\t\tsize_t write_page(std::ofstream &writer, const std::vector<uint64_t> &keys);\n\t\tvoid reset_key_map(std::ofstream &key_writer);\n\n\t\tstd::string mountpoint() const;\n\t\tstd::string cache_filename() const;\n\t\tstd::string key_cache_filename() const;\n\t\tstd::string target_filename() const;\n\t\tstd::string meta_filename() const;\n\n\t};\n\n\ttemplate<typename data_record>\n\tbasic_index_builder<data_record>::basic_index_builder(const std::string &file_name)\n\t: index_base<data_record>(), m_file_name(file_name), m_id(0),\n\t\tm_max_results(config::ft_max_results_per_section)\n\t{\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index_builder<data_record>::basic_index_builder(const std::string &db_name, size_t id)\n\t: index_base<data_record>(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index_builder<data_record>::basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size)\n\t: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {\n\t\tmerger::register_merger((size_t)this, [this]() {append();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index_builder<data_record>::basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results)\n\t: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(max_results) {\n\t\tmerger::register_merger((size_t)this, [this]() {append();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tbasic_index_builder<data_record>::~basic_index_builder() {\n\t\tmerger::deregister_merger((size_t)this);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::add(uint64_t key, const data_record &record) {\n\n\t\tindexer::merger::lock();\n\n\t\tm_lock.lock();\n\n\t\t// Amortized constant\n\t\tm_key_cache.push_back(key);\n\t\tm_record_cache.push_back(record);\n\n\t\tassert(m_record_cache.size() == m_key_cache.size());\n\n\t\tm_lock.unlock();\n\n\t}\n\n\t/*\n\t * Returns the allocated size of the cache (m_key_cache and m_record_cache).\n\t * */\n\ttemplate<typename data_record>\n\tsize_t basic_index_builder<data_record>::cache_size() const {\n\t\treturn m_key_cache.capacity() * sizeof(uint64_t) + m_record_cache.capacity() * sizeof(data_record);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::append() {\n\n\t\tassert(m_record_cache.size() == m_key_cache.size());\n\n\t\tstd::ofstream record_writer(cache_filename(), std::ios::binary | std::ios::app);\n\t\tif (!record_writer.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + cache_filename() + \"). Error: \" +\n\t\t\t\tstd::string(strerror(errno)));\n\t\t}\n\n\t\tstd::ofstream key_writer(key_cache_filename(), std::ios::binary | std::ios::app);\n\t\tif (!key_writer.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + key_cache_filename() + \"). Error: \" +\n\t\t\t\tstd::string(strerror(errno)));\n\t\t}\n\n\t\trecord_writer.write((const char *)m_record_cache.data(), m_record_cache.size() * sizeof(data_record));\n\t\tkey_writer.write((const char *)m_key_cache.data(), m_key_cache.size() * sizeof(uint64_t));\n\n\t\tm_record_cache.clear();\n\t\tm_key_cache.clear();\n\t\tm_record_cache.shrink_to_fit();\n\t\tm_key_cache.shrink_to_fit();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::merge() {\n\n\t\t{\n\t\t\tread_append_cache();\n\t\t\tsort_cache();\n\t\t\tsave_file();\n\t\t\ttruncate_cache_files();\n\t\t}\n\n\t}\n\n\t/*\n\t\tTransforms all the bitmaps in the index. Basically generating new bitmaps with the transform applied.\n\t*/\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::transform(const std::function<data_record(const data_record &, size_t)> &transform) {\n\n\t\tread_data_to_cache();\n\n\t\t// Apply transforms.\n\t\tfor (auto &iter : m_cache) {\n\t\t\tfor (size_t i = 0; i < iter.second.size(); i++) {\n\t\t\t\titer.second[i] = transform(iter.second[i], iter.second.size());\n\t\t\t}\n\t\t}\n\n\t\tsave_file();\n\t\ttruncate_cache_files();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::sort_by(const std::function<bool(const data_record &a, const data_record &b)> comp) {\n\t\tread_data_to_cache();\n\n\t\tfor (auto &iter : m_cache) {\n\t\t\tsort(iter.second.begin(), iter.second.end(), comp);\n\t\t}\n\n\t\tsave_file();\n\t\ttruncate_cache_files();\n\t}\n\n\t/*\n\t\tDeletes ALL data from this shard.\n\t*/\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::truncate() {\n\t\tcreate_directories();\n\t\ttruncate_cache_files();\n\n\t\tstd::ofstream target_writer(target_filename(), std::ios::trunc);\n\t\ttarget_writer.close();\n\t}\n\n\t/*\n\t\tDeletes all data from caches.\n\t*/\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::truncate_cache_files() {\n\n\t\treset_cache_variables();\n\n\t\tfile::delete_file(cache_filename());\n\t\tfile::delete_file(key_cache_filename());\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::create_directories() {\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/\" + std::to_string(i) + \"/full_text/\" +\n\t\t\t\tm_db_name);\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::read_append_cache() {\n\n\t\t// Read the current file.\n\t\tread_data_to_cache();\n\n\t\t//profiler::instance prof(\"index_builder::read_append_cache\");\n\n\t\t// Read the cache into memory.\n\t\tstd::ifstream reader(cache_filename(), std::ios::binary);\n\t\tif (!reader.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + cache_filename() + \"). Error: \" + std::string(strerror(errno)));\n\t\t}\n\n\t\tstd::ifstream key_reader(key_cache_filename(), std::ios::binary);\n\t\tif (!key_reader.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + key_cache_filename() + \"). Error: \" + std::string(strerror(errno)));\n\t\t}\n\n\t\tconst size_t buffer_len = 10000;\n\n\t\tstd::unique_ptr<data_record[]> buffer_allocator;\n\t\ttry {\n\t\t\tbuffer_allocator = std::make_unique<data_record[]>(buffer_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << buffer_len * sizeof(data_record) << \" bytes\" << std::endl;\n\t\t\treturn;\n\t\t}\n\n\t\tstd::unique_ptr<uint64_t[]> key_buffer_allocator;\n\t\ttry {\n\t\t\tkey_buffer_allocator = std::make_unique<uint64_t[]>(buffer_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << buffer_len * sizeof(uint64_t) << \" bytes\" << std::endl;\n\t\t\treturn;\n\t\t}\n\n\t\tdata_record *buffer = buffer_allocator.get();\n\t\tuint64_t *key_buffer = key_buffer_allocator.get();\n\n\t\treader.seekg(0, std::ios::beg);\n\n\t\tunordered_map<uint64_t, uint32_t> internal_id_map; \n\t\tunordered_map<uint64_t, vector<uint32_t>> bitmap_data;\n\n\t\twhile (!reader.eof()) {\n\n\t\t\treader.read((char *)buffer, buffer_len * sizeof(data_record));\n\t\t\tkey_reader.read((char *)key_buffer, buffer_len * sizeof(uint64_t));\n\n\t\t\tconst size_t read_bytes = reader.gcount();\n\t\t\tconst size_t num_records = read_bytes / sizeof(data_record);\n\n\t\t\tfor (size_t i = 0; i < num_records; i++) {\n\t\t\t\tm_cache[key_buffer[i]].push_back(buffer[i]);\n\t\t\t}\n\t\t}\n\t}\n\n\t/*\n\t * Reads the file into RAM.\n\t * */\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::read_data_to_cache() {\n\n\t\treset_cache_variables();\n\n\t\tstd::ifstream reader(target_filename(), std::ios::binary);\n\t\tif (!reader.is_open()) return;\n\n\t\treader.seekg(0, std::ios::end);\n\t\tconst size_t file_size = reader.tellg();\n\t\tif (file_size <= this->hash_table_byte_size()) return;\n\t\treader.seekg(this->hash_table_byte_size(), std::ios::beg);\n\n\t\twhile (this->read_page_into(reader, m_cache)) {\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::sort_cache() {\n\t\tfor (auto &iter : m_cache) {\n\t\t\tsort_record_list(iter.first, iter.second);\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::sort_record_list(uint64_t key, std::vector<data_record> &records) {\n\n\t\t// Sort records.\n\t\tstd::sort(records.begin(), records.end());\n\n\t\t// Sum equal elements.\n\t\tfor (size_t i = 0, j = 1; i < records.size() && j < records.size(); j++) {\n\t\t\tif (records[i] != records[j]) {\n\t\t\t\ti = j;\n\t\t\t} else {\n\t\t\t\trecords[i] += records[j];\n\t\t\t}\n\t\t}\n\n\t\t// Delete consecutive equal elements. Only keeping the first unique.\n\t\tauto last = std::unique(records.begin(), records.end());\n\t\trecords.erase(last, records.end());\n\n\n\t\tif (records.size() > m_max_results) {\n\t\t\t// Sort before truncation\n\t\t\tstd::sort(records.begin(), records.end(), typename data_record::truncate_order());\n\t\t\trecords.resize(config::ft_max_results_per_section);\n\n\t\t\t// Future fix here is to add hyper log log counting for words with too many urls.\n\t\t}\n\n\t\tstd::sort(records.begin(), records.end());\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::reset_cache_variables() {\n\t\tm_cache = std::map<uint64_t, vector<data_record>>{};\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::save_file() {\n\n\t\t//profiler::instance prof(\"index_builder::save_file\");\n\n\t\tstd::ofstream writer(target_filename(), std::ios::binary | std::ios::trunc);\n\t\tif (!writer.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard. Error: \" + std::string(strerror(errno)));\n\t\t}\n\n\t\treset_key_map(writer);\n\n\t\tstd::map<uint64_t, std::vector<uint64_t>> pages;\n\t\tfor (auto &iter : m_cache) {\n\t\t\tif (this->m_hash_table_size) {\n\t\t\t\tpages[iter.first % this->m_hash_table_size].push_back(iter.first);\n\t\t\t} else {\n\t\t\t\tpages[0].push_back(iter.first);\n\t\t\t}\n\t\t}\n\n\t\tfor (const auto &iter : pages) {\n\t\t\tsize_t page_pos = write_page(writer, iter.second);\n\t\t\twrite_key(writer, iter.first, page_pos);\n\t\t\twriter.flush();\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::write_key(std::ofstream &key_writer, uint64_t key, size_t page_pos) {\n\t\tif (this->m_hash_table_size > 0) {\n\t\t\tassert(key < this->m_hash_table_size);\n\t\t\tkey_writer.seekp(key * sizeof(uint64_t));\n\t\t\tkey_writer.write((char *)&page_pos, sizeof(size_t));\n\t\t}\n\t}\n\n\t/*\n\t * Writes the page with keys, appending it to the file stream writer.\n\t * */\n\ttemplate<typename data_record>\n\tsize_t basic_index_builder<data_record>::write_page(std::ofstream &writer, const std::vector<uint64_t> &keys) {\n\n\t\twriter.seekp(0, ios::end);\n\n\t\tconst size_t page_pos = writer.tellp();\n\n\t\tsize_t num_keys = keys.size();\n\n\t\twriter.write((char *)&num_keys, 8);\n\t\twriter.write((char *)keys.data(), keys.size() * 8);\n\n\t\tstd::vector<size_t> v_pos;\n\t\tstd::vector<size_t> v_len;\n\n\t\tsize_t pos = 0;\n\t\tfor (uint64_t key : keys) {\n\n\t\t\t// Store position and length\n\t\t\tconst size_t len = m_cache[key].size() * sizeof(data_record);\n\t\t\t\n\t\t\tv_pos.push_back(pos);\n\t\t\tv_len.push_back(len);\n\n\t\t\tpos += len;\n\t\t}\n\t\t\n\t\twriter.write((char *)v_pos.data(), keys.size() * 8);\n\t\twriter.write((char *)v_len.data(), keys.size() * 8);\n\n\t\t// Write data.\n\t\tsize_t i = 0;\n\t\tfor (uint64_t key : keys) {\n\t\t\tconst size_t len = v_len[i];\n\t\t\twriter.write((char *)m_cache[key].data(), len);\n\t\t\ti++;\n\t\t}\n\n\t\treturn page_pos;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid basic_index_builder<data_record>::reset_key_map(std::ofstream &key_writer) {\n\t\tkey_writer.seekp(0);\n\t\tuint64_t data = SIZE_MAX;\n\t\tfor (size_t i = 0; i < this->m_hash_table_size; i++) {\n\t\t\tkey_writer.write((char *)&data, sizeof(uint64_t));\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index_builder<data_record>::mountpoint() const {\n\t\treturn std::to_string(m_id % 8);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index_builder<data_record>::cache_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".cache\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".cache\";\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index_builder<data_record>::key_cache_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".cache.keys\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".cache.keys\";\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string basic_index_builder<data_record>::target_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".data\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".data\";\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/console.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"console.h\"\n#include <vector>\n#include <iomanip>\n#include \"text/text.h\"\n#include \"indexer/index_manager.h\"\n#include \"indexer/sharded.h\"\n#include \"indexer/basic_index.h\"\n#include \"indexer/counted_record.h\"\n#include \"URL.h\"\n#include \"transfer/transfer.h\"\n#include \"domain_stats/domain_stats.h\"\n#include \"merger.h\"\n#include \"file/tsv_file_remote.h\"\n#include \"algorithm/bloom_filter.h\"\n#include \"parser/parser.h\"\n#include \"http/server.h\"\n#include \"json.hpp\"\n\nnamespace indexer {\n\n\tvoid cmd_index(index_manager &idx_manager, const std::vector<std::string> &args) {\n\t\tif (args.size() < 2) return;\n\n\t\tmerger::start_merge_thread();\n\n\t\tconst auto batch = args[1];\n\t\tsize_t limit = 0;\n\t\tif (args.size() > 2) limit = stoull(args[2]);\n\n\t\tfile::tsv_file_remote warc_paths_file(std::string(\"crawl-data/\") + batch + \"/warc.paths.gz\");\n\t\tstd::vector<std::string> warc_paths;\n\t\twarc_paths_file.read_column_into(0, warc_paths);\n\n\t\tif (limit && warc_paths.size() > limit) warc_paths.resize(limit);\n\n\t\tfor (auto &path : warc_paths) {\n\t\t\tconst size_t pos = path.find(\".warc.gz\");\n\t\t\tif (pos != std::string::npos) {\n\t\t\t\tpath.replace(pos, 8, \".gz\");\n\t\t\t}\n\t\t}\n\t\tauto local_files = transfer::download_gz_files_to_disk(warc_paths);\n\t\tcout << \"starting indexer\" << endl;\n\t\tidx_manager.add_index_files_threaded(local_files, 24);\n\t\tcout << \"done with indexer\" << endl;\n\t\ttransfer::delete_downloaded_files(local_files);\n\n\t\tmerger::stop_merge_thread();\n\t}\n\n\tvoid cmd_search(index_manager &idx_manager, hash_table2::hash_table &ht, hash_table2::hash_table &url_ht, const std::string &query) {\n\n\t\tprofiler::instance prof(\"domain search\");\n\t\tstd::vector<indexer::return_record> res = idx_manager.find(query);\n\t\tprof.stop();\n\n\t\tcout << \"took \" << prof.get() << \"ms\" << endl;\n\n\t\tcout << setw(50) << \"domain\";\n\t\tcout << setw(20) << \"score\";\n\t\tcout << endl;\n\n\t\tstd::vector<uint64_t> domain_hashes;\n\n\t\tfor (indexer::return_record &rec : res) {\n\t\t\tconst auto host = ht.find(rec.m_value);\n\t\t\tdomain_hashes.push_back(rec.m_value);\n\n\t\t\tcout << setw(50) << host;\n\t\t\tcout << setw(20) << rec.m_score;\n\t\t\tcout << endl;\n\t\t}\n\n\t\tprofiler::instance prof2(\"url searches\");\n\n\t\tcout << \"sending \" << domain_hashes.size() << \" domain hashes\" << endl;\n\n\t\thttp::response http_res = transfer::post(\"http://65.108.132.103/?q=\" + parser::urlencode(query), std::string((char *)domain_hashes.data(), domain_hashes.size() * sizeof(uint64_t)));\n\n\t\tconst auto url_res = http_res.body();\n\n\t\tstd::stringstream ss(url_res);\n\n\t\tstd::map<uint64_t, std::vector<url_record>> results;\n\t\twhile (!ss.eof()) {\n\t\t\tuint64_t incoming_domain_hash;\n\t\t\tss.read((char *)&incoming_domain_hash, sizeof(uint64_t));\n\t\t\tif (ss.eof()) break;\n\t\t\tsize_t num_records;\n\t\t\tss.read((char *)&num_records, sizeof(size_t));\n\t\t\tfor (size_t i = 0; i < num_records; i++) {\n\t\t\t\tuint64_t value;\n\t\t\t\tfloat score;\n\t\t\t\tss.read((char *)&value, sizeof(uint64_t));\n\t\t\t\tss.read((char *)&score, sizeof(float));\n\t\t\t\tresults[incoming_domain_hash].push_back(url_record(value, score));\n\t\t\t}\n\t\t}\n\n\t\tfor (auto domain_hash : domain_hashes) {\n\t\t\tfor (const auto &url_record : results[domain_hash]) {\n\t\t\t\tconst auto &line = url_ht.find(url_record.m_value);\n\t\t\t\tstd::vector<std::string> cols;\n\n\t\t\t\tboost::algorithm::split(cols, line, boost::is_any_of(\"\\t\"));\n\t\t\t\tconst auto url = cols[0];\n\t\t\t\tconst auto title = cols[1];\n\t\t\t\tconst auto snippet = cols[4];\n\n\t\t\t\tstd::cout << url << std::endl;\n\t\t\t}\n\t\t}\n\n\t\tcout << \"took \" << prof2.get() << \"ms\" << endl;\n\n\t\tcout << \"got \" << results.size() << \" responses\" << endl;\n\n\t}\n\n\tvoid cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query) {\n\n\t\tindexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> word_index_builder(\"word_index\", 256);\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> word_index(\"word_index\", 256);\n\n\t\tconst uint64_t word_hash = ::algorithm::hash(query);\n\t\tstd::vector<indexer::counted_record> res = word_index.find(word_hash, 100000);\n\n\t\tsize_t pos = 0;\n\t\tfor (auto &rec : res) {\n\t\t\tconst auto host = ht.find(rec.m_value);\n\t\t\tcout << host << \": \" << rec.m_count << \" score: \" << rec.m_score << \" pos: \" << pos << \" m_value: \" << rec.m_value << \" doc_size: \" << word_index_builder.document_size(rec.m_value) << endl;\n\t\t\tpos++;\n\t\t}\n\n\t}\n\n\tvoid cmd_domain_info(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &domain, size_t limit, size_t offset) {\n\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> idx(\"title_word_counter\", 997);\n\n\t\tconst uint64_t domain_hash = ::algorithm::hash(domain);\n\t\tstd::vector<indexer::counted_record> res = idx.find(domain_hash);\n\n\t\tsort(res.begin(), res.end(), indexer::counted_record::truncate_order());\n\n\t\tsize_t pos = 0;\n\t\tfor (auto &rec : res) {\n\t\t\tconst auto word = ht.find(rec.m_value);\n\t\t\tcout << word << \": \" << rec.m_count << endl;\n\t\t\tif (pos >= limit) break;\n\t\t\tpos++;\n\t\t}\n\n\t}\n\n\tvoid cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query, const std::string &domain) {\n\n\t\tindexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> word_index_builder(\"word_index\", 256);\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> word_index(\"word_index\", 256);\n\n\t\tconst uint64_t word_hash = ::algorithm::hash(query);\n\t\tstd::vector<indexer::counted_record> res = word_index.find(word_hash);\n\n\t\tsize_t pos = 0;\n\t\tfor (auto &rec : res) {\n\t\t\tconst auto host = ht.find(rec.m_value);\n\t\t\tif (host == domain) {\n\t\t\t\tcout << host << \": \" << rec.m_count << \" score: \" << rec.m_score << \" pos: \" << pos << \" m_value: \" << rec.m_value << \" doc_size: \" << word_index_builder.document_size(rec.m_value) << endl;\n\t\t\t}\n\t\t\tpos++;\n\t\t}\n\n\t}\n\n\tvoid cmd_word_num(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query) {\n\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> word_index(\"word_index\", 256);\n\n\t\tconst uint64_t word_hash = ::algorithm::hash(query);\n\t\tstd::vector<indexer::counted_record> res = word_index.find(word_hash);\n\n\t\tcout << \"num_records: \" << res.size() << endl;\n\n\t}\n\n\tvoid cmd_harmonic(const std::vector<std::string> &args) {\n\t\tif (args.size() < 2) return;\n\t\tfloat harmonic = domain_stats::harmonic_centrality(URL(args[1]));\n\t\tcout << \"url: \" << args[1] << \" has harmonic centrality \" << harmonic << endl;\n\t}\n\n\tstd::vector<std::string> input_to_args(const std::string &input) {\n\t\tconst auto word_boundary = \" \\t,|!\";\n\n\t\tstd::vector<std::string> raw_words, words;\n\t\tboost::split(raw_words, input, boost::is_any_of(word_boundary));\n\n\t\tfor (auto &word : raw_words) {\n\t\t\tif (word.size()) {\n\t\t\t\twords.push_back(word);\n\t\t\t}\n\t\t}\n\n\t\treturn words;\n\t}\n\n\tvoid console() {\n\t}\n\n\tvoid index_link_batch(const std::string &batch) {\n\n\t\t::algorithm::bloom_filter urls_to_index(625000027);\n\t\turls_to_index.read_file(config::data_path() + \"/0/url_filter.bloom\");\n\n\t\tsize_t limit = 1000;\n\t\tsize_t offset = 0;\n\t\twhile (true) {\n\t\t\tindexer::index_manager idx_manager;\n\n\t\t\tmerger::start_merge_thread();\n\n\t\t\tfile::tsv_file_remote warc_paths_file(std::string(\"crawl-data/\") + batch + \"/warc.paths\");\n\t\t\tstd::vector<std::string> warc_paths;\n\t\t\twarc_paths_file.read_column_into(0, warc_paths, limit, offset);\n\n\t\t\tif (warc_paths.size() == 0) {\n\t\t\t\tmerger::stop_merge_thread();\n\t\t\t\tbreak;\n\t\t\t}\n\n\t\t\tauto local_files = transfer::download_gz_files_to_disk(warc_paths);\n\t\t\tcout << \"starting indexer\" << endl;\n\t\t\tidx_manager.add_link_files_threaded(local_files, 32, urls_to_index);\n\t\t\tcout << \"done with indexer\" << endl;\n\t\t\ttransfer::delete_downloaded_files(local_files);\n\n\t\t\tmerger::stop_merge_thread();\n\n\t\t\toffset += limit;\n\t\t}\n\t}\n\n\tvoid index_links() {\n\n\t\tdomain_stats::download_domain_stats();\n\t\tLOG_INFO(\"Done download_domain_stats\");\n\t\t\n\t\tfor (const std::string &batch : config::link_batches) {\n\t\t\tindex_link_batch(batch);\n\t\t}\n\t}\n\n\tvoid index_url_batch(const std::string &batch) {\n\n\t\tsize_t limit = 1000;\n\t\tsize_t offset = 0;\n\t\twhile (true) {\n\t\t\tindexer::index_manager idx_manager;\n\n\t\t\tmerger::start_merge_thread();\n\n\t\t\tfile::tsv_file_remote warc_paths_file(std::string(\"crawl-data/\") + batch + \"/warc.paths\");\n\t\t\tstd::vector<std::string> warc_paths;\n\t\t\twarc_paths_file.read_column_into(0, warc_paths, limit, offset);\n\n\t\t\tif (warc_paths.size() == 0) {\n\t\t\t\tmerger::stop_merge_thread();\n\t\t\t\tbreak;\n\t\t\t}\n\n\t\t\tcout << \"downloading \" << warc_paths.size() << \" to disc\" << endl;\n\t\t\tauto local_files = transfer::download_gz_files_to_disk(warc_paths);\n\t\t\tcout << \"starting indexer\" << endl;\n\t\t\tidx_manager.add_index_files_threaded(local_files, 32);\n\t\t\tcout << \"done with indexer\" << endl;\n\t\t\ttransfer::delete_downloaded_files(local_files);\n\n\t\t\tmerger::stop_merge_thread();\n\n\t\t\toffset += limit;\n\t\t}\n\t\tprofiler::print_report();\n\t}\n\n\tvoid index_urls() {\n\n\t\tdomain_stats::download_domain_stats();\n\t\tLOG_INFO(\"Done download_domain_stats\");\n\t\t\n\t\tfor (const std::string &batch : config::batches) {\n\t\t\tindex_url_batch(batch);\n\t\t}\n\t}\n\n\tvoid truncate_links() {\n\t\t{\n\t\t\tindexer::index_manager idx_manager;\n\t\t\tidx_manager.truncate_links();\n\t\t}\n\t}\n\n\tvoid domain_info_server() {\n\n\t\tdomain_stats::download_domain_stats();\n\t\tLOG_INFO(\"Done download_domain_stats\");\n\n\t\tindexer::index_manager idx_manager;\n\t\thash_table2::hash_table ht(\"word_hash_table\");\n\n\t\tindexer::sharded<indexer::basic_index, counted_record> fp_title_counter(\"first_page_title_word_counter\", 101);\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> title_counter(\"title_word_counter\", 997);\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> link_counter(\"link_word_counter\", 4001);\n\n\t\tcout << \"starting server...\" << endl;\n\n\t\t::http::server srv([&ht, &fp_title_counter, &title_counter, &link_counter](const http::request &req) {\n\t\t\thttp::response res;\n\n\t\t\tURL url = req.url();\n\n\t\t\tauto query = url.query();\n\n\t\t\tsize_t limit = 1000;\n\t\t\tif (query.count(\"limit\")) limit = std::stoi(query[\"limit\"]);\n\n\t\t\tsize_t offset = 0;\n\t\t\tif (query.count(\"offset\")) offset = std::stoi(query[\"offset\"]);\n\n\t\t\tif (url.path() == \"/favicon.ico\") {\n\t\t\t\tres.code(404);\n\t\t\t\tres.body(\"404\");\n\t\t\t\treturn res;\n\t\t\t}\n\n\t\t\tstd::stringstream body;\n\n\t\t\tauto domain = url.path();\n\t\t\tdomain.erase(0, 1);\n\n\t\t\tbody << \"<html><head><meta http-equiv='Content-type' content='text/html; charset=utf-8'></head><body>\";\n\n\t\t\tbody << \"<h1>\" << domain << \"</h1>\" << endl;\n\t\t\tbody << \"<h3>harmonic: \" << domain_stats::harmonic_centrality(domain) << \"</h3>\" << endl;\n\t\t\tbody << \"<h3>hash: \" << ::algorithm::hash(domain) << \"</h3>\" << endl;\n\n\t\t\tbody << \"<pre>\";\n\n\t\t\tconst uint64_t domain_hash = ::algorithm::hash(domain);\n\t\t\tauto fp_results = fp_title_counter.find(domain_hash);\n\t\t\tauto results = title_counter.find(domain_hash);\n\t\t\tauto link_results = link_counter.find(domain_hash);\n\n\t\t\tsort(fp_results.begin(), fp_results.end(), indexer::counted_record::truncate_order());\n\t\t\tsort(results.begin(), results.end(), indexer::counted_record::truncate_order());\n\t\t\tsort(link_results.begin(), link_results.end(), indexer::counted_record::truncate_order());\n\n\t\t\tbody << \"Limit: \" + std::to_string(limit) << endl;\n\t\t\tbody << \"Offset: \" + std::to_string(offset) << endl << endl;\n\t\t\tconst size_t original_offset = offset;\n\t\t\tbody << \"</pre>\";\n\t\t\tbody << \"<div class=lefter>\";\n\t\t\tbody << \"<pre class=green>\";\n\t\t\tfor (auto &rec : fp_results) {\n\t\t\t\tconst auto word = ht.find(rec.m_value);\n\t\t\t\tbody << word << \": \" << rec.m_count << endl;\n\t\t\t}\n\t\t\tbody << \"</pre>\";\n\t\t\tbody << \"<pre class=green>\";\n\t\t\tdouble threshold = results.size() ? results[0].m_count : 0.0;\n\t\t\tsize_t offset_start = 0;\n\t\t\tfor (auto &rec : results) {\n\t\t\t\tif (rec.m_count >= threshold * 0.8) {\n\t\t\t\t\tconst auto word = ht.find(rec.m_value);\n\t\t\t\t\tbody << word << \": \" << rec.m_count << endl;\n\t\t\t\t\toffset_start++;\n\t\t\t\t} else {\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (offset < offset_start) offset = offset_start;\n\t\t\tbody << \"</pre>\";\n\n\t\t\tbody << \"<pre>\";\n\n\t\t\tsize_t pos = 0;\n\t\t\tfor (auto &rec : results) {\n\t\t\t\tif (pos >= offset) {\n\t\t\t\t\tconst auto word = ht.find(rec.m_value);\n\t\t\t\t\tbody << word << \": \" << rec.m_count << endl;\n\t\t\t\t}\n\t\t\t\tif (pos >= limit + offset) break;\n\t\t\t\tpos++;\n\t\t\t}\n\n\t\t\tbody << \"</pre></div><pre class=righter>\";\n\n\t\t\tpos = 0;\n\t\t\tfor (auto &rec : link_results) {\n\t\t\t\tif (pos >= original_offset) {\n\t\t\t\t\tconst auto word = ht.find(rec.m_value);\n\t\t\t\t\tbody << word << \": \" << rec.m_count << endl;\n\t\t\t\t}\n\t\t\t\tif (pos >= limit + original_offset) break;\n\t\t\t\tpos++;\n\t\t\t}\n\n\t\t\tbody << \"</pre><style>.lefter {width: 50%; float: left; }\";\n\n\t\t\tres.code(200);\n\n\t\t\tres.body(body.str());\n\n\t\t\treturn res;\n\t\t});\n\t}\n\n\tvoid make_domain_index() {\n\n\t\t/*sharded_index<domain_record> idx(\"domain_info\", 997);\n\n\t\tsize_t count = 0;\n\t\tidx.for_each([&count](uint64_t key, roaring::Roaring &recs) {\n\t\t\tcount++;\n\t\t});\n\n\t\tcout << \"num_words: \" << count << endl;\n\n\t\treturn;*/\n\n\t\tdomain_stats::download_domain_stats();\n\t\tLOG_INFO(\"Done download_domain_stats\");\n\n\t\tindexer::sharded<indexer::basic_index, counted_record> fp_title_counter(\"first_page_title_word_counter\", 101);\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> title_counter(\"title_word_counter\", 997);\n\t\tindexer::sharded<indexer::basic_index, indexer::counted_record> link_counter(\"link_word_counter\", 4001);\n\n\t\tmerger::start_merge_thread();\n\n\t\tsharded_index_builder<domain_record> idx(\"domain_info\", 997);\n\t\tidx.truncate();\n\n\t\tfp_title_counter.for_each([&idx](uint64_t domain_hash, std::vector<counted_record> &records) {\n\t\t\tfor (const auto &record : records) {\n\t\t\t\tidx.add(record.m_value, domain_record(domain_hash, 0.0f));\n\t\t\t}\n\t\t});\n\n\t\tmerger::stop_merge_thread_only_append();\n\t\tidx.merge();\n\t\tmerger::start_merge_thread();\n\n\t\ttitle_counter.for_each([&idx](uint64_t domain_hash, std::vector<counted_record> &records) {\n\n\t\t\t// Sort by score.\n\t\t\tsort(records.begin(), records.end(), counted_record::truncate_order());\n\t\t\tfloat threshold = records.size() > 0 ? records[0].m_count * 0.8f : 0.0f;\n\t\t\tfor (const auto &record : records) {\n\t\t\t\tif (record.m_count < threshold) break;\n\t\t\t\tidx.add(record.m_value, domain_record(domain_hash, 0.0f));\n\t\t\t}\n\t\t});\n\n\t\tmerger::stop_merge_thread_only_append();\n\t\tidx.merge();\n\t\tmerger::start_merge_thread();\n\n\t\tlink_counter.for_each([&idx](uint64_t domain_hash, std::vector<counted_record> &records) {\n\n\t\t\t// Sort by score.\n\t\t\tsort(records.begin(), records.end(), counted_record::truncate_order());\n\t\t\tfor (size_t i = 0; i < records.size() && i < 100; i++) {\n\t\t\t\tidx.add(records[i].m_value, domain_record(domain_hash, 0.0f));\n\t\t\t}\n\t\t});\n\n\t\tmerger::stop_merge_thread_only_append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t}\n\n\tvoid make_domain_index_scores() {\n\n\t\tdomain_stats::download_domain_stats();\n\t\tLOG_INFO(\"Done download_domain_stats\");\n\n\t\thash_table2::hash_table ht(\"index_manager\");\n\n\t\tsharded_index_builder<domain_record> idx(\"domain_info\", 997);\n\n\t\tidx.for_each_record([&ht](domain_record &rec) {\n\t\t\tURL u;\n\t\t\tconst auto domain = ht.find(rec.m_value);\n\n\t\t\tfloat harmonic = domain_stats::harmonic_centrality(domain);\n\n\t\t\trec.m_score = harmonic;\n\t\t});\n\t\t\n\t}\n\n\tvoid make_url_bloom_filter() {\n\n\t\thash_table2::hash_table ht(\"index_manager\");\n\n\t\t::algorithm::bloom_filter urls_to_index(625000027);\n\n\t\tht.for_each_key([&urls_to_index](uint64_t key) {\n\t\t\turls_to_index.insert(key);\n\t\t});\n\n\t\turls_to_index.write_file(config::data_path() + \"/0/url_filter.bloom\");\n\n\t}\n\n\tvoid count_words_that_hit_max() {\n\n\t\tsharded<basic_index, url_record> url_index(\"url_index\", 4001);\n\n\t\tsize_t counter = 0;\n\t\turl_index.for_each([&](uint64_t key, auto &records) {\n\t\t\tif (records.size() >= config::ft_max_results_per_section) {\n\t\t\t\tcounter++;\n\t\t\t\tstd::cout << counter << std::endl;\n\t\t\t}\n\t\t});\n\n\t}\n\n\tsize_t count_urls() {\n\t\tindexer::index_manager idx_manager;\n\t\treturn idx_manager.url_count();\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/console.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\n\tvoid console();\n\tvoid index_links();\n\tvoid index_urls();\n\tvoid truncate_links();\n\tvoid domain_info_server();\n\tvoid search_server();\n\tvoid make_domain_index();\n\tvoid make_domain_index_scores();\n\tvoid make_url_bloom_filter();\n\tvoid optimize_urls();\n\tvoid count_words_that_hit_max();\n\tsize_t count_urls();\n\n}\n"
  },
  {
    "path": "src/indexer/counted_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\t/*\n\tThis is a record type for counting things.\n\t*/\n\t#pragma pack(4)\n\tclass counted_record {\n\n\t\tpublic:\n\t\tuint64_t m_value;\n\t\tuint64_t m_count;\n\t\tfloat m_score;\n\n\t\tcounted_record() : m_value(0), m_count(1), m_score(0.0f) {};\n\t\tcounted_record(uint64_t value) : m_value(value), m_count(1), m_score(0.0f) {};\n\t\tcounted_record(uint64_t value, float score) : m_value(value), m_count(1), m_score(score) {};\n\t\tcounted_record(uint64_t value, float score, size_t count) : m_value(value), m_count(count), m_score(score) {};\n\n\t\tbool operator==(const counted_record &b) const {\n\t\t\treturn m_value == b.m_value;\n\t\t}\n\n\t\tbool operator<(const counted_record &b) const {\n\t\t\treturn m_value < b.m_value;\n\t\t}\n\n\t\tcounted_record &operator+=(const counted_record &b) {\n\t\t\tm_count += b.m_count;\n\t\t\treturn *this;\n\t\t}\n\n\t\t/*\n\t\t * Will be applied to records before truncating. Top records will be kept.\n\t\t * */\n\t\tstruct truncate_order {\n\t\t\tinline bool operator() (const counted_record &a, const counted_record &b) {\n\t\t\t\treturn a.m_count > b.m_count;\n\t\t\t}\n\t\t};\n\n\t\t/*\n\t\t * Will be applied before storing on disk. This is the order the records will be returned in.\n\t\t * */\n\t\tstruct storage_order {\n\t\t\tinline bool operator() (const counted_record &a, const counted_record &b) {\n\t\t\t\treturn a.m_value < b.m_value;\n\t\t\t}\n\t\t};\n\n\t\tbool storage_equal(const counted_record &a) const {\n\t\t\treturn m_value == a.m_value;\n\t\t}\n\n\t};\n\t#pragma pack()\n}\n"
  },
  {
    "path": "src/indexer/domain_link_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\n\t#pragma pack(4)\n\tclass domain_link_record {\n\t\tpublic:\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\t\tuint64_t m_source_domain;\n\t\tuint64_t m_target_domain;\n\n\t\tdomain_link_record() : m_value(0), m_score(0.0f) {};\n\t\tdomain_link_record(uint64_t value) : m_value(value), m_score(0.0f) {};\n\t\tdomain_link_record(uint64_t value, float score) : m_value(value), m_score(score) {};\n\t\tdomain_link_record(uint64_t value, float score, uint64_t target_domain)\n\t\t\t\t: m_value(value), m_score(score), m_target_domain(target_domain) {};\n\n\t\tbool operator==(const domain_link_record &b) const {\n\t\t\treturn m_value == b.m_value;\n\t\t}\n\n\t\tbool operator<(const domain_link_record &b) const {\n\t\t\treturn m_value < b.m_value;\n\t\t}\n\n\t\tdomain_link_record &operator+=(const domain_link_record &b) {\n\t\t\treturn *this;\n\t\t}\n\n\t\t/*\n\t\t * Will be applied to records before truncating. Top records will be kept.\n\t\t * */\n\t\tstruct truncate_order {\n\t\t\tinline bool operator() (const domain_link_record &a, const domain_link_record &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t}\n\t\t};\n\n\t\t/*\n\t\t * Will be applied before storing on disk. This is the order the records will be returned in.\n\t\t * */\n\t\tstruct storage_order {\n\t\t\tinline bool operator() (const domain_link_record &a, const domain_link_record &b) {\n\t\t\t\treturn a.m_target_domain < b.m_target_domain;\n\t\t\t}\n\t\t};\n\n\t\tbool storage_equal(const domain_link_record &a) const {\n\t\t\treturn m_target_domain == a.m_target_domain;\n\t\t}\n\n\t};\n\t#pragma pack()\n}\n"
  },
  {
    "path": "src/indexer/domain_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"generic_record.h\"\n\nnamespace indexer {\n\tclass domain_record: public generic_record {\n\n\t\tpublic:\n\t\tdomain_record() : generic_record() {};\n\t\tdomain_record(uint64_t value) : generic_record(value) {};\n\t\tdomain_record(uint64_t value, float score) : generic_record(value, score) {};\n\n\t};\n}\n"
  },
  {
    "path": "src/indexer/generic_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\t/*\n\tThis is the base class for the record stored on disk. Needs to be small!\n\t*/\n\t#pragma pack(4)\n\tclass generic_record {\n\n\t\tpublic:\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\n\t\texplicit generic_record() : m_value(0), m_score(0.0f) {};\n\t\texplicit generic_record(uint64_t value) : m_value(value), m_score(0.0f) {};\n\t\texplicit generic_record(uint64_t value, float score) : m_value(value), m_score(score) {};\n\n\t\tbool operator==(const generic_record &b) const {\n\t\t\treturn m_value == b.m_value;\n\t\t}\n\n\t\tbool operator<(const generic_record &b) const {\n\t\t\treturn m_value < b.m_value;\n\t\t}\n\n\t\tstruct storage_order {\n\t\t\tinline bool operator() (const generic_record &a, const generic_record &b) {\n\t\t\t\treturn a.m_value < b.m_value;\n\t\t\t}\n\t\t};\n\n\t\t/*\n\t\t * Will be applied to records before truncating. Top records will be kept.\n\t\t * */\n\t\tstruct truncate_order {\n\t\t\tinline bool operator() (const generic_record &a, const generic_record &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t}\n\t\t};\n\n\t\tstruct score_order {\n\t\t\tinline bool operator() (const generic_record &a, const generic_record &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t}\n\t\t};\n\n\t\tbool storage_equal(const generic_record &a) const {\n\t\t\treturn m_value == a.m_value;\n\t\t}\n\n\t\tgeneric_record operator+(const generic_record &b) const {\n\t\t\t// can be overloaded to perform summation over scores but default behaviour is to not add scores.\n\t\t\tgeneric_record sum;\n\t\t\tsum.m_value = m_value;\n\t\t\tsum.m_score = m_score /* + b.m_score */;\n\t\t\treturn sum;\n\t\t}\n\n\t\tgeneric_record &operator+=(const generic_record &b) {\n\t\t\t// can be overloaded to perform summation over scores but default behaviour is to not add scores.\n\t\t\t// m_score += b.m_score;\n\t\t\treturn *this;\n\t\t}\n\n\t};\n\t#pragma pack()\n}\n"
  },
  {
    "path": "src/indexer/index.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <set>\n#include <cmath>\n#include <mutex>\n#include \"index_base.h\"\n#include \"roaring/roaring.hh\"\n#include \"algorithm/intersection.h\"\n#include \"algorithm/top_k.h\"\n\nnamespace indexer {\n\n\ttemplate<typename data_record>\n\tclass index : public index_base<data_record> {\n\n\tpublic:\n\n\t\texplicit index(const std::string &file_name);\n\t\texplicit index(const std::string &db_name, size_t id);\n\t\texplicit index(const std::string &db_name, size_t id, size_t hash_table_size);\n\t\texplicit index(std::istream *reader, size_t hash_table_size);\n\t\t~index();\n\n\t\tstd::vector<data_record> find(uint64_t key) const;\n\t\troaring::Roaring find_bitmap(uint64_t key) const;\n\n\t\t/*\n\t\t * Find intersection of multiple keys\n\t\t * Returns vector with records in storage order.\n\t\t * */\n\t\tstd::vector<data_record> find_intersection(const std::vector<uint64_t> &keys) const;\n\n\t\t/*\n\t\t * Find intersection of multiple keys applying lambda function score_mod to the scores before.\n\t\t * Returns n records with highest score.\n\t\t * score_mod is applied in storage_order of data_record.\n\t\t * */\n\t\tstd::vector<data_record> find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t n,\n\t\t\t\tstd::function<float(const data_record &)> score_mod = {}) const;\n\n\t\t/*\n\t\t * Overload without total_num_results.\n\t\t * */\n\t\tstd::vector<data_record> find_top(const std::vector<uint64_t> &keys, size_t n,\n\t\t\t\tstd::function<float(const data_record &)> score_mod = {}) const;\n\n\n\t\t\n\t\t/*\n\t\t * Returns inverse document frequency (idf) for the last search.\n\t\t * */\n\t\tfloat get_idf(size_t documents_with_term) const;\n\t\tsize_t get_document_count() const { return m_unique_count; }\n\n\t\tvoid print_stats();\n\n\t\tstd::set<uint64_t> get_keys(size_t with_more_than_records) const;\n\t\tconst std::vector<data_record> &records() const { return m_records; }\n\n\t\tvoid for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const;\n\n\tprivate:\n\n\t\tmutable std::istream *m_reader;\n\t\tstd::unique_ptr<std::ifstream> m_default_reader;\n\n\t\tstd::string m_file_name;\n\t\tstd::string m_db_name;\n\t\tsize_t m_id;\n\t\tsize_t m_unique_count = 0;\n\n\t\tstd::vector<data_record> m_records;\n\t\tmutable std::vector<float> m_scores;\n\n\t\tsize_t read_key_pos(uint64_t key) const;\n\t\tvoid read_meta();\n\t\tstd::string mountpoint() const;\n\t\tstd::string filename() const;\n\t\tstd::string meta_filename() const;\n\t\tvoid read_records();\n\t\t\n\t};\n\n\ttemplate<typename data_record>\n\tindex<data_record>::index(const std::string &file_name)\n\t: index_base<data_record>(), m_file_name(file_name) {\n\t\tm_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);\n\t\tm_reader = m_default_reader.get();\n\t\tread_records();\n\t}\n\n\ttemplate<typename data_record>\n\tindex<data_record>::index(const std::string &db_name, size_t id)\n\t: index_base<data_record>(), m_db_name(db_name), m_id(id) {\n\t\tm_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);\n\t\tm_reader = m_default_reader.get();\n\t\tread_records();\n\t}\n\n\ttemplate<typename data_record>\n\tindex<data_record>::index(const std::string &db_name, size_t id, size_t hash_table_size)\n\t: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id) {\n\t\tm_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);\n\t\tm_reader = m_default_reader.get();\n\t\tread_records();\n\t}\n\n\ttemplate<typename data_record>\n\tindex<data_record>::index(std::istream *reader, size_t hash_table_size)\n\t: index_base<data_record>(hash_table_size) {\n\t\tm_reader = reader;\n\t\tread_records();\n\t}\n\n\ttemplate<typename data_record>\n\tindex<data_record>::~index() {\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> index<data_record>::find(uint64_t key) const {\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\troaring::Roaring rr = find_bitmap(key);\n\n\t\tstd::function<data_record(uint32_t)> id_to_rec = [this](uint32_t id) {\n\t\t\tdata_record rec;\n\t\t\tm_reader->seekg((this->m_hash_table_size + 1) * sizeof(uint64_t) + id * sizeof(data_record), std::ios::beg);\n\t\t\tm_reader->read((char *)&rec, sizeof(data_record));\n\t\t\treturn rec;\n\t\t};\n\n\t\tstd::vector<data_record> ret;\n\t\tfor (uint32_t internal_id : rr) {\n\t\t\tret.emplace_back(id_to_rec(internal_id));\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\troaring::Roaring index<data_record>::find_bitmap(uint64_t key) const {\n\t\tsize_t key_pos = read_key_pos(key);\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\tif (key_pos == SIZE_MAX) {\n\t\t\treturn roaring::Roaring();\n\t\t}\n\n\t\t// Read page.\n\t\tm_reader->seekg(key_pos, std::ios::beg);\n\t\tsize_t num_keys;\n\t\tm_reader->read((char *)&num_keys, sizeof(size_t));\n\n\t\tstd::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);\n\t\tuint64_t *keys = keys_allocator.get();\n\t\tm_reader->read((char *)keys, num_keys * sizeof(uint64_t));\n\n\t\tsize_t key_data_pos = SIZE_MAX;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tif (keys[i] == key) {\n\t\t\t\tkey_data_pos = i;\n\t\t\t}\n\t\t}\n\n\t\tif (key_data_pos == SIZE_MAX) {\n\t\t\treturn roaring::Roaring();\n\t\t}\n\n\t\tchar buffer[64];\n\n\t\t// Read position and length.\n\t\tm_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8, std::ios::beg);\n\t\tm_reader->read(buffer, 8);\n\t\tsize_t pos = *((size_t *)(&buffer[0]));\n\n\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8, std::ios::beg);\n\t\tm_reader->read(buffer, 8);\n\t\tsize_t len = *((size_t *)(&buffer[0]));\n\n\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos, std::ios::beg);\n\n\t\tstd::unique_ptr<char[]> data_allocator = std::make_unique<char[]>(len);\n\t\tchar *data = data_allocator.get();\n\n\t\tm_reader->read(data, len);\n\n\t\treturn roaring::Roaring::readSafe(data, len);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> index<data_record>::find_intersection(const std::vector<uint64_t> &keys) const {\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\tstd::vector<roaring::Roaring> bitmaps;\n\t\tfor (auto key : keys) {\n\t\t\tbitmaps.emplace_back(std::move(find_bitmap(key)));\n\t\t}\n\n\t\tauto intersection = ::algorithm::intersection(bitmaps);\n\t\tstd::vector<data_record> res;\n\t\tfor (auto internal_id : intersection) {\n\t\t\tres.emplace_back(m_records[internal_id]);\n\t\t}\n\n\t\treturn res;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> index<data_record>::find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t num,\n\t\t\tstd::function<float(const data_record &)> score_mod) const {\n\n\t\tstd::lock_guard lock(this->m_lock);\n\n\t\tstd::vector<roaring::Roaring> bitmaps;\n\t\tfor (auto key : keys) {\n\t\t\tbitmaps.emplace_back(std::move(find_bitmap(key)));\n\t\t}\n\n\t\tif (keys.size() == 0) {\n\t\t\t// Return all records...\n\t\t\troaring::Roaring all_ids;\n\t\t\tall_ids.addRange(0, m_records.size());\n\t\t\tbitmaps.push_back(all_ids);\n\t\t}\n\n\t\tauto intersection = ::algorithm::intersection(bitmaps);\n\n\t\ttotal_num_results = intersection.cardinality();\n\n\t\t// Apply score modifications.\n\t\tstd::vector<uint32_t> ids;\n\t\tif (score_mod) {\n\t\t\tfor (auto internal_id : intersection) {\n\t\t\t\tids.push_back(internal_id);\n\t\t\t\tm_scores[internal_id] = m_records[internal_id].m_score + score_mod(m_records[internal_id]);\n\t\t\t}\n\t\t} else {\n\t\t\tfor (auto internal_id : intersection) {\n\t\t\t\tids.push_back(internal_id);\n\t\t\t\tm_scores[internal_id] = m_records[internal_id].m_score;\n\t\t\t}\n\t\t}\n\n\t\tauto ordered = [this](const uint32_t &a, const uint32_t &b) {\n\t\t\treturn m_scores[a] < m_scores[b];\n\t\t};\n\n\t\tstd::vector<uint32_t> top_ids = ::algorithm::top_k<uint32_t>(ids, num, ordered);\n\n\t\tstd::vector<data_record> ret;\n\t\tfor (uint32_t internal_id : top_ids) {\n\t\t\tret.push_back(m_records[internal_id]);\n\t\t\tret.back().m_score = m_scores[internal_id];\n\t\t}\n\n\t\tstd::sort(ret.begin(), ret.end(), typename data_record::truncate_order());\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> index<data_record>::find_top(const std::vector<uint64_t> &keys, size_t num,\n\t\t\tstd::function<float(const data_record &)> score_mod) const {\n\t\tsize_t total_num_results;\n\t\treturn find_top(total_num_results, keys, num, score_mod);\n\t}\n\n\ttemplate<typename data_record>\n\tfloat index<data_record>::get_idf(size_t documents_with_term) const {\n\t\tif (documents_with_term) {\n\t\t\tconst size_t documents_in_corpus = m_unique_count;\n\t\t\tfloat idf = std::log((float)documents_in_corpus / documents_with_term);\n\t\t\treturn idf;\n\t\t}\n\n\t\treturn 0.0f;\n\t}\n\n\t/*\n\t * Reads the exact position of the key, returns SIZE_MAX if the key was not found.\n\t * */\n\ttemplate<typename data_record>\n\tsize_t index<data_record>::read_key_pos(uint64_t key) const {\n\n\t\tif (this->m_hash_table_size == 0) return 0;\n\n\t\tconst size_t hash_pos = key % this->m_hash_table_size;\n\n\t\tm_reader->seekg(hash_pos * sizeof(size_t), std::ios::beg);\n\n\t\tsize_t pos;\n\t\tm_reader->read((char *)&pos, sizeof(size_t));\n\n\t\treturn pos;\n\t}\n\n\t/*\n\t * Reads the count of unique recprds from the count file and puts it in the m_unique_count member.\n\t * */\n\ttemplate<typename data_record>\n\tvoid index<data_record>::read_meta() {\n\t\tstruct meta {\n\t\t\tsize_t unique_count;\n\t\t};\n\n\t\tmeta m;\n\n\t\tstd::ifstream meta_reader(meta_filename(), std::ios::binary);\n\n\t\tif (meta_reader.is_open()) {\n\t\t\tmeta_reader.read((char *)(&m), sizeof(meta));\n\t\t}\n\n\t\tm_unique_count = m.unique_count;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index<data_record>::mountpoint() const {\n\t\treturn std::to_string(m_id % 8);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index<data_record>::filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".data\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".data\";\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index<data_record>::meta_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".meta\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".meta\";\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index<data_record>::print_stats() {\n\n\t\tsize_t total_num_keys = 0;\n\t\tsize_t total_num_larger_100 = 0;\n\t\tsize_t total_num_larger_10 = 0;\n\t\tsize_t total_num_records = 0;\n\t\tsize_t total_roaring_size = 0;\n\t\tsize_t total_record_size = 0;\n\t\tsize_t total_file_size = 0;\n\t\tsize_t total_cardinality = 0;\n\t\tsize_t total_page_header_size = 0;\n\n\t\tm_reader->seekg(this->hash_table_byte_size(), std::ios::beg);\n\t\tm_reader->read((char *)&total_num_records, sizeof(size_t));\n\n\t\ttotal_record_size = total_num_records * sizeof(data_record);\n\n\t\tfor (size_t page = 0; page < this->m_hash_table_size; page++) {\n\t\t\tsize_t key_pos = read_key_pos(page);\n\n\t\t\tif (key_pos == SIZE_MAX) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Read page.\n\t\t\tm_reader->seekg(key_pos, std::ios::beg);\n\t\t\tsize_t num_keys;\n\t\t\tm_reader->read((char *)&num_keys, sizeof(size_t));\n\n\t\t\ttotal_num_keys += num_keys;\n\n\t\t\tstd::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);\n\t\t\tuint64_t *keys = keys_allocator.get();\n\t\t\tm_reader->read((char *)keys, num_keys * sizeof(uint64_t));\n\t\t\ttotal_page_header_size += num_keys * sizeof(uint64_t) * 3;\n\n\t\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\t\tsize_t key_data_pos = i;\n\t\t\t\t\n\t\t\t\tchar buffer[64];\n\n\t\t\t\t// Read position and length.\n\t\t\t\tm_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8, std::ios::beg);\n\t\t\t\tm_reader->read(buffer, 8);\n\t\t\t\tsize_t pos = *((size_t *)(&buffer[0]));\n\n\t\t\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8, std::ios::beg);\n\t\t\t\tm_reader->read(buffer, 8);\n\t\t\t\tsize_t len = *((size_t *)(&buffer[0]));\n\n\t\t\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos, std::ios::beg);\n\n\t\t\t\tstd::unique_ptr<char[]> data_allocator = std::make_unique<char[]>(len);\n\t\t\t\tchar *data = data_allocator.get();\n\n\t\t\t\tm_reader->read(data, len);\n\n\t\t\t\troaring::Roaring rr = roaring::Roaring::readSafe(data, len);\n\n\t\t\t\tconst size_t card = rr.cardinality();\n\t\t\t\tif (card > 100) {\n\t\t\t\t\ttotal_num_larger_100++;\n\t\t\t\t}\n\t\t\t\tif (card > 10) {\n\t\t\t\t\ttotal_num_larger_10++;\n\t\t\t\t}\n\t\t\t\ttotal_cardinality += card;\n\t\t\t\ttotal_roaring_size += len;\n\t\t\t}\n\t\t}\n\n\t\tstd::cout << \"total_num_keys: \" << total_num_keys << std::endl;\n\t\tstd::cout << \"total_num_larger_10: \" << total_num_larger_10 << std::endl;\n\t\tstd::cout << \"total_num_larger_100: \" << total_num_larger_100 << std::endl;\n\t\tstd::cout << \"total_num_records: \" << total_num_records << std::endl;\n\t\tstd::cout << \"record size: \" << total_record_size << \" (\" << 100*((float)total_record_size / total_file_size) << \"%)\" << std::endl;\n\t\tstd::cout << \"page header size: \" << total_page_header_size << \" (\" << 100*((float)total_page_header_size / total_file_size) << \"%)\" << std::endl;\n\t\tstd::cout << \"roaring size: \" << total_roaring_size << \" (\" << 100*((float)total_roaring_size / total_file_size) << \"%)\" << std::endl;\n\t\tstd::cout << \"mean length for key: \" << total_roaring_size / total_num_keys << std::endl;\n\t\tstd::cout << \"mean cardinality for key: \" << total_cardinality / total_num_keys << std::endl;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::set<uint64_t> index<data_record>::get_keys(size_t with_more_than_records) const {\n\n\t\tstd::set<uint64_t> all_keys;\n\n\t\tfor (size_t page = 0; page < this->m_hash_table_size; page++) {\n\t\t\tsize_t key_pos = read_key_pos(page);\n\n\t\t\tif (key_pos == SIZE_MAX) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Read page.\n\t\t\tm_reader->seekg(key_pos, std::ios::beg);\n\t\t\tsize_t num_keys;\n\t\t\tm_reader->read((char *)&num_keys, sizeof(size_t));\n\n\t\t\tstd::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);\n\t\t\tuint64_t *keys = keys_allocator.get();\n\t\t\tm_reader->read((char *)keys, num_keys * sizeof(uint64_t));\n\n\t\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\t\tsize_t key_data_pos = i;\n\t\t\t\t\n\t\t\t\tchar buffer[64];\n\n\t\t\t\t// Read position and length.\n\t\t\t\tm_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8, std::ios::beg);\n\t\t\t\tm_reader->read(buffer, 8);\n\t\t\t\tsize_t pos = *((size_t *)(&buffer[0]));\n\n\t\t\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8, std::ios::beg);\n\t\t\t\tm_reader->read(buffer, 8);\n\t\t\t\tsize_t len = *((size_t *)(&buffer[0]));\n\n\t\t\t\tm_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos, std::ios::beg);\n\n\t\t\t\tstd::unique_ptr<char[]> data_allocator = std::make_unique<char[]>(len);\n\t\t\t\tchar *data = data_allocator.get();\n\n\t\t\t\tm_reader->read(data, len);\n\n\t\t\t\troaring::Roaring rr = roaring::Roaring::readSafe(data, len);\n\n\t\t\t\tconst size_t card = rr.cardinality();\n\t\t\t\tif (card > with_more_than_records) {\n\t\t\t\t\tall_keys.insert(keys[i]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn all_keys;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index<data_record>::for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const {\n\n\t\tm_reader->seekg(this->hash_table_byte_size(), std::ios::beg);\n\n\t\tsize_t num_records = 0;\n\t\tm_reader->read((char *)&num_records, sizeof(size_t));\n\t\tm_reader->seekg(num_records * sizeof(data_record), std::ios::cur);\n\n\t\tstd::map<uint64_t, roaring::Roaring> page;\n\t\twhile (this->read_bitmap_page_into(*m_reader, page)) {\n\t\t\tfor (auto &iter : page) {\n\t\t\t\ton_each_key(iter.first, iter.second);\n\t\t\t}\n\t\t\tpage.clear();\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index<data_record>::read_records() {\n\t\tsize_t num_records = 0;\n\t\tm_reader->seekg(this->hash_table_byte_size());\n\t\tm_reader->read((char *)&num_records, sizeof(uint64_t));\n\t\tm_records.resize(num_records);\n\t\tm_reader->read((char *)m_records.data(), num_records * sizeof(data_record));\n\t\tm_scores.resize(num_records);\n\t\tstd::fill(m_scores.begin(), m_scores.end(), 0.0f);\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/index_base.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <vector>\n#include <memory>\n#include \"config.h\"\n#include \"logger/logger.h\"\n#include \"roaring/roaring.hh\"\n\nnamespace indexer {\n\n\ttemplate<typename data_record>\n\tclass index_base {\n\n\t\tpublic:\n\n\t\tindex_base();\n\t\texplicit index_base(size_t hash_table_size);\n\n\t\tvoid set_hash_table_size(size_t size) { m_hash_table_size = size; }\n\n\t\tprotected:\n\n\t\t\tsize_t m_hash_table_size;\n\t\t\tmutable std::recursive_mutex m_lock;\n\n\t\t\tbool read_page_into(std::istream &reader, std::map<uint64_t, std::vector<data_record>> &into) const;\n\t\t\tbool read_bitmap_page_into(std::istream &reader, std::map<uint64_t, roaring::Roaring> &into) const;\n\t\t\tsize_t hash_table_byte_size() const { return m_hash_table_size * sizeof(size_t); }\n\t};\n\n\ttemplate<typename data_record>\n\tindex_base<data_record>::index_base()\n\t: m_hash_table_size(config::shard_hash_table_size)\n\t{}\n\n\ttemplate<typename data_record>\n\tindex_base<data_record>::index_base(size_t hash_table_size)\n\t: m_hash_table_size(hash_table_size)\n\t{}\n\n\ttemplate<typename data_record>\n\tbool index_base<data_record>::read_page_into(std::istream &reader, std::map<uint64_t, std::vector<data_record>> &into) const {\n\n\t\tuint64_t num_keys;\n\t\treader.read((char *)&num_keys, sizeof(uint64_t));\n\t\tif (reader.eof()) return false;\n\n\t\tstd::unique_ptr<char[]> vector_buffer_allocator;\n\t\ttry {\n\t\t\tvector_buffer_allocator = std::make_unique<char[]>(num_keys * sizeof(uint64_t));\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << num_keys << \" keys\" << std::endl;\n\t\t\treturn false;\n\t\t}\n\n\t\tchar *vector_buffer = vector_buffer_allocator.get();\n\n\t\t// Read the keys.\n\t\treader.read(vector_buffer, num_keys * sizeof(uint64_t));\n\t\tstd::vector<uint64_t> keys;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tkeys.push_back(*((uint64_t *)(&vector_buffer[i*8])));\n\t\t}\n\n\t\t// Read the positions.\n\t\treader.read(vector_buffer, num_keys * 8);\n\t\tstd::vector<size_t> positions;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tpositions.push_back(*((size_t *)(&vector_buffer[i*8])));\n\t\t}\n\n\t\t// Read the lengths.\n\t\treader.read(vector_buffer, num_keys * 8);\n\t\tstd::vector<size_t> lens;\n\t\tsize_t max_len = 0;\n\t\tsize_t data_size = 0;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tsize_t len = *((size_t *)(&vector_buffer[i*8]));\n\t\t\tif (len > max_len) max_len = len;\n\t\t\tlens.push_back(len);\n\t\t\tdata_size += len;\n\t\t}\n\n\t\tif (data_size == 0) return true;\n\n\t\tstd::unique_ptr<char[]> buffer_allocator;\n\t\ttry {\n\t\t\tbuffer_allocator = std::make_unique<char[]>(max_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << max_len << \" bytes\" << std::endl;\n\t\t\treturn false;\n\t\t}\n\t\tchar *buffer = buffer_allocator.get();\n\n\t\t// Read the records.\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tconst size_t len = lens[i];\n\t\t\treader.read(buffer, len);\n\t\t\tconst size_t read_len = reader.gcount();\n\t\t\tif (read_len != len) {\n\t\t\t\tLOG_INFO(\"Data stopped before end. Ignoring shard\");\n\t\t\t\treturn false;\n\t\t\t}\n\n\t\t\tconst data_record *records = (data_record *)buffer;\n\t\t\tconst size_t num_records = len / sizeof(data_record);\n\n\t\t\tfor (size_t j = 0; j < num_records; j++) {\n\t\t\t\tinto[keys[i]].push_back(records[j]);\n\t\t\t}\n\t\t}\n\n\t\treturn true;\n\t}\n\n\ttemplate<typename data_record>\n\tbool index_base<data_record>::read_bitmap_page_into(std::istream &reader, std::map<uint64_t, roaring::Roaring> &into) const {\n\n\t\tuint64_t num_keys;\n\t\treader.read((char *)&num_keys, sizeof(uint64_t));\n\t\tif (reader.eof()) return false;\n\n\t\tstd::unique_ptr<char[]> vector_buffer_allocator;\n\t\ttry {\n\t\t\tvector_buffer_allocator = std::make_unique<char[]>(num_keys * sizeof(uint64_t));\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << num_keys << \" keys\" << std::endl;\n\t\t\treturn false;\n\t\t}\n\n\t\tchar *vector_buffer = vector_buffer_allocator.get();\n\n\t\t// Read the keys.\n\t\treader.read(vector_buffer, num_keys * sizeof(uint64_t));\n\t\tstd::vector<uint64_t> keys;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tkeys.push_back(*((uint64_t *)(&vector_buffer[i*8])));\n\t\t}\n\n\t\t// Read the positions.\n\t\treader.read(vector_buffer, num_keys * 8);\n\t\tstd::vector<size_t> positions;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tpositions.push_back(*((size_t *)(&vector_buffer[i*8])));\n\t\t}\n\n\t\t// Read the lengths.\n\t\treader.read(vector_buffer, num_keys * 8);\n\t\tstd::vector<size_t> lens;\n\t\tsize_t max_len = 0;\n\t\tsize_t data_size = 0;\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tsize_t len = *((size_t *)(&vector_buffer[i*8]));\n\t\t\tif (len > max_len) max_len = len;\n\t\t\tlens.push_back(len);\n\t\t\tdata_size += len;\n\t\t}\n\n\t\tif (data_size == 0) return true;\n\n\t\tstd::unique_ptr<char[]> buffer_allocator;\n\t\ttry {\n\t\t\tbuffer_allocator = std::make_unique<char[]>(max_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << max_len << \" bytes\" << std::endl;\n\t\t\tthrow exception;\n\t\t}\n\t\tchar *buffer = buffer_allocator.get();\n\n\t\t// Read the bitmap data.\n\t\tfor (size_t i = 0; i < num_keys; i++) {\n\t\t\tconst size_t len = lens[i];\n\t\t\treader.read(buffer, len);\n\t\t\tconst size_t read_len = reader.gcount();\n\t\t\tif (read_len != len) {\n\t\t\t\tLOG_INFO(\"Data stopped before end. Ignoring shard \");\n\t\t\t\tthrow std::runtime_error(\"Data stopped before end. File is corrupt.\");\n\t\t\t}\n\n\t\t\tinto[keys[i]] = roaring::Roaring::readSafe(buffer, len);\n\t\t}\n\n\t\treturn true;\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/index_builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n#include <map>\n#include <set>\n#include <unordered_set>\n#include <cstring>\n#include <cassert>\n#include <numeric>\n#include <boost/filesystem.hpp>\n#include <sstream>\n#include \"merger.h\"\n#include \"score_builder.h\"\n#include \"index_utils.h\"\n#include \"index_base.h\"\n#include \"index.h\"\n#include \"algorithm/hyper_log_log.h\"\n#include \"config.h\"\n#include \"profiler/profiler.h\"\n#include \"logger/logger.h\"\n#include \"file/file.h\"\n#include \"memory/debugger.h\"\n#include \"roaring/roaring.hh\"\n#include \"URL.h\"\n\nnamespace indexer {\n\n\t/*\n\t\t<hash-table-data> uint8_t[hash_table_size]\n\t\t<num-records> uint64_t\n\t\t<records> data_record[num-records] sequence of records, the position of the record is the internal_id\n\t\t<page-data> page[num_pages]\n\n\t\tpage format:\n\t\t<num_keys> uint64_t\n\t\t<key-data> uint64_t[num_keys] sorted by key for binary search\n\t\t<pos-data> uint64_t[num_keys] position of record data start\n\t\t<len-data> uint64_t[num_keys] length of record data\n\t\t<record-data> <bitmap>[num_keys] bitmap is a roaring bitmap (CRoaring)\n\t*/\n\n\tenum class algorithm { bm25 = 101, tf_idf = 102};\n\n\ttemplate<typename data_record>\n\tclass index_builder : public index_base<data_record> {\n\tprivate:\n\t\t// Non copyable\n\t\tindex_builder(const index_builder &);\n\t\tindex_builder& operator=(const index_builder &);\n\tpublic:\n\n\t\texplicit index_builder(const std::string &file_name);\n\t\texplicit index_builder(size_t hash_table_size, const std::string &file_name);\n\t\texplicit index_builder(const std::string &db_name, size_t id);\n\t\texplicit index_builder(const std::string &db_name, size_t id, size_t hash_table_size);\n\t\texplicit index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results);\n\t\texplicit index_builder(const std::string &db_name, size_t id, std::function<uint32_t(const data_record &)> &rec_to_id);\n\t\t~index_builder();\n\n\t\tvoid add(uint64_t key, const data_record &record);\n\t\tsize_t cache_size() const;\n\t\tvoid transform(const std::function<uint32_t(uint32_t)> &transform);\n\t\t\n\t\tvoid append();\n\t\tvoid merge();\n\t\tvoid merge(std::unordered_map<uint64_t, uint32_t> &internal_id_map);\n\t\tvoid merge_with(const index<data_record> &other);\n\t\tvoid optimize();\n\n\t\tvoid truncate();\n\t\tvoid truncate_cache_files();\n\t\tvoid create_directories();\n\n\t\t/*void calculate_scores(algorithm algo, const score_builder &score);\n\n\t\tvoid calculate_scores_for_token(algorithm algo, const score_builder &score, uint64_t token,\n\t\t\tstd::vector<data_record> &records);\n\t\tfloat calculate_score_for_record(algorithm algo, const score_builder &score, uint64_t token,\n\t\t\tconst data_record &record);*/\n\n\t\tsize_t get_max_id();\n\n\t\tstatic void create_directories(const std::string &db_name);\n\n\tprivate:\n\n\t\tstd::string m_file_name;\n\t\tstd::string m_db_name;\n\t\tconst size_t m_id;\n\n\t\tconst size_t m_max_results;\n\n\t\tstd::mutex m_lock;\n\n\t\t// Caches\n\t\tstd::vector<uint64_t> m_key_cache;\n\t\tstd::vector<data_record> m_record_cache;\n\t\t\n\n\t\tstd::vector<data_record> m_records;\n\t\tstd::map<uint64_t, uint32_t> m_record_id_map;\n\t\tstd::map<uint64_t, roaring::Roaring> m_bitmaps;\n\n\t\tstd::function<uint32_t(const data_record &)> m_record_id_to_internal_id = [this](const data_record &record) {\n\t\t\tif (m_record_id_map.count(record.m_value) == 0) {\n\t\t\t\tm_record_id_map[record.m_value] = m_records.size();\n\t\t\t\tm_records.push_back(record);\n\t\t\t}\n\t\t\treturn m_record_id_map[record.m_value];\n\t\t};\n\n\t\tvoid read_append_cache();\n\t\tvoid read_append_cache(std::unordered_map<uint64_t, uint32_t> &internal_id_map);\n\t\tvoid read_data_to_cache();\n\t\tbool read_page(std::ifstream &reader);\n\t\tvoid reset_cache_variables();\n\t\tvoid save_file();\n\t\tvoid write_key(std::ostream &key_writer, uint64_t key, size_t page_pos);\n\t\tsize_t write_page(std::ostream &writer, const std::vector<uint64_t> &keys);\n\t\tvoid reset_key_map(std::ostream &key_writer);\n\t\tstd::vector<data_record> read_records() const;\n\t\tvoid write_records(std::ostream &writer);\n\t\tuint32_t default_record_to_internal_id(const data_record &record);\n\n\t\tstd::string mountpoint() const;\n\t\tstd::string cache_filename() const;\n\t\tstd::string key_cache_filename() const;\n\t\tstd::string target_filename() const;\n\t\tstd::string meta_filename() const;\n\n\t\tbool needs_optimization() const;\n\t\tvoid sort_records();\n\t\tvoid sort_records_and_bitmaps(std::vector<data_record> &records, std::map<uint64_t, roaring::Roaring> &bitmaps);\n\n\t};\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::index_builder(const std::string &file_name)\n\t: index_base<data_record>(), m_file_name(file_name), m_id(0),\n\t\tm_max_results(config::ft_max_results_per_section)\n\t{\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::index_builder(size_t hash_table_size, const std::string &file_name)\n\t: index_base<data_record>(hash_table_size), m_file_name(file_name), m_id(0),\n\t\tm_max_results(config::ft_max_results_per_section)\n\t{\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::index_builder(const std::string &db_name, size_t id)\n\t: index_base<data_record>(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::index_builder(const std::string &db_name, size_t id, size_t hash_table_size)\n\t: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results)\n\t: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(max_results) {\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::index_builder(const std::string &db_name, size_t id,\n\t\tstd::function<uint32_t(const data_record &)> &rec_to_id)\n\t: index_base<data_record>(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {\n\t\tm_record_id_to_internal_id = rec_to_id;\n\t\tmerger::register_merger((size_t)this, [this]() {merge();});\n\t\tmerger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });\n\t}\n\n\ttemplate<typename data_record>\n\tindex_builder<data_record>::~index_builder() {\n\t\tmerger::deregister_merger((size_t)this);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::add(uint64_t key, const data_record &record) {\n\t\tindexer::merger::lock();\n\n\t\tstd::lock_guard guard(m_lock);\n\n\t\t// Amortized constant\n\t\tm_key_cache.push_back(key);\n\t\tm_record_cache.push_back(record);\n\n\t}\n\n\t/*\n\t * Returns the allocated size of the cache (m_key_cache and m_record_cache).\n\t * */\n\ttemplate<typename data_record>\n\tsize_t index_builder<data_record>::cache_size() const {\n\t\treturn m_key_cache.capacity() * sizeof(uint64_t) + m_record_cache.capacity() * sizeof(data_record);\n\t}\n\n\t/*\n\t\tTransforms all the bitmaps in the index. Basically generating new bitmaps with the transform applied.\n\t*/\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::transform(const std::function<uint32_t(uint32_t)> &transform) {\n\t\tread_data_to_cache();\n\n\t\t// Apply transforms.\n\t\tfor (auto &iter : m_bitmaps) {\n\n\t\t\t::roaring::Roaring rr;\n\t\t\tfor (uint32_t v : iter.second) {\n\t\t\t\tconst uint32_t v_trans = transform(v);\n\t\t\t\trr.add(v_trans);\n\t\t\t}\n\t\t\tm_bitmaps[iter.first] = rr;\n\t\t}\n\n\t\tsave_file();\n\t\ttruncate_cache_files();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::append() {\n\n\t\tassert(m_record_cache.size() == m_key_cache.size());\n\n\t\tstd::ofstream record_writer(cache_filename(), std::ios::binary | std::ios::app);\n\t\tif (!record_writer.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + cache_filename() + \"). Error: \" +\n\t\t\t\tstd::string(strerror(errno)));\n\t\t}\n\n\t\tstd::ofstream key_writer(key_cache_filename(), std::ios::binary | std::ios::app);\n\t\tif (!key_writer.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + key_cache_filename() + \"). Error: \" +\n\t\t\t\tstd::string(strerror(errno)));\n\t\t}\n\n\t\trecord_writer.write((const char *)m_record_cache.data(), m_record_cache.size() * sizeof(data_record));\n\t\tkey_writer.write((const char *)m_key_cache.data(), m_key_cache.size() * sizeof(uint64_t));\n\n\t\tm_record_cache.clear();\n\t\tm_key_cache.clear();\n\t\tm_record_cache.shrink_to_fit();\n\t\tm_key_cache.shrink_to_fit();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::merge() {\n\t\tstd::unordered_map<uint64_t, uint32_t> internal_id_map;\n\t\tmerge(internal_id_map);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::merge(std::unordered_map<uint64_t, uint32_t> &internal_id_map) {\n\n\t\t{\n\t\t\tread_append_cache(internal_id_map);\n\t\t\tsave_file();\n\t\t\ttruncate_cache_files();\n\t\t}\n\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::merge_with(const index<data_record> &other) {\n\t\t/*\n\t\t * The only algorithm I can come up with is to append the records from 'other' that are not present in 'this'.\n\t\t * And also create a map from ids in 'other' to ids in the new record array.\n\t\t * Then transform the bitmaps in other before merging them.\n\t\t * */\n\n\t\tconst auto &other_records = other.records();\n\n\t\ttypename data_record::storage_order ordered;\n\n\t\tif (!std::is_sorted(other_records.cbegin(), other_records.cend(), ordered))\n\t\t\tthrow std::runtime_error(\"index_builder::merge_with needs optimized input\");\n\n\t\tread_data_to_cache();\n\n\t\tif (!std::is_sorted(m_records.cbegin(), m_records.cend(), ordered))\n\t\t\tthrow std::runtime_error(\"index_builder::merge_with needs to run on optimized index\");\n\n\t\tstd::map<uint32_t, uint32_t> id_map;\n\t\tstd::vector<data_record> new_records;\n\n\t\tsize_t i = 0, j = 0;\n\t\twhile (i < m_records.size() && j < other_records.size()) {\n\t\t\tif (ordered(m_records[i], other_records[j])) {\n\t\t\t\ti++;\n\t\t\t} else if (m_records[i].storage_equal(other_records[j])) {\n\t\t\t\tid_map[j] = i;\n\t\t\t\ti++;\n\t\t\t\tj++;\n\t\t\t} else {\n\t\t\t\tid_map[j] = m_records.size() + new_records.size();\n\t\t\t\tnew_records.push_back(other_records[j]);\n\t\t\t\tj++;\n\t\t\t}\n\t\t}\n\t\twhile (j < other_records.size()) {\n\t\t\tid_map[j] = m_records.size() + new_records.size();\n\t\t\tnew_records.push_back(other_records[j]);\n\t\t\tj++;\n\t\t}\n\n\t\tm_records.insert(m_records.end(), new_records.cbegin(), new_records.cend());\n\n\t\tother.for_each([this, &id_map](uint64_t key, roaring::Roaring &bitmap) {\n\t\t\troaring::Roaring new_bitmap;\n\t\t\tfor (auto idx : bitmap) {\n\t\t\t\tnew_bitmap.add(id_map[idx]);\n\t\t\t}\n\t\t\t// Union the bitmaps.\n\t\t\tm_bitmaps[key] |= new_bitmap;\n\t\t});\n\n\t\tsort_records_and_bitmaps(m_records, m_bitmaps);\n\n\t\tsave_file();\n\t\ttruncate_cache_files();\n\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::optimize() {\n\t\tif (needs_optimization()) {\n\t\t\tsort_records();\n\t\t}\n\t}\n\n\t/*\n\t\tDeletes ALL data from this shard.\n\t*/\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::truncate() {\n\t\tcreate_directories();\n\t\ttruncate_cache_files();\n\n\t\tstd::ofstream target_writer(target_filename(), std::ios::trunc);\n\t\ttarget_writer.close();\n\t}\n\n\t/*\n\t\tDeletes all data from caches.\n\t*/\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::truncate_cache_files() {\n\n\t\treset_cache_variables();\n\n\t\tfile::delete_file(cache_filename());\n\t\tfile::delete_file(key_cache_filename());\n\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::create_directories() {\n\t\tcreate_db_directories(m_db_name);\n\t}\n\n\ttemplate<typename data_record>\n\tsize_t index_builder<data_record>::get_max_id() {\n\n\t\tread_data_to_cache();\n\n\t\tuint32_t max_internal_id = 0;\n\t\tfor (const auto &iter : m_bitmaps) {\n\t\t\tuint32_t internal_id = iter.second.maximum();\n\t\t\tif (internal_id > max_internal_id) {\n\t\t\t\tmax_internal_id = internal_id;\n\t\t\t}\n\t\t}\n\n\t\treturn (size_t)max_internal_id;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::create_directories(const std::string &db_name) {\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tfile::create_directory(config::data_path() + \"/\" + std::to_string(i) + \"/full_text/\" + db_name);\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::read_append_cache() {\n\t\tstd::unordered_map<uint64_t, uint32_t> internal_id_map;\n\t\tread_append_cache(internal_id_map);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::read_append_cache(std::unordered_map<uint64_t, uint32_t> &internal_id_map) {\n\n\t\t// Read the current file.\n\t\tread_data_to_cache();\n\n\t\t//profiler::instance prof(\"index_builder::read_append_cache\");\n\n\t\t// Read the cache into memory.\n\t\tstd::ifstream reader(cache_filename(), std::ios::binary);\n\t\tif (!reader.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + cache_filename() + \"). Error: \" + std::string(strerror(errno)));\n\t\t}\n\n\t\tstd::ifstream key_reader(key_cache_filename(), std::ios::binary);\n\t\tif (!key_reader.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard (\" + key_cache_filename() + \"). Error: \" + std::string(strerror(errno)));\n\t\t}\n\n\t\tconst size_t buffer_len = 10000;\n\n\t\tstd::unique_ptr<data_record[]> buffer_allocator;\n\t\ttry {\n\t\t\tbuffer_allocator = std::make_unique<data_record[]>(buffer_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << buffer_len * sizeof(data_record) << \" bytes\" << std::endl;\n\t\t\treturn;\n\t\t}\n\n\t\tstd::unique_ptr<uint64_t[]> key_buffer_allocator;\n\t\ttry {\n\t\t\tkey_buffer_allocator = std::make_unique<uint64_t[]>(buffer_len);\n\t\t} catch (std::bad_alloc &exception) {\n\t\t\tstd::cout << \"bad_alloc detected: \" << exception.what() << \" file: \" << __FILE__ << \" line: \" << __LINE__ << std::endl;\n\t\t\tstd::cout << \"tried to allocate: \" << buffer_len * sizeof(uint64_t) << \" bytes\" << std::endl;\n\t\t\treturn;\n\t\t}\n\n\t\tdata_record *buffer = buffer_allocator.get();\n\t\tuint64_t *key_buffer = key_buffer_allocator.get();\n\n\t\treader.seekg(0, std::ios::beg);\n\n\t\tstd::unordered_map<uint64_t, vector<uint32_t>> bitmap_data;\n\n\t\twhile (!reader.eof()) {\n\n\t\t\treader.read((char *)buffer, buffer_len * sizeof(data_record));\n\t\t\tkey_reader.read((char *)key_buffer, buffer_len * sizeof(uint64_t));\n\n\t\t\tconst size_t read_bytes = reader.gcount();\n\t\t\tconst size_t num_records = read_bytes / sizeof(data_record);\n\n\t\t\tfor (size_t i = 0; i < num_records; i++) {\n\t\t\t\tconst auto map_iter = internal_id_map.find(buffer[i].m_value);\n\t\t\t\tif (map_iter == internal_id_map.end()) {\n\t\t\t\t\tconst uint32_t internal_id = m_record_id_to_internal_id(buffer[i]);\n\t\t\t\t\tinternal_id_map[buffer[i].m_value] = internal_id;\n\t\t\t\t\tbitmap_data[key_buffer[i]].push_back(internal_id);\n\t\t\t\t} else {\n\t\t\t\t\tbitmap_data[key_buffer[i]].push_back(map_iter->second);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t// Insert the bitmap data.\n\t\tfor (const auto &iter : bitmap_data) {\n\t\t\tm_bitmaps[iter.first].addMany(iter.second.size(), iter.second.data());\n\t\t}\n\t}\n\n\t/*\n\t * Reads the file into RAM.\n\t * */\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::read_data_to_cache() {\n\n\t\t//profiler::instance prof(\"index_builder::read_data_to_cache\");\n\n\t\treset_cache_variables();\n\n\t\tstd::ifstream reader(target_filename(), std::ios::binary);\n\t\tif (!reader.is_open()) return;\n\n\t\treader.seekg(0, std::ios::end);\n\t\tconst size_t file_size = reader.tellg();\n\t\tif (file_size <= this->hash_table_byte_size()) return;\n\t\treader.seekg(this->hash_table_byte_size(), std::ios::beg);\n\n\t\tsize_t num_records;\n\t\treader.read((char *)&num_records, sizeof(size_t));\n\n\t\t// Read records.\n\t\tconst size_t record_buffer_len = 10000;\n\t\tstd::unique_ptr<data_record[]> record_buffer_allocator = std::make_unique<data_record[]>(record_buffer_len);\n\t\tdata_record *record_buffer = record_buffer_allocator.get();\n\n\t\tsize_t records_read = 0;\n\t\twhile (records_read < num_records) {\n\t\t\tsize_t records_left = num_records - records_read;\n\t\t\tsize_t records_to_read = min(records_left, record_buffer_len);\n\t\t\treader.read((char *)record_buffer, sizeof(data_record) * records_to_read);\n\n\t\t\tfor (size_t i = 0; i < records_to_read; i++) {\n\t\t\t\tm_record_id_map[record_buffer[i].m_value] = m_records.size();\n\t\t\t\tm_records.push_back(record_buffer[i]);\n\t\t\t}\n\n\t\t\trecords_read += records_to_read;\n\t\t}\n\n\t\twhile (this->read_bitmap_page_into(reader, m_bitmaps)) {\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::reset_cache_variables() {\n\t\tm_records = std::vector<data_record>{};\n\t\tm_record_id_map = std::map<uint64_t, uint32_t>{};\n\t\tm_bitmaps = std::map<uint64_t, roaring::Roaring>{};\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::save_file() {\n\n\t\t//profiler::instance prof(\"index_builder::save_file\");\n\n\t\tstd::ostringstream writer;\n\n\t\treset_key_map(writer);\n\t\twrite_records(writer);\n\n\t\tstd::map<uint64_t, std::vector<uint64_t>> pages;\n\t\tfor (auto &iter : m_bitmaps) {\n\t\t\tif (this->m_hash_table_size) {\n\t\t\t\tpages[iter.first % this->m_hash_table_size].push_back(iter.first);\n\t\t\t} else {\n\t\t\t\tpages[0].push_back(iter.first);\n\t\t\t}\n\t\t}\n\n\t\tfor (const auto &iter : pages) {\n\t\t\tsize_t page_pos = write_page(writer, iter.second);\n\t\t\twrite_key(writer, iter.first, page_pos);\n\t\t\twriter.flush();\n\t\t}\n\n\t\tstd::ofstream file_writer(target_filename(), std::ios::binary | std::ios::trunc);\n\t\tif (!file_writer.is_open()) {\n\t\t\tthrow LOG_ERROR_EXCEPTION(\"Could not open full text shard. Error: \" + std::string(strerror(errno)));\n\t\t}\n\n\t\tfile_writer.write(writer.str().c_str(), writer.str().size());\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::write_key(std::ostream &key_writer, uint64_t key, size_t page_pos) {\n\t\tif (this->m_hash_table_size > 0) {\n\t\t\tassert(key < this->m_hash_table_size);\n\t\t\tkey_writer.seekp(key * sizeof(uint64_t));\n\t\t\tkey_writer.write((char *)&page_pos, sizeof(size_t));\n\t\t}\n\t}\n\n\t/*\n\t * Writes the page with keys, appending it to the file stream writer.\n\t * */\n\ttemplate<typename data_record>\n\tsize_t index_builder<data_record>::write_page(std::ostream &writer, const std::vector<uint64_t> &keys) {\n\n\t\twriter.seekp(0, ios::end);\n\n\t\tconst size_t page_pos = writer.tellp();\n\n\t\tsize_t num_keys = keys.size();\n\n\t\twriter.write((char *)&num_keys, 8);\n\t\twriter.write((char *)keys.data(), keys.size() * 8);\n\n\t\tstd::vector<size_t> v_pos;\n\t\tstd::vector<size_t> v_len;\n\n\t\tsize_t max_len = 0;\n\t\tsize_t pos = 0;\n\t\tfor (uint64_t key : keys) {\n\n\t\t\tm_bitmaps[key].runOptimize();\n\t\t\tm_bitmaps[key].shrinkToFit();\n\n\t\t\t// Store position and length\n\t\t\tconst size_t len = m_bitmaps[key].getSizeInBytes();\n\n\t\t\tif (len > max_len) max_len = len;\n\t\t\t\n\t\t\tv_pos.push_back(pos);\n\t\t\tv_len.push_back(len);\n\n\t\t\tpos += len;\n\t\t}\n\t\t\n\t\twriter.write((char *)v_pos.data(), keys.size() * 8);\n\t\twriter.write((char *)v_len.data(), keys.size() * 8);\n\n\t\tstd::unique_ptr<char[]> buffer_allocator = make_unique<char[]>(max_len);\n\t\tchar *buffer = buffer_allocator.get();\n\n\t\t// Write data.\n\t\tfor (uint64_t key : keys) {\n\t\t\tconst size_t len = m_bitmaps[key].getSizeInBytes();\n\t\t\tm_bitmaps[key].write(buffer);\n\t\t\twriter.write(buffer, len);\n\t\t}\n\n\t\treturn page_pos;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::reset_key_map(std::ostream &key_writer) {\n\t\tkey_writer.seekp(0);\n\t\tuint64_t data = SIZE_MAX;\n\t\tfor (size_t i = 0; i < this->m_hash_table_size; i++) {\n\t\t\tkey_writer.write((char *)&data, sizeof(uint64_t));\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> index_builder<data_record>::read_records() const {\n\t\tifstream reader(target_filename(), std::ios::in);\n\t\treader.seekg(this->hash_table_byte_size(), std::ios::beg);\n\n\t\tconst size_t num_records = m_records.size();\n\t\treader.read((char *)&num_records, sizeof(uint64_t));\n\n\t\tstd::vector<data_record> records(num_records);\n\t\treader.read((char *)records.data(), num_records * sizeof(data_record));\n\n\t\treturn records;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::write_records(std::ostream &writer) {\n\t\tconst size_t num_records = m_records.size();\n\t\twriter.write((char *)&num_records, sizeof(uint64_t));\n\t\twriter.write((char *)m_records.data(), num_records * sizeof(data_record));\n\t}\n\n\ttemplate<typename data_record>\n\tuint32_t index_builder<data_record>::default_record_to_internal_id(const data_record &record) {\n\t\tif (m_record_id_map.count(record.m_value) == 0) {\n\t\t\tm_record_id_map[record.m_value] = m_records.size();\n\t\t\tm_records.push_back(record);\n\t\t}\n\t\treturn m_record_id_map[record.m_value];\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index_builder<data_record>::mountpoint() const {\n\t\treturn std::to_string(m_id % 8);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index_builder<data_record>::cache_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".cache\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".cache\";\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index_builder<data_record>::key_cache_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".cache.keys\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".cache.keys\";\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string index_builder<data_record>::target_filename() const {\n\t\tif (m_file_name != \"\") return m_file_name + \".data\";\n\t\treturn config::data_path() + \"/\" + mountpoint() + \"/full_text/\" + m_db_name + \"/\" + std::to_string(m_id) +\n\t\t\t\".data\";\n\t}\n\n\ttemplate<typename data_record>\n\tbool index_builder<data_record>::needs_optimization() const {\n\n\t\tauto records = read_records();\n\n\t\t// Just check if the records are sorted by storage order.\n\t\tif (records.size() <= 1) return false;\n\t\t\n\t\treturn !std::is_sorted(records.cbegin(), records.cend(), typename data_record::storage_order());\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::sort_records() {\n\n\t\tread_data_to_cache();\n\n\t\tsort_records_and_bitmaps(m_records, m_bitmaps);\n\n\t\tsave_file();\n\t\ttruncate_cache_files();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid index_builder<data_record>::sort_records_and_bitmaps(std::vector<data_record> &records,\n\t\t\tstd::map<uint64_t, roaring::Roaring> &bitmaps) {\n\n\t\tstd::vector<uint32_t> permutation(records.size());\n\t\tstd::iota(permutation.begin(), permutation.end(), 0);\n\n\t\ttypename data_record::storage_order ordered;\n\n\t\tstd::sort(permutation.begin(), permutation.end(), [&records, &ordered](const size_t &a, const size_t &b) {\n\t\t\treturn ordered(records[a], records[b]);\n\t\t});\n\t\t// permutation now points from new position -> old position of record.\n\n\t\tstd::vector<uint32_t> inverse(permutation.size());\n\t\tfor (uint32_t i = 0; i < permutation.size(); i++) {\n\t\t\tinverse[permutation[i]] = i;\n\t\t}\n\t\t// inverse now points from old position -> new position of record.\n\n\t\t// Reorder the records.\n\t\tsort(records.begin(), records.end(), ordered);\n\n\t\t// Apply transforms.\n\t\tfor (auto &iter : bitmaps) {\n\n\t\t\t::roaring::Roaring rr;\n\t\t\tfor (uint32_t v : iter.second) {\n\t\t\t\tconst uint32_t v_trans = inverse[v];\n\t\t\t\trr.add(v_trans);\n\t\t\t}\n\t\t\tbitmaps[iter.first] = rr;\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/index_manager.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"index_manager.h\"\n#include \"merger.h\"\n#include \"domain_stats/domain_stats.h\"\n#include \"url_link/link.h\"\n#include \"algorithm/algorithm.h\"\n#include \"algorithm/sort.h\"\n#include \"utils/thread_pool.hpp\"\n\nusing namespace std;\n\nnamespace indexer {\n\n\tindex_manager::index_manager() {\n\n\t\tm_url_index_builder = std::make_unique<sharded_builder<basic_index_builder, url_record>>(\"url_index\", 4001);\n\t\tm_url_index = std::make_unique<sharded<basic_index, url_record>>(\"url_index\", 4001);\n\n\t\tm_link_index_builder = std::make_unique<sharded_builder<basic_index_builder, link_record>>(\"link_index\", 4001);\n\t\tm_link_index = std::make_unique<sharded<basic_index, link_record>>(\"link_index\", 4001);\n\n\t\tm_domain_link_index_builder = std::make_unique<sharded_builder<basic_index_builder, domain_link_record>>(\"domain_link_index\", 4001);\n\t\tm_domain_link_index = std::make_unique<sharded<basic_index, domain_link_record>>(\"domain_link_index\", 4001);\n\n\t\tm_hash_table_builder = std::make_unique<hash_table2::builder>(\"index_manager\");\n\t\tm_hash_table = std::make_unique<hash_table2::hash_table>(\"index_manager\");\n\n\t}\n\n\tindex_manager::~index_manager() {\n\t}\n\n\tvoid index_manager::add_index_file(const string &local_path) {\n\n\t\tconst vector<size_t> cols = {1, 2, 3, 4};\n\t\tconst vector<float> scores = {10.0, 3.0, 2.0, 1};\n\n\t\tifstream infile(local_path, ios::in);\n\t\tstring line;\n\n\t\t// word_map holds a word hash (token) => score\n\t\tstd::map<uint64_t, float> word_map;\n\n\t\tsize_t num_added = 0;\n\t\twhile (getline(infile, line)) {\n\t\t\tvector<string> col_values;\n\t\t\tboost::algorithm::split(col_values, line, boost::is_any_of(\"\\t\"));\n\n\n\t\t\tURL url(col_values[0]);\n\n\t\t\tconst uint64_t url_hash = url.hash();\n\t\t\tconst uint64_t domain_hash = url.host_hash();\n\t\t\tconst float harmonic = domain_stats::harmonic_centrality(url);\n\n\t\t\t// add to hash table\n\t\t\tm_hash_table_builder->add(url_hash, line);\n\n\t\t\turl_record record(url_hash, 0.0f, domain_hash);\n\t\t\trecord.url_length(url.path_with_query().size());\n\n\t\t\tconst std::string site_colon = \"site:\" + url.host() + \" site:www.\" + url.host() + \" \" + url.host() + \" \" + url.domain_without_tld();\n\t\t\tconst auto site_colon_tokens = text::get_unique_full_text_tokens(site_colon);\n\t\t\tfor (auto token : site_colon_tokens) {\n\t\t\t\tword_map[token] += harmonic * 20;\n\t\t\t}\n\n\t\t\tsize_t col_idx = 0;\n\t\t\tfor (size_t col : cols) {\n\t\t\t\tconst auto tokens = text::get_unique_expanded_full_text_tokens(col_values[col]);\n\t\t\t\tfor (auto token : tokens) {\n\t\t\t\t\tword_map[token] += scores[col_idx] * harmonic;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor (const auto &iter : word_map) {\n\t\t\t\trecord.m_score = iter.second;\n\t\t\t\tm_url_index_builder->add(iter.first, record);\n\t\t\t\tnum_added++;\n\t\t\t}\n\n\t\t\tword_map.clear();\n\t\t}\n\t\tstd::cout << \"num added: \" << num_added << std::endl;\n\t}\n\n\tvoid index_manager::add_index_files_threaded(const vector<string> &local_paths, size_t num_threads) {\n\n\t\tnum_threads = 1;\n\t\tutils::thread_pool pool(num_threads);\n\n\t\tfor (const string &local_path : local_paths) {\n\t\t\tpool.enqueue([this, local_path]() -> void {\n\t\t\t\tadd_index_file(local_path);\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\n\t\tm_hash_table_builder->merge();\n\n\t}\n\n\tvoid index_manager::add_link_file(const string &local_path, const ::algorithm::bloom_filter &urls_to_index) {\n\n\t\tprofiler::instance prof(\"add \" + local_path);\n\t\tifstream infile(local_path, ios::in);\n\t\tstring line;\n\t\tsize_t added = 0;\n\t\tsize_t parsed = 0;\n\t\tstd::vector<std::string> col_values;\n\t\twhile (getline(infile, line)) {\n\n\t\t\tcol_values.clear();\n\t\t\tboost::algorithm::split(col_values, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tURL target_url(col_values[2], col_values[3]);\n\n\t\t\tparsed++;\n\n\t\t\tURL source_url(col_values[0], col_values[1]);\n\n\t\t\tfloat target_harmonic = domain_stats::harmonic_centrality(target_url);\n\t\t\tfloat source_harmonic = domain_stats::harmonic_centrality(source_url);\n\n\t\t\tconst std::string link_text = col_values[4].substr(0, 1000);\n\n\t\t\tconst url_link::link link(source_url, target_url, source_harmonic, target_harmonic);\n\n\t\t\tconst uint64_t domain_link_hash = source_url.domain_link_hash(target_url, link_text);\n\t\t\tconst uint64_t link_hash = source_url.link_hash(target_url, link_text);\n\t\t\tconst bool bloom_has_url = urls_to_index.exists(target_url.hash());\n\n\t\t\tstd::vector<uint64_t> tokens = text::get_unique_expanded_full_text_tokens(link_text);\n\n\t\t\tif (bloom_has_url) {\n\n\t\t\t\tconst bool has_url = m_hash_table->has(target_url.hash());\n\n\t\t\t\tif (has_url) {\n\t\t\t\t\t// Add the url link.\n\t\t\t\t\tlink_record link_rec(link_hash, source_harmonic);\n\t\t\t\t\tlink_rec.m_source_domain = source_url.hash();\n\t\t\t\t\tlink_rec.m_target_hash = target_url.hash();\n\n\t\t\t\t\tfor (auto token : tokens) {\n\t\t\t\t\t\tm_link_index_builder->add(token, link_rec);\n\t\t\t\t\t}\n\t\t\t\t\tadded++;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tdomain_link_record rec(domain_link_hash, source_harmonic);\n\t\t\trec.m_source_domain = source_url.host_hash();\n\t\t\trec.m_target_domain = target_url.host_hash();\n\n\t\t\tfor (auto token : tokens) {\n\t\t\t\tm_domain_link_index_builder->add(token, rec);\n\t\t\t}\n\n\t\t\ttokens.clear();\n\t\t}\n\n\t\tcout << \"Done with \" << local_path << \" added \" << added << \" total \" << parsed << \" took: \" << prof.get() << \"ms\" << endl;\n\t}\n\n\tvoid index_manager::add_link_files_threaded(const vector<string> &local_paths, size_t num_threads, const ::algorithm::bloom_filter &urls_to_index) {\n\n\t\tutils::thread_pool pool(num_threads);\n\n\t\tfor (auto &local_path : local_paths) {\n\t\t\tpool.enqueue([this, local_path, &urls_to_index]() -> void {\n\t\t\t\tadd_link_file(local_path, urls_to_index);\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\tvoid index_manager::add_url_file(const string &local_path) {\n\n\t\t\n\t}\n\n\tvoid index_manager::add_url_files_threaded(const vector<string> &local_paths, size_t num_threads) {\n\n\t\tutils::thread_pool pool(num_threads);\n\n\t\tfor (auto &local_path : local_paths) {\n\t\t\tpool.enqueue([this, local_path]() -> void {\n\t\t\t\tadd_url_file(local_path);\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\tvoid index_manager::merge() {\n\n\t\tm_url_index_builder->append();\n\t\tm_url_index_builder->merge();\n\n\t\tm_link_index_builder->append();\n\t\tm_link_index_builder->merge();\n\n\t\tm_domain_link_index_builder->append();\n\t\tm_domain_link_index_builder->merge();\n\n\t}\n\n\tvoid index_manager::optimize() {\n\t}\n\n\tvoid index_manager::truncate() {\n\t\tm_url_index_builder->truncate();\n\t\ttruncate_links();\n\t}\n\n\tvoid index_manager::truncate_links() {\n\t\tm_link_index_builder->truncate();\n\t\tm_domain_link_index_builder->truncate();\n\t}\n\n\tstd::vector<return_record> index_manager::find(const string &query, full_text::search_metric &metric) {\n\n\t\tauto words = text::get_full_text_words(query, config::query_max_words);\n\t\tif (words.size() == 0) return {};\n\n\t\tauto tokens = text::get_full_text_tokens(query, config::query_max_words);\n\n\t\tauto links = m_link_index->find_intersection(tokens, 500000);\n\n\t\tmetric.m_total_url_links_found = links.size();\n\t\tmetric.m_links_handled = links.size();\n\n\t\tstd::sort(links.begin(), links.end(), [](const auto &a, const auto &b) {\n\t\t\treturn a.m_target_hash < b.m_target_hash;\n\t\t});\n\n\t\tauto domain_links = m_domain_link_index->find_intersection(tokens, 100000);\n\n\t\tmetric.m_total_domain_links_found = domain_links.size();\n\n\t\tauto results = m_url_index->find_intersection(tokens);\n\n\t\tmetric.m_total_found = results.size();\n\n\t\tsize_t applied_domain_links = apply_domain_link_scores(domain_links, results);\n\t\tsize_t applied_url_links = apply_link_scores(links, results);\n\n\t\tmetric.m_link_url_matches = applied_url_links;\n\t\tmetric.m_link_domain_matches = applied_domain_links;\n\n\t\tconst auto sort_by = [](const auto &a, const auto &b) {\n\t\t\tif (a.m_score == b.m_score) return a.m_value < b.m_value;\n\t\t\treturn a.m_score > b.m_score;\n\t\t};\n\n\t\tif (results.size() > config::pre_result_limit) {\n\t\t\tnth_element(results.begin(), results.begin() + (config::pre_result_limit - 1), results.end(), sort_by);\n\t\t\tstd::sort(results.begin(), results.begin() + config::pre_result_limit, sort_by);\n\t\t\tresults.resize(config::pre_result_limit);\n\t\t}\n\n\t\tconst auto deduplicated = deduplicate_search_results(results, config::result_limit);\n\t\tconst auto return_records = decorate_search_result(deduplicated);\n\n\t\treturn return_records;\n\t}\n\n\tstd::vector<url_record> index_manager::deduplicate_search_results(const std::vector<url_record> &results, size_t limit) {\n\n\t\tstd::vector<url_record> deduped;\n\t\tstd::vector<url_record> non_deduped;\n\n\t\tstd::map<uint64_t, size_t> d_count;\n\t\tfor (const auto &result : results) {\n\t\t\tif (d_count[result.m_domain_hash] < config::deduplicate_domain_count) {\n\t\t\t\tdeduped.push_back(result);\n\t\t\t} else {\n\t\t\t\tnon_deduped.push_back(result);\n\t\t\t}\n\t\t\td_count[result.m_domain_hash]++;\n\t\t}\n\t\tif (deduped.size() < limit) {\n\t\t\tconst size_t num_missing = limit - deduped.size();\n\t\t\tif (non_deduped.size() > num_missing) {\n\t\t\t\tnon_deduped.resize(num_missing);\n\t\t\t}\n\t\t\tstd::vector<url_record> ret;\n\t\t\t::algorithm::sort::merge_arrays(deduped, non_deduped, [] (const auto &a, const auto &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t}, ret);\n\t\t\treturn ret;\n\t\t}\n\n\t\tdeduped.resize(limit);\n\n\t\treturn deduped;\n\t}\n\n\tstd::vector<return_record> index_manager::decorate_search_result(const std::vector<url_record> &results) {\n\t\tstd::vector<return_record> return_records;\n\n\t\tfor (const auto &res : results) {\n\t\t\tconst auto tsv_data = m_hash_table->find(res.m_value);\n\t\t\treturn_record ret(res.m_value, res.m_score, tsv_data);\n\t\t\tret.m_domain_hash = res.m_domain_hash;\n\t\t\treturn_records.push_back(std::move(ret));\n\t\t}\n\n\t\treturn return_records;\n\t}\n\n\tsize_t index_manager::apply_domain_link_scores(const vector<domain_link_record> &links, std::vector<url_record> &results) {\n\t\tif (links.size() == 0) return 0;\n\t\tsize_t applied_links = 0;\n\n\t\t{\n\t\t\tunordered_map<uint64_t, float> domain_scores;\n\t\t\tunordered_map<uint64_t, int> domain_counts;\n\t\t\tmap<pair<uint64_t, uint64_t>, uint64_t> domain_unique;\n\t\t\t{\n\t\t\t\tfor (const auto &link : links) {\n\t\t\t\t\tif (domain_unique.count(std::make_pair(link.m_source_domain, link.m_target_domain)) == 0) {\n\t\t\t\t\t\tconst float domain_score = expm1(25.0f*link.m_score) / 50.0f;\n\t\t\t\t\t\tdomain_scores[link.m_target_domain] += domain_score;\n\t\t\t\t\t\tdomain_counts[link.m_target_domain]++;\n\t\t\t\t\t\tdomain_unique[std::make_pair(link.m_source_domain, link.m_target_domain)] = link.m_source_domain;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor (auto &result : results) {\n\t\t\t\tconst float domain_score = domain_scores[result.m_domain_hash];\n\t\t\t\tresult.m_score += domain_score;\n\t\t\t\tapplied_links += domain_counts[result.m_domain_hash];\n\t\t\t}\n\t\t}\n\n\t\treturn applied_links;\n\t}\n\n\tsize_t index_manager::apply_link_scores(const vector<link_record> &links, std::vector<url_record> &results) {\n\n\t\tif (links.size() == 0) return 0;\n\n\t\tsize_t applied_links = 0;\n\n\t\tsize_t i = 0, j = 0;\n\t\tstd::map<std::pair<uint64_t, uint64_t>, uint64_t> domain_unique;\n\t\twhile (i < links.size() && j < results.size()) {\n\t\t\tconst uint64_t hash1 = links[i].m_target_hash;\n\t\t\tconst uint64_t hash2 = results[j].m_value;\n\n\t\t\tif (hash1 < hash2) {\n\t\t\t\ti++;\n\t\t\t} else if (hash1 == hash2) {\n\t\t\t\tif (domain_unique.count(std::make_pair(links[i].m_source_domain, links[i].m_target_hash)) == 0) {\n\t\t\t\t\tconst float url_score = expm1(25.0f*links[i].m_score) / 50.0f;\n\t\t\t\t\tresults[j].m_score += url_score;\n\t\t\t\t\tapplied_links++;\n\t\t\t\t\tdomain_unique[std::make_pair(links[i].m_source_domain, links[i].m_target_hash)] = links[i].m_source_domain;\n\t\t\t\t}\n\n\t\t\t\ti++;\n\t\t\t} else {\n\t\t\t\tj++;\n\t\t\t}\n\t\t}\n\t\treturn applied_links;\n\t}\n\n\tstd::vector<return_record> index_manager::find(const string &query) {\n\t\tfull_text::search_metric metric;\n\t\treturn find(query, metric);\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/index_manager.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <memory>\n#include \"full_text/search_metric.h\"\n#include \"index_builder.h\"\n#include \"index.h\"\n#include \"sharded_index_builder.h\"\n#include \"sharded_index.h\"\n#include \"hash_table2/builder.h\"\n#include \"sharded_builder.h\"\n#include \"sharded.h\"\n#include \"basic_index_builder.h\"\n#include \"basic_index.h\"\n#include \"counted_record.h\"\n#include \"url_record.h\"\n#include \"link_record.h\"\n#include \"domain_link_record.h\"\n#include \"domain_record.h\"\n#include \"return_record.h\"\n#include \"algorithm/bloom_filter.h\"\n\nnamespace indexer {\n\n\tclass index_manager {\n\n\tpublic:\n\n\t\tindex_manager();\n\t\t~index_manager();\n\n\t\tvoid add_index_file(const std::string &local_path);\n\t\tvoid add_index_files_threaded(const vector<string> &local_paths, size_t num_threads);\n\t\tvoid add_link_file(const std::string &local_path, const ::algorithm::bloom_filter &urls_to_index);\n\t\tvoid add_link_files_threaded(const std::vector<std::string> &local_paths, size_t num_threads, const ::algorithm::bloom_filter &urls_to_index);\n\t\tvoid add_url_file(const std::string &local_path);\n\t\tvoid add_url_files_threaded(const std::vector<std::string> &local_paths, size_t num_threads);\n\n\t\tvoid merge();\n\t\tvoid optimize();\n\t\tvoid truncate();\n\t\tvoid truncate_links();\n\n\t\tsize_t url_count() const {\n\t\t\treturn m_hash_table->size();\n\t\t}\n\n\t\tstd::vector<return_record> find(const std::string &query, full_text::search_metric &metric);\n\t\tstd::vector<return_record> find(const std::string &query);\n\n\tprivate:\n\n\t\tstd::unique_ptr<sharded_builder<basic_index_builder, url_record>> m_url_index_builder;\n\t\tstd::unique_ptr<sharded<basic_index, url_record>> m_url_index;\n\n\t\tstd::unique_ptr<sharded_builder<basic_index_builder, link_record>> m_link_index_builder;\n\t\tstd::unique_ptr<sharded<basic_index, link_record>> m_link_index;\n\n\t\tstd::unique_ptr<sharded_builder<basic_index_builder, domain_link_record>> m_domain_link_index_builder;\n\t\tstd::unique_ptr<sharded<basic_index, domain_link_record>> m_domain_link_index;\n\n\t\tstd::unique_ptr<hash_table2::builder> m_hash_table_builder;\n\t\tstd::unique_ptr<hash_table2::hash_table> m_hash_table;\n\n\t\tsize_t apply_domain_link_scores(const vector<domain_link_record> &links, std::vector<url_record> &results);\n\t\tsize_t apply_link_scores(const vector<link_record> &links, std::vector<url_record> &results);\n\t\tstd::vector<return_record> decorate_search_result(const std::vector<url_record> &results);\n\t\tstd::vector<url_record> deduplicate_search_results(const std::vector<url_record> &results, size_t limit);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/indexer/index_reader.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"index_reader.h\"\n#include <string.h>\n\nusing namespace std;\n\nnamespace indexer {\n\n\tindex_reader_file::index_reader_file(const std::string &filename) {\n\t\tm_reader = make_unique<ifstream>();\n\t\tm_reader->open(filename, ios::binary);\n\t}\n\n\tindex_reader_file::index_reader_file(index_reader_file &&other) {\n\t\tm_reader = move(other.m_reader);\n\t}\n\n\tbool index_reader_file::seek(size_t position) {\n\t\tif (!m_reader->is_open()) return false;\n\t\tm_reader->seekg(position, ios::beg);\n\t\treturn true;\n\t}\n\n\tvoid index_reader_file::read(char *buffer, size_t length) {\n\t\tm_reader->read(buffer, length);\n\t}\n\n\tsize_t index_reader_file::size() {\n\t\tif (!m_reader->is_open()) return 0;\n\t\tm_reader->seekg(0, ios::end);\n\t\treturn m_reader->tellg();\n\t}\n\n\tindex_reader_ram::index_reader_ram(const std::string &str)\n\t: m_buffer(str.c_str()), m_len(str.size()) {\n\t}\n\n\tindex_reader_ram::index_reader_ram(const char *buffer, size_t length)\n\t: m_buffer(buffer), m_len(length) {\n\t}\n\n\tindex_reader_ram::index_reader_ram(index_reader_ram &&other)\n\t: m_buffer(other.m_buffer), m_len(other.m_len) {\n\n\t\tother.m_buffer = nullptr;\n\t\tother.m_len = 0;\n\t}\n\n\n\tbool index_reader_ram::seek(size_t position) {\n\t\tif (position < m_len) {\n\t\t\tm_pos = position;\n\t\t\treturn true;\n\t\t}\n\t\treturn false;\n\t}\n\n\tvoid index_reader_ram::read(char *buffer, size_t length) {\n\t\tif (m_pos + length <= m_len) {\n\t\t\tmemcpy(buffer, &m_buffer[m_pos], length);\n\t\t\tm_pos += length;\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/index_reader.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <fstream>\n#include <memory>\n\nnamespace indexer {\n\n\t/*\n\t\tThis class provides an abstraction of data reading used by the index class.\n\t\tWe provide an interface and two classes:\n\t\tindex_reader_file\n\t\tand\n\t\tindex_reader_ram\n\t\tto provide data directly from the file or from a preloaded sequence of bytes.\n\t*/\n\n\tclass index_reader {\n\n\t\tpublic:\n\n\t\t\tvirtual bool seek(size_t position) = 0;\n\t\t\tvirtual void read(char *buffer, size_t length) = 0;\n\t\t\tvirtual size_t size() = 0;\n\t\t\n\t};\n\n\tclass index_reader_file : public index_reader {\n\n\t\tprivate:\n\t\t\tindex_reader_file(const index_reader_file &);\n\t\t\tindex_reader_file &operator=(const index_reader_file &);\n\n\t\tpublic:\n\n\t\t\tindex_reader_file(const std::string &filename);\n\t\t\tindex_reader_file(index_reader_file &&other);\n\n\t\t\tbool seek(size_t position);\n\t\t\tvoid read(char *buffer, size_t length);\n\t\t\tsize_t size();\n\t\t\n\t\tprivate:\n\t\t\n\t\t\tstd::unique_ptr<std::ifstream> m_reader;\n\n\t};\n\n\tclass index_reader_ram : public index_reader {\n\n\t\tprivate:\n\t\t\tindex_reader_ram(const index_reader_file &);\n\t\t\tindex_reader_ram &operator=(const index_reader_file &);\n\n\t\tpublic:\n\n\t\t\texplicit index_reader_ram(const std::string &str);\n\t\t\tindex_reader_ram(const char *buffer, size_t length);\n\t\t\tindex_reader_ram(index_reader_ram &&other);\n\n\t\t\tbool seek(size_t position);\n\t\t\tvoid read(char *buffer, size_t length);\n\t\t\tsize_t size() {return m_len; };\n\t\t\n\t\tprivate:\n\t\t\n\t\t\tconst char *m_buffer;\n\t\t\tsize_t m_len;\n\t\t\tsize_t m_pos = 0;\n\n\t};\n\n\n}\n"
  },
  {
    "path": "src/indexer/index_utils.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"index_utils.h\"\n#include <boost/filesystem.hpp>\n#include \"config.h\"\n\nnamespace indexer {\n\n\tvoid create_db_directories(const std::string &db_name) {\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/\" + std::to_string(i) + \"/full_text/\" + db_name);\n\t\t}\n\t}\n\n\tvoid delete_db_directories(const std::string &db_name) {\n\t\tfor (size_t i = 0; i < 8; i++) {\n\t\t\tboost::filesystem::remove_all(config::data_path() + \"/\" + std::to_string(i) + \"/full_text/\" + db_name);\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/index_utils.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\n\tvoid create_db_directories(const std::string &db_name);\n\tvoid delete_db_directories(const std::string &db_name);\n\n}\n"
  },
  {
    "path": "src/indexer/link_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\n\t#pragma pack(4)\n\tclass link_record {\n\t\tpublic:\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\t\tuint64_t m_source_domain;\n\t\tuint64_t m_target_hash;\n\n\t\tlink_record() : m_value(0), m_score(0.0f) {};\n\t\tlink_record(uint64_t value) : m_value(value), m_score(0.0f) {};\n\t\tlink_record(uint64_t value, float score) : m_value(value), m_score(score) {};\n\n\t\tbool operator==(const link_record &b) const {\n\t\t\treturn m_value == b.m_value;\n\t\t}\n\n\t\tbool operator<(const link_record &b) const {\n\t\t\treturn m_value < b.m_value;\n\t\t}\n\n\t\tlink_record &operator+=(const link_record &b) {\n\t\t\treturn *this;\n\t\t}\n\n\t\t/*\n\t\t * Will be applied to records before truncating. Top records will be kept.\n\t\t * */\n\t\tstruct truncate_order {\n\t\t\tinline bool operator() (const link_record &a, const link_record &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t}\n\t\t};\n\n\t\t/*\n\t\t * Will be applied before storing on disk. This is the order the records will be returned in.\n\t\t * */\n\t\tstruct storage_order {\n\t\t\tinline bool operator() (const link_record &a, const link_record &b) {\n\t\t\t\treturn a.m_target_hash < b.m_target_hash;\n\t\t\t}\n\t\t};\n\n\t\tbool storage_equal(const link_record &a) const {\n\t\t\treturn m_target_hash == a.m_target_hash;\n\t\t}\n\n\t};\n\t#pragma pack()\n}\n"
  },
  {
    "path": "src/indexer/merger.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"merger.h\"\n#include \"memory/memory.h\"\n#include \"memory/debugger.h\"\n#include \"utils/thread_pool.hpp\"\n#include <map>\n#include <chrono>\n#include <thread>\n\nusing namespace std;\n\nnamespace indexer {\n\n\tnamespace merger {\n\n\t\tdouble mem_limit = 0.4;\n\n\t\tbool is_merging = false;\n\t\tmap<size_t, std::function<void()>> mergers;\n\t\tmap<size_t, std::function<void()>> appenders;\n\t\tmap<size_t, std::function<size_t()>> sizes;\n\t\tmutex merger_lock;\n\n\t\tvoid set_mem_limit(double mem_limit) {\n\t\t\t::indexer::merger::mem_limit = mem_limit;\n\t\t}\n\n\t\tvoid wait_for_merges() {\n\t\t\twhile (is_merging) {\n\t\t\t\tstd::this_thread::sleep_for(100ms);\n\t\t\t}\n\t\t}\n\n\t\tvoid lock() {\n\t\t\tif (is_merging) {\n\t\t\t\twait_for_merges();\n\t\t\t}\n\t\t}\n\n\t\tvoid register_appender(size_t id, std::function<void()> append, std::function<size_t()> size) {\n\t\t\tstd::lock_guard lock(merger_lock);\n\n\t\t\tappenders[id] = append;\n\t\t\tsizes[id] = size;\n\t\t}\n\n\t\tvoid register_merger(size_t id, std::function<void()> merge) {\n\t\t\tstd::lock_guard lock(merger_lock);\n\n\t\t\tmergers[id] = merge;\n\t\t}\n\n\t\tvoid deregister_merger(size_t id) {\n\t\t\tstd::lock_guard lock(merger_lock);\n\n\t\t\tappenders.erase(id);\n\t\t\tmergers.erase(id);\n\t\t\tsizes.erase(id);\n\t\t}\n\n\t\tbool merge_thread_is_running = true;\n\t\tthread merge_thread_obj;\n\n\t\tvoid append_all() {\n\t\t\tis_merging = true;\n\t\t\tthis_thread::sleep_for(1000ms);\n\n\t\t\tsize_t available_memory = memory::get_total_memory();\n\n\t\t\tstd::cout << \"APPENDING ALL: \" << appenders.size() << \" mergers allocated memory: \" << memory::allocated_memory() << \" limit is: \" <<\n\t\t\t\t(available_memory * mem_limit) << std::endl;\n\t\t\t\n\t\t\tutils::thread_pool pool(32);\n\n\t\t\tmerger_lock.lock();\n\t\t\tfor (auto &iter : appenders) {\n\t\t\t\tpool.enqueue([iter]() {\n\t\t\t\t\ttry {\n\t\t\t\t\t\titer.second();\n\t\t\t\t\t} catch (...) {\n\n\t\t\t\t\t}\n\t\t\t\t});\n\t\t\t}\n\n\t\t\tpool.run_all();\n\n\t\t\tcout << \"done... allocated memory: \" << memory::allocated_memory() << endl;\n\n\t\t\tmerger_lock.unlock();\n\t\t\tis_merging = false;\n\t\t}\n\n\t\tvoid merge_all() {\n\t\t\tis_merging = true;\n\t\t\tthis_thread::sleep_for(1000ms);\n\n\t\t\tsize_t available_memory = memory::get_total_memory();\n\n\t\t\tstd::cout << \"MERGING ALL: \" << mergers.size() << \" mergers allocated memory: \" << memory::allocated_memory() << \" limit is: \" <<\n\t\t\t\t(available_memory * mem_limit) << std::endl;\n\t\t\t\n\t\t\tutils::thread_pool pool(32);\n\n\t\t\tfor (auto &iter : mergers) {\n\t\t\t\tpool.enqueue([iter]() {\n\t\t\t\t\ttry {\n\t\t\t\t\t      iter.second();\n\t\t\t\t\t} catch (...) {\n\t\t\t\t\t      \n\t\t\t\t\t}\n\t\t\t\t});\n\t\t\t}\n\n\t\t\tpool.run_all();\n\n\t\t\tcout << \"done... allocated memory: \" << memory::allocated_memory() << endl;\n\n\t\t\tis_merging = false;\n\t\t}\n\n\t\tsize_t total_sizes() {\n\t\t\tstd::lock_guard lock(merger_lock);\n\t\t\tsize_t sum = 0;\n\t\t\tfor (const auto &iter : sizes) {\n\t\t\t\tsum += iter.second();\n\t\t\t}\n\t\t\treturn sum;\n\t\t}\n\n\t\tvoid merge_thread() {\n\t\t\tmemory::update();\n\t\t\tsize_t available_memory = memory::get_total_memory();\n\t\t\twhile (merge_thread_is_running) {\n\t\t\t\tif (total_sizes() > available_memory * mem_limit) {\n\t\t\t\t\tappend_all();\n\t\t\t\t}\n\t\t\t\tthis_thread::sleep_for(200ms);\n\t\t\t}\n\t\t}\n\n\t\tvoid start_merge_thread() {\n\t\t\tmerge_thread_is_running = true;\n\t\t\tif (merge_thread_obj.joinable()) {\n\t\t\t\tthrow std::runtime_error(\"Trying to start already started merge thread. Not allowed.\");\n\t\t\t}\n\t\t\tmerge_thread_obj = std::move(thread(merge_thread));\n\t\t}\n\n\t\tvoid stop_merge_thread() {\n\t\t\tmerge_thread_is_running = false;\n\t\t\tmerge_thread_obj.join();\n\t\t\tappend_all();\n\t\t\tmerge_all();\n\t\t}\n\n\t\tvoid stop_merge_thread_only_append() {\n\t\t\tmerge_thread_is_running = false;\n\t\t\tmerge_thread_obj.join();\n\t\t\tappend_all();\n\t\t}\n\n\t\tvoid terminate_merge_thread() {\n\t\t\tmerge_thread_is_running = false;\n\t\t\tmerge_thread_obj.join();\n\t\t}\n\n\t\tvoid force_append() {\n\t\t\tappend_all();\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/merger.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <functional>\n\nusing namespace std;\n\nnamespace indexer {\n\n\tnamespace merger {\n\t\tvoid set_mem_limit(double mem_limit);\n\t\tvoid lock();\n\t\tvoid register_merger(size_t id, std::function<void()> merge);\n\t\tvoid register_appender(size_t id, std::function<void()> append, std::function<size_t()> size);\n\t\tvoid deregister_merger(size_t id);\n\n\t\tvoid start_merge_thread();\n\t\tvoid stop_merge_thread();\n\t\tvoid stop_merge_thread_only_append();\n\t\tvoid terminate_merge_thread();\n\t\tvoid force_append();\n\t};\n\n}\n"
  },
  {
    "path": "src/indexer/regular_index_builder.h",
    "content": ""
  },
  {
    "path": "src/indexer/return_record.h",
    "content": "\n#pragma once\n\n#include \"URL.h\"\n#include \"generic_record.h\"\n#include \"text/text.h\"\n\nnamespace indexer {\n\n\t/*\n\tThis is the returned record from the index_manager. It contains more data than the stored record.\n\t*/\n\tclass return_record : public generic_record {\n\n\t\tpublic:\n\t\tuint64_t m_url_hash;\n\t\tuint64_t m_domain_hash;\n\t\tsize_t m_num_url_links = 0;\n\t\tsize_t m_num_domain_links = 0;\n\t\tURL m_url;\n\t\tstd::string m_title;\n\t\tstd::string m_snippet;\n\t\tstd::string m_meta;\n\n\t\treturn_record() : generic_record() {};\n\t\treturn_record(uint64_t value) : generic_record(value) {};\n\t\treturn_record(uint64_t value, float score) : generic_record(value, score) {};\n\t\treturn_record(uint64_t value, float score, const std::string &tsv_data) : generic_record(value, score) {\n\n\t\t\tsize_t pos_start = 0;\n\t\t\tsize_t pos_end = 0;\n\t\t\tsize_t col_num = 0;\n\t\t\twhile (pos_end != std::string::npos) {\n\t\t\t\tpos_end = tsv_data.find('\\t', pos_start);\n\t\t\t\tconst size_t len = pos_end - pos_start;\n\t\t\t\tif (col_num == 0) {\n\t\t\t\t\tm_url = URL(tsv_data.substr(pos_start, len));\n\t\t\t\t}\n\t\t\t\tif (col_num == 1) {\n\t\t\t\t\tm_title = tsv_data.substr(pos_start, len);\n\t\t\t\t}\n\t\t\t\tif (col_num == 3) {\n\t\t\t\t\tm_meta = tsv_data.substr(pos_start, len);\n\t\t\t\t}\n\t\t\t\tif (col_num == 4) {\n\t\t\t\t\tm_snippet = make_snippet(tsv_data.substr(pos_start, len));\n\t\t\t\t\tif (m_snippet.size() == 0) {\n\t\t\t\t\t\tm_snippet = make_snippet(m_meta);\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\tpos_start = pos_end + 1;\n\t\t\t\tcol_num++;\n\t\t\t}\n\n\t\t};\n\n\t\tprivate:\n\t\tstd::string make_snippet(const std::string &text) const {\n\t\t\tauto response = text.substr(0, 140);\n\t\t\ttext::trim(response);\n\t\t\tif (response.size() >= 140) response += \"...\";\n\t\t\treturn response;\n\t\t}\n\n\t};\n}\n"
  },
  {
    "path": "src/indexer/score_builder.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"score_builder.h\"\n#include <iostream>\n#include <map>\n\nnamespace indexer {\n\n\tscore_builder::score_builder(size_t num_documents, const std::map<uint64_t, size_t> *document_sizes)\n\t: m_num_documents(num_documents), m_document_sizes(document_sizes)\n\t{\n\t\tcalculate_avg_document_size();\n\t}\n\t\t\n\tfloat score_builder::score() const {\n\t\treturn 0.0f;\n\t}\n\n\tsize_t score_builder::document_size(uint64_t doc_id) const {\n\t\tif (m_document_sizes->count(doc_id)) {\n\t\t\treturn m_document_sizes->at(doc_id);\n\t\t}\n\t\treturn 0;\n\t}\n\n\tvoid score_builder::calculate_avg_document_size() {\n\t\tm_avg_document_size = 0.0f;\n\t\tif (m_document_sizes->size()) {\n\t\t\tsize_t sum = 0;\n\t\t\tfor (const auto &iter : *m_document_sizes) {\n\t\t\t\tsum += iter.second;\n\t\t\t}\n\t\t\tm_avg_document_size = (float)sum / m_document_sizes->size();\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/score_builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <cstdint>\n#include <iostream>\n#include <map>\n\nnamespace indexer {\n\n\tclass score_builder {\n\n\tpublic:\n\n\t\tscore_builder(size_t num_documents, const std::map<uint64_t, size_t> *document_sizes);\n\t\t\n\t\tfloat score() const;\n\t\tsize_t document_count() const { return m_num_documents; }\n\t\tsize_t document_size(uint64_t doc_id) const;\n\t\tfloat avg_document_size() const { return m_avg_document_size; };\n\n\tprivate:\n\n\t\tsize_t m_num_documents;\n\t\tfloat m_avg_document_size;\n\t\tconst std::map<uint64_t, size_t> *m_document_sizes;\n\n\t\tvoid calculate_avg_document_size();\n\n\t};\n\n}\n"
  },
  {
    "path": "src/indexer/sharded.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n#include <memory>\n#include \"config.h\"\n#include \"algorithm/sum_sorted.h\"\n#include \"algorithm/intersection.h\"\n#include \"utils/thread_pool.hpp\"\n\nnamespace indexer {\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tclass sharded {\n\n\tpublic:\n\n\t\tsharded(const std::string &db_name, size_t num_shards);\n\t\tsharded(const std::string &db_name, size_t num_shards, size_t hash_table_size);\n\t\t~sharded();\n\n\t\t/* \n\t\t * Find single key\n\t\t * Returns vector with records in storage_order.\n\t\t * */\n\t\tstd::vector<data_record> find(uint64_t key) const;\n\t\tstd::vector<data_record> find(uint64_t key, size_t limit) const;\n\n\t\t/*\n\t\t * Find intersection of multiple keys\n\t\t * Returns vector with records in storage order.\n\t\t * */\n\t\tstd::vector<data_record> find_intersection(const std::vector<uint64_t> &keys, size_t limit = 0) const;\n\n\t\t/*\n\t\t * Find each key in keys and records with same m_value each key only returns top 'limit' number of results.\n\t\t * Returns vector with summed records.\n\t\t * */\n\t\tstd::vector<data_record> find_sum(const std::vector<uint64_t> &keys, size_t limit) const;\n\n\t\t/*\n\t\t * Iterates the keys of the index and calls the callback with key and vector of records for that key.\n\t\t * */\n\t\tvoid for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const;\n\t\tvoid for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key, size_t num_threads) const;\n\n\tprivate:\n\n\t\tstd::string m_db_name;\n\t\tsize_t m_num_shards;\n\t\tsize_t m_hash_table_size;\n\n\t\tvoid read_meta();\n\t\tstd::string filename() const;\n\n\t};\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tsharded<index_type, data_record>::sharded(const std::string &db_name, size_t num_shards)\n\t: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(config::shard_hash_table_size)\n\t{\n\t\tread_meta();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tsharded<index_type, data_record>::sharded(const std::string &db_name, size_t num_shards, size_t hash_table_size)\n\t: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(hash_table_size)\n\t{\n\t\tread_meta();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tsharded<index_type, data_record>::~sharded() {\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tstd::vector<data_record> sharded<index_type, data_record>::find(uint64_t key) const {\n\n\t\tconst size_t shard_id = key % m_num_shards;\n\t\tindex_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\n\t\treturn idx.find(key);\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tstd::vector<data_record> sharded<index_type, data_record>::find(uint64_t key, size_t limit) const {\n\n\t\tconst size_t shard_id = key % m_num_shards;\n\t\tindex_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\n\t\treturn idx.find(key, limit);\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tstd::vector<data_record> sharded<index_type, data_record>::find_intersection(const std::vector<uint64_t> &keys, size_t limit) const {\n\n\t\tstd::vector<std::unique_ptr<data_record[]>> results;\n\t\tstd::vector<size_t> num_results;\n\t\tfor (uint64_t key : keys) {\n\n\t\t\tconst size_t shard_id = key % m_num_shards;\n\t\t\tindex_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\n\t\t\tsize_t num_records;\n\t\t\tstd::unique_ptr<data_record[]> res = idx.find_ptr(key, num_records);\n\t\t\tresults.emplace_back(std::move(res));\n\t\t\tnum_results.push_back(num_records);\n\t\t}\n\n\t\tstd::vector<data_record> ret = ::algorithm::intersection(results, num_results);\n\n\t\tif (limit && ret.size() > limit) {\n\t\t\tstd::nth_element(ret.begin(), ret.begin () + (limit - 1), ret.end(), [](const auto &a, const auto &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t});\n\t\t\tret.resize(limit);\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tstd::vector<data_record> sharded<index_type, data_record>::find_sum(const std::vector<uint64_t> &keys,\n\t\t\tsize_t limit) const {\n\n\t\tstd::vector<std::vector<data_record>> results;\n\t\tfor (uint64_t key : keys) {\n\t\t\tconst size_t shard_id = key % m_num_shards;\n\t\t\tindex_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\n\t\t\tstd::vector<data_record> res = idx.find(key, limit);\n\n\t\t\tsort(res.begin(), res.end());\n\n\t\t\tresults.emplace_back(std::move(res));\n\t\t}\n\n\t\t// Sum equal elements.\n\t\treturn ::algorithm::sum_sorted<data_record>(results, [](data_record &a, const data_record &b) {\n\t\t\ta.m_score += b.m_score;\n\t\t});\n\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded<index_type, data_record>::for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const {\n\t\tfor_each(on_each_key, 32);\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded<index_type, data_record>::for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key, size_t num_threads) const {\n\t\tutils::thread_pool pool(num_threads);\n\n\t\tfor (size_t shard_id = 0; shard_id < m_num_shards; shard_id++) {\n\t\t\tpool.enqueue([this, shard_id, &on_each_key]() {\n\t\t\t\tindex_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\tidx.for_each(on_each_key);\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded<index_type, data_record>::read_meta() {\n\t\tstd::ifstream meta_file(filename(), std::ios::binary);\n\n\t\tif (meta_file.is_open()) {\n\n\t\t}\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tstd::string sharded<index_type, data_record>::filename() const {\n\t\t// This file will contain meta data on the index. For example the hyper log log document counter.\n\t\treturn config::data_path() + \"/0/full_text/\" + m_db_name + \".meta\";\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/sharded_builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <map>\n#include <fstream>\n#include \"algorithm/hyper_log_log.h\"\n#include \"utils/thread_pool.hpp\"\n#include \"debug.h\"\n#include \"config.h\"\n\nnamespace indexer {\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tclass sharded_builder {\n\tprivate:\n\t\t// Non copyable\n\t\tsharded_builder(const sharded_builder &);\n\t\tsharded_builder& operator=(const sharded_builder &);\n\n\tpublic:\n\n\t\tsharded_builder(const std::string &db_name, size_t num_shards);\n\t\t~sharded_builder();\n\n\t\tvoid add(uint64_t key, const data_record &record);\n\t\t\n\t\tvoid append();\n\t\tvoid merge();\n\n\t\tvoid truncate();\n\t\tvoid truncate_cache_files();\n\t\tvoid create_directories();\n\n\t\tsize_t document_count() const { return m_document_counter.count(); }\n\t\tsize_t document_size(uint64_t value) { return m_document_sizes[value]; }\n\n\t\tvoid calculate_scores();\n\t\tvoid sort_by_scores();\n\n\tprivate:\n\n\t\tstd::mutex m_lock;\n\t\tstd::string m_db_name;\n\t\tstd::vector<std::shared_ptr<index_type<data_record>>> m_shards;\n\n\t\t::algorithm::hyper_log_log m_document_counter;\n\t\tstd::map<uint64_t, size_t> m_document_sizes;\n\t\tfloat m_avg_document_size = 0.0f;\n\t\tsize_t m_num_added_keys = 0;\n\n\t\tvoid read_meta();\n\t\tvoid write_meta();\n\t\tstd::string filename() const;\n\n\t};\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tsharded_builder<index_type, data_record>::sharded_builder(const std::string &db_name, size_t num_shards) {\n\n\t\tm_db_name = db_name;\n\t\tfor (size_t shard_id = 0; shard_id < num_shards; shard_id++) {\n\t\t\tm_shards.push_back(std::make_shared<index_type<data_record>>(db_name, shard_id));\n\t\t}\n\t\tcreate_directories();\n\t\tread_meta();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tsharded_builder<index_type, data_record>::~sharded_builder() {\n\t\twrite_meta();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::add(uint64_t key, const data_record &record) {\n\t\tm_shards[key % m_shards.size()]->add(key, record);\n\n\t\tm_document_counter.insert(record.m_value);\n\n\t\t/*m_num_added_keys++;\n\t\tm_document_sizes[record.m_value]++;*/ // Raw non unique document size.\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::append() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->append();\n\t\t}\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::merge() {\n\t\tutils::thread_pool pool(32);\n\t\tfor (size_t i = 0; i < m_shards.size(); i++) {\n\t\t\tpool.enqueue([this, i]() {\n\t\t\t\ttry {\n\t\t\t\t\tm_shards[i]->merge();\n\t\t\t\t} catch (...) {\n\t\t\t\t}\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::truncate() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->truncate();\n\t\t}\n\t\tstd::ofstream meta_file(filename(), std::ios::trunc);\n\n\t\tm_document_counter.reset();\n\t\tm_document_sizes.clear();\n\t\tm_num_added_keys = 0;\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::truncate_cache_files() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->truncate_cache_files();\n\t\t}\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::create_directories() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->create_directories();\n\t\t}\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::calculate_scores() {\n\n\t\tconst size_t total_records = m_document_counter.count();\n\t\tdouble average_document_size = 0.0f;\n\t\tfor (const auto &iter : m_document_sizes) {\n\t\t\taverage_document_size += iter.second;\n\t\t}\n\t\taverage_document_size /= m_document_sizes.size();\n\n\t\tconst auto tf_idf = [this, total_records](const data_record &rec, size_t num_records) {\n\t\t\tdata_record ret = rec;\n\t\t\tfloat tf = (float)rec.m_count/m_document_sizes[rec.m_value];\n\t\t\tfloat idf = (float)total_records/num_records;\n\t\t\tret.m_score = tf*log(idf);\n\t\t\treturn ret;\n\t\t};\n\n\t\tconst auto bm25 = [this, total_records, average_document_size](const data_record &rec, size_t num_records) {\n\n\t\t\tif (m_document_sizes[rec.m_value] < 1000) {\n\t\t\t\tdata_record ret = rec;\n\t\t\t\tret.m_score = 0.0f;\n\t\t\t\treturn ret;\n\t\t\t}\n\n\t\t\t// https://en.wikipedia.org/wiki/Okapi_BM25\n\t\t\tconst double N = total_records; \n\t\t\tconst double n_q = num_records;\n\t\t\tconst double idf = log((N - n_q + 0.5)/(n_q + 0.5) + 1.0);\n\n\t\t\tconst double count_d = rec.m_count;\n\t\t\tconst double doc_size_d = m_document_sizes[rec.m_value];\n\n\t\t\tconst double f_q = count_d/doc_size_d;\n\t\t\tconst double k1 = 1.2;\n\t\t\tconst double b = 0.75;\n\t\t\tconst double d_card = m_document_sizes[rec.m_value];\n\n\t\t\tconst double score = idf * (f_q * (k1 + 1.0)) / (f_q + k1 * (1.0 - b + b * (d_card / average_document_size)));\n\n\t\t\tdata_record ret = rec;\n\t\t\tret.m_score = score;\n\t\t\treturn ret;\n\t\t};\n\n\t\t(void)tf_idf;\n\n\t\tconst auto algo = bm25;\n\n\t\tutils::thread_pool pool(32);\n\t\tfor (size_t i = 0; i < m_shards.size(); i++) {\n\t\t\tpool.enqueue([this, i, algo](){\n\t\t\t\tm_shards[i]->transform(algo);\n\t\t\t});\n\t\t}\n\t\tpool.run_all();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::sort_by_scores() {\n\n\t\tutils::thread_pool pool(32);\n\t\tfor (size_t i = 0; i < m_shards.size(); i++) {\n\t\t\tpool.enqueue([this, i](){\n\t\t\t\tm_shards[i]->sort_by([](const data_record &a, const data_record &b) {\n\t\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t\t});\n\t\t\t});\n\t\t}\n\t\tpool.run_all();\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::read_meta() {\n\t\tstd::ifstream meta_file(filename(), std::ios::binary);\n\n\t\tif (meta_file.is_open()) {\n\n\t\t\tmeta_file.read((char *)&m_num_added_keys, sizeof(size_t));\n\n\t\t\tchar *data = m_document_counter.data();\n\t\t\tmeta_file.read(data, m_document_counter.data_size());\n\n\t\t\tsize_t num_docs = 0;\n\t\t\tmeta_file.read((char *)(&num_docs), sizeof(size_t));\n\t\t\tfor (size_t i = 0; i < num_docs; i++) {\n\t\t\t\tuint64_t doc_id = 0;\n\t\t\t\tsize_t count = 0;\n\t\t\t\tmeta_file.read((char *)(&doc_id), sizeof(uint64_t));\n\t\t\t\tmeta_file.read((char *)(&count), sizeof(size_t));\n\t\t\t\tm_document_sizes[doc_id] = count;\n\t\t\t}\n\t\t}\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tvoid sharded_builder<index_type, data_record>::write_meta() {\n\t\tstd::ofstream meta_file(filename(), std::ios::binary | std::ios::trunc);\n\n\t\tif (meta_file.is_open()) {\n\n\t\t\tmeta_file.write((char *)&m_num_added_keys, sizeof(size_t));\n\n\t\t\tchar *data = m_document_counter.data();\n\t\t\tmeta_file.write(data, m_document_counter.data_size());\n\n\t\t\t// Write document sizes.\n\t\t\tconst size_t num_docs = m_document_sizes.size();\n\t\t\tmeta_file.write((char *)(&num_docs), sizeof(size_t));\n\t\t\tfor (const auto &iter : m_document_sizes) {\n\t\t\t\tmeta_file.write((char *)(&iter.first), sizeof(uint64_t));\n\t\t\t\tmeta_file.write((char *)(&iter.second), sizeof(size_t));\n\t\t\t}\n\t\t}\n\t}\n\n\ttemplate<template<typename> typename index_type, typename data_record>\n\tstd::string sharded_builder<index_type, data_record>::filename() const {\n\t\t// This file will contain meta data on the index. For example the hyper log log document counter.\n\t\treturn config::data_path() + \"/0/full_text/\" + m_db_name + \".meta\";\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/sharded_index.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"index.h\"\n#include \"algorithm/intersection.h\"\n#include \"algorithm/top_k.h\"\n#include \"utils/thread_pool.hpp\"\n#include \"config.h\"\n#include <mutex>\n\nnamespace indexer {\n\n\ttemplate<typename data_record>\n\tclass sharded_index {\n\n\tpublic:\n\n\t\tsharded_index(const std::string &db_name, size_t num_shards);\n\t\tsharded_index(const std::string &db_name, size_t num_shards, size_t hash_table_size);\n\t\t~sharded_index();\n\n\t\t/* \n\t\t * Find single key\n\t\t * Returns vector with records in storage_order.\n\t\t * */\n\t\tstd::vector<data_record> find(uint64_t key) const;\n\n\t\t/* \n\t\t * Find single key\n\t\t * Returns bitmap of internal ids.\n\t\t * */\n\t\troaring::Roaring find_bitmap(uint64_t key) const;\n\n\t\t/*\n\t\t * Find intersection of multiple keys\n\t\t * Returns vector with records in storage order.\n\t\t * */\n\t\tstd::vector<data_record> find_intersection(const std::vector<uint64_t> &keys) const;\n\n\t\t/*\n\t\t * Find intersection of multiple keys applying lambda function score_mod to the scores before.\n\t\t * Returns n records with highest score.\n\t\t * score_mod is applied in storage_order of data_record.\n\t\t * */\n\t\tstd::vector<data_record> find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t n, \n\t\t\t\tstd::function<float(const data_record &)> score_mod = [](const data_record &) { return 0.0f; }) const;\n\n\t\t/*\n\t\t * Overload without total_num_results\n\t\t * */\n\t\tstd::vector<data_record> find_top(const std::vector<uint64_t> &keys, size_t n, \n\t\t\t\tstd::function<float(const data_record &)> score_mod = [](const data_record &) { return 0.0f; }) const;\n\n\t\t/*\n\t\t * Find intersection of multiple keys and run group by, the groups will be determined by the\n\t\t * data_record::storage_equal predicate and 'score_formula' will be applied to m_score before summing.\n\t\t * Returns vector with grouped records.\n\t\t * */\n\t\tstd::vector<data_record> find_group_by(const std::vector<uint64_t> &keys,\n\t\t\t\tstd::function<float(float)> score_formula, std::vector<size_t> &counts) const;\n\n\t\t/*\n\t\t * Calculates a set of keys that has more than the given number of records.\n\t\t * Returns a std::set<uint64_t> with the keys.\n\t\t * This function is slow. Needs to open each shard to retrieve the keys.\n\t\t * */\n\t\tstd::set<uint64_t> get_keys(size_t with_more_than_records) const;\n\n\t\t/*\n\t\t * Iterates the keys of the index and calls the callback with key and the bitmap for that key.\n\t\t * */\n\t\tvoid for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const;\n\n\t\t/*\n\t\t * Returns the total number of records.\n\t\t * */\n\t\tsize_t num_records() const { return m_records.size(); }\n\n\t\t/*\n\t\t * Copies all the records from the bitmap into the vector append_to\n\t\t * */\n\t\tvoid get_records_for_bitmap(const roaring::Roaring &bitmap, std::vector<data_record> &append_to) const;\n\n\tprivate:\n\n\t\tstd::string m_db_name;\n\t\tsize_t m_num_shards;\n\t\tsize_t m_hash_table_size;\n\n\t\tstd::vector<data_record> m_records;\n\t\tmutable std::vector<float> m_scores;\n\t\tstd::map<uint64_t, uint32_t> m_record_id_map;\n\n\t\tvoid read_meta();\n\t\tstd::string filename() const;\n\n\t};\n\n\ttemplate<typename data_record>\n\tsharded_index<data_record>::sharded_index(const std::string &db_name, size_t num_shards)\n\t: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(config::shard_hash_table_size)\n\t{\n\t\tread_meta();\n\t}\n\n\ttemplate<typename data_record>\n\tsharded_index<data_record>::sharded_index(const std::string &db_name, size_t num_shards, size_t hash_table_size)\n\t: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(hash_table_size)\n\t{\n\t\tread_meta();\n\t}\n\n\ttemplate<typename data_record>\n\tsharded_index<data_record>::~sharded_index() {\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> sharded_index<data_record>::find(uint64_t key) const {\n\n\t\tconst size_t shard_id = key % m_num_shards;\n\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\n\t\troaring::Roaring rr = idx.find_bitmap(key);\n\n\t\tstd::function<data_record(uint32_t id)> id_to_rec = [this](uint32_t id) {\n\t\t\treturn m_records[id];\n\t\t};\n\n\t\tstd::vector<data_record> ret;\n\t\tfor (uint32_t internal_id : rr) {\n\t\t\tret.emplace_back(id_to_rec(internal_id));\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\troaring::Roaring sharded_index<data_record>::find_bitmap(uint64_t key) const {\n\n\t\tconst size_t shard_id = key % m_num_shards;\n\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\n\t\treturn idx.find_bitmap(key);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> sharded_index<data_record>::find_intersection(const std::vector<uint64_t> &keys) const {\n\n\t\tstd::vector<roaring::Roaring> results;\n\t\tfor (uint64_t key : keys) {\n\n\t\t\tconst size_t shard_id = key % m_num_shards;\n\t\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\n\t\t\troaring::Roaring res = idx.find_bitmap(key);\n\t\t\tresults.emplace_back(std::move(res));\n\t\t}\n\n\t\troaring::Roaring rr = ::algorithm::intersection(results);\n\n\t\tstd::function<data_record(uint32_t id)> id_to_rec = [this](uint32_t id) {\n\t\t\treturn m_records[id];\n\t\t};\n\n\t\tstd::vector<data_record> ret;\n\t\tfor (uint32_t internal_id : rr) {\n\t\t\tret.emplace_back(id_to_rec(internal_id));\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> sharded_index<data_record>::find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t n,\n\t\t\tstd::function<float(const data_record &)> score_mod) const {\n\n\t\tstd::fill(m_scores.begin(), m_scores.end(), 0.0f);\n\n\t\tstd::vector<roaring::Roaring> results;\n\t\tfor (uint64_t key : keys) {\n\n\t\t\tconst size_t shard_id = key % m_num_shards;\n\t\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\n\t\t\troaring::Roaring res = idx.find_bitmap(key);\n\t\t\tresults.emplace_back(std::move(res));\n\t\t}\n\n\t\troaring::Roaring rr = ::algorithm::intersection(results);\n\n\t\ttotal_num_results = rr.cardinality();\n\n\t\t// Apply score modifications.\n\t\tstd::vector<uint32_t> ids;\n\t\tfor (uint32_t internal_id : rr) {\n\t\t\tids.push_back(internal_id);\n\t\t\tm_scores[internal_id] = m_records[internal_id].m_score * score_mod(m_records[internal_id].m_value);\n\t\t}\n\n\t\tauto ordered = [this](const uint32_t &a, const uint32_t &b) {\n\t\t\treturn m_scores[a] < m_scores[b];\n\t\t};\n\n\t\tstd::vector<uint32_t> top_ids = ::algorithm::top_k<uint32_t>(ids, n, ordered);\n\n\t\tstd::vector<data_record> ret;\n\t\tfor (uint32_t internal_id : top_ids) {\n\t\t\tret.push_back(m_records[internal_id]);\n\t\t\tret.back().m_score = m_scores[internal_id];\n\t\t}\n\n\t\tsort(ret.begin(), ret.end(), typename data_record::score_order());\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> sharded_index<data_record>::find_top(const std::vector<uint64_t> &keys, size_t n,\n\t\t\tstd::function<float(const data_record &)> score_mod) const {\n\n\t\tsize_t total_num_results = 0;\n\t\treturn find_top(total_num_results, keys, n, score_mod);\n\t}\n\n\ttemplate<typename data_record>\n\tstd::vector<data_record> sharded_index<data_record>::find_group_by(const std::vector<uint64_t> &keys,\n\t\t\tstd::function<float(float)> score_formula, std::vector<size_t> &counts) const {\n\n\t\tstd::vector<roaring::Roaring> results;\n\t\tfor (uint64_t key : keys) {\n\n\t\t\tconst size_t shard_id = key % m_num_shards;\n\t\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\n\t\t\troaring::Roaring res = idx.find_bitmap(key);\n\t\t\tresults.emplace_back(std::move(res));\n\t\t}\n\n\t\troaring::Roaring rr = ::algorithm::intersection(results);\n\n\t\t// Group by.\n\t\tstd::vector<data_record> ret;\n\t\tfor (uint32_t internal_id : rr) {\n\t\t\tif (internal_id >= m_records.size()) {\n\t\t\t\tstd::cout << \"internal_id: \" << internal_id << \" >= \" << m_records.size() << std::endl;\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (ret.size() && ret.back().storage_equal(m_records[internal_id])) {\n\t\t\t\tret.back().m_score += score_formula(m_records[internal_id].m_score);\n\t\t\t\tcounts.back()++;\n\t\t\t} else {\n\t\t\t\tret.emplace_back(m_records[internal_id]);\n\t\t\t\tret.back().m_score = score_formula(ret.back().m_score);\n\t\t\t\tcounts.push_back(1);\n\t\t\t}\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::set<uint64_t> sharded_index<data_record>::get_keys(size_t with_more_than_records) const {\n\n\t\tutils::thread_pool pool(32);\n\t\tstd::mutex lock;\n\t\tstd::set<uint64_t> all_keys;\n\t\tfor (size_t shard_id = 0; shard_id < m_num_shards; shard_id++) {\n\n\t\t\tpool.enqueue([this, shard_id, with_more_than_records, &all_keys, &lock]() {\n\t\t\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\tstd::set<uint64_t> keys_for_shard = idx.get_keys(with_more_than_records);\n\n\t\t\t\tlock.lock();\n\t\t\t\tall_keys.insert(keys_for_shard.begin(), keys_for_shard.end());\n\t\t\t\tlock.unlock();\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\n\t\treturn all_keys;\n\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index<data_record>::for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const {\n\t\n\t\tutils::thread_pool pool(32);\n\t\tfor (size_t shard_id = 0; shard_id < m_num_shards; shard_id++) {\n\n\t\t\tpool.enqueue([this, shard_id, &on_each_key]() {\n\t\t\t\tindex<data_record> idx(m_db_name, shard_id, m_hash_table_size);\n\t\t\t\tidx.for_each(on_each_key);\n\t\t\t});\n\t\t}\n\n\t\tpool.run_all();\n\t}\n\n\t/*\n\t * Copies all the records from the bitmap into the vector iterator \"append_to\"\n\t * */\n\ttemplate<typename data_record>\n\tvoid sharded_index<data_record>::get_records_for_bitmap(const roaring::Roaring &bitmap, std::vector<data_record> &append_to) const {\n\t\tfor (uint32_t internal_id : bitmap) {\n\t\t\tappend_to.emplace_back(m_records[internal_id]);\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index<data_record>::read_meta() {\n\t\tstd::ifstream meta_file(filename(), std::ios::binary);\n\n\t\tif (meta_file.is_open()) {\n\n\t\t\t// Read records.\n\t\t\tsize_t num_records;\n\t\t\tmeta_file.read((char *)(&num_records), sizeof(size_t));\n\t\t\tfor (size_t i = 0; i < num_records; i++) {\n\t\t\t\tdata_record rec;\n\t\t\t\tmeta_file.read((char *)(&rec), sizeof(data_record));\n\n\t\t\t\tm_record_id_map[rec.m_value] = m_records.size();\n\t\t\t\tm_records.push_back(rec);\n\t\t\t\tm_scores.push_back(0.0f);\n\t\t\t}\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string sharded_index<data_record>::filename() const {\n\t\t// This file will contain meta data on the index. For example the hyper log log document counter.\n\t\treturn config::data_path() + \"/0/full_text/\" + m_db_name + \".meta\";\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/sharded_index_builder.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"index_builder.h\"\n#include \"algorithm/hyper_log_log.h\"\n#include \"utils/thread_pool.hpp\"\n#include \"utils/thread_pool_arg.h\"\n\n#include <numeric>\n\nnamespace indexer {\n\n\ttemplate<typename data_record>\n\tclass sharded_index_builder {\n\tprivate:\n\t\t// Non copyable\n\t\tsharded_index_builder(const sharded_index_builder &);\n\t\tsharded_index_builder& operator=(const sharded_index_builder &);\n\n\tpublic:\n\n\t\tsharded_index_builder(const std::string &db_name, size_t num_shards);\n\t\t~sharded_index_builder();\n\n\t\tvoid add(uint64_t key, const data_record &record);\n\t\t\n\t\tvoid append();\n\t\tvoid merge();\n\t\tvoid merge_one(size_t id);\n\t\tvoid optimize();\n\n\t\t/*\n\t\t\tThis function calculate scores. Should run after a merge.\n\t\t*/\n\t\tvoid calculate_scores(algorithm algo);\n\n\t\tsize_t num_documents() const { return m_document_counter.count(); }\n\t\tsize_t document_size(uint64_t document_id) { return m_document_sizes[document_id]; }\n\n\t\tvoid truncate();\n\t\tvoid truncate_cache_files();\n\t\tvoid create_directories();\n\n\t\tvoid check();\n\n\t\t/*\n\t\t * Loops over the records and applies transform.\n\t\t * */\n\t\tvoid for_each_record(std::function<void(data_record &)> transform);\n\n\tprivate:\n\n\t\tstd::mutex m_lock;\n\t\tstd::string m_db_name;\n\t\tstd::vector<std::shared_ptr<index_builder<data_record>>> m_shards;\n\t\t::algorithm::hyper_log_log m_document_counter;\n\t\tstd::map<uint64_t, size_t> m_document_sizes;\n\t\tfloat m_avg_document_size = 0.0f;\n\n\t\tstd::vector<data_record> m_records;\n\t\tstd::map<uint64_t, uint32_t> m_record_id_map;\n\n\t\tvoid read_meta();\n\t\tvoid write_meta();\n\t\tstd::string filename() const;\n\t\tbool needs_optimization() const;\n\t\tvoid sort_records();\n\n\t};\n\n\ttemplate<typename data_record>\n\tsharded_index_builder<data_record>::sharded_index_builder(const std::string &db_name, size_t num_shards) {\n\n\t\tstd::function<uint32_t(const data_record &)> rec_to_id = [this](const data_record &record) {\n\t\t\tstd::lock_guard guard(m_lock);\n\t\t\tif (m_record_id_map.count(record.m_value) == 0) {\n\t\t\t\tm_record_id_map[record.m_value] = m_records.size();\n\t\t\t\tm_records.push_back(record);\n\t\t\t}\n\t\t\treturn m_record_id_map[record.m_value];\n\t\t};\n\n\t\tm_db_name = db_name;\n\t\tfor (size_t shard_id = 0; shard_id < num_shards; shard_id++) {\n\t\t\tm_shards.push_back(std::make_shared<index_builder<data_record>>(db_name, shard_id, rec_to_id));\n\t\t}\n\t\tcreate_directories();\n\t\tread_meta();\n\t}\n\n\ttemplate<typename data_record>\n\tsharded_index_builder<data_record>::~sharded_index_builder() {\n\t\twrite_meta();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::add(uint64_t key, const data_record &record) {\n\t\tm_shards[key % m_shards.size()]->add(key, record);\n\n\t\t/*m_document_counter.insert(record.m_value);\n\t\tm_document_sizes[record.m_value]++; // Raw non unique document size.\n\t\t*/\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::append() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->append();\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::merge() {\n\n\t\tutils::thread_pool_arg<std::unordered_map<uint64_t, uint32_t>> pool(32);\n\n\t\tfor (size_t i = 0; i < m_shards.size(); i++) {\n\t\t\tpool.enqueue([this, i](std::unordered_map<uint64_t, uint32_t> &internal_id_map) {\n\t\t\t\ttry {\n\t\t\t\t\tm_shards[i]->merge(internal_id_map);\n\t\t\t\t} catch (...) {\n\t\t\t\t}\n\t\t\t});\n\t\t}\n\t\tpool.run_all();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::merge_one(size_t id) {\n\t\tm_shards[id]->merge();\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::optimize() {\n\t\tif (needs_optimization()) {\n\t\t\tsort_records();\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::calculate_scores(algorithm algo) {\n\n\t\t(void)algo;\n\n\t\t/*const size_t num_docs = num_documents();\n\t\tscore_builder score(num_docs, &m_document_sizes);\n\t\t\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->calculate_scores(algo, score);\n\t\t}*/\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::truncate() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->truncate();\n\t\t}\n\t\tstd::ofstream meta_file(filename(), std::ios::trunc);\n\t\tm_records = std::vector<data_record>{};\n\t\tm_record_id_map = std::map<uint64_t, uint32_t>{};\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::truncate_cache_files() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->truncate_cache_files();\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::create_directories() {\n\t\tfor (auto &shard : m_shards) {\n\t\t\tshard->create_directories();\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::read_meta() {\n\t\tstd::ifstream meta_file(filename(), std::ios::binary);\n\n\t\tif (meta_file.is_open()) {\n\n\t\t\t// Read records.\n\t\t\tsize_t num_records;\n\t\t\tmeta_file.read((char *)(&num_records), sizeof(size_t));\n\t\t\tif (meta_file.eof()) return;\n\t\t\tfor (size_t i = 0; i < num_records; i++) {\n\t\t\t\tdata_record rec;\n\t\t\t\tmeta_file.read((char *)(&rec), sizeof(data_record));\n\n\t\t\t\tm_record_id_map[rec.m_value] = m_records.size();\n\t\t\t\tm_records.push_back(rec);\n\t\t\t}\n\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::write_meta() {\n\t\tstd::ofstream meta_file(filename(), std::ios::binary | std::ios::trunc);\n\n\t\tif (meta_file.is_open()) {\n\n\t\t\t// Write records.\n\t\t\tconst size_t num_records = m_records.size();\n\t\t\tmeta_file.write((char *)(&num_records), sizeof(size_t));\n\t\t\tfor (const data_record &record : m_records) {\n\t\t\t\tmeta_file.write((char *)(&record), sizeof(data_record));\n\t\t\t}\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tstd::string sharded_index_builder<data_record>::filename() const {\n\t\t// This file will contain meta data on the index. For example the hyper log log document counter.\n\t\treturn config::data_path() + \"/0/full_text/\" + m_db_name + \".meta\";\n\t}\n\n\ttemplate<typename data_record>\n\tbool sharded_index_builder<data_record>::needs_optimization() const {\n\t\t// Just check if the records are sorted by storage order.\n\t\tif (m_records.size() <= 1) return false;\n\t\t\n\t\ttypename data_record::storage_order ordered;\n\t\tfor (size_t i = 0; i < m_records.size() - 1; i++) {\n\t\t\tif (!ordered(m_records[i], m_records[i + 1])) {\n\t\t\t\treturn true;\n\t\t\t}\n\t\t}\n\t\treturn false;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::sort_records() {\n\t\tstd::vector<uint32_t> permutation(m_records.size());\n\t\tstd::iota(permutation.begin(), permutation.end(), 0);\n\n\t\ttypename data_record::storage_order ordered;\n\n\t\tstd::sort(permutation.begin(), permutation.end(), [this, &ordered](const size_t &a, const size_t &b) {\n\t\t\treturn ordered(m_records[a], m_records[b]);\n\t\t});\n\t\t// permutation now points from new position -> old position of record.\n\n\t\tstd::vector<uint32_t> inverse(permutation.size());\n\t\tfor (uint32_t i = 0; i < permutation.size(); i++) {\n\t\t\tinverse[permutation[i]] = i;\n\t\t}\n\n\t\t// inverse now points from old position -> new position of record.\n\n\t\tutils::thread_pool pool(32);\n\t\tfor (size_t i = 0; i < m_shards.size(); i++) {\n\t\t\tpool.enqueue([this, i, &inverse]() {\n\t\t\t\tm_shards[i]->transform([&inverse](uint32_t v) {\n\t\t\t\t\treturn inverse[v];\n\t\t\t\t});\n\t\t\t});\n\t\t}\n\t\tpool.run_all();\n\n\t\t// Reorder the records. Will be saved in meta file upon destruction.\n\t\tsort(m_records.begin(), m_records.end(), ordered);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::check() {\n\t\tconst size_t num_records = m_records.size();\n\n\t\tstd::cout << \"num_records: \" << num_records << std::endl;\n\n\t\tsize_t total_max = 0;\n\t\tfor (auto shard : m_shards) {\n\t\t\tsize_t max_id = shard->get_max_id();\n\t\t\tif (max_id >= num_records) {\n\t\t\t\tstd::cout << \"found max id: \" << max_id << \" but only has \" << num_records << \" records\" << std::endl;\n\t\t\t}\n\t\t\tif (max_id > total_max) total_max = max_id;\n\t\t}\n\t\tstd::cout << \"done, max_id was: \" << total_max << std::endl;\n\t}\n\n\t/*\n\t * Loops over the records and applies transform.\n\t * */\n\ttemplate<typename data_record>\n\tvoid sharded_index_builder<data_record>::for_each_record(std::function<void(data_record &)> transform) {\n\t\tfor (auto &rec : m_records) {\n\t\t\ttransform(rec);\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/indexer/url_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"generic_record.h\"\n\nnamespace indexer {\n\t#pragma pack(4)\n\tclass url_record : public generic_record {\n\n\t\tpublic:\n\t\tuint64_t m_domain_hash;\n\t\tuint32_t m_meta;\n\n\t\turl_record() : generic_record(), m_domain_hash(0) {};\n\t\turl_record(uint64_t value) : generic_record(value), m_domain_hash(0) {};\n\t\turl_record(uint64_t value, float score) : generic_record(value, score), m_domain_hash(0) {};\n\t\turl_record(uint64_t value, float score, uint64_t domain_hash) : generic_record(value, score), m_domain_hash(domain_hash) {};\n\n\t\tvoid url_length(uint16_t len) { m_meta = len | (m_meta << 16); };\n\t\tuint16_t url_length(void) const { return m_meta & 0xFFFF; };\n\n\t};\n}\n"
  },
  {
    "path": "src/indexer/value_record.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace indexer {\n\n\t#pragma pack(4)\n\tclass value_record {\n\t\tpublic:\n\t\tuint64_t m_value;\n\n\t\tvalue_record() : m_value(0) {};\n\t\tvalue_record(uint64_t value) : m_value(value) {};\n\t\tvalue_record(uint64_t value, float score) : m_value(value) {};\n\n\t\tbool operator==(const value_record &b) const {\n\t\t\treturn m_value == b.m_value;\n\t\t}\n\n\t\tbool operator<(const value_record &b) const {\n\t\t\treturn m_value < b.m_value;\n\t\t}\n\n\t\tvalue_record &operator+=(const value_record &b) {\n\t\t\treturn *this;\n\t\t}\n\n\t\t/*\n\t\t * Will be applied to records before truncating. Top records will be kept.\n\t\t * */\n\t\tstruct truncate_order {\n\t\t\tinline bool operator() (const value_record &a, const value_record &b) {\n\t\t\t\treturn a.m_value > b.m_value;\n\t\t\t}\n\t\t};\n\n\t\t/*\n\t\t * Will be applied before storing on disk. This is the order the records will be returned in.\n\t\t * */\n\t\tstruct storage_order {\n\t\t\tinline bool operator() (const value_record &a, const value_record &b) {\n\t\t\t\treturn a.m_value < b.m_value;\n\t\t\t}\n\t\t};\n\n\t\tbool storage_equal(const value_record &a) const {\n\t\t\treturn m_value == a.m_value;\n\t\t}\n\n\t};\n}\n"
  },
  {
    "path": "src/indexer.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iostream>\n#include \"config.h\"\n#include \"logger/logger.h\"\n#include \"downloader/warc_downloader.h\"\n#include \"tools/splitter.h\"\n#include \"tools/counter.h\"\n#include \"tools/calculate_harmonic.h\"\n#include \"tools/generate_url_lists.h\"\n#include \"tools/find_links.h\"\n#include \"indexer/index_manager.h\"\n#include \"URL.h\"\n#include \"indexer/console.h\"\n#include <iostream>\n#include <set>\n#include \"indexer/sharded_index.h\"\n#include \"transfer/transfer.h\"\n\nvoid help() {\n\tstd::cout << \"Usage: ./tools [OPTION]...\" << std::endl;\n\tstd::cout << \"--split run splitter\" << std::endl;\n\tstd::cout << \"--harmonic-hosts create file /tmp/hosts.txt with hosts for harmonic centrality\" << std::endl;\n\tstd::cout << \"--harmonic-links create file /tmp/edges.txt for edges for harmonic centrality\" << std::endl;\n\tstd::cout << \"--harmonic calculates harmonic centrality\" << std::endl;\n}\n\nint main(int argc, const char **argv) {\n\n\t//logger::start_logger_thread();\n\t//logger::verbose(true);\n\n\tif (getenv(\"ALEXANDRIA_CONFIG\") != NULL) {\n\t\tconfig::read_config(getenv(\"ALEXANDRIA_CONFIG\"));\n\t} else {\n\t\tconfig::read_config(\"/etc/alexandria.conf\");\n\t}\n\n\tif (argc < 2) {\n\t\thelp();\n\t\treturn 0;\n\t}\n\n\tconst std::string arg(argc > 1 ? argv[1] : \"\");\n\n\tif (arg == \"--downloader\" && argc > 2) {\n\t\tdownloader::warc_downloader(argv[2]);\n\t} else if (arg == \"--downloader-missing\" && argc > 2) {\n\t\tdownloader::warc_downloader_missing(string(argv[2]));\n\t} else if (arg == \"--split\") {\n\t\ttools::run_splitter();\n\t} else if (arg == \"--count-overflow-words\") {\n\t\tindexer::count_words_that_hit_max();\n\t} else if (arg == \"--count\") {\n\t\tstd::cout << \"count: \" << indexer::count_urls() << std::endl;\n\t} else if (arg == \"--count-domains\") {\n\t\ttools::run_counter_per_domain(argv[2]);\n\t} else if (arg == \"--make-urls\" && argc > 2) {\n\t\ttools::generate_url_lists(argv[2]);\n\t} else if (arg == \"--split-make-direct-links\") {\n\n\t\t/*\n\t\t * Make direct links by using the url bloom filter.\n\t\t * */\n\t\ttools::run_split_direct_links();\n\t} else if (arg == \"--split-build-url-bloom\") {\n\n\t\t/*\n\t\t * Make a bloom filter from all urls in the source batches.\n\t\t * */\n\t\ttools::run_split_build_url_bloom();\n\t} else if (arg == \"--split-build-direct-link-bloom\") {\n\n\t\t/*\n\t\t * Make a bloom filter from all direct links in the source batches.\n\t\t * */\n\t\ttools::run_split_build_direct_link_bloom();\n\t} else if (arg == \"--split-with-links\") {\n\n\t\t/*\n\t\t * split with links takes all the URL batches and splits them into smaller NODE-{node id} folders\n\t\t * with links means it only takes URLs with direct links in them. this is a major\n\t\t * optimization and makes our target index much much smaller.\n\t\t *\n\t\t * */\n\t\ttools::run_split_urls_with_direct_links();\n\t} else if (arg == \"--split-links\") {\n\n\t\t/*\n\t\t * split links should run after --split-with-links because it takes all the link batches and splits\n\t\t * them into LINK-{node id} folders but it ONLY takes links with target domain that is present in the\n\t\t * URL files stored in the NODE- folders.\n\t\t *\n\t\t * */\n\t\ttools::run_split_links_with_relevant_domains();\n\t} else if (arg == \"--search\") {\n\n\t\t/*\n\t\t * split links should run after --split-with-links because it takes all the link batches and splits\n\t\t * them into LINK-{node id} folders but it ONLY takes links with target domain that is present in the\n\t\t * URL files stored in the NODE- folders.\n\t\t *\n\t\t * */\n\t\tindexer::index_manager idx_manager;\n\t\tauto response = idx_manager.find(argv[2]);\n\n\t\tfor (const auto &rec : response) {\n\t\t\tstd::cout << rec.m_url << \" score \" << rec.m_score << std::endl;\n\t\t}\n\t} else if (arg == \"--harmonic-hosts\") {\n\t\ttools::calculate_harmonic_hosts();\n\t} else if (arg == \"--harmonic-links\") {\n\t\ttools::calculate_harmonic_links();\n\t} else if (arg == \"--harmonic\") {\n\t\ttools::calculate_harmonic();\n\t} else if (arg == \"--host-hash\") {\n\t\tURL url(argv[2]);\n\t\tcout << url.host_hash() << endl;\n\t} else if (arg == \"--url-hash\") {\n\t\tURL url(argv[2]);\n\t\tcout << url.hash() << endl;\n\t} else if (arg == \"--host-hash-mod\") {\n\t\tURL url(argv[2]);\n\t\tcout << url.host_hash() % stoull(argv[3]) << endl;\n\t} else if (arg == \"--find-links\") {\n\t\ttools::find_links();\n\t} else if (arg == \"--console\") {\n\t\tindexer::console();\n\t} else if (arg == \"--index-links\") {\n\t\tindexer::index_links();\n\t} else if (arg == \"--index-urls\") {\n\t\tindexer::index_urls();\n\t} else if (arg == \"--make-domain-index\") {\n\t\tindexer::make_domain_index();\n\t} else if (arg == \"--make-domain-index-scores\") {\n\t\tindexer::make_domain_index_scores();\n\t} else if (arg == \"--truncate-links\") {\n\t\tindexer::truncate_links();\n\t} else if (arg == \"--make-url-bloom\") {\n\t\tindexer::make_url_bloom_filter();\n\t} else {\n\t\thelp();\n\t}\n\n\tlogger::join_logger_thread();\n\n\treturn 0;\n}\n"
  },
  {
    "path": "src/logger/logger.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"logger.h\"\n#include <thread>\n#include <queue>\n\nusing namespace std;\n\nnamespace logger {\n\n\tthread m_logger_thread;\n\tmutex *m_lock = nullptr;\n\tqueue<string> *m_queue = nullptr;\n\tofstream m_file;\n\tchrono::seconds m_reopen_interval = std::chrono::seconds(300);\n\tchrono::system_clock::time_point m_last_reopen;\n\tbool m_verbose = false;\n\tbool m_run_logger = true;\n\tbool m_logger_started = false;\n\n\tvoid verbose(bool verbose) {\n\t\tm_verbose = verbose;\n\t}\n\n\tvoid initialize() {\n\t\tm_lock = new mutex;\n\t\tm_queue = new queue<string>;\n\t\tm_logger_started = true;\n\t}\n\n\tvoid de_initialize() {\n\t\tdelete m_lock;\n\t\tdelete m_queue;\n\t\tm_lock = nullptr;\n\t\tm_queue = nullptr;\n\t\tm_logger_started = false;\n\t}\n\n\tvoid reopen() {\n\t\tauto now = chrono::system_clock::now();\n\t\tm_lock->lock();\n\t\tif (now - m_last_reopen > m_reopen_interval) {\n\t\t\tm_last_reopen = now;\n\t\t\ttry {\n\t\t\t\tm_file.close();\n\t\t\t} catch (...) {\n\n\t\t\t}\n\t\t\ttry {\n\t\t\t\tm_file.open(config::log_file_path, ofstream::out | ofstream::app);\n\t\t\t\tm_last_reopen = chrono::system_clock::now();\n\t\t\t} catch (exception &error) {\n\t\t\t\ttry {\n\t\t\t\t\tm_file.close();\n\t\t\t\t} catch (...) {\n\t\t\t\t\t\n\t\t\t\t}\n\t\t\t\tthrow error;\n\t\t\t}\n\t\t}\n\t\tm_lock->unlock();\n\t}\n\n\tstring timestamp() {\n\t\tchrono::system_clock::time_point tp = std::chrono::system_clock::now();\n\t\ttime_t tt = std::chrono::system_clock::to_time_t(tp);\n\t\ttm gmt{}; gmtime_r(&tt, &gmt);\n\t\tstring buffer(100, 'x');\n\t\tsprintf(&buffer.front(), \"%04d-%02d-%02d %02d:%02d:%02d\", gmt.tm_year + 1900, (short)gmt.tm_mon + 1,\n\t\t\t(short)gmt.tm_mday, (short)gmt.tm_hour, (short)gmt.tm_min, (short)gmt.tm_sec);\n\t\tbuffer.resize(19);\n\t\treturn buffer;\n\t}\n\n\tstring format(const string &type, const string &file, int line, const string &message, const string &meta) {\n\t\tstring output;\n\t\toutput.append(timestamp());\n\t\toutput.append(\" [\" + type + \"]\");\n\t\toutput.append(\" \" + file + \":\" + to_string(line));\n\t\toutput.append(\" \" + message);\n\t\toutput.append(\" \" + meta);\n\t\treturn output;\n\t}\n\n\tvoid log_message(const string &type, const string &file, int line, const string &message, const string &meta) {\n\t\tlog_string(format(type, file, line, message, meta));\n\t}\n\n\tvoid log_string(const string &message) {\n\t\tif (!m_logger_started || m_lock == nullptr || m_queue == nullptr) return; // logger thread not started.\n\t\tm_lock->lock();\n\t\tif (m_verbose) cout << message << endl;\n\t\tm_queue->push(message);\n\t\tm_lock->unlock();\n\t}\n\n\tvoid log(const string &type, const string &file, int line, const string &message) {\n\t\tlog_message(type, file, line, message, \"\");\n\t}\n\n\tvoid write_message_to_logfile(const string &message) {\n\t\tm_file << message << endl;\n\t}\n\n\tvoid logger_thread() {\n\t\tinitialize();\n\t\treopen();\n\t\twhile (true) {\n\t\t\twhile (m_queue->empty() && m_run_logger) {\n\t\t\t\tstd::this_thread::sleep_for(std::chrono::milliseconds(50));\n\t\t\t}\n\n\t\t\tif (m_queue->empty()) break;\n\n\t\t\tm_lock->lock();\n\t\t\tstring message = m_queue->front();\n\t\t\tm_queue->pop();\n\t\t\tm_lock->unlock();\n\n\t\t\twrite_message_to_logfile(message);\n\t\t}\n\n\t\tde_initialize();\n\t}\n\n\tvoid start_logger_thread() {\n\t\tif (!m_logger_started) {\n\t\t\tm_run_logger = true;\n\t\t\tm_logger_thread = thread(logger_thread);\n\t\t}\n\n\t\t// Wait for logger thread to start.\n\t\tfor (size_t i = 0; i < 20 && !m_logger_started; i++) {\n\t\t\tthis_thread::sleep_for(1ms);\n\t\t}\n\t}\n\n\tvoid join_logger_thread() {\n\t\tif (m_logger_started) {\n\t\t\tm_run_logger = false;\n\t\t\tm_logger_thread.join();\n\t\t\tm_verbose = false;\n\t\t}\n\t}\n\n\tvoid sync() {\n\t\tstd::this_thread::sleep_for(std::chrono::milliseconds(100));\n\t}\n\n\tlogged_exception::logged_exception(const string &message, const string &file, int line)\n\t: m_message(message), m_file(file), m_line(line)\n\t{\n\t\tm_formatted_message = format(\"EXCEPTION\", m_file, m_line, m_message, \"\");\n\t}\n}\n\n"
  },
  {
    "path": "src/logger/logger.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"config.h\"\n#include <mutex>\n#include <fstream>\n#include <iostream>\n\n#define LOG_INFO(msg) (logger::log(\"info\", __FILE__, __LINE__, msg))\n#define LOG_ERROR(msg) (logger::log(\"error\", __FILE__, __LINE__, msg))\n\n#define LOG_ERROR_EXCEPTION(msg) (logger::logged_exception(msg, std::string(__FILE__), __LINE__))\n\nnamespace logger {\n\n\tvoid verbose(bool verbose);\n\tvoid reopen();\n\tstd::string timestamp();\n\tvoid log_message(const std::string &type, const std::string &file, int line, const std::string &message, const std::string &meta);\n\tvoid log_string(const std::string &message);\n\n\t// Should be called like this: logger::log(\"error\", __FILE__, __LINE__, error.what());\n\tvoid log(const std::string &type, const std::string &file, int line, const std::string &message);\n\tvoid log(const std::string &type, const std::string &file, int line, const std::string &message, const std::string &meta);\n\n\tvoid start_logger_thread();\n\tvoid join_logger_thread();\n\tvoid sync();\n\n\tclass logged_exception : public std::exception {\n\n\t\tpublic:\n\t\t\tlogged_exception(const std::string &message, const std::string &file, int line);\n\n\t\t\tconst char *what() const throw () {\n\t\t\t\treturn m_formatted_message.c_str();\n\t\t\t}\n\n\t\tprivate:\n\n\t\t\tstd::string m_message;\n\t\t\tstd::string m_file;\n\t\t\tint m_line;\n\t\t\tstd::string m_formatted_message;\n\n\t};\n\n}\n\n"
  },
  {
    "path": "src/memory/debugger.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"debugger.h\"\n#include \"memory.h\"\n#include \"logger/logger.h\"\n#include <iostream>\n#include <cstdlib>\n#include <array>\n\nusing namespace std;\n\n/*\n\tThis memory manager exists so that we can know exactly how many bytes we currently have allocated. Since the OS\n\tis running a virtual memory system we can only know exactly how many bytes we have allocated right now if we\n\tkeep a counter ourselves.\n\n\tTo do this we overload the global new, new[], delete and delete[] operators.\n\n\tThe problem is knowing how much memory is freed, we only have the pointer and not the length. To fix this issue\n\twe allocate sizeof(size_t) bytes more when allocating memory, then we store the length there and return a pointer\n\tto the address at offset \"sizeof(size_t)\" from the allocated pointer.\n\n\tThis seems like absolute madness at first but I don't have any other solution.\n*/\n\n#include \"sys/types.h\"\n#include \"sys/sysinfo.h\"\n\nnamespace memory {\n\n\tatomic_size_t mem_counter;\n\tsize_t ptr_counter;\n\tsize_t total_memory_on_host;\n\n\tvoid incr_mem_counter(size_t n) {\n\t\tmem_counter += n;\n\t}\n\n\tvoid decr_mem_counter(size_t n) {\n\t\tmem_counter -= n;\n\t}\n\n\tsize_t allocated_memory() {\n\t\treturn mem_counter;\n\t}\n\n\tsize_t num_allocated() {\n\t\treturn ptr_counter;\n\t}\n\n\tsize_t record_usage_base = 0;\n\tsize_t record_usage_peak = 0;\n\tsize_t global_usage_peak = 0;\n\n\tvoid reset_usage() {\n\t\trecord_usage_base = allocated_memory();\n\t\trecord_usage_peak = record_usage_base;\n\t}\n\n\tvoid record_usage() {\n\t\tif (record_usage_peak < allocated_memory()) {\n\t\t\trecord_usage_peak = allocated_memory();\n\t\t}\n\t\tif (global_usage_peak < get_usage()) {\n\t\t\tglobal_usage_peak = get_usage();\n\t\t}\n\t}\n\n\tsize_t get_usage() {\n\t\treturn record_usage_peak - record_usage_base;\n\t}\n\n\tsize_t get_usage_peak() {\n\t\treturn global_usage_peak;\n\t}\n\n}\n\n"
  },
  {
    "path": "src/memory/debugger.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <atomic>\n\nnamespace memory {\n\n\tvoid incr_mem_counter(size_t n);\n\tvoid decr_mem_counter(size_t n);\n\tsize_t allocated_memory(); // Returns number of allocated bytes.\n\tsize_t num_allocated(); // Returns number of allocated pointers.\n\n\tvoid reset_usage();\n\tvoid record_usage();\n\tsize_t get_usage();\n\tsize_t get_usage_peak();\n\n}\n"
  },
  {
    "path": "src/memory/memory.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"memory.h\"\n#include <unistd.h>\n#include <iostream>\n#include <fstream>\n\nnamespace memory {\n\n\tsize_t available_memory = 0;\n\tsize_t used_memory = 0;\n\tsize_t total_memory = 0;\n\n\tsize_t get_available_memory() {\n\t\treturn available_memory;\n\t}\n\n\tsize_t get_used_memory() {\n\t\treturn used_memory;\n\t}\n\n\tsize_t get_total_memory() {\n\t\treturn total_memory;\n\t}\n\n\t/*\n\t * inspired by https://stackoverflow.com/questions/349889/how-do-you-determine-the-amount-of-linux-system-ram-in-c\n\t * */\n\tvoid update() {\n\n\t\t{\n\t\t\tstd::string token;\n\t\t\tstd::ifstream infile(\"/proc/meminfo\", std::ios::in);\n\t\t\tif (infile.is_open()) {\n\t\t\t\twhile (infile >> token) {\n\t\t\t\t\tif (token == \"MemAvailable:\") {\n\t\t\t\t\t\tsize_t mem;\n\t\t\t\t\t\tif (infile >> mem) {\n\t\t\t\t\t\t\tavailable_memory = mem * 1000;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tavailable_memory = 0;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t\tif (token == \"MemTotal:\") {\n\t\t\t\t\t\tsize_t mem;\n\t\t\t\t\t\tif (infile >> mem) {\n\t\t\t\t\t\t\ttotal_memory = mem * 1000;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\ttotal_memory = 0;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t{\n\t\t\tconst size_t pid = getpid();\n\t\t\tstd::string token;\n\t\t\tstd::ifstream infile(\"/proc/\" + std::to_string(pid) + \"/stat\", std::ios::in);\n\t\t\tif (infile.is_open()) {\n\n\t\t\t\tsize_t counter = 1;\n\t\t\t\twhile (infile >> token) {\n\t\t\t\t\tif (counter == 23) {\n\t\t\t\t\t\tused_memory = std::stoull(token);\n\t\t\t\t\t\tbreak;\n\t\t\t\t\t}\n\t\t\t\t\tcounter++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n}\n"
  },
  {
    "path": "src/memory/memory.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace memory {\n\n\t/*\n\t\tReturns the number of bytes of available memory on the system. So this is how much our virtual memory can expand.\n\t*/\n\tsize_t get_available_memory();\n\n\t/*\n\t\tReturns the size in bytes of our virtual memory (vsize)\n\t\tsame as vsize in /proc/[pid]/stat here: https://man7.org/linux/man-pages/man5/proc.5.html\n\t*/\n\tsize_t get_used_memory();\n\n\t/*\n\t\tReturns the total number of bytes in the system RAM.\n\t*/\n\tsize_t get_total_memory();\n\n\tvoid update();\n\n}\n"
  },
  {
    "path": "src/memory/overload.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"debugger.h\"\n#include <new>\n\nusing namespace std;\n\n/*\n\tOverload the global new, new[], delete and delete[] operators.\n*/\n// https://en.cppreference.com/w/cpp/memory/new/operator_new\n\nvoid *operator new(size_t n) {\n\n\tvoid *m = malloc(n + sizeof(size_t));\n\n\tif (m) {\n\t\tmemory::incr_mem_counter(n);\n\n\t\tstatic_cast<size_t *>(m)[0] = n;\n\t\treturn &(static_cast<size_t *>(m)[1]);\n\t}\n\n\tthrow bad_alloc();\n}\n\nvoid *operator new[](size_t n) {\n\n\tvoid *m = malloc(n + sizeof(size_t));\n\n\tif (m) {\n\t\tmemory::incr_mem_counter(n);\n\n\t\tstatic_cast<size_t *>(m)[0] = n;\n\t\treturn &(static_cast<size_t *>(m)[1]);\n\t}\n\n\tthrow bad_alloc();\n}\n\nvoid operator delete(void *p) noexcept {\n\n\tvoid *realp = &(static_cast<size_t *>(p)[-1]);\n\tconst size_t n = static_cast<size_t *>(p)[-1];\n\n\tmemory::decr_mem_counter(n);\n\n\tfree(realp);\n}\n\nvoid operator delete[](void *p) noexcept {\n\n\tvoid *realp = &(static_cast<size_t *>(p)[-1]);\n\tconst size_t n = static_cast<size_t *>(p)[-1];\n\n\tmemory::decr_mem_counter(n);\n\n\tfree(realp);\n}\n\n"
  },
  {
    "path": "src/parser/cc_parser.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n// main.cpp\n#include \"config.h\"\n#include \"warc/warc.h\"\n#include \"common/ThreadPool.h\"\n#include \"logger/logger.h\"\n#include \"text/text.h\"\n#include \"transfer/transfer.h\"\n#include <iostream>\n\nusing namespace std;\n\nnamespace parser {\n\n\tvoid run_downloader(const string &warc_path) {\n\n\t\twarc::parser pp;\n\t\twarc::multipart_download(\"http://commoncrawl.s3.amazonaws.com/\" + warc_path, [&pp](const string &chunk) {\n\t\t\tstringstream ss(chunk);\n\t\t\tpp.parse_stream(ss);\n\t\t});\n\n\t\tLOG_INFO(\"uploading: \" + warc_path);\n\t\tint error;\n\t\terror = transfer::upload_gz_file(warc::get_result_path(warc_path), pp.result());\n\t\terror = transfer::upload_gz_file(warc::get_link_result_path(warc_path), pp.link_result());\n\n\t\tif (error) {\n\t\t\tLOG_INFO(\"error uploading: \" + warc_path);\n\t\t}\n\t}\n\n\tvoid start_downloaders(const vector<string> &warc_paths) {\n\t\tconst size_t num_threads = 48;\n\t\tThreadPool pool(num_threads);\n\t\tvector<future<void>> results;\n\n\t\tfor (const string &warc_path : warc_paths) {\n\t\t\tresults.emplace_back(pool.enqueue([warc_path, num_threads] {\n\t\t\t\tsleep(rand() % (num_threads * 2));\n\t\t\t\trun_downloader(warc_path);\n\t\t\t}));\n\t\t}\n\n\t\tfor(auto &&result: results) {\n\t\t\tresult.get();\n\t\t}\n\t}\n\n\tvector<string> download_warc_paths() {\n\t\tint error;\n\t\tstring content = transfer::file_to_string(\"nodes/\" + config::node + \"/warc.paths\", error);\n\t\tif (error == transfer::ERROR) return {};\n\n\t\tcontent = text::trim(content);\n\n\t\tvector<string> raw_warc_paths;\n\t\tboost::algorithm::split(raw_warc_paths, content, boost::is_any_of(\"\\n\"));\n\n\t\tvector<string> warc_paths;\n\t\tfor (const string &warc_path : raw_warc_paths) {\n\t\t\tif (text::trim(warc_path).size()) {\n\t\t\t\twarc_paths.push_back(text::trim(warc_path));\n\t\t\t}\n\t\t}\n\n\t\treturn warc_paths;\n\t}\n\n\tbool upload_warc_paths(const vector<string> &warc_paths) {\n\t\tstring content = boost::algorithm::join(warc_paths, \"\\n\");\n\t\tint error = transfer::upload_file(\"nodes/\" + config::node + \"/warc.paths\", content);\n\t\treturn error == transfer::OK;\n\t}\n\n\tvoid warc_downloader() {\n\n\t\tconst size_t timeout = 300;\n\t\tconst size_t limit = 500;\n\n\t\t// main loop\n\t\twhile (true) {\n\n\t\t\t// Check if there are any urls to digest every 'timeout' minutes.\n\t\t\tvector<string> warc_paths = download_warc_paths();\n\n\t\t\tif (warc_paths.size() > 0) {\n\t\t\t\t// Digest 'limit' number of warc paths.\n\t\t\t\tvector<string> warc_paths_to_download;\n\t\t\t\twhile (warc_paths_to_download.size() < limit && warc_paths.size() > 0) {\n\t\t\t\t\twarc_paths_to_download.push_back(warc_paths.back());\n\t\t\t\t\twarc_paths.pop_back();\n\t\t\t\t}\n\n\t\t\t\tif (upload_warc_paths(warc_paths)) {\n\t\t\t\t\tstart_downloaders(warc_paths_to_download);\n\t\t\t\t} else {\n\t\t\t\t\tLOG_INFO(\"Fatal, could not upload warc paths, will not download\");\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tsleep(timeout);\n\t\t}\n\t}\n}\n\n"
  },
  {
    "path": "src/parser/cc_parser.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n\nnamespace parser {\n\n\tstd::vector<std::string> download_warc_paths();\n\tbool upload_warc_paths(const std::vector<std::string> &warc_paths);\n\n\tvoid warc_downloader();\n}\n"
  },
  {
    "path": "src/parser/entities.cpp",
    "content": "/*\tCopyright 2012, 2016 Christoph Gärtner\n\tDistributed under the Boost Software License, Version 1.0\n*/\n\n#include \"entities.h\"\n\n#include <errno.h>\n#include <stdbool.h>\n#include <stdlib.h>\n#include <string.h>\n\nusing namespace std;\n\n#define UNICODE_MAX 0x10FFFFul\n\nstatic const char *const NAMED_ENTITIES[][2] = {\n\t{ \"AElig;\", \"Æ\" },\n\t{ \"Aacute;\", \"Á\" },\n\t{ \"Acirc;\", \"Â\" },\n\t{ \"Agrave;\", \"À\" },\n\t{ \"Alpha;\", \"Α\" },\n\t{ \"Aring;\", \"Å\" },\n\t{ \"Atilde;\", \"Ã\" },\n\t{ \"Auml;\", \"Ä\" },\n\t{ \"Beta;\", \"Β\" },\n\t{ \"Ccedil;\", \"Ç\" },\n\t{ \"Chi;\", \"Χ\" },\n\t{ \"Dagger;\", \"‡\" },\n\t{ \"Delta;\", \"Δ\" },\n\t{ \"ETH;\", \"Ð\" },\n\t{ \"Eacute;\", \"É\" },\n\t{ \"Ecirc;\", \"Ê\" },\n\t{ \"Egrave;\", \"È\" },\n\t{ \"Epsilon;\", \"Ε\" },\n\t{ \"Eta;\", \"Η\" },\n\t{ \"Euml;\", \"Ë\" },\n\t{ \"Gamma;\", \"Γ\" },\n\t{ \"Iacute;\", \"Í\" },\n\t{ \"Icirc;\", \"Î\" },\n\t{ \"Igrave;\", \"Ì\" },\n\t{ \"Iota;\", \"Ι\" },\n\t{ \"Iuml;\", \"Ï\" },\n\t{ \"Kappa;\", \"Κ\" },\n\t{ \"Lambda;\", \"Λ\" },\n\t{ \"Mu;\", \"Μ\" },\n\t{ \"Ntilde;\", \"Ñ\" },\n\t{ \"Nu;\", \"Ν\" },\n\t{ \"OElig;\", \"Œ\" },\n\t{ \"Oacute;\", \"Ó\" },\n\t{ \"Ocirc;\", \"Ô\" },\n\t{ \"Ograve;\", \"Ò\" },\n\t{ \"Omega;\", \"Ω\" },\n\t{ \"Omicron;\", \"Ο\" },\n\t{ \"Oslash;\", \"Ø\" },\n\t{ \"Otilde;\", \"Õ\" },\n\t{ \"Ouml;\", \"Ö\" },\n\t{ \"Phi;\", \"Φ\" },\n\t{ \"Pi;\", \"Π\" },\n\t{ \"Prime;\", \"″\" },\n\t{ \"Psi;\", \"Ψ\" },\n\t{ \"Rho;\", \"Ρ\" },\n\t{ \"Scaron;\", \"Š\" },\n\t{ \"Sigma;\", \"Σ\" },\n\t{ \"THORN;\", \"Þ\" },\n\t{ \"Tau;\", \"Τ\" },\n\t{ \"Theta;\", \"Θ\" },\n\t{ \"Uacute;\", \"Ú\" },\n\t{ \"Ucirc;\", \"Û\" },\n\t{ \"Ugrave;\", \"Ù\" },\n\t{ \"Upsilon;\", \"Υ\" },\n\t{ \"Uuml;\", \"Ü\" },\n\t{ \"Xi;\", \"Ξ\" },\n\t{ \"Yacute;\", \"Ý\" },\n\t{ \"Yuml;\", \"Ÿ\" },\n\t{ \"Zeta;\", \"Ζ\" },\n\t{ \"aacute;\", \"á\" },\n\t{ \"acirc;\", \"â\" },\n\t{ \"acute;\", \"´\" },\n\t{ \"aelig;\", \"æ\" },\n\t{ \"agrave;\", \"à\" },\n\t{ \"alefsym;\", \"ℵ\" },\n\t{ \"alpha;\", \"α\" },\n\t{ \"amp;\", \"&\" },\n\t{ \"and;\", \"∧\" },\n\t{ \"ang;\", \"∠\" },\n\t{ \"apos;\", \"'\" },\n\t{ \"aring;\", \"å\" },\n\t{ \"asymp;\", \"≈\" },\n\t{ \"atilde;\", \"ã\" },\n\t{ \"auml;\", \"ä\" },\n\t{ \"bdquo;\", \"„\" },\n\t{ \"beta;\", \"β\" },\n\t{ \"brvbar;\", \"¦\" },\n\t{ \"bull;\", \"•\" },\n\t{ \"cap;\", \"∩\" },\n\t{ \"ccedil;\", \"ç\" },\n\t{ \"cedil;\", \"¸\" },\n\t{ \"cent;\", \"¢\" },\n\t{ \"chi;\", \"χ\" },\n\t{ \"circ;\", \"ˆ\" },\n\t{ \"clubs;\", \"♣\" },\n\t{ \"cong;\", \"≅\" },\n\t{ \"copy;\", \"©\" },\n\t{ \"crarr;\", \"↵\" },\n\t{ \"cup;\", \"∪\" },\n\t{ \"curren;\", \"¤\" },\n\t{ \"dArr;\", \"⇓\" },\n\t{ \"dagger;\", \"†\" },\n\t{ \"darr;\", \"↓\" },\n\t{ \"deg;\", \"°\" },\n\t{ \"delta;\", \"δ\" },\n\t{ \"diams;\", \"♦\" },\n\t{ \"divide;\", \"÷\" },\n\t{ \"eacute;\", \"é\" },\n\t{ \"ecirc;\", \"ê\" },\n\t{ \"egrave;\", \"è\" },\n\t{ \"empty;\", \"∅\" },\n\t{ \"emsp;\", \"\\xE2\\x80\\x83\" },\n\t{ \"ensp;\", \"\\xE2\\x80\\x82\" },\n\t{ \"epsilon;\", \"ε\" },\n\t{ \"equiv;\", \"≡\" },\n\t{ \"eta;\", \"η\" },\n\t{ \"eth;\", \"ð\" },\n\t{ \"euml;\", \"ë\" },\n\t{ \"euro;\", \"€\" },\n\t{ \"exist;\", \"∃\" },\n\t{ \"fnof;\", \"ƒ\" },\n\t{ \"forall;\", \"∀\" },\n\t{ \"frac12;\", \"½\" },\n\t{ \"frac14;\", \"¼\" },\n\t{ \"frac34;\", \"¾\" },\n\t{ \"frasl;\", \"⁄\" },\n\t{ \"gamma;\", \"γ\" },\n\t{ \"ge;\", \"≥\" },\n\t{ \"gt;\", \">\" },\n\t{ \"hArr;\", \"⇔\" },\n\t{ \"harr;\", \"↔\" },\n\t{ \"hearts;\", \"♥\" },\n\t{ \"hellip;\", \"…\" },\n\t{ \"iacute;\", \"í\" },\n\t{ \"icirc;\", \"î\" },\n\t{ \"iexcl;\", \"¡\" },\n\t{ \"igrave;\", \"ì\" },\n\t{ \"image;\", \"ℑ\" },\n\t{ \"infin;\", \"∞\" },\n\t{ \"int;\", \"∫\" },\n\t{ \"iota;\", \"ι\" },\n\t{ \"iquest;\", \"¿\" },\n\t{ \"isin;\", \"∈\" },\n\t{ \"iuml;\", \"ï\" },\n\t{ \"kappa;\", \"κ\" },\n\t{ \"lArr;\", \"⇐\" },\n\t{ \"lambda;\", \"λ\" },\n\t{ \"lang;\", \"〈\" },\n\t{ \"laquo;\", \"«\" },\n\t{ \"larr;\", \"←\" },\n\t{ \"lceil;\", \"⌈\" },\n\t{ \"ldquo;\", \"“\" },\n\t{ \"le;\", \"≤\" },\n\t{ \"lfloor;\", \"⌊\" },\n\t{ \"lowast;\", \"∗\" },\n\t{ \"loz;\", \"◊\" },\n\t{ \"lrm;\", \"\\xE2\\x80\\x8E\" },\n\t{ \"lsaquo;\", \"‹\" },\n\t{ \"lsquo;\", \"‘\" },\n\t{ \"lt;\", \"<\" },\n\t{ \"macr;\", \"¯\" },\n\t{ \"mdash;\", \"—\" },\n\t{ \"micro;\", \"µ\" },\n\t{ \"middot;\", \"·\" },\n\t{ \"minus;\", \"−\" },\n\t{ \"mu;\", \"μ\" },\n\t{ \"nabla;\", \"∇\" },\n\t{ \"nbsp;\", \" \" },\n\t{ \"ndash;\", \"–\" },\n\t{ \"ne;\", \"≠\" },\n\t{ \"ni;\", \"∋\" },\n\t{ \"not;\", \"¬\" },\n\t{ \"notin;\", \"∉\" },\n\t{ \"nsub;\", \"⊄\" },\n\t{ \"ntilde;\", \"ñ\" },\n\t{ \"nu;\", \"ν\" },\n\t{ \"oacute;\", \"ó\" },\n\t{ \"ocirc;\", \"ô\" },\n\t{ \"oelig;\", \"œ\" },\n\t{ \"ograve;\", \"ò\" },\n\t{ \"oline;\", \"‾\" },\n\t{ \"omega;\", \"ω\" },\n\t{ \"omicron;\", \"ο\" },\n\t{ \"oplus;\", \"⊕\" },\n\t{ \"or;\", \"∨\" },\n\t{ \"ordf;\", \"ª\" },\n\t{ \"ordm;\", \"º\" },\n\t{ \"oslash;\", \"ø\" },\n\t{ \"otilde;\", \"õ\" },\n\t{ \"otimes;\", \"⊗\" },\n\t{ \"ouml;\", \"ö\" },\n\t{ \"para;\", \"¶\" },\n\t{ \"part;\", \"∂\" },\n\t{ \"permil;\", \"‰\" },\n\t{ \"perp;\", \"⊥\" },\n\t{ \"phi;\", \"φ\" },\n\t{ \"pi;\", \"π\" },\n\t{ \"piv;\", \"ϖ\" },\n\t{ \"plusmn;\", \"±\" },\n\t{ \"pound;\", \"£\" },\n\t{ \"prime;\", \"′\" },\n\t{ \"prod;\", \"∏\" },\n\t{ \"prop;\", \"∝\" },\n\t{ \"psi;\", \"ψ\" },\n\t{ \"quot;\", \"\\\"\" },\n\t{ \"rArr;\", \"⇒\" },\n\t{ \"radic;\", \"√\" },\n\t{ \"rang;\", \"〉\" },\n\t{ \"raquo;\", \"»\" },\n\t{ \"rarr;\", \"→\" },\n\t{ \"rceil;\", \"⌉\" },\n\t{ \"rdquo;\", \"”\" },\n\t{ \"real;\", \"ℜ\" },\n\t{ \"reg;\", \"®\" },\n\t{ \"rfloor;\", \"⌋\" },\n\t{ \"rho;\", \"ρ\" },\n\t{ \"rlm;\", \"\\xE2\\x80\\x8F\" },\n\t{ \"rsaquo;\", \"›\" },\n\t{ \"rsquo;\", \"’\" },\n\t{ \"sbquo;\", \"‚\" },\n\t{ \"scaron;\", \"š\" },\n\t{ \"sdot;\", \"⋅\" },\n\t{ \"sect;\", \"§\" },\n\t{ \"shy;\", \"\\xC2\\xAD\" },\n\t{ \"sigma;\", \"σ\" },\n\t{ \"sigmaf;\", \"ς\" },\n\t{ \"sim;\", \"∼\" },\n\t{ \"spades;\", \"♠\" },\n\t{ \"sub;\", \"⊂\" },\n\t{ \"sube;\", \"⊆\" },\n\t{ \"sum;\", \"∑\" },\n\t{ \"sup1;\", \"¹\" },\n\t{ \"sup2;\", \"²\" },\n\t{ \"sup3;\", \"³\" },\n\t{ \"sup;\", \"⊃\" },\n\t{ \"supe;\", \"⊇\" },\n\t{ \"szlig;\", \"ß\" },\n\t{ \"tau;\", \"τ\" },\n\t{ \"there4;\", \"∴\" },\n\t{ \"theta;\", \"θ\" },\n\t{ \"thetasym;\", \"ϑ\" },\n\t{ \"thinsp;\", \"\\xE2\\x80\\x89\" },\n\t{ \"thorn;\", \"þ\" },\n\t{ \"tilde;\", \"˜\" },\n\t{ \"times;\", \"×\" },\n\t{ \"trade;\", \"™\" },\n\t{ \"uArr;\", \"⇑\" },\n\t{ \"uacute;\", \"ú\" },\n\t{ \"uarr;\", \"↑\" },\n\t{ \"ucirc;\", \"û\" },\n\t{ \"ugrave;\", \"ù\" },\n\t{ \"uml;\", \"¨\" },\n\t{ \"upsih;\", \"ϒ\" },\n\t{ \"upsilon;\", \"υ\" },\n\t{ \"uuml;\", \"ü\" },\n\t{ \"weierp;\", \"℘\" },\n\t{ \"xi;\", \"ξ\" },\n\t{ \"yacute;\", \"ý\" },\n\t{ \"yen;\", \"¥\" },\n\t{ \"yuml;\", \"ÿ\" },\n\t{ \"zeta;\", \"ζ\" },\n\t{ \"zwj;\", \"\\xE2\\x80\\x8D\" },\n\t{ \"zwnj;\", \"\\xE2\\x80\\x8C\" }\n};\n\nstatic int cmp(const void *key, const void *value)\n{\n\treturn strncmp((const char *)key, *(const char *const *)value,\n\t\tstrlen(*(const char *const *)value));\n}\n\nstatic const char *get_named_entity(const char *name)\n{\n\tconst char *const *entity = (const char *const *)bsearch(name,\n\t\tNAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES,\n\t\tsizeof *NAMED_ENTITIES, cmp);\n\n\treturn entity ? entity[1] : NULL;\n}\n\nstatic size_t putc_utf8(unsigned long cp, char *buffer)\n{\n\tunsigned char *bytes = (unsigned char *)buffer;\n\n\tif(cp <= 0x007Ful)\n\t{\n\t\tbytes[0] = (unsigned char)cp;\n\t\treturn 1;\n\t}\n\n\tif(cp <= 0x07FFul)\n\t{\n\t\tbytes[1] = (unsigned char)((2 << 6) | (cp & 0x3F));\n\t\tbytes[0] = (unsigned char)((6 << 5) | (cp >> 6));\n\t\treturn 2;\n\t}\n\n\tif(cp <= 0xFFFFul)\n\t{\n\t\tbytes[2] = (unsigned char)(( 2 << 6) | ( cp       & 0x3F));\n\t\tbytes[1] = (unsigned char)(( 2 << 6) | ((cp >> 6) & 0x3F));\n\t\tbytes[0] = (unsigned char)((14 << 4) |  (cp >> 12));\n\t\treturn 3;\n\t}\n\n\tif(cp <= 0x10FFFFul)\n\t{\n\t\tbytes[3] = (unsigned char)(( 2 << 6) | ( cp        & 0x3F));\n\t\tbytes[2] = (unsigned char)(( 2 << 6) | ((cp >>  6) & 0x3F));\n\t\tbytes[1] = (unsigned char)(( 2 << 6) | ((cp >> 12) & 0x3F));\n\t\tbytes[0] = (unsigned char)((30 << 3) |  (cp >> 18));\n\t\treturn 4;\n\t}\n\n\treturn 0;\n}\n\nstatic bool parse_entity(\n\tconst char *current, char **to, const char **from)\n{\n\tconst char *end = strchr(current, ';');\n\tif(!end) return 0;\n\n\tif(current[1] == '#')\n\t{\n\t\tchar *tail = NULL;\n\t\tint errno_save = errno;\n\t\tbool hex = current[2] == 'x' || current[2] == 'X';\n\n\t\terrno = 0;\n\t\tunsigned long cp = strtoul(\n\t\t\tcurrent + (hex ? 3 : 2), &tail, hex ? 16 : 10);\n\n\t\tbool fail = errno || tail != end || cp > UNICODE_MAX;\n\t\terrno = errno_save;\n\t\tif(fail) return 0;\n\n\t\t*to += putc_utf8(cp, *to);\n\t\t*from = end + 1;\n\n\t\treturn 1;\n\t}\n\telse\n\t{\n\t\tconst char *entity = get_named_entity(&current[1]);\n\t\tif(!entity) return 0;\n\n\t\tsize_t len = strlen(entity);\n\t\tmemcpy(*to, entity, len);\n\n\t\t*to += len;\n\t\t*from = end + 1;\n\n\t\treturn 1;\n\t}\n}\n\nsize_t decode_html_entities_utf8(char *dest, const char *src)\n{\n\tif(!src) src = dest;\n\n\tchar *to = dest;\n\tconst char *from = src;\n\n\tfor(const char *current; (current = strchr(from, '&'));)\n\t{\n\t\tmemmove(to, from, (size_t)(current - from));\n\t\tto += current - from;\n\n\t\tif(parse_entity(current, &to, &from))\n\t\t\tcontinue;\n\n\t\tfrom = current;\n\t\t*to++ = *from++;\n\t}\n\n\tsize_t remaining = strlen(from);\n\n\tmemmove(to, from, remaining);\n\tto += remaining;\n\t*to = 0;\n\n\treturn (size_t)(to - dest);\n}\n"
  },
  {
    "path": "src/parser/entities.h",
    "content": "/*\tCopyright 2012 Christoph Gärtner\n\tDistributed under the Boost Software License, Version 1.0\n*/\n\n#ifndef DECODE_HTML_ENTITIES_UTF8_\n#define DECODE_HTML_ENTITIES_UTF8_\n\n#include <stddef.h>\n\nextern size_t decode_html_entities_utf8(char *dest, const char *src);\n/*\tTakes input from <src> and decodes into <dest>, which should be a buffer\n\tlarge enough to hold <strlen(src) + 1> characters.\n\n\tIf <src> is <NULL>, input will be taken from <dest>, decoding\n\tthe entities in-place.\n\n\tThe function returns the length of the decoded string.\n*/\n\n#endif\n"
  },
  {
    "path": "src/parser/html_link.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"html_link.h\"\n\nusing namespace std;\n\nnamespace parser {\n\n\thtml_link::html_link(const string &host, const string &path, const string &target_host, const string &target_path, bool nofollow,\n\t\tconst string &text) :\n\t\tm_host(host),\n\t\tm_path(path),\n\t\tm_target_host(target_host),\n\t\tm_target_path(target_path),\n\t\tm_nofollow(nofollow),\n\t\tm_text(text)\n\t{\n\t\t\n\t}\n\n\thtml_link::html_link(const string &host, const string &path, const string &target_host, const string &target_path, bool nofollow) :\n\t\tm_host(host),\n\t\tm_path(path),\n\t\tm_target_host(target_host),\n\t\tm_target_path(target_path),\n\t\tm_nofollow(nofollow)\n\t{\n\t\t\n\t}\n\n\thtml_link::~html_link() {}\n\n}\n"
  },
  {
    "path": "src/parser/html_link.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <string>\n#include \"URL.h\"\n\nnamespace parser {\n\n\tclass html_link {\n\n\t\tpublic:\n\t\t\thtml_link(const std::string &host, const std::string &path, const std::string &target_host, const std::string &target_path, bool nofollow,\n\t\t\t\tconst std::string &text);\n\t\t\thtml_link(const std::string &host, const std::string &path, const std::string &target_host, const std::string &target_path, bool nofollow);\n\t\t\t~html_link();\n\n\t\t\tURL source_url() const { return URL(m_host, m_path); };\n\t\t\tURL target_url() const { return URL(m_target_host, m_target_path); };\n\t\t\tstd::string host() const { return m_host; };\n\t\t\tstd::string path() const { return m_path; };\n\t\t\tstd::string target_host() const { return m_target_host; };\n\t\t\tstd::string target_path() const { return m_target_path; };\n\t\t\tbool nofollow() const { return m_nofollow; };\n\t\t\tstd::string text() const {return m_text; };\n\n\t\tprivate:\n\t\t\tstd::string m_host;\n\t\t\tstd::string m_path;\n\t\t\tstd::string m_target_host;\n\t\t\tstd::string m_target_path;\n\t\t\tbool m_nofollow;\n\t\t\tstd::string m_text;\n\n\t};\n\n}\n"
  },
  {
    "path": "src/parser/html_parser.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"entities.h\"\n#include \"html_parser.h\"\n#include \"parser.h\"\n#include \"config.h\"\n#include \"text/text.h\"\n#include <curl/curl.h>\n\nusing namespace std;\n\nnamespace parser {\n\n\tconst vector<string> non_content_tags{\"script\", \"noscript\", \"style\", \"embed\", \"label\", \"form\", \"input\",\n\t\t\"iframe\", \"head\", \"meta\", \"link\", \"object\", \"aside\", \"channel\", \"img\"};\n\n\thtml_parser::html_parser()\n\t: m_long_text_len(100000)\n\t{\n\t\tm_long_str_buf = std::make_unique<char[]>(m_long_text_len);\n\t\tm_clean_buff = std::make_unique<char[]>(m_long_text_len);\n\t\tm_encoding_buffer = std::make_unique<unsigned char []>(m_long_text_len);\n\t}\n\n\thtml_parser::html_parser(size_t long_text_len)\n\t: m_long_text_len(long_text_len)\n\t{\n\t\tm_long_str_buf = std::make_unique<char[]>(m_long_text_len);\n\t\tm_clean_buff = std::make_unique<char[]>(m_long_text_len);\n\t\tm_encoding_buffer = std::make_unique<unsigned char []>(m_long_text_len);\n\t}\n\n\thtml_parser::~html_parser() {\n\t}\n\n\tvoid html_parser::parse(const string &html) {\n\t\tparse(html, \"\");\n\t}\n\n\tvoid html_parser::parse(const string &html, const string &url) {\n\n\t\tm_should_insert = false;\n\t\tm_should_insert = false;\n\n\t\tparse_url(url, m_host, m_path, \"\");\n\n\t\tm_title.clear();\n\t\tm_h1.clear();\n\t\tm_meta.clear();\n\t\tm_text.clear();\n\t\tm_invisible_pos.clear();\n\t\tm_links.clear();\n\t\tm_internal_links.clear();\n\n\t\tparse_encoding(html);\n\t\tif (m_encoding == ENC_UNKNOWN) {\n\t\t\tm_should_insert = false;\n\t\t\treturn;\n\t\t}\n\n\t\tfind_scripts(html);\n\t\tfind_styles(html);\n\t\tsort_invisible();\n\t\tfind_links(html, url);\n\n\t\tm_title = get_tag_content(html, \"<title\", \"</title>\");\n\t\tm_h1 = get_tag_content(html, \"<h1\", \"</h1>\");\n\t\tm_meta = get_meta_tag(html);\n\t\tm_text = get_text_content(html);\n\n\t\tif (m_encoding == ENC_ISO_8859_1) {\n\t\t\tiso_to_utf8(m_title);\n\t\t\tiso_to_utf8(m_h1);\n\t\t\tiso_to_utf8(m_meta);\n\t\t\tiso_to_utf8(m_text);\n\t\t}\n\n\t\tclean_text(m_title);\n\t\tif (m_title.size() == 0 || is_exotic_language(m_title) || m_title.size() > HTML_PARSER_MAX_TITLE_LEN) return;\n\t\tm_should_insert = true;\n\n\t\tclean_text(m_h1);\n\t\tclean_text(m_meta);\n\t\tclean_text(m_text);\n\n\t\tif (m_h1.size() > HTML_PARSER_MAX_H1_LEN) {\n\t\t\tm_should_insert = false;\n\t\t\treturn;\n\t\t}\n\t}\n\n\tvoid html_parser::find_scripts(const string &html) {\n\t\tsize_t pos = 0;\n\t\tpair<size_t, size_t> tag(0, 0);\n\t\twhile (pos != string::npos) {\n\t\t\ttag = find_tag(html, \"<script\", \"</script>\", tag.second);\n\t\t\tif (tag.second == string::npos) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tm_invisible_pos.push_back(tag);\n\t\t}\n\t}\n\n\tvoid html_parser::find_styles(const string &html) {\n\t\tsize_t pos = 0;\n\t\tpair<size_t, size_t> tag(0, 0);\n\t\twhile (pos != string::npos) {\n\t\t\ttag = find_tag(html, \"<style\", \"</style>\", tag.second);\n\t\t\tif (tag.second == string::npos) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tm_invisible_pos.push_back(tag);\n\t\t}\n\t}\n\n\tvoid html_parser::find_links(const string &html, const string &base_url) {\n\t\tsize_t pos = 0;\n\t\tpair<size_t, size_t> tag(0, 0);\n\t\twhile (pos != string::npos) {\n\t\t\ttag = find_tag(html, \"<a \", \"</a>\", tag.second);\n\t\t\tif (tag.second == string::npos) {\n\t\t\t\tbreak;\n\t\t\t}\n\n\t\t\tparse_link(html.substr(tag.first, tag.second - tag.first), base_url);\n\t\t}\n\t}\n\n\tint html_parser::parse_link(const string &link, const string &base_url) {\n\t\tconst string href_key = \"href=\\\"\";\n\t\tconst size_t key_len = href_key.size();\n\t\tconst size_t href_start = link.find(href_key);\n\t\tif (href_start == string::npos) return ::parser::ERROR;\n\t\tconst size_t href_end = link.find(\"\\\"\", href_start + key_len);\n\t\tif (href_end == string::npos) return ::parser::ERROR;\n\t\tstring href = link.substr(href_start + key_len, href_end - href_start - key_len);\n\n\t\tconst string rel_key = \"rel=\\\"\";\n\t\tconst size_t rel_key_len = rel_key.size();\n\t\tconst size_t rel_start = link.find(rel_key);\n\t\tbool nofollow = false;\n\t\tif (rel_start != string::npos) {\n\t\t\t// \"rel=\" present in string\n\t\t\tconst size_t rel_end = link.find(\"\\\"\", rel_start + key_len);\n\t\t\tconst string rel = link.substr(rel_start + rel_key_len, rel_end - rel_start - rel_key_len);\n\t\t\tif (rel.find(\"nofollow\") != string::npos) nofollow = true;\n\t\t}\n\n\t\tstring host;\n\t\tstring path;\n\t\tif (parse_url(href, host, path, base_url) != ::parser::OK) return ::parser::ERROR;\n\n\t\tif (host == m_host) {\n\t\t\t// Ignore internal links for now.\n\t\t\tif (!nofollow) {\n\t\t\t\tm_internal_links.emplace_back(std::make_pair(URL(m_host, m_path).hash(), URL(host, path).hash()));\n\t\t\t}\n\t\t\treturn ::parser::OK;\n\t\t}\n\n\t\tconst size_t content_start = link.find(\">\", href_end) + 1;\n\t\tif (content_start == string::npos) return ::parser::ERROR;\n\t\tconst size_t content_end = link.find(\"</a>\", content_start);\n\t\tstring content = link.substr(content_start, content_end - content_start);\n\n\t\tif (m_encoding == ENC_ISO_8859_1) {\n\t\t\tiso_to_utf8(content);\n\t\t}\n\t\tclean_text(content);\n\n\t\tif (content == \"\") return ::parser::ERROR;\n\n\t\tm_links.push_back(html_link(m_host, m_path, host, path, nofollow, content));\n\n\t\treturn ::parser::OK;\n\t}\n\n\tint html_parser::parse_url(const string &url, string &host, string &path, const string &base_url) {\n\t\tCURLU *h = curl_url();\n\t\tif (!h) return ::parser::ERROR;\n\n\t\tif (base_url.size()) {\n\t\t\tcurl_url_set(h, CURLUPART_URL, base_url.c_str(), 0);\n\t\t}\n\n\t\tCURLUcode uc = curl_url_set(h, CURLUPART_URL, url.c_str(), 0);\n\t\tif (uc) {\n\t\t\tcurl_url_cleanup(h);\n\t\t\treturn ::parser::ERROR;\n\t\t}\n\n\t\tchar *chost;\n\t\tuc = curl_url_get(h, CURLUPART_HOST, &chost, 0);\n\t\tif (!uc) {\n\t\t\thost = chost;\n\t\t\tremove_www(host);\n\t\t\tcurl_free(chost);\n\t\t}\n\n\t\tchar *cpath;\n\t\tuc = curl_url_get(h, CURLUPART_PATH, &cpath, 0);\n\t\tif (!uc) {\n\t\t\tif (strnlen(cpath, m_long_text_len) < m_long_text_len) {\n\t\t\t\tdecode_html_entities_utf8(m_clean_buff.get(), cpath);\n\t\t\t\tpath = m_clean_buff.get();\n\t\t\t} else {\n\t\t\t\tpath = cpath;\n\t\t\t}\n\t\t\tcurl_free(cpath);\n\t\t}\n\n\t\tchar *cquery;\n\t\tuc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0);\n\t\tif (!uc) {\n\t\t\tif (strnlen(cquery, m_long_text_len) < m_long_text_len) {\n\t\t\t\tdecode_html_entities_utf8(m_clean_buff.get(), cquery);\n\t\t\t\tpath += \"?\" + string(m_clean_buff.get());\n\t\t\t} else {\n\t\t\t\tpath += \"?\" + string(cquery);\n\t\t\t}\n\t\t\tcurl_free(cquery);\n\t\t}\n\n\t\tcurl_url_cleanup(h);\n\n\t\treturn ::parser::OK;\n\t}\n\n\tvoid html_parser::remove_www(string &path) {\n\t\tsize_t pos = path.find(\"www.\");\n\t\tif (pos == 0) path.erase(0, 4);\n\t\ttext::trim_inplace(path);\n\t}\n\n\tvoid html_parser::parse_encoding(const string &html) {\n\t\tm_encoding = ENC_UTF_8;\n\t\tconst size_t pos_start = html.find(\"charset=\");\n\t\tif (pos_start == string::npos || pos_start > 1024) return;\n\n\t\tstring encoding = html.substr(pos_start, 40);\n\t\tencoding = text::lower_case(encoding);\n\n\t\tconst size_t utf8_start = encoding.find(\"utf-8\");\n\t\tconst size_t iso88591_start = encoding.find(\"iso-8859-1\");\n\t\tif (utf8_start != string::npos) m_encoding = ENC_UTF_8;\n\t\telse if (iso88591_start != string::npos) m_encoding = ENC_ISO_8859_1;\n\t\telse m_encoding = ENC_UNKNOWN;\n\t}\n\n\tvoid html_parser::iso_to_utf8(string &str) {\n\t\tstring str_out;\n\t\tfor (std::string::iterator it = str.begin(); it != str.end(); ++it)\n\t\t{\n\t\t\tuint8_t ch = *it;\n\t\t\tif (ch < 0x80) {\n\t\t\t\tstr_out.push_back(ch);\n\t\t\t}\n\t\t\telse {\n\t\t\t\tstr_out.push_back(0xc0 | ch >> 6);\n\t\t\t\tstr_out.push_back(0x80 | (ch & 0x3f));\n\t\t\t}\n\t\t}\n\t\tstr = str_out;\n\t}\n\n\tstring html_parser::title() const {\n\t\treturn m_title;\n\t} \n\n\tstring html_parser::meta() const {\n\t\treturn m_meta;\n\t}\n\n\tstring html_parser::h1() const {\n\t\treturn m_h1;\n\t}\n\n\tstring html_parser::text() const {\n\t\treturn m_text;\n\t}\n\n\tvector<html_link> html_parser::links() const {\n\t\treturn m_links;\n\t}\n\n\tvector<std::pair<uint64_t, uint64_t>> html_parser::internal_links() const {\n\t\treturn m_internal_links;\n\t}\n\n\tbool html_parser::should_insert() const {\n\t\treturn m_should_insert;\n\t}\n\n\tstring html_parser::url_tld(const string &url) {\n\n\t\tstring response;\n\t\tstring host;\n\t\tvector<string> parts;\n\t\tCURLU *h = curl_url();\n\t\tif (!h) return \"\";\n\n\t\tCURLUcode uc = curl_url_set(h, CURLUPART_URL, url.c_str(), 0);\n\t\tif (uc) {\n\t\t\tcurl_url_cleanup(h);\n\t\t\treturn \"\";\n\t\t}\n\n\t\tchar *chost;\n\t\tuc = curl_url_get(h, CURLUPART_HOST, &chost, 0);\n\t\tif (!uc) {\n\t\t\thost = chost;\n\t\t\tboost::split(parts, host, boost::is_any_of(\".\"));\n\t\t\tcurl_free(chost);\n\n\t\t\tif (parts.size()) {\n\t\t\t\tresponse = parts.back();\n\t\t\t}\n\t\t}\n\n\t\tcurl_url_cleanup(h);\n\n\t\treturn response;\n\t}\n\n\tinline pair<size_t, size_t> html_parser::find_tag(const string &html, const string &tag_start, const string &tag_end,\n\t\tsize_t pos) {\n\t\tsize_t pos_start = html.find(tag_start, pos);\n\t\tif (pos_start == string::npos) return pair<size_t, size_t>(string::npos, string::npos);\n\n\t\tconst size_t pos_end = html.find(tag_end, pos_start);\n\t\tif (pos_end == string::npos) return pair<size_t, size_t>(string::npos, string::npos);\n\t\treturn pair<size_t, size_t>(pos_start, pos_end + tag_end.size());\n\t}\n\n\tstring html_parser::get_tag_content(const string &html, const string &tag_start, const string &tag_end) {\n\t\tsize_t pos_start = html.find(tag_start);\n\t\tif (pos_start == string::npos || is_invisible(pos_start)) return \"\";\n\t\tpos_start = html.find(\">\", pos_start);\n\n\t\tconst size_t pos_end = html.find(tag_end, pos_start);\n\t\tconst size_t len = pos_end - pos_start;\n\t\tif (pos_end == string::npos) return \"\";\n\t\treturn (string)html.substr(pos_start + 1, len - 1);\n\t}\n\n\tstring html_parser::get_meta_tag(const string &html) {\n\t\tsize_t pos_start = 0;\n\t\twhile ((pos_start = html.find(\"<meta\", pos_start + 1)) != string::npos)  {\n\t\t\tconst size_t pos_end = html.find(\">\", pos_start);\n\t\t\tconst size_t pos_description = html.find(\"description\\\"\", pos_start);\n\t\t\tif (pos_description < pos_end) {\n\t\t\t\tconst size_t pos_end_tag = html.find(\">\", pos_description);\n\t\t\t\tconst size_t pos_start_tag = html.rfind(\"<\", pos_description);\n\n\t\t\t\tconst string s = \"content=\";\n\t\t\t\tconst size_t content_start = html.find(s, pos_start_tag);\n\t\t\t\tif (content_start != string::npos && content_start <= pos_end_tag) {\n\t\t\t\t\treturn (string)html.substr(content_start + s.size(), pos_end_tag - content_start - s.size() - 1);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn \"\";\n\t}\n\n\tvoid html_parser::clean_text(string &str) {\n\t\tstrip_tags(str);\n\t\tif (str.size() >= m_long_text_len) return;\n\t\tdecode_html_entities_utf8(m_clean_buff.get(), str.c_str());\n\t\tstr = m_clean_buff.get();\n\t\tstrip_whitespace(str);\n\t\ttext::trim_both_inplace(str);\n\t}\n\n\tvoid html_parser::strip_tags(string &html) {\n\t\tconst int len = html.size();\n\t\tbool copy = true;\n\t\tbool last_was_space = false;\n\t\tint i = 0, j = 0;\n\t\tconst char *html_s = html.c_str();\n\t\tfor (; i < len; i++) {\n\t\t\tif (html_s[i] == '<') copy = false;\n\t\t\tif (isspace(html_s[i])) {\n\t\t\t\thtml[j] = ' ';\n\t\t\t\tif (copy && !last_was_space) j++;\n\t\t\t\tlast_was_space = true;\n\t\t\t} else {\n\t\t\t\thtml[j] = html_s[i];\n\t\t\t\tif (copy) j++;\n\t\t\t\tlast_was_space = false;\n\t\t\t}\n\t\t\tif (html_s[i] == '>') copy = true;\n\t\t}\n\t\thtml.resize(j);\n\t}\n\n\tvoid html_parser::strip_whitespace(string &html) {\n\t\tconst int len = html.size();\n\t\tbool last_was_space = false;\n\t\tint i = 0, j = 0;\n\t\tconst char *html_s = html.c_str();\n\t\tfor (; i < len; i++) {\n\t\t\tif (isspace(html_s[i])) {\n\t\t\t\thtml[j] = ' ';\n\t\t\t\tif (!last_was_space) j++;\n\t\t\t\tlast_was_space = true;\n\t\t\t} else {\n\t\t\t\thtml[j] = html_s[i];\n\t\t\t\tj++;\n\t\t\t\tlast_was_space = false;\n\t\t\t}\n\t\t}\n\t\thtml.resize(j);\n\t}\n\n\t/*\n\t * This function returns the text content of the html by first trying to fetch content after the first <h1>...</h1> tag. If no h1 tag is present\n\t * it tries to fetch content from the start of the <body>\n\t * */\n\tstring html_parser::get_text_content(const string &html) {\n\t\tsize_t pos_start = html.find(\"</h1>\");\n\n\t\t// Start from body if no h1 is present\n\t\tif (pos_start == string::npos || is_invisible(pos_start)) {\n\t\t\tpos_start = html.find(\"<body\");\n\t\t}\n\t\tif (pos_start == string::npos || is_invisible(pos_start)) {\n\t\t\treturn \"\";\n\t\t}\n\n\t\tconst size_t len = html.size();\n\t\tbool copy = true;\n\t\tbool ignore = false;\n\t\tbool last_was_space = false;\n\t\tsize_t i = pos_start, j = 0;\n\n\t\tauto interval = m_invisible_pos.begin();\n\t\tconst auto invisible_end = m_invisible_pos.end();\n\t\twhile (interval != m_invisible_pos.end() && interval->first < pos_start) {\n\t\t\tinterval++;\n\t\t}\n\n\t\tconst char *html_s = html.c_str();\n\n\t\tfor (; i < len && j < m_long_text_len; i++) {\n\t\t\tif (html_s[i] == '<') {\n\t\t\t\tif (interval != invisible_end && interval->first == i) {\n\t\t\t\t\t// Skip the whole invisible tag.\n\t\t\t\t\ti = interval->second - 1;\n\t\t\t\t\tinterval++;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\t// Insert a space, because we don't want to concatenate words.\n\t\t\t\tm_long_str_buf[j] = ' ';\n\t\t\t\tif (copy && !last_was_space) j++;\n\t\t\t\tlast_was_space = true;\n\n\t\t\t\tcopy = false;\n\t\t\t}\n\t\t\tif (isspace(html_s[i])) {\n\t\t\t\tif (j < m_long_text_len) m_long_str_buf[j] = ' ';\n\t\t\t\tif (copy && !last_was_space) j++;\n\t\t\t\tlast_was_space = true;\n\t\t\t} else {\n\t\t\t\tif (j < m_long_text_len) m_long_str_buf[j] = html_s[i];\n\t\t\t\tif (copy) j++;\n\t\t\t\tlast_was_space = false;\n\t\t\t}\n\t\t\tif (!ignore && html_s[i] == '>') copy = true;\n\t\t}\n\n\t\tstring text(m_long_str_buf.get(), j);\n\n\t\treturn text;\n\t}\n\n\tbool html_parser::is_exotic_language_debug(const string &str) const {\n\t\tconst size_t len = str.size();\n\t\tconst char *cstr = str.c_str();\n\t\tint num_exotic = 0;\n\t\tint num_normal = 0;\n\t\tint num_seminormal = 0;\n\t\tfor (size_t i = 0; i < len;) {\n\t\t\tint multibyte_len = 1;\n\t\t\tint cumsum = 0;\n\t\t\tfor (size_t j = i + 1; (j < len) && IS_MULTIBYTE_CODEPOINT(cstr[j]); j++, multibyte_len++) {\n\t\t\t\tcumsum += (unsigned char)cstr[j];\n\t\t\t}\n\n\t\t\tif (multibyte_len > 2) {\n\t\t\t\tnum_exotic++;\n\t\t\t} else if (multibyte_len == 2){\n\t\t\t\tnum_seminormal++;\n\t\t\t} else {\n\t\t\t\tnum_normal++;\n\t\t\t}\n\n\t\t\ti += multibyte_len;\n\t\t}\n\n\t\tint total = (num_seminormal + num_exotic + num_normal);\n\n\t\tcout << str << \" exotic: \" << num_exotic << \" seminormal: \" << num_seminormal << \" normal: \" << num_normal << endl;\n\n\t\tif (num_exotic > 5) return true;\n\t\tif (total <= 3) return false;\n\t\tif ((float)(num_seminormal + num_exotic) / ((float)total) > 0.5) return true;\n\n\t\treturn false;\n\t}\n\n\tbool html_parser::is_exotic_language(const string &str) const {\n\t\tconst size_t len = str.size();\n\t\tconst char *cstr = str.c_str();\n\t\tint num_exotic = 0;\n\t\tint num_normal = 0;\n\t\tint num_seminormal = 0;\n\t\tfor (size_t i = 0; i < len;) {\n\t\t\tint multibyte_len = 1;\n\t\t\tint cumsum = 0;\n\t\t\tfor (size_t j = i + 1; (j < len) && IS_MULTIBYTE_CODEPOINT(cstr[j]); j++, multibyte_len++) {\n\t\t\t\tcumsum += (unsigned char)cstr[j];\n\t\t\t}\n\n\t\t\tif (multibyte_len > 2) {\n\t\t\t\tnum_exotic++;\n\t\t\t} else if (multibyte_len == 2){\n\t\t\t\tnum_seminormal++;\n\t\t\t} else {\n\t\t\t\tnum_normal++;\n\t\t\t}\n\n\t\t\ti += multibyte_len;\n\t\t}\n\n\t\tint total = (num_seminormal + num_exotic + num_normal);\n\n\t\tif (num_exotic > 5) return true;\n\t\tif (total <= 3) return false;\n\t\tif ((float)(num_seminormal + num_exotic) / ((float)total) > 0.5) return true;\n\n\t\treturn false;\n\t}\n\n\tvoid html_parser::sort_invisible() {\n\t\tsort(m_invisible_pos.begin(), m_invisible_pos.end(), [](const pair<int, int>& lhs, const pair<int, int>& rhs) {\n\t\t\treturn lhs.first < rhs.first;\n\t\t});\n\t}\n\n\tinline bool html_parser::is_invisible(size_t pos) {\n\t\tfor (const auto &interval : m_invisible_pos) {\n\t\t\tif (interval.first <= pos && pos < interval.second) return true;\n\t\t}\n\t\treturn false;\n\t}\n\n}\n"
  },
  {
    "path": "src/parser/html_parser.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <string>\n#include <vector>\n#include <map>\n#include <iostream>\n#include <algorithm>\n#include <cctype>\n#include <string.h>\n#include <memory>\n#include <boost/algorithm/string.hpp>\n\n#include \"html_link.h\"\n#include \"parser/unicode.h\"\n\n#define HTML_PARSER_MAX_H1_LEN 400\n#define HTML_PARSER_MAX_TITLE_LEN 400\n\n#define ENC_UTF_8 1\n#define ENC_ISO_8859_1 2\n#define ENC_UNKNOWN -1\n\nnamespace parser {\n\n\tclass html_parser {\n\n\tpublic:\n\n\t\thtml_parser();\n\t\thtml_parser(size_t long_text_len);\n\t\t~html_parser();\n\n\t\tvoid parse(const std::string &html, const std::string &url);\n\t\tvoid parse(const std::string &html);\n\n\t\tstd::string title() const;\n\t\tstd::string meta() const;\n\t\tstd::string h1() const;\n\t\tstd::string text() const;\n\t\tstd::vector<html_link> links() const;\n\t\tstd::vector<std::pair<uint64_t, uint64_t>> internal_links() const;\n\t\tbool should_insert() const;\n\n\t\t// Return top level domain\n\t\tstd::string url_tld(const std::string &url);\n\t\tbool is_exotic_language_debug(const std::string &str) const;\n\t\tbool is_exotic_language(const std::string &str) const;\n\n\tprivate:\n\n\t\tstd::vector<html_link> m_links;\n\t\tstd::vector<std::pair<uint64_t, uint64_t>> m_internal_links;\n\t\tstd::vector<std::pair<size_t, size_t>> m_invisible_pos;\n\n\t\tconst size_t m_long_text_len = 1000;\n\t\tstd::unique_ptr<char[]> m_long_str_buf;\n\t\tstd::unique_ptr<char[]> m_clean_buff;\n\t\tstd::unique_ptr<unsigned char[]> m_encoding_buffer;\n\t\tbool m_should_insert;\n\t\tint m_encoding = ENC_UNKNOWN;\n\n\t\tstd::string m_title;\n\t\tstd::string m_h1;\n\t\tstd::string m_meta;\n\t\tstd::string m_text;\n\n\t\tstd::string m_host;\n\t\tstd::string m_path;\n\n\t\tvoid find_scripts(const std::string &html);\n\t\tvoid find_styles(const std::string &html);\n\t\tvoid find_links(const std::string &html, const std::string &base_url);\n\n\t\tint parse_link(const std::string &link, const std::string &base_url);\n\t\tint parse_url(const std::string &url, std::string &host, std::string &path, const std::string &base_url);\n\t\tinline void remove_www(std::string &path);\n\t\tvoid parse_encoding(const std::string &html);\n\t\tvoid iso_to_utf8(std::string &text);\n\n\t\tinline std::pair<size_t, size_t> find_tag(const std::string &html, const std::string &tag_start, const std::string &tag_end,\n\t\t\tsize_t pos);\n\t\tstd::string get_tag_content(const std::string &html, const std::string &tag_start, const std::string &tag_end);\n\t\tstd::string get_meta_tag(const std::string &html);\n\t\tvoid clean_text(std::string &str);\n\t\tvoid strip_whitespace(std::string &html);\n\t\tvoid strip_tags(std::string &html);\n\t\tstd::string get_text_content(const std::string &html);\n\t\tvoid sort_invisible();\n\t\tinline bool is_invisible(size_t pos);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/parser/parser.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"parser.h\"\n#include <curl/curl.h>\n\nusing namespace std;\n\nnamespace parser {\n\n\tbool is_percent_encoding(const char *cstr) {\n\t\tconst char first = tolower(cstr[1]);\n\t\tconst char second = tolower(cstr[2]);\n\t\tconst bool first_valid = (first >= '0' && first <= '9') || (first >= 'a' && first <= 'f');\n\t\tconst bool second_valid = (second >= '0' && second <= '9') || (second >= 'a' && second <= 'f');\n\t\treturn cstr[0] == '%' && first_valid && second_valid;\n\t}\n\n\tstring urldecode(const string &str) {\n\t\tconst size_t len = str.size();\n\t\tconst char *cstr = str.c_str();\n\t\tchar *ret = new char[len + 1];\n\t\tsize_t j = 0;\n\t\tfor (size_t i = 0; i < len; i++) {\n\t\t\tif (i < len - 2 && is_percent_encoding(&cstr[i])) {\n\t\t\t\tret[j++] = (char)stoi(string(&cstr[i + 1], 2), NULL, 16);\n\t\t\t\ti += 2;\n\t\t\t} else if (i < len - 1 && cstr[i] == '%' && cstr[i + 1] == '%') {\n\t\t\t\tret[j++] = '%';\n\t\t\t\ti++;\n\t\t\t} else {\n\t\t\t\tret[j++] = cstr[i];\n\t\t\t}\n\t\t}\n\t\tret[j] = '\\0';\n\n\t\tstring ret_str(ret);\n\n\t\tdelete[] ret;\n\n\t\treturn ret_str;\n\t}\n\n\tstring urlencode(const string &str) {\n\t\tCURL *curl = curl_easy_init();\n\t\tif (curl) {\n\t\t\tchar *output = curl_easy_escape(curl, str.c_str(), str.size());\n\t\t\tif (output) {\n\t\t\t\tstring ret(output);\n\t\t\t\tcurl_free(output);\n\t\t\t\tcurl_easy_cleanup(curl);\n\t\t\t\treturn ret;\n\t\t\t}\n\t\t\tcurl_easy_cleanup(curl);\n\t\t}\n\n\t\treturn str;\n\t}\n\n\tstring get_http_header(const string &record, const string &key) {\n\t\tconst size_t pos = record.find(key);\n\t\tconst size_t pos_end = record.find(\"\\n\", pos);\n\t\tif (pos == string::npos) {\n\t\t\treturn \"\";\n\t\t}\n\n\t\tif (pos_end == string::npos) {\n\t\t\treturn record.substr(pos + key.size());\n\t\t}\n\n\t\treturn record.substr(pos + key.size(), pos_end - pos - key.size() - 1);\n\t}\n}\n"
  },
  {
    "path": "src/parser/parser.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace parser {\n\n\tconst int OK = 0;\n\tconst int ERROR = 1;\n\n\tstd::string urldecode(const std::string &str);\n\tstd::string urlencode(const std::string &str);\n\tstd::string get_http_header(const std::string &record, const std::string &key);\n}\n"
  },
  {
    "path": "src/parser/unicode.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"unicode.h\"\n\nusing namespace std;\n\nnamespace parser {\n\n\tstd::string unicode::encode(const std::string &str) {\n\n\t\tconst char *cstr = str.c_str();\n\t\tsize_t len = str.size();\n\n\t\tchar *target = new char[str.size()];\n\n\t\tsize_t last_unicode = len;\n\t\tsize_t utf8_len = 0;\n\t\tfor (size_t i = 0; i < len; i++) {\n\t\t\tbool copy = true;\n\t\t\tif (utf8_len == 0) {\n\t\t\t\tif (IS_UTF8_START_1(cstr[i])) {\n\t\t\t\t\tutf8_len = 1;\n\t\t\t\t\tlast_unicode = i;\n\t\t\t\t} else if (IS_UTF8_START_2(cstr[i])) {\n\t\t\t\t\tutf8_len = 2;\n\t\t\t\t\tlast_unicode = i;\n\t\t\t\t} else if (IS_UTF8_START_3(cstr[i])) {\n\t\t\t\t\tutf8_len = 3;\n\t\t\t\t\tlast_unicode = i;\n\t\t\t\t} else if (IS_UNKNOWN_UTF8_START(cstr[i])) {\n\t\t\t\t\tcopy = false;\n\t\t\t\t} else if ('\\x00' <= cstr[i] && cstr[i] <= '\\x1f') {\n\t\t\t\t\tcopy = false;\n\t\t\t\t}\n\t\t\t} else if (IS_MULTIBYTE_CODEPOINT(cstr[i])) {\n\t\t\t\tutf8_len--;\n\t\t\t} else {\n\t\t\t\t// This unicode character has been terminated too soon.\n\t\t\t\tcopy = false;\n\t\t\t\tfor (size_t j = last_unicode; j <= i; j++) {\n\t\t\t\t\ttarget[j] = '?';\n\t\t\t\t}\n\t\t\t\tutf8_len = 0;\n\t\t\t}\n\t\t\tif (copy) {\n\t\t\t\ttarget[i] = cstr[i];\n\t\t\t} else {\n\t\t\t\ttarget[i] = '?';\n\t\t\t}\n\t\t}\n\n\t\tstd::string ret(target, len);\n\t\tdelete []target;\n\t\tif (utf8_len) {\n\t\t\treturn ret.substr(0, last_unicode);\n\t\t} else {\n\t\t\treturn ret;\n\t\t}\n\t}\n\n\tbool unicode::is_valid(const std::string &str) {\n\t\t\n\t\tconst char *cstr = str.c_str();\n\t\tsize_t len = str.size();\n\n\t\tsize_t utf8_len = 0;\n\t\tfor (size_t i = 0; i < len; i++) {\n\t\t\tif (utf8_len == 0) {\n\t\t\t\tif (IS_UTF8_START_1(cstr[i])) {\n\t\t\t\t\tutf8_len = 1;\n\t\t\t\t} else if (IS_UTF8_START_2(cstr[i])) {\n\t\t\t\t\tutf8_len = 2;\n\t\t\t\t} else if (IS_UTF8_START_3(cstr[i])) {\n\t\t\t\t\tutf8_len = 3;\n\t\t\t\t} else if (IS_UNKNOWN_UTF8_START(cstr[i])) {\n\t\t\t\t\treturn false;\n\t\t\t\t}\n\t\t\t} else if (IS_MULTIBYTE_CODEPOINT(cstr[i])) {\n\t\t\t\tutf8_len--;\n\t\t\t} else {\n\t\t\t\t// This unicode character has been terminated too soon.\n\t\t\t\treturn false;\n\t\t\t}\n\t\t}\n\n\t\tif (utf8_len) {\n\t\t\treturn false;\n\t\t}\n\n\t\treturn true;\n\t}\n\n}\n"
  },
  {
    "path": "src/parser/unicode.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\n#define IS_MULTIBYTE_CODEPOINT(ch) (((unsigned char)ch >> 7) && !(((unsigned char)ch >> 6) & 0x1))\n#define IS_UTF8_START_1(ch) (((unsigned char)ch >> 5) == 0b00000110 && ((unsigned char)ch & 0b00011111) >= 0b00000010)\n#define IS_UTF8_START_2(ch) (((unsigned char)ch >> 4) == 0b00001110)\n#define IS_UTF8_START_3(ch) (((unsigned char)ch >> 3) == 0b00011110)\n#define IS_UNKNOWN_UTF8_START(ch) (ch >> 7)\n\nnamespace parser {\n\n\tclass unicode {\n\n\t\tpublic:\n\t\t\t\n\t\t\tstatic std::string encode(const std::string &str);\n\t\t\tstatic bool is_valid(const std::string &str);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/profiler/profiler.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"profiler.h\"\n#include \"logger/logger.h\"\n#include <vector>\n#include <map>\n\nusing namespace std;\n\nnamespace profiler {\n\n\tmap<string, double> profiles_per_name;\n\n\tstd::chrono::_V2::system_clock::time_point start_time = std::chrono::high_resolution_clock::now();\n\n\tinstance::instance(const string &name) :\n\t\tm_name(name)\n\t{\n\t\tm_start_time = std::chrono::high_resolution_clock::now();\n\t}\n\n\tinstance::instance() :\n\t\tm_name(\"unnamed profile\")\n\t{\n\t\tm_start_time = std::chrono::high_resolution_clock::now();\n\t}\n\n\tinstance::~instance() {\n\t\tif (!m_has_stopped) {\n\t\t\tstop();\n\t\t}\n\t}\n\n\tvoid instance::enable() {\n\t\tm_enabled = true;\n\t}\n\n\tdouble instance::get() const {\n\t\tauto timer_elapsed = chrono::high_resolution_clock::now() - m_start_time;\n\t\tauto microseconds = chrono::duration_cast<std::chrono::microseconds>(timer_elapsed).count();\n\n\t\treturn (double)microseconds/1000;\n\t}\n\n\tdouble instance::get_micro() const {\n\t\tif (!m_enabled) return 0;\n\t\tauto timer_elapsed = chrono::high_resolution_clock::now() - m_start_time;\n\t\tauto microseconds = chrono::duration_cast<std::chrono::microseconds>(timer_elapsed).count();\n\n\t\treturn (double)microseconds;\n\t}\n\n\tvoid instance::stop() {\n\t\tm_has_stopped = true;\n\t\tprofiles_per_name[m_name] += get();\n\t\tif (!m_enabled) return;\n\t\tLOG_INFO(\"profiler [\" + m_name + \"] took \" + to_string(get()) + \"ms\");\n\t}\n\n\tvoid instance::print() {\n\t\tif (!m_enabled) return;\n\t\tcout << \"profiler [\" + m_name + \"] took \" + to_string(get()) + \"ms\" << endl;\n\t}\n\n\tvoid print_memory_status() {\n\t\tifstream infile(\"/proc/\" + to_string(getpid()) + \"/status\");\n\t\tif (infile.is_open()) {\n\t\t\tstring line;\n\t\t\twhile (getline(infile, line)) {\n\t\t\t\tLOG_INFO(line);\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid tick(const string &name, const string &section) {\n\t\t(void)name;\n\t\t(void)section;\n\t}\n\tvoid report_reset();\n\tvoid report_print();\n\n\tdouble now_micro() {\n\t\tauto timer_elapsed = chrono::high_resolution_clock::now() - start_time;\n\t\tauto microseconds = chrono::duration_cast<std::chrono::microseconds>(timer_elapsed).count();\n\n\t\treturn (double)microseconds;\n\t}\n\n\tsize_t timestamp() {\n\t\tconst auto p1 = std::chrono::system_clock::now();\n\t\treturn std::chrono::duration_cast<std::chrono::seconds>(p1.time_since_epoch()).count();\n\t}\n\n\tvoid print_report() {\n\n\t\tdouble total_ms = 0.0;\n\t\tfor (const auto &iter : profiles_per_name) {\n\t\t\ttotal_ms += iter.second;\n\t\t}\n\n\t\tfor (const auto &iter : profiles_per_name) {\n\t\t\tcout << iter.first << \": \" << iter.second << \"ms (\" << 100.0 * (iter.second / total_ms) << \"%)\" << endl;\n\t\t}\n\t}\n\n}\n\n"
  },
  {
    "path": "src/profiler/profiler.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <chrono>\n#include <fstream>\n#include <unistd.h>\n\nnamespace profiler {\n\n\tclass instance {\n\n\tpublic:\n\n\t\texplicit instance(const std::string &name);\n\t\tinstance();\n\t\t~instance();\n\n\t\tvoid enable();\n\t\tdouble get() const;\n\t\tdouble get_micro() const;\n\t\tvoid stop();\n\t\tvoid print();\n\n\tprivate:\n\t\tstd::string m_name;\n\t\tbool m_enabled = true;\n\t\tbool m_has_stopped = false;\n\t\tstd::chrono::_V2::system_clock::time_point m_start_time;\n\t};\n\n\tvoid print_memory_status();\n\n\tvoid tick(const std::string &name, const std::string &section);\n\tvoid report_reset();\n\tvoid report_print();\n\tdouble now_micro();\n\tsize_t timestamp();\n\tvoid print_report();\n\n}\n"
  },
  {
    "path": "src/scraper/scraper.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"scraper.h\"\n#include \"parser/html_parser.h\"\n#include \"common/datetime.h\"\n#include \"text/text.h\"\n#include \"logger/logger.h\"\n#include <memory>\n\nusing namespace std;\n\nnamespace scraper {\n\n\tstring user_agent_token() {\n\t\treturn \"AlexandriaOrgBot\";\n\t}\n\n\tstring user_agent() {\n\t\tstring ua_version = \"1.0\";\n\t\tstring ua = \"Mozilla/5.0 (Linux) (compatible; \"+user_agent_token()+\"/\"+ua_version+\"; +https://www.alexandria.org/bot.html)\";\n\t\treturn ua;\n\t}\n\n\tscraper_stats::scraper_stats() {\n\t}\n\n\tscraper_stats::~scraper_stats() {\n\t\tm_running = false;\n\t\tif (m_thread.joinable()) m_thread.join();\n\t}\n\n\tvoid scraper_stats::gather_statistics(const map<string, unique_ptr<scraper>> &scrapers, size_t urls_in_queue) {\n\t\tstart_count(urls_in_queue);\n\t\tfor (const auto &iter : scrapers) {\n\t\t\tif (iter.second->finished()) {\n\t\t\t\tcount_finished(*(iter.second));\n\t\t\t} else {\n\t\t\t\tcount_unfinished(*(iter.second));\n\t\t\t}\n\t\t}\n\t\tend_count();\n\t}\n\n\tvoid scraper_stats::start_thread(size_t timeout) {\n\t\tm_timeout = timeout;\n\t\tm_thread = std::move(thread([this]() {\n\t\t\tthis->run();\n\t\t}));\n\t}\n\n\tvoid scraper_stats::start_count(size_t urls_in_queue) {\n\t\tm_lock.lock();\n\t\tm_unfinished_scrapers = 0;\n\t\tm_unfinished_scraped_urls = 0;\n\t\tm_unfinished_scraped_urls_non200 = 0;\n\t\tm_unfinished_scraped_errors = 0;\n\t\tm_urls_in_queue = urls_in_queue;\n\t\tm_urls_assigned = 0;\n\t}\n\n\tvoid scraper_stats::end_count() {\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_stats::count_finished(const scraper &scraper) {\n\t\tm_scraped_urls += scraper.num_scraped();\n\t\tm_scraped_urls_non200 += scraper.num_scraped_non200();\n\t\tm_scraped_errors += scraper.num_errors();\n\t\tm_finished_scrapers += 1;\n\t\tm_num_blocked += scraper.blocked() ? 1 : 0;\n\t}\n\n\tvoid scraper_stats::count_unfinished(const scraper &scraper) {\n\t\tm_unfinished_scraped_urls += scraper.num_scraped();\n\t\tm_unfinished_scraped_urls_non200 += scraper.num_scraped_non200();\n\t\tm_unfinished_scraped_errors += scraper.num_errors();\n\t\tm_unfinished_scrapers += 1;\n\t\tm_urls_assigned += scraper.size();\n\t}\n\n\tvoid scraper_stats::run() {\n\t\tsize_t time_start = profiler::timestamp();\n\t\twhile (m_running) {\n\t\t\tstd::this_thread::sleep_for(std::chrono::seconds(m_timeout));\n\t\t\tlog_report(profiler::timestamp() - time_start);\n\t\t}\n\t}\n\n\tvoid scraper_stats::log_report(size_t dt) {\n\t\tm_lock.lock();\n\t\tstd::stringstream ss;\n\t\tss.precision(2);\n\t\tss << endl;\n\t\tss << \"Scraper stats:\" << endl;\n\t\tss << m_urls_in_queue << \" urls in queue (not assigned to any scraper)\" << endl;\n\t\tss << m_urls_assigned << \" urls assigned to running scrapers\" << endl;\n\t\tss << (m_scraped_urls + m_unfinished_scraped_urls) << \" urls done (200 response)\" << endl;\n\t\tss << (m_scraped_urls_non200 + m_unfinished_scraped_urls_non200) << \" urls (non 200 response)\" << endl;\n\t\tss << (m_scraped_errors + m_unfinished_scraped_errors) << \" urls (errors)\" << endl;\n\t\tss << fixed << (double)(m_scraped_urls + m_unfinished_scraped_urls)/dt << \"/s\" << endl;\n\t\tss << m_finished_scrapers << \" finished scrapers\" << endl;\n\t\tss << m_unfinished_scrapers << \" unfinished scrapers\" << endl;\n\t\tss << m_num_blocked << \" blocked scrapers\" << endl;\n\t\tm_lock.unlock();\n\t\tLOG_INFO(ss.str());\n\t}\n\n\tscraper::scraper(const string &domain, scraper_store *store) :\n\t\tm_domain(domain), m_store(store)\n\t{\n\t\t//m_domain_data.m_domain = domain;\n\t\tm_curl = curl_easy_init();\n\t}\n\n\tscraper::~scraper() {\n\t\tif (m_thread.joinable()) m_thread.join();\n\t\tupload_domain_info();\n\t\tcurl_easy_cleanup(m_curl);\n\t}\n\n\tvoid scraper::push_url(const URL &url) {\n\t\tm_queue.push(url);\n\t}\n\n\tvoid scraper::run() {\n\n\t\tdownload_domain_data();\n\t\tdownload_robots();\n\n\t\twhile (m_queue.size()) {\n\t\t\tURL url = filter_url(m_queue.front());\n\t\t\tm_queue.pop();\n\t\t\tif (robots_allow_url(url)) {\n\t\t\t\tif (m_timeout) {\n\t\t\t\t\tthis_thread::sleep_for(std::chrono::seconds(m_timeout/2 + (rand() % m_timeout)));\n\t\t\t\t}\n\t\t\t\thandle_url(url);\n\t\t\t}\n\t\t\tif (m_consecutive_error_count > 20) break;\n\t\t}\n\n\t\tm_finished = true;\n\t}\n\n\tvoid scraper::handle_url(const URL &url) {\n\t\tcout << url.str() << endl;\n\t\tm_buffer.resize(0);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_USERAGENT, user_agent().c_str());\n\t\tcurl_easy_setopt(m_curl, CURLOPT_FOLLOWLOCATION, 1l);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_MAXREDIRS, 5l);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_WRITEFUNCTION, curl_string_reader);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_WRITEDATA, this);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_URL, url.str().c_str());\n\t\tcurl_easy_setopt(m_curl, CURLOPT_TIMEOUT, 30);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_ERRORBUFFER, m_curl_error_buffer);\n\n\t\tCURLcode res = curl_easy_perform(m_curl);\n\n\t\tif (res == CURLE_OK) {\n\t\t\tm_consecutive_error_count = 0;\n\t\t\tlong response_code;\n\t\t\tchar *new_url_str = nullptr;\n\t\t\tcurl_easy_getinfo(m_curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\tcurl_easy_getinfo(m_curl, CURLINFO_EFFECTIVE_URL, &new_url_str);\n\n\t\t\t// Fetch IP address.\n\t\t\tchar *ip_cstr;\n\t\t\tstring ip;\n\t\t\tif (!curl_easy_getinfo(m_curl, CURLINFO_PRIMARY_IP, &ip_cstr) && ip_cstr != nullptr) ip = string(ip_cstr);\n\n\t\t\tif (new_url_str != nullptr) {\n\t\t\t\tstring new_u_str(new_url_str);\n\t\t\t\tURL new_url(new_u_str);\n\t\t\t\tupdate_url(new_url, response_code, common::cur_datetime(), URL());\n\t\t\t\tif (url.canonically_different(new_url)) {\n\t\t\t\t\tupdate_url(url, 301, common::cur_datetime(), new_url); // A bit of cheeting heere, it is not sure the original url had a 301 response code.\n\t\t\t\t}\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\thandle_200_response(m_buffer, response_code, ip, new_url);\n\t\t\t\t} else {\n\t\t\t\t\thandle_non_200_response(m_buffer, response_code, ip, new_url);\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\tupdate_url(url, response_code, common::cur_datetime(), URL());\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\thandle_200_response(m_buffer, response_code, ip, url);\n\t\t\t\t} else {\n\t\t\t\t\thandle_non_200_response(m_buffer, response_code, ip, url);\n\t\t\t\t}\n\t\t\t}\n\t\t} else {\n\t\t\t/*\n\t\t\t * Handle everything here: https://curl.se/libcurl/c/libcurl-errors.html\n\t\t\t * */\n\t\t\tvector<CURLcode> domain_errors = {\n\t\t\t\tCURLE_COULDNT_RESOLVE_HOST,\n\t\t\t\tCURLE_COULDNT_CONNECT,\n\t\t\t};\n\n\t\t\thandle_curl_error(url, res, string(m_curl_error_buffer));\n\n\t\t\tif (res == CURLE_COULDNT_RESOLVE_HOST || res == CURLE_COULDNT_CONNECT) {\n\t\t\t\tupdate_url(url, 10000 + res, common::cur_datetime(), URL());\n\t\t\t\tmark_all_urls_with_error(10000 + res);\n\t\t\t} else {\n\t\t\t\tupdate_url(url, 10000 + res, common::cur_datetime(), URL());\n\t\t\t}\n\t\t}\n\n\t\tm_buffer.resize(0);\n\t\tm_buffer.shrink_to_fit();\n\t}\n\n\tvoid scraper::mark_all_urls_with_error(size_t error_code) {\n\t\twhile (m_queue.size()) {\n\t\t\tURL url = filter_url(m_queue.front());\n\t\t\tm_queue.pop();\n\t\t\tupdate_url(url, error_code, common::cur_datetime(), URL());\n\t\t}\n\t}\n\n\tvoid scraper::update_url(const URL &url, size_t http_code, size_t last_visited, const URL &redirect) {\n\t\t// Store information about URL.\n\t}\n\n\tvoid scraper::handle_curl_error(const URL &url, size_t curl_error, const std::string &error_msg) {\n\t\tm_num_errors++;\n\t\tm_consecutive_error_count++;\n\t\tm_store->add_curl_error(url.str() + \"\\t\" + to_string(curl_error) + \"\\t\" + error_msg + \"\\n\");\n\t\tm_store->upload_curl_errors();\n\t}\n\n\tvoid scraper::handle_200_response(const string &data, size_t response_code, const string &ip, const URL &url) {\n\t\t(void)response_code;\n\t\tm_num_200++;\n\t\tparser::html_parser html_parser(100000);\n\t\thtml_parser.parse(data, url.str());\n\n\t\tm_num_total++;\n\t\tif (url.has_www()) m_num_www++; \n\t\tif (url.has_https()) m_num_https++; \n\t\tif (m_num_total == 3) upload_domain_info();\n\n\t\tconst string date = common::iso8601_datetime();\n\n\t\tif (html_parser.should_insert()) {\n\t\t\tconst string line = (url.str()\n\t\t\t\t+ '\\t' + html_parser.title()\n\t\t\t\t+ '\\t' + html_parser.h1()\n\t\t\t\t+ '\\t' + html_parser.meta()\n\t\t\t\t+ '\\t' + html_parser.text()\n\t\t\t\t+ '\\t' + date\n\t\t\t\t+ '\\t' + ip\n\t\t\t\t+ '\\n');\n\t\t\tm_store->add_scraper_data(line);\n\t\t\tstring links;\n\t\t\tfor (const auto &link : html_parser.links()) {\n\t\t\t\tlinks += (link.host()\n\t\t\t\t\t+ '\\t' + link.path()\n\t\t\t\t\t+ '\\t' + link.target_host()\n\t\t\t\t\t+ '\\t' + link.target_path()\n\t\t\t\t\t+ '\\t' + link.text()\n\t\t\t\t\t+ '\\t' + (link.nofollow() ? \"1\" : \"0\")\n\t\t\t\t\t+ '\\n');\n\t\t\t}\n\t\t\tm_store->add_link_data(links);\n\t\t\tm_store->upload_results();\n\t\t}\n\t}\n\n\tvoid scraper::handle_non_200_response(const string &data, size_t response_code, const string &ip, const URL &url) {\n\n\t\tm_num_non200++;\n\n\t\tcheck_for_captcha_block(data, response_code);\n\n\t\tparser::html_parser html_parser;\n\t\thtml_parser.parse(data, url.str());\n\n\t\tconst string date = common::iso8601_datetime();\n\n\t\tif (html_parser.should_insert()) {\n\t\t\tconst string line = (url.str()\n\t\t\t\t+ '\\t' + html_parser.title()\n\t\t\t\t+ '\\t' + html_parser.h1()\n\t\t\t\t+ '\\t' + html_parser.meta()\n\t\t\t\t+ '\\t' + html_parser.text()\n\t\t\t\t+ '\\t' + date\n\t\t\t\t+ '\\t' + ip\n\t\t\t\t+ '\\n');\n\t\t\tm_store->add_non_200_scraper_data(line);\n\t\t\tm_store->upload_non_200_results();\n\t\t}\n\t}\n\n\tvoid scraper::check_for_captcha_block(const std::string &data, size_t response_code) {\n\t\tif (response_code != 200 && (data.find(\"Captcha\") != string::npos || data.find(\"captcha\") != string::npos)) {\n\t\t\tm_blocked = true;\n\t\t\tmark_all_urls_with_error(10000 + 999);\n\t\t}\n\t}\n\n\tvoid scraper::download_domain_data() {\n\t\t\n\t}\n\n\tvoid scraper::download_robots() {\n\t\tconst URL robots_path = filter_url(URL(\"http://\" + m_domain + \"/robots.txt\"));\n\t\tm_robots_content = simple_get(robots_path);\n\n\t\tscraper::upload_robots_txt(m_robots_content);\n\t}\n\n\tbool scraper::robots_allow_url(const URL &url) const {\n\t\tgooglebot::RobotsMatcher matcher;\n\t\tbool allowed = matcher.OneAgentAllowedByRobots(m_robots_content, user_agent_token(), url.str());\n\t\treturn allowed;\n\t}\n\n\tstring scraper::simple_get(const URL &url) {\n\t\tcurl_easy_setopt(m_curl, CURLOPT_USERAGENT, user_agent().c_str());\n\t\tcurl_easy_setopt(m_curl, CURLOPT_FOLLOWLOCATION, 1l);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_MAXREDIRS, 5l);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_WRITEFUNCTION, curl_string_reader);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_WRITEDATA, this);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_URL, url.str().c_str());\n\t\tcurl_easy_setopt(m_curl, CURLOPT_TIMEOUT, 30);\n\t\tcurl_easy_setopt(m_curl, CURLOPT_ERRORBUFFER, m_curl_error_buffer);\n\n\t\tm_buffer.resize(0);\n\t\tCURLcode res = curl_easy_perform(m_curl);\n\t\tif (res == CURLE_OK) {\n\t\t\tlong response_code;\n\t\t\tchar *new_url_str = nullptr;\n\t\t\tcurl_easy_getinfo(m_curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\tcurl_easy_getinfo(m_curl, CURLINFO_EFFECTIVE_URL, &new_url_str);\n\n\t\t\tcheck_for_captcha_block(m_buffer, response_code);\n\t\t} else {\n\t\t\t/*\n\t\t\t * Handle everything here: https://curl.se/libcurl/c/libcurl-errors.html\n\t\t\t * */\n\t\t\tvector<CURLcode> domain_errors = {\n\t\t\t\tCURLE_COULDNT_RESOLVE_HOST,\n\t\t\t\tCURLE_COULDNT_CONNECT,\n\t\t\t};\n\n\t\t\thandle_curl_error(url, res, string(m_curl_error_buffer));\n\n\t\t\tif (res == CURLE_COULDNT_RESOLVE_HOST || res == CURLE_COULDNT_CONNECT) {\n\t\t\t\tmark_all_urls_with_error(10000 + res);\n\t\t\t} else {\n\t\t\t}\n\t\t}\n\n\t\treturn m_buffer;\n\t}\n\n\tvoid scraper::upload_domain_info() {\n\t\tif (m_num_total > 0) {\n\t\t\t// TODO.. Upload data about domain.\n\t\t}\n\t}\n\n\tvoid scraper::upload_robots_txt(const string &robots_content) {\n\t\t// TODO.. Upload data about robots.txt\n\t}\n\n\tURL scraper::filter_url(const URL &url) {\n\t\tURL ret(url);\n\t\t//if (m_domain_data.m_has_https && !url.has_https()) ret.set_scheme(\"https\");\n\t\t//if (m_domain_data.m_has_www && !url.has_www()) ret.set_www(true);\n\n\t\treturn ret;\n\t}\n\n\tvoid scraper::start_thread() {\n\t\tm_started = true;\n\t\tm_thread = std::move(thread([this](){\n\t\t\tthis->run();\n\t\t}));\n\t}\n\n\tsize_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata) {\n\t\tconst size_t byte_size = size * nmemb;\n\t\tscraper *s = static_cast<scraper *>(userdata);\n\t\tif (s->m_buffer_len < s->m_buffer.size() + byte_size) return 0;\n\t\ts->m_buffer.append(ptr, byte_size);\n\t\treturn byte_size;\n\t}\n\n\tsize_t read_max_scrapers() {\n\t\tifstream infile(\"/tmp/num_scrapers\");\n\t\tif (!infile.is_open()) return 0;\n\t\tsize_t max_scrapers;\n\t\tinfile >> max_scrapers;\n\t\treturn max_scrapers;\n\t}\n\n\tbool reset_scraper_urls() {\n\t\tstring content = \"\";\n\t\tint error = transfer::upload_file(\"nodes/\" + config::node + \"/scraper.urls\", content);\n\t\treturn error == transfer::OK;\n\t}\n\n\tvector<string> download_scraper_urls() {\n\t\tint error;\n\t\tstring content = transfer::file_to_string(\"nodes/\" + config::node + \"/scraper.urls\", error);\n\t\tif (error == transfer::ERROR) return {};\n\n\t\treset_scraper_urls();\n\n\t\tvector<string> raw_urls;\n\t\tboost::algorithm::split(raw_urls, content, boost::is_any_of(\"\\n\"));\n\n\t\tvector<string> urls;\n\t\tfor (const string &url : raw_urls) {\n\t\t\tif (text::trim(url).size()) {\n\t\t\t\turls.push_back(url);\n\t\t\t}\n\t\t}\n\n\t\treturn urls;\n\t}\n\n\tvoid run_scraper_on_urls(const vector<string> &input_urls) {\n\t\tsize_t max_scrapers = 1000;\n\t\tscraper_store store;\n\t\tscraper_stats stats;\n\t\tmap<string, unique_ptr<scraper>> scrapers;\n\n\t\tstats.start_thread(60); // Report statistics every minute.\n\n\t\tvector<string> urls = input_urls;\n\t\twhile (urls.size() || scrapers.size()) {\n\n\t\t\tLOG_INFO(\"Starting scrapers with: \" + to_string(urls.size()) + \" urls\");\n\n\t\t\tsize_t new_max_scrapers = read_max_scrapers();\n\t\t\tif (new_max_scrapers) {\n\t\t\t\tmax_scrapers = new_max_scrapers;\n\t\t\t}\n\n\t\t\tvector<string> unhandled_urls;\n\n\t\t\tfor (const string &url_str : urls) {\n\t\t\t\tURL url(url_str);\n\n\t\t\t\tif (scrapers.count(url.host()) == 0) {\n\t\t\t\t\tif (scrapers.size() >= max_scrapers) {\n\t\t\t\t\t\tunhandled_urls.push_back(url_str);\n\t\t\t\t\t} else {\n\t\t\t\t\t\tscrapers[url.host()] = make_unique<scraper>(url.host(), &store);\n\t\t\t\t\t\tscrapers[url.host()]->push_url(url);\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\tscrapers[url.host()]->push_url(url);\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Start scrapers.\n\t\t\tfor (auto &iter : scrapers) {\n\t\t\t\tif (!iter.second->started()) {\n\t\t\t\t\titer.second->start_thread();\n\t\t\t\t}\n\t\t\t}\n\t\t\t\n\t\t\t// Wait for some scrapers to finish before we assign new scrapers again.\n\t\t\twhile (scrapers.size() > max_scrapers * 0.8) {\n\t\t\t\tstats.gather_statistics(scrapers, urls.size());\n\t\t\t\tfor (auto iter = scrapers.begin(); iter != scrapers.end(); ) {\n\t\t\t\t\tif (iter->second->finished()) {\n\t\t\t\t\t\titer = scrapers.erase(iter);\n\t\t\t\t\t} else {\n\t\t\t\t\t\titer++;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tthis_thread::sleep_for(1000ms);\n\t\t\t}\n\t\t\tstats.gather_statistics(scrapers, urls.size());\n\t\t\turls = unhandled_urls;\n\n\t\t\t// Check for new urls and append them.\n\t\t\tvector<string> new_urls = download_scraper_urls();\n\t\t\turls.insert(urls.end(), new_urls.begin(), new_urls.end());\n\n\t\t\tif (urls.size() == 0) {\n\t\t\t\t// We don't have any new urls. Just sleep a bit before checking again.\n\t\t\t\tstd::this_thread::sleep_for(std::chrono::seconds(60));\n\t\t\t}\n\t\t}\n\t\t\n\t}\n\n\tvoid url_downloader() {\n\n\t\tconst size_t timeout = 300;\n\t\t//const size_t limit = 500;\n\n\t\t// main loop\n\t\twhile (true) {\n\n\t\t\t// Check if there are any urls to digest every 'timeout' minutes.\n\t\t\tvector<string> urls = download_scraper_urls();\n\n\t\t\tif (urls.size() > 0) {\n\t\t\t\trun_scraper_on_urls(urls);\n\t\t\t}\n\n\t\t\tsleep(timeout);\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "src/scraper/scraper.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iostream>\n#include <queue>\n#include <curl/curl.h>\n#include <thread>\n#include <boost/filesystem.hpp>\n#include \"transfer/transfer.h\"\n#include \"robots.h\"\n#include \"scraper_store.h\"\n#include \"URL.h\"\n#include \"profiler/profiler.h\"\n\nnamespace scraper {\n\n\tstd::string user_agent_token();\n\tstd::string user_agent();\n\n\t/*\n\t * The scraper!\n\t * */\n\tclass scraper {\n\t\tpublic:\n\n\t\t\tscraper(const std::string &domain, scraper_store *store);\n\t\t\t~scraper();\n\n\t\t\tvoid set_timeout(size_t timeout_in_seconds) { m_timeout = timeout_in_seconds; }\n\t\t\tvoid push_url(const URL &url);\n\t\t\tvoid run();\n\t\t\tvoid start_thread();\n\t\t\tbool finished() const { return m_finished; };\n\t\t\tbool started() { return m_started; }\n\t\t\tstd::string domain() { return m_domain; }\n\t\t\tsize_t num_scraped() const { return m_num_200; }\n\t\t\tsize_t num_scraped_non200() const { return m_num_non200; }\n\t\t\tsize_t num_errors() const { return m_num_errors; }\n\t\t\tsize_t size() const { return m_queue.size(); }\n\t\t\tbool blocked() const { return m_blocked; }\n\n\t\tprivate:\n\t\t\tstd::thread m_thread;\n\t\t\tbool m_started = false;\n\t\t\tbool m_finished = false;\n\t\t\tstd::string m_domain;\n\t\t\tstd::string m_buffer;\n\t\t\tchar m_curl_error_buffer[CURL_ERROR_SIZE];\n\t\t\tsize_t m_buffer_len = 1024*1024*10;\n\t\t\tsize_t m_num_200 = 0;\n\t\t\tsize_t m_num_non200 = 0;\n\t\t\tsize_t m_num_errors = 0;\n\t\t\tbool m_blocked = false;\n\t\t\tCURL *m_curl;\n\t\t\tscraper_store *m_store;\n\t\t\tstd::queue<URL> m_queue;\n\t\t\tgooglebot::RobotsMatcher m_robots;\n\t\t\tstd::string m_robots_content;\n\t\t\tsize_t m_num_total = 0;\n\t\t\tsize_t m_num_www = 0;\n\t\t\tsize_t m_num_https = 0;\n\t\t\tsize_t m_consecutive_error_count = 0;\n\t\t\tsize_t m_timeout = 30;\n\n\t\t\tvoid handle_curl_error(const URL &url, size_t curl_error, const std::string &error_msg);\n\t\t\tvoid handle_url(const URL &url);\n\t\t\tvoid mark_all_urls_with_error(size_t error_code);\n\t\t\tvoid update_url(const URL &url, size_t http_code, size_t last_visited, const URL &redirect);\n\t\t\tvoid handle_200_response(const std::string &data, size_t response_code, const std::string &ip, const URL &url);\n\t\t\tvoid handle_non_200_response(const std::string &data, size_t response_code, const std::string &ip, const URL &url);\n\t\t\tvoid check_for_captcha_block(const std::string &data, size_t response_code);\n\t\t\tvoid download_domain_data();\n\t\t\tvoid download_robots();\n\t\t\tbool robots_allow_url(const URL &url) const;\n\t\t\tstd::string simple_get(const URL &url);\n\t\t\tvoid upload_domain_info();\n\t\t\tvoid upload_robots_txt(const std::string &robots_content);\n\t\t\tURL filter_url(const URL &url);\n\n\t\tpublic:\n\n\t\t\tfriend size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata);\n\t};\n\n\tclass scraper_stats {\n\t\tpublic:\n\t\t\tscraper_stats();\n\t\t\t~scraper_stats();\n\t\t\tvoid gather_statistics(const std::map<std::string, std::unique_ptr<scraper>> &scrapers, size_t urls_in_queue);\n\t\t\tvoid start_thread(size_t timeout);\n\t\t\tvoid start_count(size_t urls_in_queue);\n\t\t\tvoid end_count();\n\t\t\tvoid count_finished(const scraper &scraper);\n\t\t\tvoid count_unfinished(const scraper &scraper);\n\n\t\tprivate:\n\t\t\tstd::thread m_thread;\n\t\t\tsize_t m_timeout = 300;\n\t\t\tsize_t m_num_blocked = 0;\n\t\t\tsize_t m_finished_scrapers = 0;\n\t\t\tsize_t m_unfinished_scrapers = 0;\n\t\t\tsize_t m_scraped_urls = 0;\n\t\t\tsize_t m_unfinished_scraped_urls = 0;\n\t\t\tsize_t m_scraped_urls_non200 = 0;\n\t\t\tsize_t m_unfinished_scraped_urls_non200 = 0;\n\t\t\tsize_t m_scraped_errors = 0;\n\t\t\tsize_t m_unfinished_scraped_errors = 0;\n\t\t\tsize_t m_urls_in_queue = 0;\n\t\t\tsize_t m_urls_assigned = 0;\n\t\t\tbool m_running = true;\n\t\t\tstd::mutex m_lock;\n\n\t\t\tvoid run();\n\t\t\tvoid log_report(size_t dt);\n\t};\n\n\tsize_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata);\n\n\tsize_t read_max_scrapers();\n\tvoid url_downloader();\n\tvoid run_scraper_on_urls(const std::vector<std::string> &input_urls);\n\n}\n"
  },
  {
    "path": "src/scraper/scraper_store.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"scraper_store.h\"\n#include \"common/system.h\"\n#include \"common/datetime.h\"\n#include \"warc/warc.h\"\n#include \"transfer/transfer.h\"\n#include \"logger/logger.h\"\n\nusing namespace std;\n\nnamespace scraper {\n\n\tscraper_store::scraper_store() {\n\n\t}\n\n\tscraper_store::scraper_store(bool do_upload)\n\t: m_do_upload(do_upload)\n\t{\n\t}\n\t\n\tscraper_store::~scraper_store() {\n\t\tm_upload_limit = 0;\n\t\tupload_results();\n\t\tupload_non_200_results();\n\t}\n\n\tvoid scraper_store::add_scraper_data(const std::string &line) {\n\t\tm_lock.lock();\n\t\tm_results.push_back(line);\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::add_non_200_scraper_data(const std::string &line) {\n\t\tm_lock.lock();\n\t\tm_non_200_results.push_back(line);\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::add_link_data(const std::string &links) {\n\t\tm_lock.lock();\n\t\tm_link_results.push_back(links);\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::add_curl_error(const string &line) {\n\t\tm_lock.lock();\n\t\tm_curl_errors.push_back(line);\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::upload_url_datas() {\n\t\tif (!m_do_upload) return;\n\t\tm_lock.lock();\n\t\t// todo upload data\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::upload_domain_datas() {\n\t\tif (!m_do_upload) return;\n\t\tm_lock.lock();\n\t\t// todo upload data\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::upload_robots_datas() {\n\t\tif (!m_do_upload) return;\n\t\tm_lock.lock();\n\t\t// todo upload data\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::upload_results() {\n\t\tif (!m_do_upload) return;\n\t\tm_lock.lock();\n\t\tif (m_results.size() >= m_upload_limit) {\n\t\t\tconst string all_results = boost::algorithm::join(m_results, \"\");\n\t\t\tconst string all_link_results = boost::algorithm::join(m_link_results, \"\");\n\n\t\t\tm_results.resize(0);\n\t\t\tm_link_results.resize(0);\n\n\t\t\tm_lock.unlock();\n\n\t\t\tinternal_upload_results(all_results, all_link_results);\n\n\t\t\treturn;\n\t\t}\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::upload_non_200_results() {\n\t\tif (!m_do_upload) return;\n\t\tm_lock.lock();\n\t\tif (m_non_200_results.size() >= m_non_200_upload_limit) {\n\t\t\tconst string all_results = boost::algorithm::join(m_non_200_results, \"\");\n\n\t\t\tm_non_200_results.resize(0);\n\n\t\t\tm_lock.unlock();\n\n\t\t\tinternal_upload_non_200_results(all_results);\n\n\t\t\treturn;\n\t\t}\n\t\tm_lock.unlock();\n\t}\n\n\tvoid scraper_store::upload_curl_errors() {\n\t\tif (!m_do_upload) return;\n\t\tm_lock.lock();\n\t\tif (m_curl_errors.size() >= m_curl_errors_upload_limit) {\n\t\t\tconst string all_results = boost::algorithm::join(m_curl_errors, \"\");\n\n\t\t\tm_curl_errors.resize(0);\n\n\t\t\tm_lock.unlock();\n\n\t\t\tinternal_upload_curl_errors(all_results);\n\n\t\t\treturn;\n\t\t}\n\t\tm_lock.unlock();\n\t}\n\n\tstd::string scraper_store::tail() const {\n\t\tif (m_results.size() == 0) return \"\";\n\t\treturn m_results.back();\n\t}\n\n\tvoid scraper_store::try_upload_until_complete(const string &path, const string &data) {\n\n\t\tsize_t retry_num = 1;\n\t\twhile (transfer::upload_gz_file(path, data) == transfer::ERROR) {\n\t\t\tLOG_INFO(\"Error uploading file \" + path + \" retry no \" + to_string(retry_num++));\n\t\t\tstd::this_thread::sleep_for(std::chrono::seconds(30));\n\t\t}\n\t}\n\n\tvoid scraper_store::internal_upload_results(const string &all_results, const string &all_link_results) {\n\t\tconst string warc_path = \"crawl-data/ALEXANDRIA-SCRAPER-01/files/\" + common::uuid() + \"-\" + to_string(common::cur_datetime()) + \"-\" +\n\t\t\tto_string(m_file_index++) + \".warc.gz\";\n\t\ttry_upload_until_complete(warc::get_result_path(warc_path), all_results);\n\t\ttry_upload_until_complete(warc::get_link_result_path(warc_path), all_link_results);\n\t}\n\n\tvoid scraper_store::internal_upload_non_200_results(const string &all_results) {\n\t\tconst string warc_path = \"crawl-data/ALEXANDRIA-SCRAPER-01/non-200-responses/\" + common::uuid() + \"-\" + to_string(common::cur_datetime()) +\n\t\t\t\"-\" + to_string(m_file_index++) + \".warc.gz\";\n\t\ttry_upload_until_complete(warc::get_result_path(warc_path), all_results);\n\t}\n\n\tvoid scraper_store::internal_upload_curl_errors(const string &all_results) {\n\t\tconst string warc_path = \"crawl-data/ALEXANDRIA-SCRAPER-01/curl-errors/\" + common::uuid() + \"-\" + to_string(common::cur_datetime()) +\n\t\t\t\"-\" + to_string(m_file_index++) + \".warc.gz\";\n\t\ttry_upload_until_complete(warc::get_result_path(warc_path), all_results);\n\t}\n\n}\n"
  },
  {
    "path": "src/scraper/scraper_store.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n#include <mutex>\n#include <thread>\n\nnamespace scraper {\n\n\t/*\n\t * Responsible for storing scraper data on a file and upload it to our fileserver when the file reaches a number of urls.\n\t * */\n\tclass scraper_store {\n\t\tpublic:\n\t\t\tscraper_store();\n\t\t\tscraper_store(bool do_upload);\n\t\t\t~scraper_store();\n\n\t\t\tvoid add_scraper_data(const std::string &line);\n\t\t\tvoid add_non_200_scraper_data(const std::string &line);\n\t\t\tvoid add_link_data(const std::string &links);\n\t\t\tvoid add_curl_error(const std::string &line);\n\t\t\tvoid upload_url_datas();\n\t\t\tvoid upload_domain_datas();\n\t\t\tvoid upload_robots_datas();\n\t\t\tvoid upload_results();\n\t\t\tvoid upload_non_200_results();\n\t\t\tvoid upload_curl_errors();\n\t\t\tstd::string tail() const;\n\n\t\t\tstd::vector<std::string> get_results() const { return m_results; }\n\n\t\tprivate:\n\t\t\tstd::mutex m_lock;\n\t\t\tstd::vector<std::string> m_results;\n\t\t\tstd::vector<std::string> m_non_200_results;\n\t\t\tstd::vector<std::string> m_link_results;\n\t\t\tstd::vector<std::string> m_curl_errors;\n\t\t\tsize_t m_file_index = 0;\n\t\t\tsize_t m_upload_limit = 50000;\n\t\t\tsize_t m_non_200_upload_limit = 10000;\n\t\t\tsize_t m_curl_errors_upload_limit = 10000;\n\t\t\tbool m_do_upload = true;\n\n\t\t\tvoid try_upload_until_complete(const std::string &path, const std::string &data);\n\t\t\tvoid internal_upload_results(const std::string &all_results, const std::string &all_link_results);\n\t\t\tvoid internal_upload_non_200_results(const std::string &all_results);\n\t\t\tvoid internal_upload_curl_errors(const std::string &all_results);\n\n\t};\n\n}\n"
  },
  {
    "path": "src/scraper.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <iostream>\n#include <signal.h>\n#include <set>\n#include <boost/algorithm/string.hpp>\n#include \"fcgio.h\"\n#include \"config.h\"\n#include \"logger/logger.h\"\n#include \"scraper/scraper.h\"\n\nusing namespace std;\n\nvoid custom_scraper() {\n\n\tset<string> files = {\n\t\t\"1081037252118226853.gz\",\n\t\t\"10929784512354426297.gz\",\n\t\t\"11734959054377540990.gz\",\n\t\t\"1231587059077024966.gz\",\n\t\t\"12502184239462757041.gz\",\n\t\t\"12938836205580400636.gz\",\n\t\t\"13296278169331508461.gz\",\n\t\t\"14413462586171452382.gz\",\n\t\t\"15525439295995440529.gz\",\n\t\t\"16672519014390713150.gz\",\n\t\t\"18394430357962364895.gz\",\n\t\t\"10327881400750748691.gz\",\n\t\t\"10670281930934377105.gz\",\n\t\t\"10803309592637608156.gz\",\n\t\t\"1081037252118226853.gz\", \n\t\t\"10834835858785818363.gz\",\n\t\t\"10929784512354426297.gz\",\n\t\t\"11126428663436160103.gz\",\n\t\t\"11147566439172409894.gz\",\n\t\t\"11190665490273023949.gz\",\n\t\t\"11494937404220367031.gz\",\n\t\t\"11734959054377540990.gz\",\n\t\t\"11828921816388240862.gz\",\n\t\t\"12060772154545358825.gz\",\n\t\t\"12162727308599252185.gz\",\n\t\t\"1231587059077024966.gz\", \n\t\t\"12422730800151531594.gz\",\n\t\t\"12502184239462757041.gz\",\n\t\t\"12607232937660003080.gz\",\n\t\t\"12718743898666138934.gz\",\n\t\t\"12938836205580400636.gz\",\n\t\t\"13296278169331508461.gz\",\n\t\t\"13298202493829067141.gz\",\n\t\t\"13361744378846796689.gz\",\n\t\t\"13490885160851937523.gz\",\n\t\t\"13574739826384812082.gz\",\n\t\t\"13587802784601809709.gz\",\n\t\t\"13631835647153009173.gz\",\n\t\t\"1367770908792956967.gz\", \n\t\t\"14046839555269968094.gz\",\n\t\t\"14413462586171452382.gz\",\n\t\t\"14541904792326560616.gz\",\n\t\t\"1482373106349460952.gz\", \n\t\t\"14837337010216722341.gz\",\n\t\t\"15086873759162732674.gz\",\n\t\t\"15141235398943116798.gz\",\n\t\t\"15184607826907101421.gz\",\n\t\t\"15202491165257081552.gz\",\n\t\t\"15282359210281111669.gz\",\n\t\t\"15389582257311135463.gz\",\n\t\t\"15391345478373482283.gz\",\n\t\t\"15525439295995440529.gz\",\n\t\t\"15534406110118601925.gz\",\n\t\t\"15538335442391548855.gz\",\n\t\t\"15612477389751002303.gz\",\n\t\t\"15624474507591924007.gz\",\n\t\t\"15676254393982196237.gz\",\n\t\t\"15984927866124019398.gz\",\n\t\t\"16082148041043793761.gz\",\n\t\t\"16126091541072713257.gz\",\n\t\t\"16255682052513253306.gz\",\n\t\t\"16337701239641827376.gz\",\n\t\t\"16383716280375787103.gz\",\n\t\t\"16529912269361020733.gz\",\n\t\t\"16534544105461457700.gz\",\n\t\t\"16639969140692056885.gz\",\n\t\t\"16672519014390713150.gz\",\n\t\t\"16744732358440828846.gz\",\n\t\t\"16836166158893839160.gz\",\n\t\t\"17068835535637839797.gz\",\n\t\t\"1729061688188470388.gz\", \n\t\t\"17360561405055540730.gz\",\n\t\t\"1746843565446970019.gz\", \n\t\t\"17640709097762418065.gz\",\n\t\t\"18131842535353305093.gz\",\n\t\t\"18187211227753083566.gz\",\n\t\t\"18394430357962364895.gz\",\n\t\t\"1934117982241616211.gz\", \n\t\t\"2211216046817783595.gz\", \n\t\t\"2239809113491403275.gz\", \n\t\t\"2327635888646701575.gz\", \n\t\t\"2478041411438244752.gz\", \n\t\t\"2551177065288807556.gz\", \n\t\t\"2601237824066336189.gz\", \n\t\t\"2646934360799240353.gz\", \n\t\t\"2868212837076456812.gz\", \n\t\t\"2926810779085983621.gz\", \n\t\t\"3091319073926623211.gz\", \n\t\t\"338937183383628192.gz\",  \n\t\t\"3604690558929123764.gz\", \n\t\t\"3606044194188728481.gz\", \n\t\t\"3852426225324652244.gz\", \n\t\t\"3972328001646307399.gz\", \n\t\t\"4007769859008228127.gz\", \n\t\t\"4072548759689568430.gz\", \n\t\t\"4193623627004305293.gz\", \n\t\t\"4226856446620685890.gz\", \n\t\t\"4312881270332666532.gz\", \n\t\t\"4473520710685818343.gz\", \n\t\t\"4720198542499220909.gz\", \n\t\t\"4734886902380514989.gz\", \n\t\t\"4800764859071121577.gz\", \n\t\t\"4837392932044495189.gz\", \n\t\t\"493001789945179170.gz\",  \n\t\t\"5263808122620003539.gz\", \n\t\t\"5284265763220135234.gz\", \n\t\t\"5322267948444699594.gz\", \n\t\t\"5339170779334172446.gz\", \n\t\t\"5496827761574196815.gz\", \n\t\t\"5683557192991319856.gz\", \n\t\t\"5772366474889297285.gz\", \n\t\t\"5790856524309526271.gz\", \n\t\t\"5853082621493931535.gz\", \n\t\t\"5936310530969939988.gz\", \n\t\t\"5958586233415593683.gz\", \n\t\t\"5969382542874041237.gz\", \n\t\t\"5969882935831645732.gz\", \n\t\t\"6133590028181400561.gz\", \n\t\t\"6168304203247739410.gz\", \n\t\t\"619121932569169133.gz\",  \n\t\t\"6233832895907042056.gz\", \n\t\t\"6371233587304885182.gz\", \n\t\t\"6665598992901336677.gz\", \n\t\t\"6747719063536596803.gz\", \n\t\t\"6783121411632321193.gz\", \n\t\t\"6878954272251422334.gz\", \n\t\t\"6944679014837000907.gz\", \n\t\t\"7204366432079867323.gz\", \n\t\t\"7261759399318904627.gz\", \n\t\t\"7279922463899918193.gz\", \n\t\t\"7372161099870305017.gz\", \n\t\t\"7483704574748382827.gz\", \n\t\t\"7500975006697782336.gz\", \n\t\t\"7577940383110528297.gz\", \n\t\t\"7660839115654270407.gz\", \n\t\t\"7690859939878490358.gz\", \n\t\t\"7794216653216203685.gz\", \n\t\t\"7969521158007747392.gz\", \n\t\t\"7972503305086309118.gz\", \n\t\t\"7977087069524267698.gz\", \n\t\t\"801925665986995127.gz\",  \n\t\t\"8357461134896215565.gz\", \n\t\t\"8473327975000475483.gz\", \n\t\t\"8558287370764624669.gz\", \n\t\t\"88637784417391575.gz\",   \n\t\t\"9219910288440466216.gz\", \n\t\t\"9257832192261807811.gz\", \n\t\t\"9300442310473380111.gz\", \n\t\t\"9529889625719263624.gz\", \n\t\t\"9668036200275969373.gz\", \n\t\t\"990293958999783642.gz\"\n\t};\n\n\tboost::filesystem::create_directories(\"output\");\n\n\tfor (string file : files) {\n\n\t\tifstream infile(\"output/\" + file);\n\t\tif (infile.is_open()) continue;\n\n\t\tstringstream ss;\n\t\tint error;\n\t\ttransfer::gz_file_to_stream(\"crawl-data/ALEXANDRIA-TEST-SIZES/files/\" + file, ss, error);\n\n\t\tif (error == transfer::OK) {\n\t\t\tstring line;\n\n\t\t\tscraper::scraper_store store(false);\n\t\t\tmap<string, unique_ptr<scraper::scraper>> scrapers;\n\t\t\twhile (getline(ss, line)) {\n\t\t\t\tvector<string> cols;\n\t\t\t\tboost::algorithm::split(cols, line, boost::is_any_of(\"\\t\"));\n\n\t\t\t\tURL url(cols[0]);\n\n\t\t\t\tif (scrapers.count(url.host()) == 0) {\n\t\t\t\t\tscrapers[url.host()] = make_unique<scraper::scraper>(url.host(), &store);\n\t\t\t\t\tscrapers[url.host()]->set_timeout(0);\n\t\t\t\t}\n\n\t\t\t\tscrapers[url.host()]->push_url(url);\n\t\t\t}\n\n\t\t\tfor (auto &_scraper : scrapers) {\n\t\t\t\t_scraper.second->run();\n\t\t\t}\n\n\t\t\tconst string filename = \"output/\" + file;\n\t\t\tofstream outfile(filename, ios::trunc | ios::binary);\n\n\t\t\tboost::iostreams::filtering_ostream compress_stream;\n\t\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\t\tcompress_stream.push(outfile);\n\n\t\t\tfor (const string row : store.get_results()) {\n\t\t\t\tcompress_stream << row;\n\t\t\t}\n\t\t}\n\t\treturn;\n\t}\n/*\n\tscraper::scraper_store store(false);\n\tscraper::scraper _scraper(\"heroes.thelazy.net\", &store);\n\t_scraper.set_timeout(0);\n\t_scraper.push_url(URL(\"https://heroes.thelazy.net//index.php/Main_Page\"));\n\t_scraper.push_url(URL(\"https://heroes.thelazy.net//index.php/Dungeon\"));\n\t_scraper.run();\n\n\tfor (const string row : store.get_results()) {\n\t\tcout << row << endl;\n\t}*/\n}\n\nint main(int argc, const char **argv) {\n\n\tstruct sigaction act{SIG_IGN};\n\tsigaction(SIGPIPE, &act, NULL);\n\n\tlogger::start_logger_thread();\n\n\tif (getenv(\"ALEXANDRIA_CONFIG\") != NULL) {\n\t\tconfig::read_config(getenv(\"ALEXANDRIA_CONFIG\"));\n\t} else {\n\t\tconfig::read_config(\"/etc/alexandria.conf\");\n\t}\n\n\tcustom_scraper();\n\n\tlogger::join_logger_thread();\n\n\treturn 0;\n}\n\n"
  },
  {
    "path": "src/search_engine/search_allocation.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"full_text/result_set.h\"\n#include \"full_text/record.h\"\n#include \"full_text/link_record.h\"\n#include \"full_text/domain_link_record.h\"\n#include \"config.h\"\n#include <map>\n#include <vector>\n\nnamespace search_engine {\n\n\t/*\n\t\tThe idea with this namespace is to handle all the memory allocation needed for serving a request to the search engine.\n\t*/\n\n\ttemplate <typename data_record>\n\tstruct storage {\n\t\t/*\n\t\t\tresult_sets holds pre-allocated object of class full_text::result_set.\n\t\t\tresult_sets[0 ... config::query_max_words]\n\t\t*/\n\t\tstd::vector<std::unique_ptr<full_text::result_set<data_record>>> m_result_sets;\n\n\t\t// To hold the intersection of the result sets.\n\t\tstd::unique_ptr<full_text::result_set<data_record>> m_intersected_result;\n\t};\n\n\tclass allocation {\n\n\t\tpublic:\n\n\t\t\tallocation() {\n\t\t\t\tm_storage = create_storage();\n\t\t\t\tm_link_storage = std::make_unique();\n\t\t\t\tm_domain_link_storage = std::make_unique();\n\t\t\t}\n\n\t\tprivate:\n\t\t\tstd::unique_ptr<storage<full_text::record>> m_storage;\n\t\t\tstd::unique_ptr<storage<full_text::link_record>> m_link_storage;\n\t\t\tstd::unique_ptr<storage<full_text::domain_link_record>> m_domain_link_storage;\n\t};\n\n\ttemplate <typename data_record>\n\tstd::unique_ptr<storage<data_record>> *create_storage() {\n\t\tauto storage = new Storage<data_record>;\n\n\t\t// Allocate result_sets.\n\t\tfor (size_t j = 0; j < config::query_max_words; j++) {\n\t\t\tauto result_set = std::make_unique<full_text::result_set<data_record>>(config::ft_max_results_per_section * config::ft_max_sections);\n\t\t\tstorage->result_sets.push_back(std::move(result_set));\n\t\t}\n\t\tstorage->intersected_result = std::make_unique<full_text::result_set<data_record>>(config::ft_max_results_per_section * config::ft_max_sections);\n\n\t\treturn storage;\n\t}\n\n\tallocation *create_allocation();\n\tvoid delete_allocation(allocation *allocation);\n\n}\n"
  },
  {
    "path": "src/search_engine/search_engine.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"search_engine.h\"\n#include <cmath>\n\nusing namespace std;\n\nnamespace search_engine {\n\n\tvoid reset_search_metric(struct full_text::search_metric &metric) {\n\t\tmetric.m_total_found = 0;\n\t\tmetric.m_total_url_links_found = 0;\n\t\tmetric.m_total_domain_links_found = 0;\n\t\tmetric.m_links_handled = 0;\n\t\tmetric.m_link_domain_matches = 0;\n\t\tmetric.m_link_url_matches = 0;\n\t}\n\n\tstd::vector<full_text::record> search_deduplicate(storage<full_text::record> *storage,\n\t\tconst full_text::index<full_text::record> &index, const vector<full_text::link_record> &links,\n\t\tconst vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) {\n\n\t\tvector<full_text::record> complete_result = search_wrapper(storage, index, links, domain_links, query, config::pre_result_limit, metric);\n\n\t\tvector<full_text::record> deduped_result = deduplicate_result_vector<full_text::record>(complete_result, limit);\n\n\t\treturn deduped_result;\n\t}\n\n}\n"
  },
  {
    "path": "src/search_engine/search_engine.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n#include <cmath>\n#include \"full_text/index.h\"\n#include \"full_text/record.h\"\n#include \"full_text/link_record.h\"\n#include \"full_text/domain_link_record.h\"\n#include \"full_text/shard.h\"\n#include \"full_text/search_metric.h\"\n#include \"logger/logger.h\"\n#include \"profiler/profiler.h\"\n#include \"parser/parser.h\"\n#include \"transfer/transfer.h\"\n#include \"algorithm/hash.h\"\n#include \"algorithm/sort.h\"\n#include \"algorithm/algorithm.h\"\n#include \"search_allocation.h\"\n#include <cassert>\n\nnamespace search_engine {\n\n\tusing std::string;\n\tusing std::vector;\n\tusing std::future;\n\tusing std::thread;\n\tusing std::span;\n\tusing std::pair;\n\tusing std::map;\n\tusing std::unordered_map;\n\n\t/*\n\t\tPublic interface\n\t*/\n\n\t/*\n\t\tOur main search routine, no deduplication just raw search.\n\t*/\n\ttemplate<typename data_record>\n\tvector<data_record> search(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst vector<full_text::link_record> &links, const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit,\n\t\tstruct full_text::search_metric &metric);\n\n\t/*\n\t\tOnly for FullTextRecords since deduplication requires domain hashes.\n\t*/\n\tvector<full_text::record> search_deduplicate(storage<full_text::record> *storage,\n\t\tconst full_text::index<full_text::record> &index, const vector<full_text::link_record> &links,\n\t\tconst vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric);\n\n\t/*\n\t\tSearch for the exact phrase. Will treat the whole phrase as an n_gram so will only give results when num words in query are less\n\t\tor equal to config::n_gram.\n\t*/\n\ttemplate<typename data_record>\n\tvector<data_record> search_exact(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst string &query, size_t limit, struct full_text::search_metric &metric);\n\n\ttemplate<typename data_record>\n\tvector<data_record> search_ids(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst string &query, size_t limit);\n\n\ttemplate<typename data_record>\n\tfull_text::result_set<data_record> *search_remote(const std::string &query, storage<data_record> *storage);\n\n\n\ttemplate<typename data_record>\n\tclass comparator_class {\n\tpublic:\n\t\t// Comparator function\n\t\tbool operator()(data_record &a, data_record &b)\n\t\t{\n\t\t\tif (a.m_score == b.m_score) return a.m_value < b.m_value;\n\t\t\treturn a.m_score > b.m_score;\n\t\t}\n\t};\n\n\tvoid reset_search_metric(struct full_text::search_metric &metric);\n\n\ttemplate<typename data_record>\n\tvoid set_total_found(const vector<full_text::result_set<data_record> *> result_vector, struct full_text::search_metric &metric, double result_quote) {\n\n\t\tsize_t largest_total = 0;\n\t\tfor (full_text::result_set<data_record> *result : result_vector) {\n\t\t\tif (result->total_num_results() > largest_total) {\n\t\t\t\tlargest_total = result->total_num_results();\n\t\t\t}\n\t\t}\n\n\t\tmetric.m_total_found = (size_t)(largest_total * result_quote);\n\t}\n\n\ttemplate<typename data_record>\n\tsize_t largest_result(const vector<full_text::result_set<data_record> *> &result_vector) {\n\n\t\tsize_t largest_size = 0;\n\t\tfor (full_text::result_set<data_record> *result : result_vector) {\n\t\t\tif (result->size() > largest_size) {\n\t\t\t\tlargest_size = result->size();\n\t\t\t}\n\t\t}\n\n\t\treturn largest_size;\n\t}\n\n\t/*\n\t\tAdd scores for the given links to the result set. The links are assumed to be ordered by link.m_target_hash ascending.\n\t*/\n\ttemplate<typename data_record>\n\tsize_t apply_link_scores(const vector<full_text::link_record> &links, full_text::result_set<data_record> *results) {\n\n\t\tif (typeid(data_record) != typeid(full_text::record)) return 0;\n\t\tif (links.size() == 0) return 0;\n\n\t\tsize_t applied_links = 0;\n\n\t\tsize_t i = 0;\n\t\tsize_t j = 0;\n\t\tmap<pair<uint64_t, uint64_t>, uint64_t> domain_unique;\n\t\tfull_text::record *data = (full_text::record *)results->data_pointer();\n\t\twhile (i < links.size() && j < results->size()) {\n\n\t\t\tconst uint64_t hash1 = links[i].m_target_hash;\n\t\t\tconst uint64_t hash2 = data[j].m_value;\n\n\t\t\tif (hash1 < hash2) {\n\t\t\t\ti++;\n\t\t\t} else if (hash1 == hash2) {\n\n\t\t\t\tif (domain_unique.count(std::make_pair(links[i].m_source_domain, links[i].m_target_hash)) == 0) {\n\t\t\t\t\tconst float url_score = expm1(25.0f*links[i].m_score) / 50.0f;\n\t\t\t\t\tdata[j].m_score += url_score;\n\t\t\t\t\tapplied_links++;\n\t\t\t\t\tdomain_unique[std::make_pair(links[i].m_source_domain, links[i].m_target_hash)] = links[i].m_source_domain;\n\t\t\t\t}\n\n\t\t\t\ti++;\n\t\t\t} else {\n\t\t\t\tj++;\n\t\t\t}\n\t\t}\n\n\t\treturn applied_links;\n\t}\n\n\ttemplate<typename data_record>\n\tsize_t apply_domain_link_scores(const vector<full_text::domain_link_record> &links, full_text::result_set<data_record> *results) {\n\n\t\tif (typeid(data_record) != typeid(full_text::record)) return 0;\n\t\tif (links.size() == 0) return 0;\n\n\t\tsize_t applied_links = 0;\n\t\t{\n\t\t\tstd::unordered_map<uint64_t, float> domain_scores;\n\t\t\tstd::unordered_map<uint64_t, int> domain_counts;\n\t\t\tstd::map<std::pair<uint64_t, uint64_t>, uint64_t> domain_unique;\n\t\t\t{\n\t\t\t\tfor (const full_text::domain_link_record &link : links) {\n\n\t\t\t\t\tif (domain_unique.count(std::make_pair(link.m_source_domain, link.m_target_domain)) == 0) {\n\n\t\t\t\t\t\tconst float domain_score = expm1(25.0f*link.m_score) / 50.0f;\n\t\t\t\t\t\tdomain_scores[link.m_target_domain] += domain_score;\n\t\t\t\t\t\tdomain_counts[link.m_target_domain]++;\n\t\t\t\t\t\tdomain_unique[std::make_pair(link.m_source_domain, link.m_target_domain)] = link.m_source_domain;\n\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// Loop over the results and add the calculated domain scores.\n\t\t\tfull_text::record *data = (full_text::record *)results->data_pointer();\n\t\t\tfor (size_t i = 0; i < results->size(); i++) {\n\t\t\t\tconst float domain_score = domain_scores[data[i].m_domain_hash];\n\t\t\t\tdata[i].m_score += domain_score;\n\t\t\t\tapplied_links += domain_counts[data[i].m_domain_hash];\n\t\t\t}\n\t\t}\n\n\t\treturn applied_links;\n\t}\n\n\ttemplate<typename data_record>\n\tsize_t lower_bound(const data_record *data, size_t pos, size_t len, uint64_t value) {\n\t\twhile (pos < len) {\n\t\t\tsize_t m = (pos + len) >> 1;\n\t\t\tif (data[m].m_value < value) {\n\t\t\t\tpos = m + 1;\n\t\t\t} else {\n\t\t\t\tlen = m;\n\t\t\t}\n\t\t}\n\n\t\treturn pos;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid value_intersection(const vector<full_text::result_set<data_record> *> &result_sets, vector<int> sections, vector<data_record> &dest) {\n\n\t\tif (result_sets.size() == 0) {\n\t\t\treturn;\n\t\t}\n\n\t\tsize_t shortest_vector_position = 0;\n\t\tsize_t shortest_len = SIZE_MAX;\n\t\t{\n\t\t\tsize_t iter_index = 0;\n\t\t\tfor (full_text::result_set<data_record> *result_set : result_sets) {\n\t\t\t\tif (shortest_len > result_set->size()) {\n\t\t\t\t\tshortest_len = result_set->size();\n\t\t\t\t\tshortest_vector_position = iter_index;\n\t\t\t\t}\n\t\t\t\titer_index++;\n\t\t\t}\n\t\t}\n\n\t\tvector<size_t> positions(result_sets.size(), 0);\n\n\t\tconst data_record *shortest_data = result_sets[shortest_vector_position]->section_pointer(sections[shortest_vector_position]);\n\n\t\twhile (positions[shortest_vector_position] < shortest_len) {\n\n\t\t\tbool all_equal = true;\n\t\t\tuint64_t value = shortest_data[positions[shortest_vector_position]].m_value;\n\n\t\t\tfloat score_sum = 0.0f;\n\t\t\tsize_t iter_index = 0;\n\t\t\tfor (full_text::result_set<data_record> *result_set : result_sets) {\n\t\t\t\tconst data_record *data_arr = result_set->section_pointer(sections[iter_index]);\n\t\t\t\tconst size_t len = result_set->size();\n\n\t\t\t\tsize_t *pos = &(positions[iter_index]);\n\t\t\t\t\n\t\t\t\t// this is a linear search.\n\t\t\t\twhile (*pos < len && value > data_arr[*pos].m_value) {\n\t\t\t\t\t(*pos)++;\n\t\t\t\t}\n\n\t\t\t\tif (*pos < len && value == data_arr[*pos].m_value) {\n\t\t\t\t\tconst float score = data_arr[*pos].m_score;\n\t\t\t\t\tscore_sum += score;\n\t\t\t\t}\n\t\t\t\tif ((*pos < len && value < data_arr[*pos].m_value) || *pos >= len) {\n\t\t\t\t\tall_equal = false;\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t\titer_index++;\n\t\t\t}\n\t\t\tif (all_equal) {\n\t\t\t\tdest.push_back(shortest_data[positions[shortest_vector_position]]);\n\t\t\t\tdest.back().m_score = score_sum / result_sets.size();\n\t\t\t}\n\n\t\t\tpositions[shortest_vector_position]++;\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tvoid calculate_intersection(const vector<full_text::result_set<data_record> *> &result_sets, full_text::result_set<data_record> *dest) {\n\n\t\tfor (full_text::result_set<data_record> *result : result_sets) {\n\t\t\tif (result->size() == 0) return;\n\t\t}\n\n\t\tvector<full_text::result_set<data_record> *> sorted_result_sets(result_sets);\n\n\t\tsort(sorted_result_sets.begin(), sorted_result_sets.end(), [](const full_text::result_set<data_record> *a, const full_text::result_set<data_record> *b) {\n\t\t\treturn a->total_num_results() < b->total_num_results();\n\t\t});\n\n\t\tvector<int> lengths;\n\t\tfor (full_text::result_set<data_record> *result : sorted_result_sets) {\n\t\t\tlengths.push_back(result->num_sections());\n\t\t}\n\n\t\tvector<vector<int>> partitions = Algorithm::incremental_partitions(lengths, config::ft_section_depth);\n\n\t\t// First just try the top sections.\n\t\t{\n\t\t\tvector<data_record> result;\n\t\t\tvalue_intersection(sorted_result_sets, partitions[0], result);\n\t\t\tif (result.size() >= config::result_limit) {\n\t\t\t\tdest->copy_vector(result);\n\t\t\t\treturn;\n\t\t\t}\n\t\t}\n\n\t\tvector<int> maximum(sorted_result_sets.size(), 0);\n\t\tfor (const vector<int> &vec : partitions) {\n\t\t\tfor (size_t i = 0; i < vec.size(); i++) {\n\t\t\t\tif (vec[i] > maximum[i]) maximum[i] = vec[i];\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < maximum.size(); i++) {\n\t\t\tsorted_result_sets[i]->read_to_section(maximum[i]);\n\t\t}\n\n\t\tsize_t idx = 0;\n\t\tconst size_t num_threads = 8;\n\n\t\tThreadPool pool(num_threads);\n\t\tvector<vector<data_record>> results(partitions.size());\n\t\tstd::vector<std::future<vector<data_record>>> thread_results;\n\t\tfor (const vector<int> &partition : partitions) {\n\t\t\tthread_results.emplace_back(pool.enqueue([sorted_result_sets, partition]() {\n\t\t\t\tvector<data_record> result;\n\t\t\t\tvalue_intersection(sorted_result_sets, partition, result);\n\t\t\t\treturn result;\n\t\t\t}));\n\t\t\tidx++;\n\t\t}\n\t\tidx = 0;\n\t\tfor (auto && result: thread_results) {\n\t\t\tresults[idx] = result.get();\n\t\t\tidx++;\n\t\t}\n\t\t// merge\n\t\tvector<data_record> merged_vec;\n\t\tSort::merge_arrays(results, [](const data_record &a, const data_record &b) {\n\t\t\treturn a.m_value < b.m_value;\n\t\t}, merged_vec);\n\n\t\t// copy.\n\t\tdest->copy_vector(merged_vec);\n\t}\n\n\ttemplate<typename data_record>\n\tvoid sort_by_score(vector<data_record> &results) {\n\t\tsort(results.begin(), results.end(), [](const data_record &a, const data_record &b) {\n\t\t\treturn a.m_score > b.m_score;\n\t\t});\n\t}\n\n\t/*\n\t\tputs the top n elements in the first n slots of results. Then sorts those top n elements by value.\n\n\t\tthis function assumes that the input results are sorted by value! so it does nothing for n < results.size()\n\t*/\n\ttemplate<typename data_record>\n\tvoid get_unsorted_results_with_top_scores(full_text::result_set<data_record> *result, size_t n) {\n\n\t\tif (result->size() > n) {\n\t\t\tspan<data_record> *arr = result->span_pointer();\n\t\t\tnth_element(arr->begin(), arr->begin() + (n - 1), arr->end(), SearchEngine::comparator_class<data_record>{});\n\n\t\t\tsort(arr->begin(), arr->begin() + n, [](const data_record &a, const data_record &b) {\n\t\t\t\treturn a.m_value < b.m_value;\n\t\t\t});\n\n\t\t\tresult->resize(n);\n\t\t}\n\t}\n\n\ttemplate<typename data_record>\n\tbool result_has_many_domains(const full_text::result_set<data_record> *results) {\n\n\t\tif (results->size() == 0) return false;\n\n\t\tconst data_record *data = results->data_pointer();\n\t\tconst uint64_t first_domain_hash = data[0].m_domain_hash;\n\t\tfor (size_t i = 0; i < results->size(); i++) {\n\t\t\tif (data[i].m_domain_hash != first_domain_hash) {\n\t\t\t\treturn true;\n\t\t\t}\n\t\t}\n\n\t\treturn false;\n\t}\n\n\ttemplate<typename data_record>\n\tvoid deduplicate_domains(full_text::result_set<data_record> *results, size_t results_per_domain, size_t limit) {\n\n\t\tvector<data_record> deduplicate;\n\t\tunordered_map<uint64_t, size_t> domain_counts;\n\t\tdata_record *records = results->data_pointer();\n\t\tsize_t j = 0;\n\t\tfor (size_t i = 0; i < results->size() && j < limit; i++) {\n\t\t\trecords[j] = records[i];\n\t\t\tif (domain_counts[records[i].m_domain_hash] < results_per_domain) {\n\t\t\t\tj++;\n\t\t\t\tdomain_counts[records[i].m_domain_hash]++;\n\t\t\t}\n\t\t}\n\t\tresults->resize(j);\n\t}\n\n\ttemplate<typename data_record>\n\tvector<data_record> deduplicate_result_vector(const vector<data_record> &results, size_t limit) {\n\n\t\tvector<data_record> deduped;\n\t\tvector<data_record> non_deduped;\n\n\t\tmap<uint64_t, size_t> d_count;\n\t\tfor (const data_record &result : results) {\n\t\t\tif (d_count[result.m_domain_hash] < config::deduplicate_domain_count) {\n\t\t\t\tdeduped.push_back(result);\n\t\t\t} else {\n\t\t\t\tnon_deduped.push_back(result);\n\t\t\t}\n\t\t\td_count[result.m_domain_hash]++;\n\t\t}\n\t\tif (deduped.size() < limit) {\n\t\t\tconst size_t num_missing = limit - deduped.size();\n\t\t\tif (non_deduped.size() > num_missing) {\n\t\t\t\tnon_deduped.resize(num_missing);\n\t\t\t}\n\t\t\tvector<data_record> ret;\n\t\t\tSort::merge_arrays(deduped, non_deduped, [] (const data_record &a, const data_record &b) {\n\t\t\t\treturn a.m_score > b.m_score;\n\t\t\t}, ret);\n\t\t\treturn ret;\n\t\t}\n\n\t\tdeduped.resize(limit);\n\n\t\treturn deduped;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<full_text::result_set<data_record> *> search_shards(vector<full_text::result_set<data_record> *> &result_sets,\n\t\tconst vector<FullTextShard<data_record> *> &shards, const vector<string> &words) {\n\n\t\tassert(words.size() <= config::query_max_words);\n\t\tassert(words.size() <= result_sets.size());\n\n\t\tvector<full_text::result_set<data_record> *> result_vector;\n\t\tvector<string> searched_words;\n\t\tsize_t word_id = 0;\n\t\tfor (const string &word : words) {\n\n\t\t\t// One word should only be searched once.\n\t\t\tif (find(searched_words.begin(), searched_words.end(), word) != searched_words.end()) continue;\n\t\t\t\n\t\t\tsearched_words.push_back(word);\n\n\t\t\tuint64_t word_hash = Hash::str(word);\n\n\t\t\tshards[word_hash % config::ft_num_shards]->find(word_hash, result_sets[word_id]);\n\n\t\t\tresult_vector.push_back(result_sets[word_id]);\n\t\t\tword_id++;\n\t\t}\n\n\t\treturn result_vector;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<full_text::result_set<data_record> *> search_shards_exact(vector<full_text::result_set<data_record> *> &result_sets,\n\t\tconst vector<FullTextShard<data_record> *> &shards, const vector<string> &words) {\n\n\t\tassert(words.size() <= config::query_max_words);\n\t\tassert(words.size() <= result_sets.size());\n\n\t\tvector<full_text::result_set<data_record> *> result_vector;\n\n\t\tuint64_t n_gram_hash = Hash::str(boost::join(words, \" \"));\n\n\t\tshards[n_gram_hash % config::ft_num_shards]->find(n_gram_hash, result_sets[0]);\n\n\t\tresult_vector.push_back(result_sets[0]);\n\n\t\treturn result_vector;\n\t}\n\n\ttemplate <typename data_record>\n\tfull_text::result_set<data_record> *make_search(storage<data_record> *storage,\n\t\t\tconst vector<FullTextShard<data_record> *> &shards, const vector<full_text::link_record> &links,\n\t\t\tconst vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) {\n\n\t\treset_search_metric(metric);\n\n\t\tvector<string> words = Text::get_full_text_words(query, config::query_max_words);\n\t\tif (words.size() == 0) return new full_text::result_set<data_record>(0);\n\n\t\tvector<full_text::result_set<data_record> *> result_vector = search_shards<data_record>(storage->result_sets, shards, words);\n\n\t\tfull_text::result_set<data_record> *flat_result;\n\t\tif (result_vector.size() > 1) {\n\n\t\t\t// We need to calculate the intersection of the given results.\n\t\t\tflat_result = storage->intersected_result;\n\t\t\tflat_result->resize(0);\n\t\t\tcalculate_intersection<data_record>(result_vector, flat_result);\n\n\t\t\tset_total_found<data_record>(result_vector, metric, (double)flat_result->size() / largest_result(result_vector));\n\t\t} else {\n\t\t\tflat_result = result_vector[0];\n\t\t\tset_total_found<data_record>(result_vector, metric, 1.0);\n\t\t}\n\n\t\t// Close file pointers.\n\t\tfor (full_text::result_set<data_record> *result_set : result_vector) {\n\t\t\tresult_set->close_sections();\n\t\t}\n\n\t\tmetric.m_link_domain_matches = apply_domain_link_scores(domain_links, flat_result);\n\t\tmetric.m_link_url_matches = apply_link_scores(links, flat_result);\n\n\t\tget_unsorted_results_with_top_scores<data_record>(flat_result, limit);\n\n\t\treturn flat_result;\n\t}\n\n\ttemplate <typename data_record>\n\tfull_text::result_set<data_record> *make_search_exact(storage<data_record> *storage,\n\t\t\tconst vector<FullTextShard<data_record> *> &shards, const string &query, size_t limit, struct full_text::search_metric &metric) {\n\n\t\treset_search_metric(metric);\n\n\t\tvector<string> words = Text::get_full_text_words(query, config::query_max_words);\n\t\tif (words.size() == 0) return new full_text::result_set<data_record>(0);\n\n\t\tvector<full_text::result_set<data_record> *> result_vector = search_shards_exact<data_record>(storage->result_sets, shards, words);\n\n\t\tfull_text::result_set<data_record> *flat_result;\n\t\tif (result_vector.size() > 1) {\n\n\t\t\t// We need to calculate the intersection of the given results.\n\t\t\tflat_result = storage->intersected_result;\n\t\t\tflat_result->resize(0);\n\t\t\tcalculate_intersection<data_record>(result_vector, flat_result);\n\n\t\t\tset_total_found<data_record>(result_vector, metric, (double)flat_result->size() / largest_result(result_vector));\n\t\t} else {\n\t\t\tflat_result = result_vector[0];\n\t\t\tset_total_found<data_record>(result_vector, metric, 1.0);\n\t\t}\n\n\t\t// Close file pointers.\n\t\tfor (full_text::result_set<data_record> *result_set : result_vector) {\n\t\t\tresult_set->close_sections();\n\t\t}\n\n\t\tget_unsorted_results_with_top_scores<data_record>(flat_result, limit);\n\n\t\treturn flat_result;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<data_record> search_wrapper(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst vector<full_text::link_record> &links, const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit,\n\t\tstruct full_text::search_metric &metric) {\n\n\n\t\tfull_text::result_set<data_record> *result = make_search<data_record>(storage, index.shards(), links, domain_links, query, limit, metric);\n\n\t\tvector<data_record> complete_result(result->span_pointer()->begin(), result->span_pointer()->end());\n\n\t\t// Sort.\n\t\tsort_by_score<data_record>(complete_result);\n\n\t\treturn complete_result;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<data_record> search_wrapper_exact(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst string &query, size_t limit, struct full_text::search_metric &metric) {\n\n\t\tfull_text::result_set<data_record> *result = make_search_exact<data_record>(storage, index.shards(), query, limit, metric);\n\n\t\tvector<data_record> complete_result(result->span_pointer()->begin(), result->span_pointer()->end());\n\n\t\t// Sort.\n\t\tsort_by_score<data_record>(complete_result);\n\n\t\treturn complete_result;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<data_record> search(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst vector<full_text::link_record> &links, const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit,\n\t\tstruct full_text::search_metric &metric) {\n\n\t\tvector<data_record> complete_result = search_wrapper(storage, index, links, domain_links, query, limit, metric);\n\t\t\n\t\tif (complete_result.size() > limit) {\n\t\t\tcomplete_result.resize(limit);\n\t\t}\n\n\t\treturn complete_result;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<data_record> search_exact(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst string &query, size_t limit, struct full_text::search_metric &metric) {\n\n\t\tvector<data_record> complete_result = search_wrapper_exact(storage, index, query, limit, metric);\n\t\t\n\t\tif (complete_result.size() > limit) {\n\t\t\tcomplete_result.resize(limit);\n\t\t}\n\n\t\treturn complete_result;\n\t}\n\n\ttemplate<typename data_record>\n\tvector<data_record> search_ids(storage<data_record> *storage, const full_text::index<data_record> &index,\n\t\tconst string &query, size_t limit) {\n\n\t\tvector<string> words = text::get_expanded_full_text_words(query);\n\n\t\tuint64_t key = algorithm::hash(boost::algorithm::join(words, \" \"));\n\n\t\tindex.shards()[key % config::ft_num_shards]->find(key, storage->result_sets[0]);\n\n\t\tvector<data_record> ret(storage->result_sets[0]->span_pointer()->begin(), storage->result_sets[0]->span_pointer()->end());\n\n\t\tstorage->result_sets[0]->close_sections();\n\n\t\treturn ret;\n\t}\n\n\ttemplate<typename data_record>\n\tfull_text::result_set<data_record> *search_remote(const std::string &query, storage<data_record> *storage) {\n\t\tstorage->result_sets[0]->resize(0);\n\n\t\tstring buffer;\n\t\tint error;\n\t\ttransfer::url_to_string(config::data_node + \"/?i=\" + parser::urlencode(query), buffer, error);\n\t\tif (error == transfer::OK) {\n\t\t\tconst size_t num_records = buffer.size() / sizeof(data_record);\n\t\t\tdata_record *data_ptr = storage->result_sets[0]->data_pointer();\n\t\t\tmemcpy(data_ptr, buffer.c_str(), buffer.size());\n\t\t\tstorage->result_sets[0]->resize(num_records);\n\t\t}\n\t\treturn storage->result_sets[0];\n\t}\n\n}\n"
  },
  {
    "path": "src/server/search_server.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"search_server.h\"\n\n#include <iostream>\n#include \"http/server.h\"\n#include \"indexer/index_manager.h\"\n#include \"indexer/url_record.h\"\n#include \"hash_table2/hash_table.h\"\n#include \"transfer/transfer.h\"\n#include \"parser/parser.h\"\n#include \"parser/unicode.h\"\n#include \"api/result_with_snippet.h\"\n#include \"api/api_response.h\"\n#include \"full_text/search_metric.h\"\n\nnamespace server {\n\n\tvoid search_server() {\n\n\t\tindexer::index_manager idx_manager;\n\n\t\tcout << \"starting server...\" << endl;\n\n\t\t::http::server srv([&idx_manager](const http::request &req) {\n\t\t\thttp::response res;\n\n\t\t\tres.content_type(\"application/json\");\n\n\t\t\tURL url = req.url();\n\n\t\t\tauto query = url.query();\n\n\t\t\tsize_t limit = 1000;\n\t\t\tif (query.count(\"limit\")) limit = std::stoi(query[\"limit\"]);\n\n\t\t\t(void)limit;\n\n\t\t\tif (url.path() == \"/favicon.ico\") {\n\t\t\t\tres.code(404);\n\t\t\t\tres.body(\"404\");\n\t\t\t\treturn res;\n\t\t\t}\n\n\t\t\tstringstream body;\n\n\t\t\t// implement the same search server logic we have on alexandria.org now.\n\t\t\tLOG_INFO(\"Serving request: \" + url.path());\n\n\t\t\tbool deduplicate = true;\n\t\t\tif (query.find(\"d\") != query.end()) {\n\t\t\t\tif (query[\"d\"] == \"a\") {\n\t\t\t\t\tdeduplicate = false;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif (query.find(\"q\") != query.end() && deduplicate) {\n\n\t\t\t\tfull_text::search_metric metric;\n\n\t\t\t\tprofiler::instance profiler;\n\n\t\t\t\tauto results = idx_manager.find(query[\"q\"], metric);\n\n\t\t\t\tapi::api_response api_res(results, metric, profiler.get());\n\t\t\t\tbody << api_res;\n\t\t\t}\n\n\t\t\tres.code(200);\n\n\t\t\tres.body(body.str());\n\n\t\t\treturn res;\n\t\t});\n\t}\n}\n"
  },
  {
    "path": "src/server/search_server.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\nnamespace server {\n\tvoid search_server();\n}\n"
  },
  {
    "path": "src/server/url_server.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"url_server.h\"\n\n#include <iostream>\n#include \"http/server.h\"\n#include \"indexer/index_manager.h\"\n#include \"indexer/url_record.h\"\n\nnamespace server {\n\tvoid url_server() {\n\n\t\tcout << \"starting server...\" << endl;\n\n\t\t::http::server srv([](const http::request &req) {\n\t\t\thttp::response res;\n\n\t\t\tURL url = req.url();\n\n\t\t\tauto query = url.query();\n\n\t\t\tstringstream body;\n\n\t\t\tif (req.request_method() == \"POST\") {\n\t\t\t\tconst string req_body = req.request_body();\n\n\t\t\t\tconst size_t num_hashes = req_body.size() / sizeof(uint64_t);\n\t\t\t\tstd::vector<uint64_t> domain_hashes(num_hashes);\n\t\t\t\tmemcpy((char *)domain_hashes.data(), req_body.c_str(), num_hashes * sizeof(uint64_t));\n\n\t\t\t\tauto tokens = text::get_tokens(query[\"q\"]);\n\n\t\t\t\tsize_t len = std::stoull(query[\"len\"]);\n\n\t\t\t\tstd::map<uint64_t, std::vector<indexer::url_record>> results;\n\n\t\t\t\tutils::thread_pool pool(32);\n\t\t\t\tstd::mutex result_lock;\n\t\t\t\tcout << \"received \" << domain_hashes.size() << \" hashes\" << endl;\n\t\t\t\tsize_t all_total_num_results = 0;\n\t\t\t\tfor (auto dom_hash : domain_hashes) {\n\t\t\t\t\tpool.enqueue([dom_hash, tokens, &query, &result_lock, &results, &all_total_num_results, len]() {\n\t\t\t\t\t\tstd::vector<indexer::url_record> res;\n\n\t\t\t\t\t\tvector<indexer::link_record> links;\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t// read links\n\t\t\t\t\t\t\tconst string file = config::data_path() + \"/\" + to_string(dom_hash % 8) +\n\t\t\t\t\t\t\t\t\"/full_text/url_links/\" + to_string(dom_hash) + \".data\";\n\t\t\t\t\t\t\tindexer::index_reader_file reader(file);\n\n\t\t\t\t\t\t\tif (reader.size()) {\n\t\t\t\t\t\t\t\tif (reader.size() > 10 * 1024* 1024) {\n\t\t\t\t\t\t\t\t\tindexer::index<indexer::link_record> idx(\"url_links\", dom_hash, 1000);\n\t\t\t\t\t\t\t\t\tlinks = idx.find_top(tokens, 1000);\n\t\t\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\t\t\tconst size_t size = reader.size();\n\t\t\t\t\t\t\t\t\tstd::unique_ptr<char[]> buffer = std::make_unique<char[]>(size);\n\t\t\t\t\t\t\t\t\treader.seek(0);\n\t\t\t\t\t\t\t\t\treader.read(buffer.get(), size);\n\t\t\t\t\t\t\t\t\tstd::istringstream ram_reader(string(buffer.get(), size));\n\t\t\t\t\t\t\t\t\tindexer::index<indexer::link_record> idx(&ram_reader, 1000);\n\t\t\t\t\t\t\t\t\tlinks = idx.find_top(tokens, 1000);\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tstd::sort(links.begin(), links.end(), indexer::link_record::storage_order());\n\n\t\t\t\t\t\t\tauto link_formula = [](float score) {\n\t\t\t\t\t\t\t\treturn expm1(20.0f * score) / 10.0f;\n\t\t\t\t\t\t\t};\n\n\t\t\t\t\t\t\tstd::vector<indexer::link_record> grouped;\n\t\t\t\t\t\t\tfor (auto rec : links) {\n\t\t\t\t\t\t\t\tif (grouped.size() && grouped.back().storage_equal(rec)) {\n\t\t\t\t\t\t\t\t\tgrouped.back().m_score += link_formula(rec.m_score);\n\t\t\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\t\t\tgrouped.emplace_back(rec);\n\t\t\t\t\t\t\t\t\tgrouped.back().m_score = link_formula(rec.m_score);\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tlinks = grouped;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tconst string file = config::data_path() + \"/\" + to_string(dom_hash % 8) + \"/full_text/url/\" +\n\t\t\t\t\t\t\tto_string(dom_hash) + \".data\";\n\t\t\t\t\t\tindexer::index_reader_file reader(file);\n\n\t\t\t\t\t\tsize_t mod_incr = 0;\n\t\t\t\t\t\tauto score_mod = [&mod_incr, &links](const indexer::url_record &record) {\n\t\t\t\t\t\t\twhile (mod_incr < links.size() && links[mod_incr].m_target_hash < record.m_value) {\n\t\t\t\t\t\t\t\tmod_incr++;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\tfloat link_score = 0.0f;\n\t\t\t\t\t\t\tif (mod_incr < links.size() && links[mod_incr].m_target_hash == record.m_value) {\n\t\t\t\t\t\t\t\tlink_score += links[mod_incr].m_score;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\treturn record.m_score + ((1000.0f - record.url_length()) / 500.0f) + link_score;\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tsize_t total_num_results = 0;\n\n\t\t\t\t\t\tif (reader.size()) {\n\t\t\t\t\t\t\tif (reader.size() > 10 * 1024* 1024) {\n\t\t\t\t\t\t\t\tindexer::index<indexer::url_record> idx(\"url\", dom_hash, 1000);\n\t\t\t\t\t\t\t\tres = idx.find_top(total_num_results, tokens, len, score_mod);\n\t\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\t\tconst size_t size = reader.size();\n\t\t\t\t\t\t\t\tstd::unique_ptr<char[]> buffer = std::make_unique<char[]>(size);\n\t\t\t\t\t\t\t\treader.seek(0);\n\t\t\t\t\t\t\t\treader.read(buffer.get(), size);\n\t\t\t\t\t\t\t\tstd::istringstream ram_reader(std::string(buffer.get(), size));\n\t\t\t\t\t\t\t\tindexer::index<indexer::url_record> idx(&ram_reader, 1000);\n\t\t\t\t\t\t\t\tres = idx.find_top(total_num_results, tokens, len, score_mod);\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tstd::lock_guard lock(result_lock);\n\t\t\t\t\t\tall_total_num_results += total_num_results;\n\t\t\t\t\t\tresults[dom_hash] = res;\n\t\t\t\t\t});\n\t\t\t\t}\n\n\t\t\t\tpool.run_all();\n\n\t\t\t\t// Output result.\n\t\t\t\tbody.write((char *)&all_total_num_results, sizeof(size_t));\n\t\t\t\tfor (auto domain_hash : domain_hashes) {\n\t\t\t\t\tbody.write((char *)&domain_hash, sizeof(uint64_t));\n\t\t\t\t\tsize_t num_records = results[domain_hash].size();\n\t\t\t\t\tbody.write((char *)&num_records, sizeof(size_t));\n\n\t\t\t\t\tfor (const auto &record : results[domain_hash]) {\n\t\t\t\t\t\tbody.write((char *)&(record.m_value), sizeof(uint64_t));\n\t\t\t\t\t\tbody.write((char *)&(record.m_score), sizeof(float));\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\tres.content_type(\"application/octet-stream\");\n\t\t\t}\n\n\t\t\tres.code(200);\n\n\t\t\tconst string res_str = body.str();\n\t\t\tcout << \"outputting: \" << res_str.size() << endl;\n\t\t\tres.body(res_str);\n\n\t\t\treturn res;\n\t\t});\n\t}\n}\n"
  },
  {
    "path": "src/server/url_server.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\nnamespace server {\n\tvoid url_server();\n}\n"
  },
  {
    "path": "src/server.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <random>\n\n#include <iostream>\n#include <signal.h>\n#include \"fcgio.h\"\n#include \"config.h\"\n#include \"logger/logger.h\"\n#include \"profiler/profiler.h\"\n#include \"indexer/console.h\"\n#include \"json.hpp\"\n#include \"server/search_server.h\"\n#include \"server/url_server.h\"\n\n#include <fstream>\n\nusing namespace std;\n\nint main(int argc, const char **argv) {\n\n\tstruct sigaction act{SIG_IGN};\n\tsigaction(SIGPIPE, &act, NULL);\n\n\tlogger::start_logger_thread();\n\n\tif (getenv(\"ALEXANDRIA_CONFIG\") != NULL) {\n\t\tconfig::read_config(getenv(\"ALEXANDRIA_CONFIG\"));\n\t} else {\n\t\tconfig::read_config(\"/etc/alexandria.conf\");\n\t}\n\n\tconst string arg(argc > 1 ? argv[1] : \"\");\n\n\tserver::search_server();\n\n\tlogger::join_logger_thread();\n\n\treturn 0;\n}\n\n"
  },
  {
    "path": "src/stats/stats.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"config.h\"\n#include \"text/text.h\"\n#include \"full_text/full_text_index.h\"\n#include \"full_text/full_text_shard.h\"\n\nnamespace stats {\n\n\tstd::hash<std::string> hasher;\n\n\ttemplate<typename data_record>\n\tstd::map<std::string, double> word_stats(const full_text::full_text_index<data_record> &index, const std::string &query, size_t index_size);\n\n\ttemplate<typename data_record>\n\tstd::map<std::string, double> get_word_counts(const std::vector<full_text::full_text_shard<data_record> *> &shards, const std::string &query) {\n\n\t\tstd::vector<std::string> words = text::get_full_text_words(query);\n\t\tif (words.size() == 0) return {};\n\n\t\tstd::map<std::string, double> result;\n\t\tstd::vector<std::string> searched_words;\n\t\tfor (const std::string &word : words) {\n\n\t\t\t// One word should only be searched once.\n\t\t\tif (find(searched_words.begin(), searched_words.end(), word) != searched_words.end()) continue;\n\t\t\tsearched_words.push_back(word);\n\n\t\t\tuint64_t word_hash = hasher(word);\n\t\t\tresult[word] = shards[word_hash % config::ft_num_shards]->total_num_results(word_hash);\n\t\t}\n\n\t\treturn result;\n\t}\n\n\ttemplate<typename data_record>\n\tstd::map<std::string, double> word_stats(const full_text::full_text_index<data_record> &index, const std::string &query, size_t index_size) {\n\n\t\tstd::map<std::string, double> complete_result = get_word_counts<data_record>(index.shards(), query);\n\n\t\tfor (const auto &iter : complete_result) {\n\t\t\tcomplete_result[iter.first] /= index_size;\n\t\t}\n\n\t\treturn complete_result;\n\t}\n\n}\n"
  },
  {
    "path": "src/text/stopwords.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"stopwords.h\"\n\nusing namespace std;\n\nbool stopwords::is_stop_word(const string &word) {\n\treturn (s_english.find(word) != s_english.end()) || (s_swedish.find(word) != s_swedish.end());\n}\n\nset<string> stopwords::s_english{\n\t\"the\",\n\t\"of\",\n\t\"and\",\n\t\"in\",\n\t\"to\",\n\t\"a\",\n\t\"is\",\n\t\"as\",\n\t\"for\",\n\t\"was\",\n\t\"by\",\n\t\"that\",\n\t\"with\",\n\t\"on\",\n\t\"from\",\n\t\"are\",\n\t\"an\",\n\t\"or\",\n\t\"it\",\n\t\"at\",\n\t\"his\",\n\t\"be\",\n\t\"which\",\n\t\"this\",\n\t\"he\",\n\t\"were\",\n\t\"not\",\n\t\"also\",\n\t\"has\",\n\t\"have\",\n\t\"its\",\n\t\"their\",\n\t\"but\",\n\t\"first\",\n\t\"had\",\n\t\"one\",\n\t\"other\",\n\t\"new\",\n\t\"they\",\n\t\"such\",\n\t\"been\",\n\t\"can\",\n\t\"after\",\n\t\"more\",\n\t\"who\",\n\t\"two\",\n\t\"all\",\n\t\"some\",\n\t\"most\",\n\t\"may\",\n\t\"into\",\n\t\"when\",\n\t\"between\",\n\t\"than\",\n\t\"there\",\n\t\"these\",\n\t\"during\",\n\t\"only\",\n\t\"many\",\n\t\"time\",\n\t\"would\",\n\t\"states\",\n\t\"no\",\n\t\"over\",\n\t\"about\",\n\t\"while\",\n\t\"use\",\n\t\"both\",\n\t\"if\",\n\t\"where\",\n\t\"then\",\n\t\"i\",\n\t\"through\",\n\t\"since\",\n\t\"being\",\n\t\"made\",\n\t\"became\",\n\t\"part\",\n\t\"her\",\n\t\"de\",\n\t\"three\",\n\t\"any\",\n\t\"up\",\n\t\"each\",\n\t\"them\",\n\t\"often\",\n\t\"will\",\n\t\"him\",\n\t\"so\",\n\t\"out\",\n\t\"same\",\n\t\"because\",\n\t\"well\",\n\t\"several\",\n\t\"form\",\n\t\"name\",\n\t\"could\",\n\t\"although\",\n\t\"set\",\n\t\"different\",\n\t\"1\",\n\t\"2\",\n\t\"3\",\n\t\"4\",\n\t\"5\",\n\t\"6\",\n\t\"7\",\n\t\"8\",\n\t\"9\",\n\t\"0\"\n};\n\nset<string> stopwords::s_swedish{\n\t\"och\",\n\t\"i\",\n\t\"av\",\n\t\"som\",\n\t\"en\",\n\t\"att\",\n\t\"till\",\n\t\"den\",\n\t\"med\",\n\t\"på\",\n\t\"är\",\n\t\"för\",\n\t\"det\",\n\t\"de\",\n\t\"ett\",\n\t\"var\",\n\t\"från\",\n\t\"har\",\n\t\"om\",\n\t\"vid\",\n\t\"inte\",\n\t\"även\",\n\t\"eller\",\n\t\"sig\",\n\t\"men\",\n\t\"efter\",\n\t\"man\",\n\t\"kan\",\n\t\"sin\",\n\t\"där\",\n\t\"andra\",\n\t\"hade\",\n\t\"blev\",\n\t\"då\",\n\t\"första\",\n\t\"finns\",\n\t\"mot\",\n\t\"sedan\",\n\t\"så\",\n\t\"genom\",\n\t\"över\",\n\t\"detta\",\n\t\"också\",\n\t\"bland\",\n\t\"mellan\",\n\t\"två\",\n\t\"när\",\n\t\"fick\",\n\t\"samt\",\n\t\"skulle\",\n\t\"annat\",\n\t\"dock\",\n\t\"denna\",\n\t\"inom\",\n\t\"olika\",\n\t\"vilket\",\n\t\"ut\",\n\t\"flera\",\n\t\"se\",\n\t\"vara\",\n\t\"upp\",\n\t\"ha\",\n\t\"senare\",\n\t\"många\",\n\t\"kom\",\n\t\"än\",\n\t\"dessa\",\n\t\"alla\",\n\t\"samma\",\n\t\"del\",\n\t\"stora\",\n\t\"sitt\",\n\t\"sina\",\n\t\"mycket\",\n\t\"tre\",\n\t\"mer\",\n\t\"utan\",\n\t\"nya\",\n\t\"ofta\",\n\t\"enligt\",\n\t\"blir\",\n\t\"några\",\n\t\"kunde\",\n\t\"hela\",\n\t\"gjorde\",\n\t\"varit\",\n\t\"här\",\n\t\"ska\",\n\t\"eftersom\",\n\t\"få\",\n\t\"fanns\",\n\t\"bara\",\n\t\"något\",\n\t\"kommer\",\n\t\"både\",\n\t\"kallas\",\n\t\"vissa\",\n\t\"får\",\n\t\"cirka\",\n\t\"ur\",\n\t\"endast\",\n\t\"tog\",\n\t\"dem\",\n\t\"medan\",\n\t\"redan\",\n\t\"fyra\",\n\t\"någon\",\n\t\"nu\",\n\t\"går\",\n\t\"innan\",\n\t\"bli\",\n\t\"allt\",\n\t\"därefter\",\n\t\"därför\",\n\t\"hur\",\n\t\"varje\",\n\t\"per\",\n\t\"åt\",\n\t\"antal\",\n\t\"delen\",\n\t\"vilken\",\n\t\"vad\",\n\t\"helt\",\n\t\"sätt\",\n\t\"vill\",\n\t\"åren\",\n\t\"gör\",\n\t\"kallade\",\n\t\"främst\",\n\t\"båda\",\n\t\"själv\",\n\t\"1\",\n\t\"2\",\n\t\"3\",\n\t\"4\",\n\t\"5\",\n\t\"6\",\n\t\"7\",\n\t\"8\",\n\t\"9\",\n\t\"0\"\n};\n"
  },
  {
    "path": "src/text/stopwords.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <set>\n\nclass stopwords {\n\npublic:\n\n\tstatic bool is_stop_word(const std::string &word);\n\nprivate:\n\n\tstatic std::set<std::string> s_english;\n\tstatic std::set<std::string> s_swedish;\n\n};\n"
  },
  {
    "path": "src/text/text.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"text.h\"\n\nnamespace text {\n\n\tbool is_clean_char(const char *ch, size_t multibyte_len) {\n\t\tif (multibyte_len == 1) {\n\t\t\treturn (ch[0] >= 'a' && ch[0] <= 'z') || (ch[0] >= '0' && ch[0] <= '9');\n\t\t} else if (multibyte_len == 2) {\n\t\t\treturn (strncmp(ch, \"å\", 2) == 0) || (strncmp(ch, \"ä\", 2) == 0) || (strncmp(ch, \"ö\", 2) == 0);\n\t\t}\n\t\treturn false;\n\t}\n\n\tbool is_clean_word(const std::string &s) {\n\t\tconst char *str = s.c_str();\n\t\tsize_t len = s.size();\n\t\tfor (size_t i = 0; i < len; ) {\n\t\t\tsize_t multibyte_len = 1;\n\t\t\tfor (size_t j = i + 1; IS_MULTIBYTE_CODEPOINT(str[j]) && (j < len); j++, multibyte_len++) {\n\t\t\t}\n\n\t\t\tif (!is_clean_char(&str[i], multibyte_len)) {\n\t\t\t\treturn false;\n\t\t\t}\n\n\t\t\ti += multibyte_len;\n\t\t}\n\n\t\treturn true;\n\t}\n\n\tstd::string clean_word(const std::string &s) {\n\t\tstd::string result;\n\t\tconst char *str = s.c_str();\n\t\tsize_t len = s.size();\n\t\tfor (size_t i = 0; i < len; ) {\n\t\t\tsize_t multibyte_len = 1;\n\t\t\tfor (size_t j = i + 1; IS_MULTIBYTE_CODEPOINT(str[j]) && (j < len); j++, multibyte_len++) {\n\t\t\t}\n\n\t\t\tif (is_clean_char(&str[i], multibyte_len)) {\n\t\t\t\tresult.append(&str[i], multibyte_len);\n\t\t\t}\n\n\t\t\ti += multibyte_len;\n\t\t}\n\n\t\treturn result;\n\t}\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t*/\n\tstd::vector<std::string> get_words(const std::string &str, size_t limit) {\n\n\t\tconst std::string word_boundary = \" \\t,|!\";\n\n\t\tstd::string str_lc = lower_case(str);\n\n\t\tstd::vector<std::string> raw_words, words;\n\t\tboost::split(raw_words, str_lc, boost::is_any_of(word_boundary));\n\n\t\tfor (std::string &word : raw_words) {\n\t\t\ttrim_both_inplace(word);\n\t\t\tif (is_clean_word(word) && word.size() <= CC_MAX_WORD_LEN &&\n\t\t\t\t\tword.size() > 0) {\n\t\t\t\twords.push_back(word);\n\t\t\t}\n\t\t\tif (limit && words.size() == limit) break;\n\t\t}\n\n\t\treturn words;\n\t}\n\n\tstd::vector<std::string> get_words(const std::string &str) {\n\n\t\treturn get_words(str, 0);\n\t}\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t*/\n\tstd::vector<std::string> get_full_text_words(const std::string &str, size_t limit) {\n\n\t\tconst std::string word_boundary = \" \\t,|!\";\n\n\t\tstd::string str_lc = lower_case(str);\n\n\t\tstd::vector<std::string> raw_words, words;\n\t\tboost::split(raw_words, str_lc, boost::is_any_of(word_boundary));\n\n\t\tfor (std::string &word : raw_words) {\n\t\t\tif (parser::unicode::is_valid(word)) {\n\t\t\t\ttrim_both_inplace(word);\n\t\t\t\tif (word.size() <= CC_MAX_WORD_LEN && word.size() > 0) {\n\t\t\t\t\twords.push_back(word);\n\t\t\t\t}\n\t\t\t\tif (limit && words.size() == limit) break;\n\t\t\t}\n\t\t\t\n\t\t}\n\n\t\treturn words;\n\t}\n\n\tstd::vector<std::string> get_full_text_words(const std::string &str) {\n\n\t\treturn get_full_text_words(str, 0);\n\t}\n\n\tstd::vector<uint64_t> get_full_text_tokens(const std::string &str, size_t limit) {\n\n\t\tconst auto words = get_full_text_words(str, limit);\n\t\tstd::vector<uint64_t> ret(words.size());\n\n\t\tstd::transform(words.cbegin(), words.cend(), ret.begin(), [](const std::string &word) {\n\t\t\treturn algorithm::hash(word);\n\t\t});\n\n\t\treturn ret;\n\n\t}\n\n\tstd::vector<uint64_t> get_full_text_tokens(const std::string &str) {\n\n\t\treturn get_full_text_tokens(str, 0);\n\n\t}\n\n\tstd::vector<uint64_t> get_unique_full_text_tokens(const std::string &str, size_t limit) {\n\n\t\tauto vec = get_full_text_tokens(str, 0);\n\t\tstd::set<uint64_t> s;\n\t\tconst unsigned size = vec.size();\n\t\tfor (unsigned i = 0; i < size; ++i) s.insert(vec[i]);\n\n\t\tvec.assign(s.begin(), s.end());\n\n\t\treturn vec;\n\t}\n\n\tstd::vector<uint64_t> get_unique_full_text_tokens(const std::string &str) {\n\n\t\treturn get_unique_full_text_tokens(str, 0);\n\n\t}\n\n\t/*\n\t\tThis should be the fast way of getting tokens out of a string. It should just read the whole string and\n\t\tstore tokens using the str2token hash function.\n\t*/\n\tstd::vector<uint64_t> get_tokens(const std::string &str, std::function<uint64_t(std::string)> str2token) {\n\t\tconst char *word_boundary = \" \\t,|!\";\n\t\tstd::string cur_token;\n\t\tstd::vector<uint64_t> tokens;\n\t\tfor (const char &ch : str) {\n\t\t\t// If is word boundary.\n\t\t\tif (strchr(word_boundary, ch)) {\n\t\t\t\tif (cur_token.size() && parser::unicode::is_valid(cur_token)) {\n\t\t\t\t\ttrim_punct_inplace(cur_token);\n\t\t\t\t\ttokens.push_back(str2token(cur_token));\n\t\t\t\t}\n\t\t\t\tcur_token.clear();\n\t\t\t} else {\n\t\t\t\t// This if statement trims the token.\n\t\t\t\tif (!isspace(ch)) {\n\t\t\t\t\tcur_token.insert(cur_token.end(), tolower(ch));\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t// Remember the last token.\n\t\tif (cur_token.size() && parser::unicode::is_valid(cur_token)) {\n\t\t\ttrim_punct_inplace(cur_token);\n\t\t\ttokens.push_back(str2token(cur_token));\n\t\t}\n\n\t\treturn tokens;\n\t}\n\n\tstd::vector<uint64_t> get_tokens(const std::string &str) {\n\t\treturn get_tokens(str, algorithm::hash);\n\t}\n\n\tstd::vector<std::string> get_snippets(const std::string &str) {\n\t\tconst size_t snippet_len = 300;\n\t\tconst char *word_boundary = \" \\t,|!\";\n\t\tstd::string cur_snippet;\n\t\tstd::string cur_token;\n\t\tstd::vector<std::string> snippets;\n\t\tfor (const char &ch : str) {\n\t\t\t// If is word boundary.\n\t\t\tif (strchr(word_boundary, ch)) {\n\t\t\t\tif (cur_token.size() && parser::unicode::is_valid(cur_token)) {\n\t\t\t\t\tif (cur_snippet.size() + cur_token.size() <= snippet_len) {\n\t\t\t\t\t\tcur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end());\n\t\t\t\t\t\tcur_snippet.insert(cur_snippet.end(), ' ');\n\t\t\t\t\t} else {\n\t\t\t\t\t\ttrim_inplace(cur_snippet);\n\t\t\t\t\t\tsnippets.push_back(cur_snippet);\n\t\t\t\t\t\tcur_snippet.clear();\n\t\t\t\t\t\tcur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end());\n\t\t\t\t\t\tcur_snippet.insert(cur_snippet.end(), ' ');\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tcur_token.clear();\n\t\t\t} else {\n\t\t\t\t// This if statement trims the token.\n\t\t\t\tcur_token.insert(cur_token.end(), ch);\n\t\t\t}\n\t\t}\n\n\t\tif (cur_token.size() && parser::unicode::is_valid(cur_token)) {\n\t\t\tcur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end());\n\t\t}\n\n\t\ttrim_inplace(cur_snippet);\n\t\tsnippets.push_back(cur_snippet);\n\n\t\treturn snippets;\n\t}\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t\tThese functions also expand on blend chars.\n\t*/\n\tstd::vector<std::string> get_expanded_full_text_words(const std::string &str, size_t limit) {\n\n\t\tconst std::string word_boundary = \" \\t,|!\";\n\t\tconst std::string blend_chars = \".-:\";\n\n\t\tstd::string str_lc = lower_case(str);\n\n\t\tstd::vector<std::string> raw_words, words, blended;\n\t\tboost::split(raw_words, str_lc, boost::is_any_of(word_boundary));\n\n\t\tfor (std::string &word : raw_words) {\n\t\t\tif (parser::unicode::is_valid(word)) {\n\t\t\t\ttrim_both_inplace(word);\n\t\t\t\tif (word.size() <= CC_MAX_WORD_LEN && word.size() > 0) {\n\t\t\t\t\twords.push_back(word);\n\n\t\t\t\t\tif (limit && words.size() == limit) break;\n\n\t\t\t\t\tboost::split(blended, word, boost::is_any_of(blend_chars));\n\t\t\t\t\tif (blended.size() > 1) {\n\t\t\t\t\t\tfor (std::string &blended_word : blended) {\n\t\t\t\t\t\t\ttrim_both_inplace(blended_word);\n\t\t\t\t\t\t\twords.push_back(blended_word);\n\t\t\t\t\t\t\tif (limit && words.size() == limit) break;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t\n\t\t}\n\n\t\treturn words;\n\t}\n\n\tstd::vector<std::string> get_expanded_full_text_words(const std::string &str) {\n\n\t\treturn get_expanded_full_text_words(str, 0);\n\t}\n\n\t/*\n\t * Exactly the same algorithm as above but returns tokens.\n\t * */\n\tstd::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str, size_t limit) {\n\n\t\tconst auto words = get_expanded_full_text_words(str, limit);\n\t\tstd::vector<uint64_t> ret(words.size());\n\n\t\tstd::transform(words.cbegin(), words.cend(), ret.begin(), [](const std::string &word) {\n\t\t\treturn algorithm::hash(word);\n\t\t});\n\n\t\treturn ret;\n\t}\n\n\tstd::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str) {\n\n\t\treturn get_expanded_full_text_tokens(str, 0);\n\n\t}\n\n\tstd::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str, size_t limit) {\n\n\t\tauto vec = get_expanded_full_text_tokens(str, 0);\n\t\tstd::set<uint64_t> s;\n\t\tconst unsigned size = vec.size();\n\t\tfor (unsigned i = 0; i < size; ++i) s.insert(vec[i]);\n\n\t\tvec.assign(s.begin(), s.end());\n\n\t\treturn vec;\n\t}\n\n\tstd::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str) {\n\n\t\treturn get_unique_expanded_full_text_tokens(str, 0);\n\n\t}\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t*/\n\tstd::vector<std::string> get_words_without_stopwords(const std::string &str, size_t limit) {\n\n\t\tconst std::string word_boundary = \" \\t,|!,\";\n\n\t\tstd::string str_lc = lower_case(str);\n\n\t\tstd::vector<std::string> raw_words, words;\n\t\tboost::split(raw_words, str_lc, boost::is_any_of(word_boundary));\n\n\t\tfor (std::string &word : raw_words) {\n\t\t\ttrim_both_inplace(word);\n\t\t\tif (is_clean_word(word) && !stopwords::is_stop_word(word) && word.size() <= CC_MAX_WORD_LEN &&\n\t\t\t\t\tword.size() > 0) {\n\t\t\t\twords.push_back(word);\n\t\t\t}\n\t\t\tif (limit && words.size() == limit) break;\n\t\t}\n\n\t\treturn words;\n\t}\n\n\tstd::vector<std::string> get_words_without_stopwords(const std::string &str) {\n\n\t\treturn get_words_without_stopwords(str, 0);\n\t}\n\n\tvoid words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t)> &ins) {\n\t\t\n\t\tconst size_t word_iter_max = words.size();\n\n\t\tfor (size_t i = 0; i < word_iter_max; i++) {\n\t\t\tfor (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) {\n\t\t\t\tstd::string n_gram = words[i];\n\t\t\t\tfor (size_t k = i + 1; k <= i + j; k++) {\n\t\t\t\t\tn_gram += \" \" + words[k];\n\t\t\t\t}\n\t\t\t\tins(algorithm::hash(n_gram));\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &)> &ins) {\n\t\t\n\t\tconst size_t word_iter_max = words.size();\n\n\t\tfor (size_t i = 0; i < word_iter_max; i++) {\n\t\t\tfor (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) {\n\t\t\t\tstd::string n_gram = words[i];\n\t\t\t\tfor (size_t k = i + 1; k <= i + j; k++) {\n\t\t\t\t\tn_gram += \" \" + words[k];\n\t\t\t\t}\n\t\t\t\tins(algorithm::hash(n_gram), n_gram);\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &, size_t)> &ins) {\n\t\t\n\t\tconst size_t word_iter_max = words.size();\n\n\t\tfor (size_t i = 0; i < word_iter_max; i++) {\n\t\t\tfor (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) {\n\t\t\t\tstd::string n_gram = words[i];\n\t\t\t\tfor (size_t k = i + 1; k <= i + j; k++) {\n\t\t\t\t\tn_gram += \" \" + words[k];\n\t\t\t\t}\n\t\t\t\tins(algorithm::hash(n_gram), n_gram, j + 1);\n\t\t\t}\n\t\t}\n\t}\n\n\tstd::map<std::string, size_t> get_word_counts(const std::string &text) {\n\t\tstd::vector<std::string> words = get_full_text_words(text);\n\t\tstd::map<std::string, size_t> counts;\n\t\tfor (const std::string &word : words) {\n\t\t\tcounts[word]++;\n\t\t}\n\n\t\treturn counts;\n\t}\n\n\tstd::map<std::string, float> get_word_frequency(const std::string &text) {\n\t\tstd::vector<std::string> words = get_full_text_words(text);\n\t\tstd::map<std::string, size_t> counts;\n\t\tsize_t total = 0;\n\t\tfor (const std::string &word : words) {\n\t\t\tcounts[word]++;\n\t\t\ttotal++;\n\t\t}\n\n\t\tstd::map<std::string, float> freq;\n\t\tfor (const auto &iter : counts) {\n\t\t\tfreq[iter.first] = (float)iter.second / total;\n\t\t}\n\n\t\treturn freq;\n\t}\n\n}\n"
  },
  {
    "path": "src/text/text.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#define CC_MAX_WORD_LEN 100\n\n#include <vector>\n#include <map>\n#include <iostream>\n#include <algorithm>\n#include <boost/algorithm/string.hpp>\n#include <sstream>\n#include \"stopwords.h\"\n#include \"parser/unicode.h\"\n#include \"algorithm/hash.h\"\n\nnamespace text {\n\n\t/*\n\t * excludes + from punctuation trim since we want to be able to search for c++\n\t */\n\tinline bool my_ispunct(int ch) {\n\t\tif (ch == '+') return false;\n\t\tif (ch == '#') return false;\n\t\treturn ispunct(ch);\n\t}\n\n\t/*\n\t * trim whitespace from beginning (in place)\n\t * */\n\tinline void ltrim_inplace(std::string &s) {\n\t\ts.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {\n\t\t\treturn !isspace(ch);\n\t\t}));\n\t}\n\n\t/*\n\t * trim whitespace from end (in place)\n\t * */\n\tinline void rtrim_inplace(std::string &s) {\n\t\ts.erase(find_if(s.rbegin(), s.rend(), [](int ch) {\n\t\t\treturn !isspace(ch);\n\t\t}).base(), s.end());\n\t}\n\n\t/*\n\t * trim whitespace from both beginning and end (in place)\n\t * */\n\tinline void trim_inplace(std::string &s) {\n\t\tltrim_inplace(s);\n\t\trtrim_inplace(s);\n\t}\n\n\t/*\n\t * trim whitespace from both beginning and end (return result)\n\t * */\n\tinline std::string trim(const std::string &s) {\n\t\tstd::string copy = s;\n\t\tltrim_inplace(copy);\n\t\trtrim_inplace(copy);\n\t\treturn copy;\n\t}\n\n\t/*\n\t * trim punctuation from beginning (in place)\n\t * */\n\tinline void ltrim_punct_inplace(std::string &s) {\n\t\ts.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {\n\t\t\treturn !my_ispunct(ch);\n\t\t}));\n\t}\n\n\t/*\n\t * trim punctuation from end (in place)\n\t * */\n\tinline void rtrim_punct_inplace(std::string &s) {\n\t\ts.erase(find_if(s.rbegin(), s.rend(), [](int ch) {\n\t\t\treturn !my_ispunct(ch);\n\t\t}).base(), s.end());\n\t}\n\n\t/*\n\t * trim punctuation from both beginning and end (in place)\n\t * */\n\tinline void trim_punct_inplace(std::string &s) {\n\t\tltrim_punct_inplace(s);\n\t\trtrim_punct_inplace(s);\n\t}\n\n\t/*\n\t * trim punctuation from both beginning and end (return result)\n\t * */\n\tinline std::string trim_punct(const std::string &s) {\n\t\tstd::string copy = s;\n\t\tltrim_punct_inplace(copy);\n\t\trtrim_punct_inplace(copy);\n\t\treturn copy;\n\t}\n\n\t/*\n\t * trim both whitespace and punctuation from beginning (in place)\n\t * */\n\tinline void ltrim_both_inplace(std::string &s) {\n\t\ts.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {\n\t\t\treturn !isspace(ch) && !my_ispunct(ch);\n\t\t}));\n\t}\n\n\t/*\n\t * trim both whitespace and punctuation from end (in place)\n\t * */\n\tinline void rtrim_both_inplace(std::string &s) {\n\t\ts.erase(find_if(s.rbegin(), s.rend(), [](int ch) {\n\t\t\treturn !isspace(ch) && !my_ispunct(ch);\n\t\t}).base(), s.end());\n\t}\n\n\t/*\n\t * trim both whitespace and punctuation from both beginning and end (in place)\n\t * */\n\tinline void trim_both_inplace(std::string &s) {\n\t\tltrim_both_inplace(s);\n\t\trtrim_both_inplace(s);\n\t}\n\n\t/*\n\t * trim both whitespace and punctuation from both beginning and end (return result)\n\t * */\n\tinline std::string trim_both(const std::string &s) {\n\t\tstd::string copy = s;\n\t\tltrim_both_inplace(copy);\n\t\trtrim_both_inplace(copy);\n\t\treturn copy;\n\t}\n\n\tinline std::string lower_case(const std::string &str) {\n\t\tstd::string ret = str;\n\t\ttransform(ret.begin(), ret.end(), ret.begin(), [](unsigned char c){ return tolower(c); });\n\t\treturn ret;\n\t}\n\n\tbool is_clean_char(const char *ch, size_t multibyte_len);\n\tbool is_clean_word(const std::string &s);\n\tstd::string clean_word(const std::string &s);\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t*/\n\tstd::vector<std::string> get_words(const std::string &str, size_t limit);\n\tstd::vector<std::string> get_words(const std::string &str);\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t*/\n\tstd::vector<std::string> get_full_text_words(const std::string &str, size_t limit);\n\tstd::vector<std::string> get_full_text_words(const std::string &str);\n\n\tstd::vector<uint64_t> get_full_text_tokens(const std::string &str, size_t limit);\n\tstd::vector<uint64_t> get_full_text_tokens(const std::string &str);\n\n\tstd::vector<uint64_t> get_unique_full_text_tokens(const std::string &str, size_t limit);\n\tstd::vector<uint64_t> get_unique_full_text_tokens(const std::string &str);\n\n\tstd::vector<uint64_t> get_tokens(const std::string &str, std::function<uint64_t(std::string)> str2token);\n\tstd::vector<uint64_t> get_tokens(const std::string &str);\n\n\tstd::vector<std::string> get_snippets(const std::string &str);\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t\tThese functions also expand on blend chars.\n\t*/\n\tstd::vector<std::string> get_expanded_full_text_words(const std::string &str, size_t limit);\n\tstd::vector<std::string> get_expanded_full_text_words(const std::string &str);\n\n\tstd::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str, size_t limit);\n\tstd::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str);\n\n\tstd::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str, size_t limit);\n\tstd::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str);\n\n\t/*\n\t\tReturns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.\n\t*/\n\tstd::vector<std::string> get_words_without_stopwords(const std::string &str, size_t limit);\n\tstd::vector<std::string> get_words_without_stopwords(const std::string &str);\n\n\tvoid words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t)> &ins);\n\tvoid words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &)> &ins);\n\tvoid words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &, size_t)> &ins);\n\n\tstd::map<std::string, size_t> get_word_counts(const std::string &text);\n\tstd::map<std::string, float> get_word_frequency(const std::string &text);\n\n}\n"
  },
  {
    "path": "src/tools/calculate_harmonic.cpp",
    "content": "\n#include \"calculate_harmonic.h\"\n#include \"splitter.h\"\n\n#include \"config.h\"\n#include \"url_link/link.h\"\n#include \"URL.h\"\n#include \"common/ThreadPool.h\"\n#include \"algorithm/algorithm.h\"\n#include \"algorithm/hyper_ball.h\"\n#include <iostream>\n#include <vector>\n#include <mutex>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/filesystem.hpp>\n#include <boost/algorithm/string.hpp>\n#include <unordered_map>\n#include <unordered_set>\n#include <iomanip>\n\nnamespace tools {\n\n\tstd::unordered_map<uint64_t, std::string> run_uniq_host(const std::vector<std::string> files) {\n\n\t\tstd::unordered_map<uint64_t, std::string> hosts;\n\n\t\tfor (const std::string &warc_path : files) {\n\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tuint64_t host_hash = url.host_hash();\n\t\t\t\tif (hosts.count(host_hash) == 0) {\n\t\t\t\t\thosts[host_hash] = url.host();\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn hosts;\n\t}\n\n\tstruct pair_hash {\n\t\tinline size_t operator() (const std::pair<uint32_t, uint32_t> &p) const {\n\t\t\treturn (uint64_t)p.first << 32 | (uint64_t)p.second;\n\t\t}\n\t};\n\n\tstd::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> run_uniq_link(const std::vector<std::string> files, const std::unordered_map<uint64_t, uint32_t> &hosts) {\n\n\t\tstd::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> edges;\n\n\t\tfor (const std::string &warc_path : files) {\n\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\n\t\t\t\tconst uint64_t source_hash = link.source_url().host_hash();\n\t\t\t\tconst uint64_t target_hash = link.target_url().host_hash();\n\n\t\t\t\tconst size_t source_count = hosts.count(source_hash);\n\t\t\t\tconst size_t target_count = hosts.count(target_hash);\n\t\t\t\tif (source_count && target_count) {\n\t\t\t\t\t// Link between two hosts in the host map.\n\t\t\t\t\tedges.insert(std::make_pair(hosts.at(source_hash), hosts.at(target_hash)));\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn edges;\n\t}\n\n\tvoid calculate_harmonic_hosts() {\n\n\t\tauto files = generate_list_with_target_url_files();\n\n\t\tstd::vector<std::vector<std::string>> chunks;\n\t\talgorithm::vector_chunk<std::string>(files, files.size() / s_num_threads, chunks);\n\n\t\tThreadPool pool(s_num_threads);\n\t\tstd::vector<std::future<std::unordered_map<uint64_t, std::string>>> results;\n\n\t\tfor (const std::vector<std::string> &chunk : chunks) {\n\n\t\t\tresults.emplace_back(pool.enqueue([chunk] {\n\t\t\t\treturn run_uniq_host(chunk);\n\t\t\t}));\n\t\t}\n\n\t\tstd::unordered_map<uint64_t, std::string> hosts;\n\t\tsize_t idx = 0;\n\t\tstd::cout.precision(2);\n\t\tfor (auto &result : results) {\n\t\t\tconst std::unordered_map<uint64_t, std::string> result_map = result.get();\n\t\t\tfor (const auto &iter : result_map) {\n\t\t\t\thosts[iter.first] = iter.second;\n\t\t\t}\n\t\t\tconst double percent = (100.0*(double)idx/results.size());\n\t\t\tstd::cout << \"hosts contains \" << hosts.size() << \" elements \" << percent << \"% done\" << std::endl;\n\t\t\tidx++;\n\t\t}\n\n\t\tidx = 0;\n\t\tstd::ofstream outfile(config::data_path() + \"/hosts.txt\", std::ios::trunc);\n\t\tfor (const auto &iter : hosts) {\n\t\t\toutfile << idx << '\\t' << iter.first << '\\t' << iter.second << '\\n';\n\t\t\tidx++;\n\t\t}\n\t\toutfile.close();\n\t}\n\n\tstd::unordered_map<uint64_t, uint32_t> read_hosts_file() {\n\n\t\t// Load the hosts\n\t\tstd::ifstream infile(config::data_path() + \"/hosts.txt\");\n\n\t\tstd::unordered_map<uint64_t, uint32_t> ret;\n\n\t\tstd::string line;\n\t\twhile (getline(infile, line)) {\n\t\t\tstd::vector<std::string> parts;\n\t\t\tboost::algorithm::split(parts, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tuint32_t id = std::stoi(parts[0]);\n\t\t\tuint64_t hash = std::stoull(parts[1]);\n\t\t\tret[hash] = id;\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\tstd::vector<uint32_t> read_hosts_file_vec() {\n\n\t\t// Load the hosts\n\t\tstd::ifstream infile(config::data_path() + \"/hosts.txt\");\n\n\t\tstd::vector<uint32_t> ret;\n\n\t\tstd::string line;\n\t\twhile (getline(infile, line)) {\n\t\t\tstd::vector<std::string> parts;\n\t\t\tboost::algorithm::split(parts, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tuint32_t id = std::stoi(parts[0]);\n\t\t\tret.push_back(id);\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\tstd::map<uint32_t, std::string> read_hosts_file_with_names() {\n\n\t\t// Load the hosts\n\t\tstd::ifstream infile(config::data_path() + \"/hosts.txt\");\n\n\t\tstd::map<uint32_t, std::string> ret;\n\n\t\tstd::string line;\n\t\twhile (getline(infile, line)) {\n\t\t\tstd::vector<std::string> parts;\n\t\t\tboost::algorithm::split(parts, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tuint32_t id = std::stoi(parts[0]);\n\t\t\tret[id] = parts[2];\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\tstd::unique_ptr<std::vector<uint32_t>[]> read_edge_file(size_t vlen) {\n\n\t\t// Load the hosts\n\t\tstd::ifstream infile(config::data_path() + \"/edges.txt\");\n\n\t\tauto edge_map = std::make_unique<std::vector<uint32_t>[]>(vlen);\n\n\t\tstd::string line;\n\t\twhile (getline(infile, line)) {\n\t\t\tstd::vector<std::string> parts;\n\t\t\tboost::algorithm::split(parts, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tuint32_t from = std::stoi(parts[0]); // I think we are counting from 0 now but from 1 when we created the edge file.\n\t\t\tuint32_t to = std::stoi(parts[1]);\n\t\t\tedge_map[to].push_back(from);\n\t\t}\n\n\t\treturn edge_map;\n\t}\n\n\tvoid calculate_harmonic_links() {\n\n\t\tstd::unordered_map<uint64_t, uint32_t> hosts = read_hosts_file();\n\n\t\tstd::cout << \"loaded \" << hosts.size() << \" hosts\" << std::endl;\n\n\t\tauto files = generate_list_with_target_link_files();\n\n\t\tstd::vector<std::vector<std::string>> chunks;\n\t\talgorithm::vector_chunk<std::string>(files, files.size() / (s_num_threads * 500), chunks);\n\n\t\tThreadPool pool(s_num_threads);\n\t\tstd::vector<std::future<std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash>>> results;\n\n\t\tfor (const std::vector<std::string> &chunk : chunks) {\n\t\t\tresults.emplace_back(pool.enqueue([chunk, &hosts] {\n\t\t\t\treturn run_uniq_link(chunk, hosts);\n\t\t\t}));\n\t\t}\n\n\t\tstd::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> edges;\n\t\tsize_t idx = 0;\n\t\tstd::cout.precision(2);\n\t\tfor (auto &result : results) {\n\t\t\tconst std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> result_set = result.get();\n\t\t\tsize_t idasd = 0;\n\t\t\tfor (const std::pair<uint32_t, uint32_t> &edge : result_set) {\n\t\t\t\tedges.insert(edge);\n\t\t\t\tidasd++;\n\t\t\t}\n\t\t\tconst double percent = (100.0*(double)idx/results.size());\n\t\t\tstd::cout << \"edges contains \" << edges.size() << \" elements \" << percent << \"% done\" << std::endl;\n\t\t\tidx++;\n\t\t}\n\n\t\tstd::ofstream outfile(config::data_path() + \"/edges.txt\", std::ios::trunc);\n\t\tfor (const std::pair<uint32_t, uint32_t>& edge : edges) {\n\t\t\toutfile << edge.first << '\\t' << edge.second << '\\n';\n\t\t}\n\t\toutfile.close();\n\t}\n\n\tvoid calculate_harmonic() {\n\n\t\tstd::vector<uint32_t> hosts = read_hosts_file_vec();\n\t\tauto edge_map = read_edge_file(hosts.size());\n\n\t\tconst size_t num_hosts = hosts.size();\n\n\t\tstd::cout << \"loaded \" << hosts.size() << \" hosts\" << std::endl;\n\n\t\tstd::cout << \"running harmonic centrality algorithm on \" << s_num_threads << \" threads\" << std::endl;\n\n\t\t//vector<double> harmonic = algorithm::harmonic_centrality_threaded(hosts.size(), edge_map, 3, num_threads);\n\n\t\tstd::vector<double> harmonic = algorithm::hyper_ball(hosts.size(), edge_map);\n\n\t\tedge_map.reset(nullptr);\n\n\t\tstd::map<uint32_t, std::string> host_names = read_hosts_file_with_names();\n\n\t\t// Save harmonic centrality.\n\t\tstd::ofstream outfile(config::data_path() + \"/harmonic.txt\", std::ios::trunc);\n\t\tfor (size_t i = 0; i < hosts.size(); i++) {\n\t\t\tconst double harmonic_float = harmonic[i] / num_hosts;\n\t\t\toutfile << std::setprecision(15) << host_names.at(hosts[i]) << '\\t' << harmonic_float << '\\n';\n\t\t}\n\n\t}\n\n}\n\n"
  },
  {
    "path": "src/tools/calculate_harmonic.h",
    "content": "\n#pragma once\n\nnamespace tools {\n\n\tvoid calculate_harmonic_hosts();\n\tvoid calculate_harmonic_links();\n\tvoid calculate_harmonic();\n\n}\n"
  },
  {
    "path": "src/tools/counter.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"counter.h\"\n\n#include <iostream>\n#include <future>\n#include <vector>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/filesystem.hpp>\n#include \"config.h\"\n#include \"URL.h\"\n#include \"url_link/link.h\"\n#include \"transfer/transfer.h\"\n#include \"algorithm/hyper_log_log.h\"\n#include \"algorithm/algorithm.h\"\n#include \"file/tsv_file_remote.h\"\n#include \"common/system.h\"\n\nnamespace tools {\n\n\tstd::map<std::string, size_t> count_urls_per_domain(const std::vector<std::string> &warc_paths) {\n\n\t\tconst std::set<std::string> domains = {\n\t\t\t\"theinstantpottable.com\",\n\t\t\t\"thehighlineboutique.com\",\n\t\t\t\"harveyspet.com\",\n\t\t\t\"finertech.com\",\n\t\t\t\"canadiantiresucks.net\",\n\t\t\t\"thecounter.org\",\n\t\t\t\"learningworksforkids.com\",\n\t\t\t\"doodlecraftblog.com\",\n\t\t\t\"heroes.thelazy.net\",\n\t\t\t\"stedmansonline.com\",\n\t\t\t\"restaurantbusinessonline.com\",\n\t\t\t\"gotohomerepair.com\",\n\t\t\t\"aboutbail.com\",\n\t\t\t\"spacefuture.com\",\n\t\t\t\"personaltelco.net\",\n\t\t\t\"helis.com\"\n\t\t};\n\t\tstd::vector<std::string> saved_rows;\n\n\t\tstd::map<std::string, size_t> counts;\n\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tif (domains.find(url.host()) != domains.end()) {\n\t\t\t\t\tsaved_rows.push_back(line);\n\t\t\t\t}\n\t\t\t\tcounts[url.host()]++;\n\t\t\t}\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\n\t\t\tidx++;\n\t\t}\n\n\t\t// Save rows.\n\t\tif (saved_rows.size() > 0) {\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/crawl-data/ALEXANDRIA-TEST-SIZES/files/\");\n\t\t\tstd::ofstream outfile(config::data_path() + \"/crawl-data/ALEXANDRIA-TEST-SIZES/files/\" + common::uuid() + \".gz\");\n\t\t\tboost::iostreams::filtering_ostream compress_stream;\n\t\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\t\tcompress_stream.push(outfile);\n\t\t\tfor (const std::string& row : saved_rows) {\n\t\t\t\tcompress_stream << row << \"\\n\";\n\t\t\t}\n\t\t}\n\n\t\treturn counts;\n\t}\n\n\tvoid run_counter_per_domain(const std::string &batch) {\n\n\t\tconst size_t num_threads = 12;\n\n\t\tstd::vector<std::string> files;\n\t\tstd::vector<std::string> link_files;\n\n\t\tconst std::string file_name = config::data_path() + \"/crawl-data/\" + batch + \"/warc.paths.gz\";\n\n\t\tstd::ifstream infile(file_name);\n\n\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\tdecompress_stream.push(infile);\n\n\t\tstd::string line;\n\t\twhile (getline(decompress_stream, line)) {\n\t\t\tstd::string warc_path = config::data_path() + \"/\" + line;\n\t\t\tconst size_t pos = warc_path.find(\".warc.gz\");\n\t\t\tif (pos != std::string::npos) {\n\t\t\t\twarc_path.replace(pos, 8, \".gz\");\n\t\t\t}\n\n\t\t\tfiles.push_back(warc_path);\n\t\t}\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / num_threads), thread_input);\n\n\t\t/*\n\t\tRun url counters\n\t\t*/\n\t\tstd::vector<std::future<std::map<std::string, size_t>>> futures;\n\t\tfor (size_t i = 0; i < num_threads && i < thread_input.size(); i++) {\n\t\t\tfutures.emplace_back(std::async(std::launch::async, count_urls_per_domain, thread_input[i]));\n\t\t}\n\n\t\tstd::map<std::string, size_t> all_counts;\n\t\tfor (auto &future : futures) {\n\t\t\tstd::map<std::string, size_t> result = future.get();\n\t\t\tfor (const auto &iter : result) {\n\t\t\t\tall_counts[iter.first] += iter.second;\n\t\t\t}\n\t\t}\n\n\t\tfutures.clear();\n\n\t\tfor (const auto &iter : all_counts) {\n\t\t\tstd::cout << iter.first << \"\\t\" << iter.second << std::endl;\n\t\t}\n\t}\n\n\talgorithm::hyper_log_log *count_urls(const std::vector<std::string> &warc_paths) {\n\n\t\talgorithm::hyper_log_log *counter = new algorithm::hyper_log_log();\n\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tcounter->insert(url.hash());\n\t\t\t}\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\n\t\t\tidx++;\n\t\t}\n\n\t\treturn counter;\n\t}\n\n\talgorithm::hyper_log_log *count_links(const std::vector<std::string> &warc_paths) {\n\n\t\talgorithm::hyper_log_log *counter = new algorithm::hyper_log_log();\n\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\t\t\t\tcounter->insert(link.target_url().hash());\n\t\t\t}\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\n\t\t\tidx++;\n\t\t}\n\n\t\treturn counter;\n\t}\n\n\tvoid run_counter() {\n\n\t\tconst size_t num_threads = 12;\n\n\t\tstd::vector<std::string> files;\n\t\tstd::vector<std::string> link_files;\n\n\t\tfor (const std::string &batch : config::batches) {\n\n\t\t\tconst std::string file_name = config::data_path() + \"/crawl-data/\" + batch + \"/warc.paths.gz\";\n\n\t\t\tstd::ifstream infile(file_name);\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tstd::string warc_path = config::data_path() + \"/\" + line;\n\t\t\t\tconst size_t pos = warc_path.find(\".warc.gz\");\n\t\t\t\tif (pos != std::string::npos) {\n\t\t\t\t\twarc_path.replace(pos, 8, \".gz\");\n\t\t\t\t}\n\n\t\t\t\tfiles.push_back(warc_path);\n\t\t\t}\n\t\t}\n\n\t\tfor (const std::string &batch : config::link_batches) {\n\n\t\t\tconst std::string file_name = config::data_path() + \"/crawl-data/\" + batch + \"/warc.paths.gz\";\n\n\t\t\tstd::ifstream infile(file_name);\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tstd::string warc_path = config::data_path() + \"/\" + line;\n\t\t\t\tconst size_t pos = warc_path.find(\".warc.gz\");\n\n\t\t\t\tif (pos != std::string::npos) {\n\t\t\t\t\twarc_path.replace(pos, 8, \".links.gz\");\n\t\t\t\t}\n\n\t\t\t\tlink_files.push_back(warc_path);\n\t\t\t}\n\t\t}\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / num_threads), thread_input);\n\n\t\tstd::vector<std::vector<std::string>> link_thread_input;\n\t\talgorithm::vector_chunk(link_files, ceil((double)link_files.size() / num_threads), link_thread_input);\n\n\t\tstd::mutex write_file_mutex;\n\n\t\t/*\n\t\tRun url counters\n\t\t*/\n\t\tstd::vector<std::future<algorithm::hyper_log_log *>> futures;\n\t\tfor (size_t i = 0; i < num_threads && i < thread_input.size(); i++) {\n\t\t\tfutures.emplace_back(std::async(std::launch::async, count_urls, thread_input[i]));\n\t\t}\n\n\t\talgorithm::hyper_log_log url_counter;\n\t\tfor (auto &future : futures) {\n\t\t\talgorithm::hyper_log_log *result = future.get();\n\t\t\turl_counter += *(result);\n\t\t\tdelete result;\n\t\t}\n\n\t\tfutures.clear();\n\n\t\t/*\n\t\tRun link counters\n\t\t*/\n\t\tfor (size_t i = 0; i < num_threads && i < link_thread_input.size(); i++) {\n\t\t\tfutures.emplace_back(std::async(std::launch::async, count_links, link_thread_input[i]));\n\t\t}\n\n\t\talgorithm::hyper_log_log link_counter;\n\t\tfor (auto &future : futures) {\n\t\t\talgorithm::hyper_log_log *result = future.get();\n\t\t\tlink_counter += *(result);\n\t\t\tdelete result;\n\t\t}\n\n\t\tstd::cout << \"Uniq urls: \" << url_counter.count() << std::endl;\n\t\tstd::cout << \"Uniq links: \" << link_counter.count() << std::endl;\n\t}\n\n\tstd::vector<std::string> download_link_batch(const std::string &batch, size_t limit, size_t offset) {\n\t\t\n\t\tfile::tsv_file_remote warc_paths_file(std::string(\"crawl-data/\") + batch + \"/warc.paths.gz\");\n\t\tstd::vector<std::string> warc_paths;\n\t\twarc_paths_file.read_column_into(0, warc_paths);\n\n\t\tstd::vector<std::string> files_to_download;\n\t\tfor (size_t i = offset; i < warc_paths.size() && i < (offset + limit); i++) {\n\t\t\tstd::string warc_path = warc_paths[i];\n\t\t\tconst size_t pos = warc_path.find(\".warc.gz\");\n\t\t\tif (pos != std::string::npos) {\n\t\t\t\twarc_path.replace(pos, 8, \".links.gz\");\n\t\t\t}\n\t\t\tfiles_to_download.push_back(warc_path);\n\t\t}\n\n\t\treturn transfer::download_gz_files_to_disk(files_to_download);\n\t}\n\n}\n\n"
  },
  {
    "path": "src/tools/counter.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace tools {\n\n\tvoid run_counter_per_domain(const std::string &batch);\n\tvoid run_counter();\n\tvoid count_all_links();\n\n}\n\n"
  },
  {
    "path": "src/tools/find_links.cpp",
    "content": "\n#include \"find_links.h\"\n#include \"file/gz_tsv_file.h\"\n#include \"URL.h\"\n#include \"algorithm/algorithm.h\"\n#include <boost/algorithm/string.hpp>\n#include <iostream>\n#include <vector>\n#include <set>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/algorithm/string.hpp>\n#include <math.h>\n#include \"utils/thread_pool.hpp\"\n#include \"algorithm/hash.h\"\n#include \"common/system.h\"\n#include \"config.h\"\n\nnamespace tools {\n\n\tvoid find_links_for_hosts_chunk(const std::set<size_t> &host_hashes, const std::vector<std::string> &files) {\n\n\t\tsize_t links_written = 0;\n\t\tconst size_t links_per_file = 1000000;\n\n\t\tstd::ofstream outfile;\n\n\t\toutfile.open(config::data_path() + \"/crawl-data/SMALL-LINK-MIX/files/\" + common::uuid() + \"_\" + std::to_string(links_written) + \"-\" +\n\t\t\tstd::to_string(links_written + links_per_file) + \".gz\", std::ios::binary);\n\n\t\tboost::iostreams::filtering_ostream compress_stream;\n\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\tcompress_stream.push(outfile);\n\n\t\tfor (auto file : files) {\n\t\t\tstd::ifstream infile(config::data_path() + \"/\" + file);\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tstd::vector<std::string> col_values;\n\t\t\t\tboost::algorithm::split(col_values, line, boost::is_any_of(\"\\t\"));\n\n\t\t\t\tconst size_t host_hash = algorithm::hash(col_values[2]);\n\n\t\t\t\tif (host_hashes.find(host_hash) != host_hashes.end()) {\n\n\t\t\t\t\t// Write link to current file.\n\n\t\t\t\t\tcompress_stream << line << \"\\n\";\n\t\t\t\t\tlinks_written++;\n\t\t\t\t\tif ((links_written % links_per_file) == 0) {\n\t\t\t\t\t\tstd::cout << \"writing file\" << std::endl;\n\t\t\t\t\t\tcompress_stream.strict_sync();\n\t\t\t\t\t\tcompress_stream.pop();\n\t\t\t\t\t\toutfile.close();\n\t\t\t\t\t\toutfile.open(config::data_path() + \"/crawl-data/SMALL-LINK-MIX/files/\" + common::uuid() +\n\t\t\t\t\t\t\t\"_\" + std::to_string(links_written) + \"-\" + std::to_string(links_written + links_per_file) + \".gz\",\n\t\t\t\t\t\t\tstd::ios::binary);\n\t\t\t\t\t\tcompress_stream.push(outfile);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid find_links_for_hosts(const std::set<size_t> &host_hashes) {\n\t\tconst std::string batch = \"LINK-MIX\";\n\t\tconst size_t num_threads = 12;\n\t\tsize_t limit = 4000;\n\n\t\tfile::gz_tsv_file batch_file(config::data_path() + \"/crawl-data/\" + batch + \"/warc.paths.gz\");\n\n\t\tstd::vector<std::string> rows;\n\t\tbatch_file.read_column_into(0, rows);\n\n\t\tif (rows.size() > limit) rows.resize(limit);\n\n\t\tstd::vector<std::vector<std::string>> chunks;\n\t\talgorithm::vector_chunk<std::string>(rows, ceil(rows.size() / num_threads) + 1, chunks);\n\n\t\tutils::thread_pool threads(num_threads);\n\n\t\tfor (auto chunk : chunks) {\n\t\t\tthreads.enqueue([&host_hashes, chunk]() {\n\t\t\t\tfind_links_for_hosts_chunk(host_hashes, chunk);\n\t\t\t});\n\t\t}\n\n\t\tthreads.run_all();\n\t}\n\n\tvoid find_links() {\n\t\tconst auto batch = \"SMALL-MIX\";\n\t\tsize_t limit = 20;\n\n\t\tfile::gz_tsv_file batch_file(config::data_path() + \"/crawl-data/\"+batch+\"/warc.paths.gz\");\n\n\t\tstd::vector<std::string> rows;\n\t\tbatch_file.read_column_into(0, rows);\n\n\t\tif (rows.size() > limit) rows.resize(limit);\n\n\t\t// Load all the host hashes into a set\n\t\tstd::set<size_t> host_hashes;\n\n\t\tfor (auto row : rows) {\n\t\t\tstd::ifstream infile(config::data_path() + \"/\" + row);\n\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tstd::vector<std::string> col_values;\n\t\t\t\tboost::algorithm::split(col_values, line, boost::is_any_of(\"\\t\"));\n\n\t\t\t\tURL url(col_values[0]);\n\n\t\t\t\thost_hashes.insert(url.host_hash());\n\t\t\t}\n\t\t}\n\n\t\tstd::cout << \"found \" << host_hashes.size() << \" hosts\" << std::endl;\n\n\t\tfind_links_for_hosts(host_hashes);\n\t}\n\n\n}\n"
  },
  {
    "path": "src/tools/find_links.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n\nnamespace tools {\n\n\tvoid find_links();\n\n}\n\n"
  },
  {
    "path": "src/tools/generate_url_lists.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <vector>\n#include <iostream>\n#include <fstream>\n#include \"generate_url_lists.h\"\n\n#include <boost/filesystem.hpp>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/algorithm/string.hpp>\n\nusing namespace std;\nusing namespace boost::filesystem;\n\nnamespace tools {\n\n\tvector<string> read_urls_with_many_links(const std::string &file_path) {\n\n\t\tstd::ifstream infile(file_path);\n\t\tif (!infile.is_open()) return {};\n\n\t\tvector<string> ret_urls;\n\n\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\tdecompress_stream.push(infile);\n\n\t\tstring line;\n\t\twhile (getline(decompress_stream, line)) {\n\t\t\tvector<string> cols;\n\t\t\tboost::algorithm::split(cols, line, boost::is_any_of(\"\\t\"));\n\t\t\tif (stoull(cols[1]) > 1) {\n\t\t\t\tret_urls.push_back(cols[0]);\n\t\t\t}\n\t\t}\n\n\t\treturn ret_urls;\n\t}\n\n\tvector<string> read_urls(const std::string &path) {\n\t\t// Only read the first 10 files.\n\t\tvector<string> urls;\n\t\tfor (size_t i = 1; i <= 10; i++) {\n\t\t\tstring file_path = path + \"/top_\" + to_string(i) + \".gz\";\n\t\t\tif (is_regular_file(file_path)) {\n\t\t\t\tvector<string> new_urls = read_urls_with_many_links(file_path);\n\t\t\t\tif (new_urls.size() == 0) break;\n\t\t\t\turls.insert(urls.end(), new_urls.begin(), new_urls.end());\n\t\t\t}\n\t\t}\n\n\t\treturn urls;\n\t}\n\n\tvoid generate_url_lists(const std::string &batch_path) {\n\t\tpath pth(batch_path);\n\t\tdirectory_iterator end_iter;\n\n\t\tvector<string> urls;\n\n\t\tfor (directory_iterator iter(pth); iter != end_iter; iter++) {\n\t\t\tif (is_directory(iter->path())) {\n\t\t\t\tstring current_file = iter->path().string();\n\t\t\t\tvector<string> new_urls = read_urls(current_file);\n\t\t\t\turls.insert(urls.end(), new_urls.begin(), new_urls.end());\n\t\t\t}\n\t\t}\n\n\t\tfor (const string &url : urls) {\n\t\t\tcout << url << endl;\n\t\t}\n\n\t}\n\n}\n"
  },
  {
    "path": "src/tools/generate_url_lists.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"config.h\"\n\nnamespace tools {\n\n\tvoid generate_url_lists(const std::string &batch_path);\n\n}\n"
  },
  {
    "path": "src/tools/splitter.cpp",
    "content": "\n#include \"splitter.h\"\n#include \"config.h\"\n#include \"roaring/roaring64map.hh\"\n#include \"algorithm/bloom_filter.h\"\n#include <iostream>\n#include <vector>\n#include <unordered_set>\n#include <fstream>\n#include <cmath>\n#include <thread>\n#include <future>\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/filesystem.hpp>\n#include \"url_link/link.h\"\n#include \"algorithm/algorithm.h\"\n#include \"URL.h\"\n#include \"common/system.h\"\n\nnamespace tools {\n\n\tstd::vector<std::string> target_url_batches() {\n\t\tstd::vector<std::string> batches;\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tbatches.push_back(\"NODE-\" + std::to_string(node_id) + s_suffix);\n\t\t}\n\n\t\treturn batches;\n\t}\n\n\tstd::vector<std::string> target_link_batches() {\n\t\tstd::vector<std::string> batches;\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tbatches.push_back(\"LINK-\" + std::to_string(node_id) + s_suffix);\n\t\t}\n\n\t\treturn batches;\n\t}\n\n\tstd::vector<std::string> generate_list_with_files(const std::vector<std::string> &batches, const std::string &suffix = \".gz\", const std::string &warc_paths_suffix = \".gz\") {\n\n\t\tstd::vector<std::string> file_names;\n\t\tfor (const auto &batch : batches) {\n\n\t\t\tconst std::string file_name = config::data_path() + \"/crawl-data/\" + batch + \"/warc.paths\" + warc_paths_suffix;\n\n\t\t\tstd::ifstream infile(file_name);\n\n\t\t\tif (warc_paths_suffix == \".gz\") {\n\n\t\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\t\tdecompress_stream.push(infile);\n\n\t\t\t\tstd::string line;\n\t\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\t\tstd::string warc_path = config::data_path() + \"/\" + line;\n\t\t\t\t\tconst size_t pos = warc_path.find(\".warc.gz\");\n\n\t\t\t\t\tif (pos != std::string::npos) {\n\t\t\t\t\t\twarc_path.replace(pos, 8, suffix);\n\t\t\t\t\t}\n\n\t\t\t\t\tfile_names.push_back(warc_path);\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\tstd::string line;\n\t\t\t\twhile (getline(infile, line)) {\n\t\t\t\t\tstd::string warc_path = config::data_path() + \"/\" + line;\n\t\t\t\t\tconst size_t pos = warc_path.find(\".warc.gz\");\n\n\t\t\t\t\tif (pos != std::string::npos) {\n\t\t\t\t\t\twarc_path.replace(pos, 8, suffix);\n\t\t\t\t\t}\n\n\t\t\t\t\tfile_names.push_back(warc_path);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn file_names;\n\t}\n\n\tstd::vector<std::string> generate_list_with_url_files() {\n\n\t\t// create a list with .gz files that contains urls\n\t\treturn generate_list_with_files(config::batches, \".gz\");\n\n\t}\n\n\tstd::vector<std::string> generate_list_with_link_files() {\n\n\t\t// create a list with .gz files that contains links\n\t\treturn generate_list_with_files(config::link_batches, \".links.gz\");\n\n\t}\n\n\tstd::vector<std::string> generate_list_with_direct_link_files() {\n\n\t\t// create a list with .gz files that contains links\n\t\treturn generate_list_with_files(config::link_batches, \".direct.links.gz\");\n\n\t}\n\n\tstd::vector<std::string> generate_list_with_target_url_files() {\n\n\t\t// create a list with .gz files that contains urls\n\t\treturn generate_list_with_files(target_url_batches(), \"\", \"\");\n\n\t}\n\n\tstd::vector<std::string> generate_list_with_target_link_files() {\n\n\t\t// create a list with .gz files that contains links\n\t\treturn generate_list_with_files(target_link_batches(), \"\", \"\");\n\n\t}\n\n\t// File structure is [data_path]/crawl-data/NODE-[node_id]/files/uuid-file_index.gz\n\tstd::string write_cache(size_t file_index, std::vector<std::string> &lines, size_t node_id) {\n\n\t\tauto uuid = common::uuid();\n\n\t\tconst std::string filename = \"crawl-data/NODE-\" + std::to_string(node_id) + s_suffix + \"/files/\" + uuid + \"-\" + std::to_string(file_index) + \".gz\";\n\t\tstd::ofstream outfile(config::data_path() + \"/\" + filename, std::ios::trunc | std::ios::binary);\n\n\t\tboost::iostreams::filtering_ostream compress_stream;\n\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\tcompress_stream.push(outfile);\n\n\t\tfor (const std::string &line : lines) {\n\t\t\tcompress_stream << line << \"\\n\";\n\t\t}\n\t\tlines.clear();\n\t\treturn filename;\n\t}\n\n\t// File structure is [DATA_PATH]/crawl-data/NODE-[node_id]/files/uuid-file_index.gz\n\tstd::string write_link_cache(size_t file_index, std::vector<std::string> &lines, size_t node_id) {\n\n\t\tauto uuid = common::uuid();\n\n\t\tconst std::string filename = \"crawl-data/LINK-\" + std::to_string(node_id) + s_suffix + \"/files/\" + uuid + \"-\" + std::to_string(file_index) + \".gz\";\n\t\tstd::ofstream outfile(config::data_path() + \"/\" + filename, std::ios::trunc | std::ios::binary);\n\n\t\tboost::iostreams::filtering_ostream compress_stream;\n\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\tcompress_stream.push(outfile);\n\n\t\tfor (const std::string &line : lines) {\n\t\t\tcompress_stream << line << \"\\n\";\n\t\t}\n\t\tlines.clear();\n\t\treturn filename;\n\t}\n\n\tvoid splitter(const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {\n\n\t\tconst size_t max_cache_size = 10000;\n\t\tsize_t file_index = 1;\n\n\t\tusing vec2d_str = std::vector<std::vector<std::string>>;\n\n\t\tvec2d_str file_names(config::nodes_in_cluster);\n\t\tvec2d_str cache(config::nodes_in_cluster);\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tconst size_t node_id = url.index_on_node();\n\t\t\t\tcache[node_id].push_back(line);\n\t\t\t}\n\n\t\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\t\tif (cache[node_id].size() > max_cache_size) {\n\t\t\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t}\n\n\t\twrite_file_mutex.lock();\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tconst std::string filename = config::data_path() + \"/crawl-data/NODE-\" + std::to_string(node_id) + s_suffix + \"/warc.paths\";\n\t\t\tstd::ofstream outfile(filename, std::ios::app);\n\t\t\tfor (const std::string &file : file_names[node_id]) {\n\t\t\t\toutfile << file << \"\\n\";\n\t\t\t}\n\t\t}\n\t\twrite_file_mutex.unlock();\n\t}\n\n\tvoid link_splitter(const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {\n\n\t\tconst size_t max_cache_size = 1000000;\n\t\tsize_t file_index = 1;\n\t\t\n\t\tusing vec2d_str = std::vector<std::vector<std::string>>;\n\n\t\tvec2d_str file_names(config::nodes_in_cluster);\n\t\tvec2d_str cache(config::nodes_in_cluster);\n\t\tsize_t done = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\n\t\t\tstd::cout << \"done \" << done << \"/\" << warc_paths.size() << std::endl;\n\t\t\tdone++;\n\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\t\t\t\tconst size_t node_id = link.index_on_node();\n\t\t\t\tcache[node_id].push_back(line);\n\t\t\t}\n\n\t\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\t\tif (cache[node_id].size() > max_cache_size) {\n\t\t\t\t\tfile_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tfile_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));\n\t\t}\n\n\t\twrite_file_mutex.lock();\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tconst auto filename = config::data_path() + \"/crawl-data/LINK-\" + std::to_string(node_id) + s_suffix + \"/warc.paths\";\n\t\t\tstd::ofstream outfile(filename, std::ios::app);\n\t\t\tfor (const std::string &file : file_names[node_id]) {\n\t\t\t\toutfile << file << \"\\n\";\n\t\t\t}\n\t\t}\n\t\twrite_file_mutex.unlock();\n\t}\n\n\tvoid link_splitter_with_hosts(const std::unordered_set<size_t> &hosts, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {\n\n\t\tconst size_t max_cache_size = 1000000;\n\t\tsize_t file_index = 1;\n\t\t\n\t\tusing vec2d_str = std::vector<std::vector<std::string>>;\n\n\t\tvec2d_str file_names(config::nodes_in_cluster);\n\t\tvec2d_str cache(config::nodes_in_cluster);\n\t\tsize_t done = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\n\t\t\tstd::cout << \"done \" << done << \"/\" << warc_paths.size() << std::endl;\n\t\t\tdone++;\n\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\t\t\t\tconst auto target_host = link.target_host_hash();\n\t\t\t\tif (hosts.count(target_host)) {\n\t\t\t\t\tconst size_t node_id = link.index_on_node();\n\t\t\t\t\tcache[node_id].push_back(line);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\t\tif (cache[node_id].size() > max_cache_size) {\n\t\t\t\t\tfile_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tfile_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));\n\t\t}\n\n\t\twrite_file_mutex.lock();\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tconst auto filename = config::data_path() + \"/crawl-data/LINK-\" + std::to_string(node_id) + s_suffix + \"/warc.paths\";\n\t\t\tstd::ofstream outfile(filename, std::ios::app);\n\t\t\tfor (const std::string &file : file_names[node_id]) {\n\t\t\t\toutfile << file << \"\\n\";\n\t\t\t}\n\t\t}\n\t\twrite_file_mutex.unlock();\n\t}\n\n\tvoid splitter_with_urls(const std::unordered_set<size_t> &urls, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {\n\n\t\tconst size_t max_cache_size = 150000;\n\t\tsize_t file_index = 1;\n\n\t\tstd::vector<std::vector<std::string>> file_names(config::nodes_in_cluster);\n\t\tstd::vector<std::vector<std::string>> cache(config::nodes_in_cluster);\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::cout << warc_path << std::endl;\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tif (urls.count(url.hash())) {\n\t\t\t\t\tconst size_t node_id = url.index_on_node();\n\t\t\t\t\tcache[node_id].push_back(line);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\t\tif (cache[node_id].size() > max_cache_size) {\n\t\t\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\t\t\tidx++;\n\t\t}\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t}\n\n\t\twrite_file_mutex.lock();\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tconst std::string filename = config::data_path() + \"/crawl-data/NODE-\" + std::to_string(node_id) + s_suffix + \"/warc.paths\";\n\t\t\tstd::ofstream outfile(filename, std::ios::app);\n\t\t\tfor (const std::string &file : file_names[node_id]) {\n\t\t\t\toutfile << file << \"\\n\";\n\t\t\t}\n\t\t}\n\t\twrite_file_mutex.unlock();\n\t}\n\n\tvoid splitter_with_roaring(const ::roaring::Roaring64Map &urls, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {\n\n\t\tconst size_t max_cache_size = 150000;\n\t\tsize_t file_index = 1;\n\n\t\tstd::vector<std::vector<std::string>> file_names(config::nodes_in_cluster);\n\t\tstd::vector<std::vector<std::string>> cache(config::nodes_in_cluster);\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::cout << warc_path << std::endl;\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tif (urls.contains(url.hash() >> 20)) {\n\t\t\t\t\tconst size_t node_id = url.index_on_node();\n\t\t\t\t\tcache[node_id].push_back(line);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\t\tif (cache[node_id].size() > max_cache_size) {\n\t\t\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\t\t\tidx++;\n\t\t}\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t}\n\n\t\twrite_file_mutex.lock();\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tconst std::string filename = config::data_path() + \"/crawl-data/NODE-\" + std::to_string(node_id) + s_suffix + \"/warc.paths\";\n\t\t\tstd::ofstream outfile(filename, std::ios::app);\n\t\t\tfor (const std::string &file : file_names[node_id]) {\n\t\t\t\toutfile << file << \"\\n\";\n\t\t\t}\n\t\t}\n\t\twrite_file_mutex.unlock();\n\t}\n\n\tvoid splitter_with_bloom(const ::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {\n\n\t\tconst size_t max_cache_size = 10000;\n\t\tsize_t file_index = 1;\n\n\t\tstd::vector<std::vector<std::string>> file_names(config::nodes_in_cluster);\n\t\tstd::vector<std::vector<std::string>> cache(config::nodes_in_cluster);\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tif (bloom.exists(url.hash())) {\n\t\t\t\t\tconst size_t node_id = url.index_on_node();\n\t\t\t\t\tcache[node_id].push_back(line);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\t\tif (cache[node_id].size() > max_cache_size) {\n\t\t\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\t\t\tidx++;\n\t\t}\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tfile_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));\n\t\t}\n\n\t\twrite_file_mutex.lock();\n\t\tfor (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {\n\t\t\tconst std::string filename = config::data_path() + \"/crawl-data/NODE-\" + std::to_string(node_id) + s_suffix + \"/warc.paths\";\n\t\t\tstd::ofstream outfile(filename, std::ios::app);\n\t\t\tfor (const std::string &file : file_names[node_id]) {\n\t\t\t\toutfile << file << \"\\n\";\n\t\t\t}\n\t\t}\n\t\twrite_file_mutex.unlock();\n\t}\n\n\tstd::unordered_set<size_t> build_link_set(const std::vector<std::string> &warc_paths, size_t hash_min, size_t hash_max) {\n\n\t\tstd::unordered_set<size_t> result;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\t\t\t\tconst size_t hash = link.target_url().hash();\n\t\t\t\tif (hash >= hash_min && hash <= hash_max) {\n\t\t\t\t\tresult.insert(hash);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn result;\n\t}\n\n\t/*\n\t * Input is a vector with paths to url files. Returns an unordered set with all the host hashes.\n\t * */\n\tstd::unordered_set<size_t> build_url_host_set(const std::vector<std::string> &warc_paths) {\n\n\t\tstd::unordered_set<size_t> hosts;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\thosts.insert(url.host_hash());\n\t\t\t}\n\t\t}\n\n\t\treturn hosts;\n\t}\n\n\tstd::unordered_set<size_t> build_url_set(const std::vector<std::string> &warc_paths) {\n\n\t\tstd::unordered_set<size_t> url_hashes;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\turl_hashes.insert(url.hash());\n\t\t\t}\n\t\t}\n\n\t\treturn url_hashes;\n\t}\n\n\tvoid create_warc_directories() {\n\t\t// Create directories.\n\t\tfor (const auto &batch : target_url_batches()) {\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/crawl-data/\" + batch);\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/crawl-data/\" + batch + \"/files\");\n\t\t}\n\t\tfor (const auto &batch : target_link_batches()) {\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/crawl-data/\" + batch);\n\t\t\tboost::filesystem::create_directories(config::data_path() + \"/crawl-data/\" + batch + \"/files\");\n\t\t}\n\t}\n\n\tvoid run_splitter() {\n\n\t\ttools::create_warc_directories();\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_url_files();\n\t\tauto link_files = generate_list_with_link_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\tstd::vector<std::vector<std::string>> link_thread_input;\n\t\talgorithm::vector_chunk(link_files, ceil((double)link_files.size() / s_num_threads), link_thread_input);\n\n\t\tstd::mutex write_file_mutex;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(splitter, thread_input[i], ref(write_file_mutex)));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t\tthreads.clear();\n\n\t\t/*\n\t\tRun link_splitter threads\n\t\tfor (size_t i = 0; i < link_thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(thread(link_splitter, link_thread_input[i], ref(write_file_mutex)));\n\t\t}\n\n\t\tfor (thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\t\t*/\n\t}\n\n\tvoid run_url_splitter_on_urls_in_set(const std::unordered_set<size_t> &urls) {\n\n\t\ttools::create_warc_directories();\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_url_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\tstd::mutex write_file_mutex;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(splitter_with_urls, std::cref(urls), std::cref(thread_input[i]), ref(write_file_mutex)));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t}\n\n\tvoid run_url_splitter_on_urls_in_roaring(const ::roaring::Roaring64Map &urls) {\n\n\t\ttools::create_warc_directories();\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_url_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\tstd::mutex write_file_mutex;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(splitter_with_roaring, std::cref(urls), std::cref(thread_input[i]), ref(write_file_mutex)));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t}\n\n\tvoid run_url_splitter_on_urls_in_bloom_filter(const ::algorithm::bloom_filter &bloom) {\n\n\t\ttools::create_warc_directories();\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_url_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\tstd::mutex write_file_mutex;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(splitter_with_bloom, std::cref(bloom), std::cref(thread_input[i]), ref(write_file_mutex)));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t}\n\n\tvoid run_link_splitter_on_links_with_target_host_in_set(const std::unordered_set<size_t> &hosts) {\n\n\t\ttools::create_warc_directories();\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_link_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\tstd::mutex write_file_mutex;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(link_splitter_with_hosts, std::cref(hosts), std::cref(thread_input[i]), ref(write_file_mutex)));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t}\n\n\tstd::unordered_set<size_t> generate_set_of_urls() {\n\n\t\tauto url_files = generate_list_with_url_files();\n\n\t\t// create an unordered set that contains host hashes of all the urls.\n\t\tstd::cout << \"building url hashes map\" << std::endl;\n\t\tstd::unordered_set<size_t> url_hashes;\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(url_files, ceil((double)url_files.size() / s_num_threads), thread_input);\n\n\t\tstd::vector<std::future<std::unordered_set<size_t>>> futures;\n\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tfutures.emplace_back(std::async(std::launch::async, build_url_set, thread_input[i]));\n\t\t}\n\n\t\tfor (auto &fut : futures) {\n\t\t\tauto result = fut.get();\n\t\t\turl_hashes.insert(result.begin(), result.end());\n\t\t}\n\n\t\treturn url_hashes;\n\t}\n\n\tvoid run_split_links_with_relevant_domains() {\n\n\t\tauto url_files = generate_list_with_target_url_files();\n\n\t\t// create an unordered set that contains host hashes of all the urls.\n\t\tstd::cout << \"building host hashes map\" << std::endl;\n\t\tstd::unordered_set<size_t> host_hashes;\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(url_files, ceil((double)url_files.size() / s_num_threads), thread_input);\n\n\t\tstd::vector<std::future<std::unordered_set<size_t>>> futures;\n\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tfutures.emplace_back(std::async(std::launch::async, build_url_host_set, thread_input[i]));\n\t\t}\n\n\t\tfor (auto &fut : futures) {\n\t\t\tauto result = fut.get();\n\t\t\thost_hashes.insert(result.begin(), result.end());\n\t\t}\n\n\t\tstd::cout << \"done. the map size is \" << host_hashes.size() << std::endl;\n\n\t\trun_link_splitter_on_links_with_target_host_in_set(host_hashes);\n\t}\n\n\tvoid split_make_bloom(::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths) {\n\n\t\tstd::vector<uint64_t> cache;\n\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst URL url(line.substr(0, line.find(\"\\t\")));\n\t\t\t\tcache.push_back(url.hash());\n\t\t\t}\n\n\t\t\tbloom.insert_many(cache);\n\t\t\tcache.clear();\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\t\t\tidx++;\n\t\t}\n\n\t}\n\n\tvoid run_split_build_url_bloom() {\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_url_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\t::algorithm::bloom_filter bloom;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(split_make_bloom, std::ref(bloom), std::cref(thread_input[i])));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t\tbloom.write_file(config::data_path() + \"/0/url_filter_main.bloom\");\n\t}\n\n\tvoid split_make_direct_links(const ::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths) {\n\n\t\tsize_t done = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\n\t\t\tstd::cout << \"done \" << done << \"/\" << warc_paths.size() << std::endl;\n\t\t\tdone++;\n\n\t\t\tauto target_warc_path = warc_path;\n\t\t\tconst size_t pos = target_warc_path.find(\".links.gz\");\n\n\t\t\tif (pos != std::string::npos) {\n\t\t\t\ttarget_warc_path.replace(pos, 9, \".direct.links.gz\");\n\t\t\t} else {\n\t\t\t\tstd::cout << \"ERROR: \" << warc_path << std::endl;\n\t\t\t\treturn;\n\t\t\t}\n\n\t\t\tstd::ofstream outfile(target_warc_path, std::ios::trunc | std::ios::binary);\n\n\t\t\tboost::iostreams::filtering_ostream compress_stream;\n\t\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\t\tcompress_stream.push(outfile);\n\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\n\t\t\t\tif (bloom.exists(link.target_url().hash())) {\n\t\t\t\t\tcompress_stream << line << \"\\n\";\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid run_split_direct_links() {\n\n\t\t::algorithm::bloom_filter bloom;\n\t\tbloom.read_file(config::data_path() + \"/0/url_filter_main.bloom\");\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_link_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(split_make_direct_links, std::cref(bloom), std::cref(thread_input[i])));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\t}\n\n\tvoid split_make_link_bloom(::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths) {\n\n\t\tstd::vector<uint64_t> cache;\n\n\t\tsize_t idx = 0;\n\t\tfor (const std::string &warc_path : warc_paths) {\n\t\t\tstd::ifstream infile(warc_path);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstd::string line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tconst url_link::link link(line);\n\t\t\t\tconst size_t hash = link.target_url().hash();\n\t\t\t\tcache.push_back(hash);\n\t\t\t}\n\n\t\t\tbloom.insert_many(cache);\n\t\t\tcache.clear();\n\n\t\t\tif (idx % 100 == 0) {\n\t\t\t\tstd::cout << warc_path << \" done \" << idx << \"/\" << warc_paths.size() << std::endl;\n\t\t\t} \n\t\t\tidx++;\n\t\t}\n\n\t}\n\n\tvoid run_split_build_direct_link_bloom() {\n\n\t\tstd::vector<std::thread> threads;\n\t\tauto files = generate_list_with_direct_link_files();\n\n\t\tstd::vector<std::vector<std::string>> thread_input;\n\t\talgorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);\n\n\t\t::algorithm::bloom_filter bloom;\n\n\t\t/*\n\t\tRun splitter threads\n\t\t*/\n\t\tfor (size_t i = 0; i < thread_input.size(); i++) {\n\t\t\tthreads.emplace_back(std::thread(split_make_link_bloom, std::ref(bloom), std::cref(thread_input[i])));\n\t\t}\n\n\t\tfor (std::thread &one_thread : threads) {\n\t\t\tone_thread.join();\n\t\t}\n\n\t\tbloom.write_file(config::data_path() + \"/0/direct_link_filter_main.bloom\");\n\t}\n\n\tvoid run_split_urls_with_direct_links() {\n\t\t\n\t\t::algorithm::bloom_filter bloom;\n\t\tbloom.read_file(config::data_path() + \"/0/direct_link_filter_main.bloom\");\n\n\t\trun_url_splitter_on_urls_in_bloom_filter(bloom);\n\t}\n\n\n}\n"
  },
  {
    "path": "src/tools/splitter.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <vector>\n\nnamespace tools {\n\n\tconst std::string s_suffix = \"-small\";\n\tconst size_t s_num_threads = 12;\n\n\tstd::vector<std::string> target_url_batches();\n\tstd::vector<std::string> target_link_batches();\n\n\tstd::vector<std::string> generate_list_with_url_files();\n\tstd::vector<std::string> generate_list_with_link_files();\n\tstd::vector<std::string> generate_list_with_target_url_files();\n\tstd::vector<std::string> generate_list_with_target_link_files();\n\n\tvoid run_splitter();\n\tvoid run_split_urls_with_direct_links();\n\tvoid run_split_links_with_relevant_domains();\n\tvoid run_split_build_url_bloom();\n\tvoid run_split_direct_links();\n\tvoid run_split_build_direct_link_bloom();\n\n}\n\n"
  },
  {
    "path": "src/transfer/transfer.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"config.h\"\n#include \"transfer.h\"\n#include <fstream>\n#include \"common/ThreadPool.h\"\n#include \"logger/logger.h\"\n#include \"profiler/profiler.h\"\n#include \"file/file.h\"\n#include \"text/text.h\"\n#include \"parser/parser.h\"\n#include \"algorithm/hash.h\"\n\nusing namespace std;\n\nnamespace transfer {\n\n\tsize_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, stringstream *ss) {\n\t\tsize_t byte_size = size * nmemb;\n\t\tss->write((char *)ptr, byte_size);\n\t\treturn byte_size;\n\t}\n\n\tsize_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, ostream *os) {\n\t\tsize_t byte_size = size * nmemb;\n\t\tos->write((char *)ptr, byte_size);\n\t\treturn byte_size;\n\t}\n\n\tsize_t curl_string_writer(void *ptr, size_t size, size_t nmemb, string *str) {\n\t\tsize_t byte_size = size * nmemb;\n\t\tstr->append((char *)ptr, byte_size);\n\t\treturn byte_size;\n\t}\n\n\tstruct curl_string_read_struct {\n\t\tconst char *buffer;\n\t\tsize_t buffer_len;\n\t\tsize_t offset;\n\t};\n\n\tsize_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata) {\n\t\tstruct curl_string_read_struct *arg = (struct curl_string_read_struct *)userdata;\n\n\t\tif (arg->offset >= arg->buffer_len) {\n\t\t\treturn 0ull;\n\t\t}\n\n\t\tsize_t max_read = size * nmemb;\n\t\tsize_t read_bytes = arg->buffer_len - arg->offset;\n\t\tif (read_bytes > max_read) read_bytes = max_read;\n\n\t\tmemcpy(ptr, &arg->buffer[arg->offset], read_bytes);\n\n\t\targ->offset += read_bytes;\n\n\t\treturn read_bytes;\n\t}\n\n\tsize_t curl_file_reader(char *ptr, size_t size, size_t nmemb, void *userdata) {\n\t\tstd::ifstream *infile = (std::ifstream *)userdata;\n\n\t\tif (infile->eof()) {\n\t\t\treturn 0ull;\n\t\t}\n\n\t\tsize_t max_read = size * nmemb;\n\n\t\tinfile->read(ptr, max_read);\n\n\t\treturn infile->gcount();\n\t}\n\n\tvoid set_internal_auth(CURL *curl) {\n\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, username.c_str());\n\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, password.c_str());\n\t}\n\n\tstring make_url(const string &url) {\n\n\t\tif (url.find(\"http://\") == 0 || url.find(\"https://\") == 0) {\n\t\t\treturn url;\n\t\t}\n\n\t\tif (url.size() && url[0] != '/') {\n\t\t\treturn \"http://\" + config::master + \"/\" + url;\n\t\t}\n\t\treturn \"http://\" + config::master + url;\n\t}\n\n\tstring file_to_string(const string &file_path, int &error) {\n\t\tCURL *curl = curl_easy_init();\n\t\terror = ERROR;\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tLOG_INFO(\"Downloading url: \" + make_url(file_path));\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());\n\n\t\t\tset_internal_auth(curl);\n\n\t\t\tstringstream response;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\tlong response_code;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\terror = OK;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t\treturn response.str();\n\t\t}\n\n\t\treturn \"\";\n\t}\n\n\tstring gz_file_to_string(const string &file_path, int &error) {\n\t\tCURL *curl = curl_easy_init();\n\t\terror = ERROR;\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tLOG_INFO(\"Downloading url: \" + make_url(file_path));\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());\n\n\t\t\tset_internal_auth(curl);\n\n\t\t\tstringstream response;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tstring response_str;\n\t\t\ttry {\n\t\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\t\tdecompress_stream.push(response);\n\n\t\t\t\tresponse_str = string(istreambuf_iterator<char>(decompress_stream), {});\n\t\t\t} catch (...) {\n\t\t\t\tcurl_easy_cleanup(curl);\n\t\t\t\terror = ERROR;\n\t\t\t\treturn \"\";\n\t\t\t}\n\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\tlong response_code;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\terror = OK;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t\treturn response_str;\n\t\t}\n\n\t\treturn \"\";\n\t}\n\n\tvoid file_to_stream(const string &file_path, ostream &output_stream, int &error) {\n\t\tCURL *curl = curl_easy_init();\n\t\terror = ERROR;\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tLOG_INFO(\"Downloading url: \" + make_url(file_path));\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());\n\n\t\t\tset_internal_auth(curl);\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &output_stream);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_ostream_writer);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\tlong response_code;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\terror = OK;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t}\n\t}\n\n\tvoid gz_file_to_stream(const string &file_path, ostream &output_stream, int &error) {\n\t\tCURL *curl = curl_easy_init();\n\t\terror = ERROR;\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tLOG_INFO(\"Downloading url: \" + make_url(file_path));\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());\n\n\t\t\tset_internal_auth(curl);\n\n\t\t\tstringstream response;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\tlong response_code;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\terror = OK;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\ttry {\n\t\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\t\tdecompress_stream.push(response);\n\n\t\t\t\toutput_stream << decompress_stream.rdbuf();\n\t\t\t} catch(...) {\n\t\t\t\terror = ERROR;\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\t\t}\n\t}\n\n\tvoid url_to_string(const string &url, string &buffer, int &error) {\n\t\tCURL *curl = curl_easy_init();\n\t\terror = ERROR;\n\t\tconst size_t original_buffer_size = buffer.size();\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tLOG_INFO(\"Downloading url: \" + url);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 5000);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 5);\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_string_writer);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\tlong response_code;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\t\tif (response_code >= 200 && response_code < 300) {\n\t\t\t\t\terror = OK;\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\t// If an error ocurred we set the size of the buffer to the original size, removing any appended data.\n\t\t\t\tbuffer.resize(original_buffer_size);\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\t\t}\n\t}\n\n\tstring run_gz_download_thread(const string &file_path) {\n\t\tsize_t hsh = algorithm::hash(file_path);\n\t\tconst string target_filename = config::data_path() + \"/\" + to_string(hsh % 8) + \"/tmp/tmp_\" + to_string(hsh);\n\t\tofstream target_file(target_filename, ios::binary | ios::trunc);\n\t\tint error;\n\t\tgz_file_to_stream(file_path, target_file, error);\n\t\tif (error != OK) {\n\t\t\treturn \"\";\n\t\t}\n\t\treturn target_filename;\n\t}\n\n\tvector<string> download_gz_files_to_disk(const vector<string> &files_to_download) {\n\t\t\n\t\tThreadPool pool(config::num_async_file_transfers);\n\t\tstd::vector<std::future<string>> results;\n\n\t\tfor (const string &file : files_to_download) {\n\t\t\tresults.emplace_back(\n\t\t\t\tpool.enqueue([file] {\n\t\t\t\t\treturn run_gz_download_thread(file);\n\t\t\t\t})\n\t\t\t);\n\t\t}\n\n\t\tvector<string> local_filenames;\n\t\tfor(auto && result: results) {\n\t\t\tconst string filename = result.get();\n\t\t\tif (filename != \"\") {\n\t\t\t\tlocal_filenames.push_back(filename);\n\t\t\t}\n\t\t}\n\n\t\treturn local_filenames;\n\t}\n\n\tvoid delete_downloaded_files(const vector<string> &files) {\n\t\tLOG_INFO(\"Deleting \" + to_string(files.size()) + \" downloaded files\");\n\t\tfor (const string &file : files) {\n\t\t\tfile::delete_file(file);\n\t\t}\n\t}\n\n\tsize_t head_content_length(const string &url, int &error) {\n\t\tCURL *curl = curl_easy_init();\n\t\terror = ERROR;\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tLOG_INFO(\"Making head request to:\" + url);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\n\t\t\tstringstream response;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_NOBODY, 1);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_HEADER, 1);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tstring response_str;\n\t\t\ttry {\n\t\t\t\tresponse_str = string(istreambuf_iterator<char>(response), {});\n\t\t\t} catch (...) {\n\t\t\t\tcurl_easy_cleanup(curl);\n\t\t\t\terror = ERROR;\n\t\t\t\treturn 0;\n\t\t\t}\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\tlong response_code;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n\t\t\t\tif (response_code == 200) {\n\t\t\t\t\terror = OK;\n\t\t\t\t} else {\n\t\t\t\t\tcurl_easy_cleanup(curl);\n\t\t\t\t\treturn 0;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t\tconst string content_len_str = parser::get_http_header(text::lower_case(response_str), \"content-length: \");\n\t\t\tsize_t content_len;\n\t\t\ttry {\n\t\t\t\tcontent_len = stoull(content_len_str);\n\t\t\t} catch (...) {\n\t\t\t\terror = ERROR;\n\t\t\t\treturn 0;\n\t\t\t}\n\n\t\t\treturn content_len;\n\t\t}\n\n\t\treturn 0;\n\t}\n\n\tint upload_file(const string &path, const string &data) {\n\t\tCURL *curl = curl_easy_init();\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tconst string url = \"http://\" + config::upload + \"/\" + path;\n\t\t\tLOG_INFO(\"Uploading file to:\" + url);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);\n\n\t\t\tstruct curl_string_read_struct arg;\n\t\t\targ.buffer = data.c_str();\n\t\t\targ.buffer_len = data.size();\n\t\t\targ.offset = 0;\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READDATA, &arg);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\treturn OK;\n\t\t\t}\n\t\t\treturn ERROR;\n\t\t}\n\n\t\treturn ERROR;\n\t}\n\n\tint upload_gz_file(const string &path, const string &data) {\n\t\tCURL *curl = curl_easy_init();\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tconst string url = \"http://\" + config::upload + \"/\" + path;\n\t\t\tLOG_INFO(\"Uploading file to:\" + url);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);\n\n\t\t\tstringstream ss(data);\n\t\t\tboost::iostreams::filtering_istream compress_stream;\n\t\t\tcompress_stream.push(boost::iostreams::gzip_compressor());\n\t\t\tcompress_stream.push(ss);\n\n\t\t\tstring compressed_data = string(istreambuf_iterator<char>(compress_stream), {});\n\n\t\t\tstruct curl_string_read_struct arg;\n\t\t\targ.buffer = compressed_data.c_str();\n\t\t\targ.buffer_len = compressed_data.size();\n\t\t\targ.offset = 0;\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READDATA, &arg);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\treturn OK;\n\t\t\t}\n\t\t\treturn ERROR;\n\t\t}\n\n\t\treturn ERROR;\n\t}\n\n\tint upload_file_from_disk(const string &dest_path, const string &filename) {\n\t\tCURL *curl = curl_easy_init();\n\t\tif (curl) {\n\t\t\tCURLcode res;\n\t\t\tconst string url = \"http://\" + config::upload + \"/\" + dest_path;\n\t\t\tLOG_INFO(\"Uploading file to:\" + url);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);\n\n\t\t\tstd::ifstream infile(filename, std::ios::in | std::ios::binary);\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_file_reader);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READDATA, &infile);\n\n\t\t\tres = curl_easy_perform(curl);\n\n\t\t\tcurl_easy_cleanup(curl);\n\n\t\t\tif (res == CURLE_OK) {\n\t\t\t\treturn OK;\n\t\t\t}\n\t\t\treturn ERROR;\n\t\t}\n\n\t\treturn ERROR;\n\t}\n\n\t/*\n\t * Perform simple GET request and return response.\n\t * */\n\thttp::response get(const string &url) {\n\t\treturn get(url, vector<string>{});\n\t}\n\n\thttp::response get(const string &url, const vector<string> &headers) {\n\t\tCURL *curl = curl_easy_init();\n\t\tstruct curl_slist *header_list = NULL;\n\t\thttp::response response;\n\t\tif (curl) {\n\n\t\t\tfor (const string &header : headers) {\n\t\t\t\theader_list = curl_slist_append(header_list, header.c_str());\n\t\t\t}\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);\n\n\t\t\tstringstream response_stream;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tcurl_easy_perform(curl);\n\n\t\t\tcurl_slist_free_all(header_list);\n\n\t\t\tsize_t code = 0;\n\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);\n\n\t\t\tresponse.code(code);\n\t\t\tresponse.body(response_stream.str());\n\n\t\t\tcurl_easy_cleanup(curl);\n\t\t}\n\n\t\treturn response;\n\t}\n\n\t/*\n\t * Perform simple POST request and return response.\n\t * */\n\thttp::response post(const string &url, const string &data) {\n\t\treturn post(url, data, {});\n\t}\n\n\thttp::response post(const string &url, const string &data, const vector<string> &headers) {\n\t\tCURL *curl = curl_easy_init();\n\t\tstruct curl_slist *header_list = NULL;\n\t\thttp::response response;\n\t\tif (curl) {\n\n\t\t\tfor (const string &header : headers) {\n\t\t\t\theader_list = curl_slist_append(header_list, header.c_str());\n\t\t\t}\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\n\t\t\tstruct curl_string_read_struct arg;\n\t\t\targ.buffer = data.c_str();\n\t\t\targ.buffer_len = data.size();\n\t\t\targ.offset = 0;\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_POST, 1l);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READDATA, &arg);\n\n\t\t\tstringstream response_stream;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tCURLcode curl_result = curl_easy_perform(curl);\n\n\t\t\tif (curl_result == CURLE_OK) {\n\t\t\t\tsize_t code = 0;\n\t\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);\n\t\t\t\tresponse.code(code);\n\t\t\t\tresponse.body(response_stream.str());\n\t\t\t} else {\n\t\t\t\tresponse.code(0);\n\t\t\t\tresponse.body(\"\");\n\t\t\t}\n\n\t\t\tcurl_easy_cleanup(curl);\n\t\t}\n\n\t\treturn response;\n\t}\n\n\t/*\n\t * Perform simple PUT request and return response.\n\t * */\n\thttp::response put(const string &url, const string &data) {\n\t\tCURL *curl = curl_easy_init();\n\t\thttp::response response;\n\t\tif (curl) {\n\t\t\tcurl_easy_setopt(curl, CURLOPT_URL, url.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);\n\n\t\t\tstruct curl_string_read_struct arg;\n\t\t\targ.buffer = data.c_str();\n\t\t\targ.buffer_len = data.size();\n\t\t\targ.offset = 0;\n\n\t\t\tcurl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_READDATA, &arg);\n\n\t\t\tstringstream response_stream;\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream);\n\t\t\tcurl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);\n\n\t\t\tcurl_easy_perform(curl);\n\n\t\t\tsize_t code = 0;\n\t\t\tcurl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);\n\n\t\t\tresponse.code(code);\n\t\t\tresponse.body(response_stream.str());\n\n\t\t\tcurl_easy_cleanup(curl);\n\t\t}\n\n\t\treturn response;\n\t}\n}\n"
  },
  {
    "path": "src/transfer/transfer.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <curl/curl.h>\n#include <iostream>\n#include <sstream>\n\n#include \"http/response.h\"\n\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n\nnamespace transfer {\n\n\tconst std::string username = \"alexandria\";\n\tconst std::string password = \"wmXN6U4u\";\n\n\tconst int OK = 0;\n\tconst int ERROR = 1;\n\n\tsize_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, std::stringstream *ss);\n\tsize_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, std::ostream *os);\n\n\tstd::string file_to_string(const std::string &file_path, int &error);\n\tstd::string gz_file_to_string(const std::string &file_path, int &error);\n\n\tvoid file_to_stream(const std::string &file_path, std::ostream &output_stream, int &error);\n\tvoid gz_file_to_stream(const std::string &file_path, std::ostream &output_stream, int &error);\n\n\tvoid url_to_string(const std::string &url, std::string &buffer, int &error);\n\n\tstd::vector<std::string> download_gz_files_to_disk(const std::vector<std::string> &files_to_download);\n\tvoid delete_downloaded_files(const std::vector<std::string> &files);\n\n\t// Make a http HEAD request and return the content length. Return 0 on failure and sets the error parameter to transfer::ERROR\n\tsize_t head_content_length(const std::string &url, int &error);\n\n\tint upload_file(const std::string &path, const std::string &data);\n\tint upload_gz_file(const std::string &path, const std::string &data);\n\tint upload_file_from_disk(const std::string &dest_path, const std::string &filename);\n\n\t/*\n\t * Perform simple GET request and return response.\n\t * */\n\thttp::response get(const std::string &url);\n\thttp::response get(const std::string &url, const std::vector<std::string> &headers);\n\n\t/*\n\t * Perform simple POST request and return response.\n\t * */\n\thttp::response post(const std::string &url, const std::string &data);\n\thttp::response post(const std::string &url, const std::string &data, const std::vector<std::string> &headers);\n\n\t/*\n\t * Perform simple PUT request and return response.\n\t * */\n\thttp::response put(const std::string &url, const std::string &data);\n\n}\n"
  },
  {
    "path": "src/url_link/link.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"link.h\"\n#include <boost/algorithm/string.hpp>\n\nusing namespace std;\n\nnamespace url_link {\n\n\tlink::link() {\n\n\t}\n\n\tlink::link(const string &standard_link_data) {\n\t\t\tvector<string> col_values;\n\t\t\tboost::algorithm::split(col_values, standard_link_data, boost::is_any_of(\"\\t\"));\n\n\t\t\tm_source_url = URL(col_values[0], col_values[1]);\n\t\t\tm_target_url = URL(col_values[2], col_values[3]);\n\t\t\tm_link_text = col_values[4].substr(0, 1000);\n\n\t\t\tm_target_host_hash = m_target_url.host_hash();\n\t\t\tm_source_harmonic = 0;\n\t\t\tm_target_harmonic = 0;\n\t}\n\n\tlink::link(const URL &source_url, const URL &target_url, float source_harmonic, float target_harmonic)\n\t:\n\t\tm_source_url(source_url),\n\t\tm_target_url(target_url),\n\t\tm_target_host_hash(target_url.host_hash()),\n\t\tm_source_harmonic(source_harmonic),\n\t\tm_target_harmonic(target_harmonic)\n\t{\n\t}\n\n\tlink::~link() {\n\n\t}\n\n\tfloat link::url_score() const {\n\t\treturn max(m_source_harmonic - m_target_harmonic, m_source_harmonic / 100.0f);\n\t}\n\n\tfloat link::domain_score() const {\n\t\treturn max(m_source_harmonic - m_target_harmonic, m_source_harmonic / 100.0f)/100.0;\n\t}\n\n}\n\n"
  },
  {
    "path": "src/url_link/link.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include \"URL.h\"\n#include \"config.h\"\n\nnamespace url_link {\n\n\tclass link {\n\n\tpublic:\n\t\tlink();\n\t\texplicit link(const std::string &standard_link_data);\n\t\tlink(const URL &source_url, const URL &target_url, float source_harmonic, float target_harmonic);\n\t\t~link();\n\n\t\tfloat url_score() const;\n\t\tfloat domain_score() const;\n\n\t\tconst URL &source_url() const { return m_source_url; }\n\t\tconst URL &target_url() const { return m_target_url; }\n\t\tconst uint64_t &target_host_hash() const { return m_target_host_hash; }\n\t\tconst float &source_harmonic() const { return m_source_harmonic; }\n\t\tconst float &target_harmonic() const { return m_target_harmonic; }\n\n\t\tsize_t index_on_node() const {\n\t\t\treturn target_url().host_hash() % config::nodes_in_cluster;\n\t\t}\n\n\tprivate:\n\t\tURL m_source_url;\n\t\tURL m_target_url;\n\t\tuint64_t m_target_host_hash;\n\t\tfloat m_source_harmonic;\n\t\tfloat m_target_harmonic;\n\t\tstd::string m_link_text;\n\t};\n}\n"
  },
  {
    "path": "src/utils/id_allocator.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <mutex>\n#include <memory>\n\nnamespace utils {\n\n\t/*\n\t * Very simple helper for allocating one shared object per id by multiple threads. Each thread should keep its own cache of the pointers since\n\t * the get function locks execution.\n\t *\n\t *\n\t * - thread A\n\t *   std::unordered_map<uint64_t, data *> local_cache;\n\t *   for (...) {\n\t *\t\tif (!local_cache.count(id)) {\n\t *\t\t\tlocal_cache[id] = alloc.get(id, ...); // alloc is shared instance of id_allocator\n\t *\t\t}\n\t *\n\t *\t\tlocal_cache[id] can be used now.\n\t *   }\n\t * */\n\n\ttemplate<typename alloc_type>\n\tclass id_allocator {\n\n\t\tpublic:\n\n\t\t\t/*\n\t\t\t * Allocates a pointer to an \"alloc_type\" object associated with id. The rest of the arguments are passed to the constructor of\n\t\t\t * alloc_type.\n\t\t\t * */\n\t\t\ttemplate<class... type_args>\n\t\t\talloc_type *get(uint64_t id, type_args&&... args) {\n\n\t\t\t\tstd::lock_guard guard(m_lock);\n\n\t\t\t\tif (m_map.count(id) == 0) {\n\t\t\t\t\tm_map[id] = std::make_unique<alloc_type>(std::forward<type_args>(args)...);\n\t\t\t\t}\n\n\t\t\t\treturn m_map[id].get();\n\t\t\t}\n\n\t\tprivate:\n\n\t\t\tstd::mutex m_lock;\n\t\t\tstd::unordered_map<uint64_t, std::unique_ptr<alloc_type>> m_map;\n\n\t};\n\t\n}\n"
  },
  {
    "path": "src/utils/thread_pool.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"thread_pool.hpp\"\n#include <chrono>\n#include <iostream>\n#include <thread>\n#include <future>\n#include <queue>\n\nusing namespace std::chrono_literals;\n\nnamespace utils {\n\n\tthread_pool::thread_pool(size_t num_threads, size_t max_queue_len)\n\t: m_max_queue_len(max_queue_len) {\n\t\tfor (size_t i = 0; i < num_threads; i++) {\n\t\t\tm_workers.emplace_back([this]() {\n\t\t\t\tthis->handle_work();\n\t\t\t});\n\t\t}\n\t}\n\tthread_pool::~thread_pool() {\n\t\trun_all();\n\t}\n\n\tvoid thread_pool::enqueue(std::function<void()> &&fun) {\n\n\t\tif (m_stop) {\n\t\t\tthrow std::runtime_error(\"enqueue on stopped thread_pool not allowed\");\n\t\t}\n\n\t\tif (m_max_queue_len > 0) {\n\t\t\twhile (true) {\n\t\t\t\t\n\t\t\t\t{\n\t\t\t\t\tstd::lock_guard lock(m_queue_lock);\n\t\t\t\t\tif (m_queue.size() < m_max_queue_len) {\n\t\t\t\t\t\tm_queue.emplace(std::move(fun));\n\t\t\t\t\t\tbreak;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tstd::this_thread::sleep_for(100ms);\n\t\t\t}\n\t\t} else {\n\t\t\tm_queue_lock.lock();\n\t\t\tm_queue.emplace(std::move(fun));\n\t\t\tm_queue_lock.unlock();\n\t\t}\n\n\t\tm_condition.notify_one();\n\t}\n\n\tvoid thread_pool::run_all() {\n\t\tif (m_stop) return; // Already stopped..\n\t\tm_queue_lock.lock();\n\t\tm_stop = true;\n\t\tm_queue_lock.unlock();\n\t\tm_condition.notify_all();\n\n\t\tfor (std::thread &thread : m_workers) {\n\t\t\tif (thread.joinable()) {\n\t\t\t\tthread.join();\n\t\t\t}\n\t\t}\n\t}\n\n\tvoid thread_pool::handle_work() {\n\t\twhile (true) {\n\n\t\t\tstd::function<void()> task;\n\n\t\t\t{\n\t\t\t\tstd::unique_lock<std::mutex> lock(m_queue_lock);\n\t\t\t\tm_condition.wait(lock, [this] {\n\t\t\t\t\treturn m_stop || !m_queue.empty();\n\t\t\t\t});\n\t\t\t\tif (m_stop && m_queue.empty()) return;\n\t\t\t\ttask = std::move(m_queue.front());\n\t\t\t\tm_queue.pop();\n\t\t\t}\n\n\t\t\ttask();\n\t\t}\n\t}\n\t\n}\n"
  },
  {
    "path": "src/utils/thread_pool.hpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <thread>\n#include <future>\n#include <queue>\n\nnamespace utils {\n\n\tclass thread_pool {\n\n\t\tpublic:\n\n\t\t\texplicit thread_pool(size_t num_workers, size_t max_queue_len = 0);\n\t\t\t~thread_pool();\n\n\t\t\tvoid enqueue(std::function<void()> &&fun);\n\t\t\tvoid run_all();\n\n\t\tprivate:\n\n\t\t\tvoid handle_work();\n\n\t\t\tstd::vector<std::thread> m_workers;\n\t\t\tstd::queue<std::function<void()>> m_queue;\n\n\t\t\tstd::mutex m_queue_lock;\n\t\t\tstd::condition_variable m_condition;\n\t\t\tbool m_stop = false;\n\t\t\tsize_t m_max_queue_len;\n\n\t};\n\t\n}\n"
  },
  {
    "path": "src/utils/thread_pool_arg.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#pragma once\n\n#include <iostream>\n#include <thread>\n#include <future>\n#include <queue>\n\nnamespace utils {\n\n\ttemplate <typename arg>\n\tclass thread_pool_arg {\n\n\t\tpublic:\n\n\t\t\texplicit thread_pool_arg(size_t);\n\t\t\t~thread_pool_arg();\n\n\t\t\tvoid enqueue(std::function<void(arg &)> &&fun);\n\t\t\tvoid run_all();\n\n\t\tprivate:\n\n\t\t\tvoid handle_work();\n\n\t\t\tstd::vector<std::thread> m_workers;\n\t\t\tstd::queue<std::function<void(arg &)>> m_queue;\n\n\t\t\tstd::mutex m_queue_lock;\n\t\t\tstd::condition_variable m_condition;\n\t\t\tbool m_stop = false;\n\n\t};\n\n\ttemplate<typename arg>\n\tthread_pool_arg<arg>::thread_pool_arg(size_t num_threads) {\n\t\tfor (size_t i = 0; i < num_threads; i++) {\n\t\t\tm_workers.emplace_back([this]() {\n\t\t\t\tthis->handle_work();\n\t\t\t});\n\t\t}\n\t}\n\n\ttemplate<typename arg>\n\tthread_pool_arg<arg>::~thread_pool_arg() {\n\t\trun_all();\n\t}\n\n\ttemplate<typename arg>\n\tvoid thread_pool_arg<arg>::enqueue(std::function<void(arg &)> &&fun) {\n\n\t\tif (m_stop) {\n\t\t\tthrow std::runtime_error(\"enqueue on stopped thread_pool_arg not allowed\");\n\t\t}\n\n\t\tm_queue_lock.lock();\n\t\tm_queue.emplace(std::move(fun));\n\t\tm_queue_lock.unlock();\n\n\t\tm_condition.notify_one();\n\t}\n\n\ttemplate<typename arg>\n\tvoid thread_pool_arg<arg>::run_all() {\n\t\tif (m_stop) return; // Already stopped..\n\t\tm_queue_lock.lock();\n\t\tm_stop = true;\n\t\tm_queue_lock.unlock();\n\t\tm_condition.notify_all();\n\n\t\tfor (std::thread &thread : m_workers) {\n\t\t\tif (thread.joinable()) {\n\t\t\t\tthread.join();\n\t\t\t}\n\t\t}\n\t}\n\n\ttemplate<typename arg>\n\tvoid thread_pool_arg<arg>::handle_work() {\n\n\t\targ a;\n\t\twhile (true) {\n\n\t\t\tstd::function<void(arg &)> task;\n\n\t\t\t{\n\t\t\t\tstd::unique_lock<std::mutex> lock(m_queue_lock);\n\t\t\t\tm_condition.wait(lock, [this] {\n\t\t\t\t\treturn m_stop || !m_queue.empty();\n\t\t\t\t});\n\t\t\t\tif (m_stop && m_queue.empty()) return;\n\t\t\t\ttask = std::move(m_queue.front());\n\t\t\t\tm_queue.pop();\n\t\t\t}\n\n\t\t\ttask(a);\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "src/warc/tlds.h",
    "content": "\n#pragma once\n\n#include <iostream>\n#include <unordered_set>\n\nnamespace warc {\n\n\tconst std::unordered_set<std::string> double_tlds({\n\t\t\"co.uk\"\n\t});\n\n\tconst std::unordered_set<std::string> tlds({\n\t\t\"se\",\n\t\t\"com\",\n\t\t\"org\",\n\t\t\"net\",\n\t\t\"int\",\n\t\t\"edu\",\n\t\t\"gov\",\n\t\t\"mil\",\n\t\t\"ad\",\n\t\t\"as\",\n\t\t\"az\",\n\t\t\"bz\",\n\t\t\"cc\",\n\t\t\"cd\",\n\t\t\"co\",\n\t\t\"dj\",\n\t\t\"fm\",\n\t\t\"gg\",\n\t\t\"io\",\n\t\t\"la\",\n\t\t\"me\",\n\t\t\"ms\",\n\t\t\"nu\",\n\t\t\"sc\",\n\t\t\"tf\",\n\t\t\"tv\",\n\t\t\"ws\",\n\t\t\"ai\",\n\t\t\"as\",\n\t\t\"au\",\n\t\t\"bm\",\n\t\t\"bs\",\n\t\t\"gi\",\n\t\t\"gu\",\n\t\t\"uk\",\n\t\t\"us\",\n\t\t\"sh\",\n\t\t\"ca\",\n\t\t\"to\",\n\t\t\"ac\",\n\t\t\"academy\",\n\t\t\"accountant\",\n\t\t\"accountants\",\n\t\t\"active\",\n\t\t\"actor\",\n\t\t\"ads\",\n\t\t\"adult\",\n\t\t\"aero\",\n\t\t\"africa\",\n\t\t\"agency\",\n\t\t\"airforce\",\n\t\t\"amazon\",\n\t\t\"analytics\",\n\t\t\"apartments\",\n\t\t\"app\",\n\t\t\"apple\",\n\t\t\"archi\",\n\t\t\"army\",\n\t\t\"art\",\n\t\t\"associates\",\n\t\t\"attorney\",\n\t\t\"auction\",\n\t\t\"audible\",\n\t\t\"audio\",\n\t\t\"author\",\n\t\t\"auto\",\n\t\t\"autos\",\n\t\t\"aws\",\n\t\t\"baby\",\n\t\t\"band\",\n\t\t\"bank\",\n\t\t\"bar\",\n\t\t\"barefoot\",\n\t\t\"bargains\",\n\t\t\"baseball\",\n\t\t\"basketball\",\n\t\t\"beauty\",\n\t\t\"beer\",\n\t\t\"best\",\n\t\t\"bestbuy\",\n\t\t\"bet\",\n\t\t\"bible\",\n\t\t\"bid\",\n\t\t\"bike\",\n\t\t\"bingo\",\n\t\t\"bio\",\n\t\t\"biz\",\n\t\t\"black\",\n\t\t\"blackfriday\",\n\t\t\"blockbuster\",\n\t\t\"blog\",\n\t\t\"blue\",\n\t\t\"boo\",\n\t\t\"book\",\n\t\t\"boots\",\n\t\t\"boston\",\n\t\t\"bot\",\n\t\t\"boutique\",\n\t\t\"box\",\n\t\t\"broadway\",\n\t\t\"broker\",\n\t\t\"build\",\n\t\t\"builders\",\n\t\t\"business\",\n\t\t\"buy\",\n\t\t\"buzz\",\n\t\t\"cab\",\n\t\t\"cafe\",\n\t\t\"call\",\n\t\t\"cam\",\n\t\t\"camera\",\n\t\t\"camp\",\n\t\t\"cancerresearch\",\n\t\t\"capital\",\n\t\t\"car\",\n\t\t\"cards\",\n\t\t\"care\",\n\t\t\"career\",\n\t\t\"careers\",\n\t\t\"cars\",\n\t\t\"case\",\n\t\t\"cash\",\n\t\t\"casino\",\n\t\t\"catering\",\n\t\t\"catholic\",\n\t\t\"center\",\n\t\t\"cern\",\n\t\t\"ceo\",\n\t\t\"cfd\",\n\t\t\"channel\",\n\t\t\"chat\",\n\t\t\"charity\",\n\t\t\"cheap\",\n\t\t\"christmas\",\n\t\t\"church\",\n\t\t\"circle\",\n\t\t\"city\",\n\t\t\"claims\",\n\t\t\"cleaning\",\n\t\t\"click\",\n\t\t\"clinic\",\n\t\t\"clothing\",\n\t\t\"cloud\",\n\t\t\"club\",\n\t\t\"coach\",\n\t\t\"codes\",\n\t\t\"coffee\",\n\t\t\"college\",\n\t\t\"community\",\n\t\t\"company\",\n\t\t\"compare\",\n\t\t\"computer\",\n\t\t\"condos\",\n\t\t\"construction\",\n\t\t\"consulting\",\n\t\t\"contact\",\n\t\t\"contractors\",\n\t\t\"cooking\",\n\t\t\"cool\",\n\t\t\"coop\",\n\t\t\"country\",\n\t\t\"coupon\",\n\t\t\"coupons\",\n\t\t\"courses\",\n\t\t\"cpa\",\n\t\t\"credit\",\n\t\t\"creditcard\",\n\t\t\"cruise\",\n\t\t\"cricket\",\n\t\t\"cruises\",\n\t\t\"cyou\",\n\t\t\"dad\",\n\t\t\"dance\",\n\t\t\"data\",\n\t\t\"date\",\n\t\t\"dating\",\n\t\t\"day\",\n\t\t\"deal\",\n\t\t\"deals\",\n\t\t\"degree\",\n\t\t\"delivery\",\n\t\t\"democrat\",\n\t\t\"dental\",\n\t\t\"dentist\",\n\t\t\"design\",\n\t\t\"dev\",\n\t\t\"diamonds\",\n\t\t\"diet\",\n\t\t\"digital\",\n\t\t\"direct\",\n\t\t\"directory\",\n\t\t\"discount\",\n\t\t\"diy\",\n\t\t\"docs\",\n\t\t\"doctor\",\n\t\t\"dog\",\n\t\t\"domains\",\n\t\t\"dot\",\n\t\t\"download\",\n\t\t\"drive\",\n\t\t\"duck\",\n\t\t\"earth\",\n\t\t\"eat\",\n\t\t\"eco\",\n\t\t\"education\",\n\t\t\"email\",\n\t\t\"energy\",\n\t\t\"engineer\",\n\t\t\"engineering\",\n\t\t\"edeka\",\n\t\t\"entertainment\",\n\t\t\"enterprises\",\n\t\t\"equipment\",\n\t\t\"esq\",\n\t\t\"estate\",\n\t\t\"events\",\n\t\t\"exchange\",\n\t\t\"expert\",\n\t\t\"exposed\",\n\t\t\"express\",\n\t\t\"fail\",\n\t\t\"faith\",\n\t\t\"family\",\n\t\t\"fan\",\n\t\t\"fans\",\n\t\t\"farm\",\n\t\t\"fashion\",\n\t\t\"fast\",\n\t\t\"feedback\",\n\t\t\"fiat\",\n\t\t\"film\",\n\t\t\"final\",\n\t\t\"finance\",\n\t\t\"financial\",\n\t\t\"fire\",\n\t\t\"fish\",\n\t\t\"fishing\",\n\t\t\"fit\",\n\t\t\"fitness\",\n\t\t\"flights\",\n\t\t\"florist\",\n\t\t\"flowers\",\n\t\t\"fly\",\n\t\t\"foo\",\n\t\t\"food\",\n\t\t\"foodnetwork\",\n\t\t\"football\",\n\t\t\"forsale\",\n\t\t\"forum\",\n\t\t\"foundation\",\n\t\t\"free\",\n\t\t\"frontdoor\",\n\t\t\"fun\",\n\t\t\"fund\",\n\t\t\"furniture\",\n\t\t\"fyi\",\n\t\t\"gallery\",\n\t\t\"game\",\n\t\t\"games\",\n\t\t\"garden\",\n\t\t\"gay\",\n\t\t\"gdn\",\n\t\t\"gift\",\n\t\t\"gifts\",\n\t\t\"gives\",\n\t\t\"glass\",\n\t\t\"gle\",\n\t\t\"global\",\n\t\t\"gold\",\n\t\t\"golf\",\n\t\t\"google\",\n\t\t\"gop\",\n\t\t\"graphics\",\n\t\t\"green\",\n\t\t\"gripe\",\n\t\t\"grocery\",\n\t\t\"group\",\n\t\t\"guide\",\n\t\t\"guitars\",\n\t\t\"guru\",\n\t\t\"hair\",\n\t\t\"hangout\",\n\t\t\"health\",\n\t\t\"healthcare\",\n\t\t\"help\",\n\t\t\"here\",\n\t\t\"hiphop\",\n\t\t\"hiv\",\n\t\t\"hockey\",\n\t\t\"holdings\",\n\t\t\"holiday\",\n\t\t\"homegoods\",\n\t\t\"homes\",\n\t\t\"homesense\",\n\t\t\"horse\",\n\t\t\"hospital\",\n\t\t\"host\",\n\t\t\"hosting\",\n\t\t\"hot\",\n\t\t\"hotels\",\n\t\t\"house\",\n\t\t\"how\",\n\t\t\"ice\",\n\t\t\"icu\",\n\t\t\"inc\",\n\t\t\"industries\",\n\t\t\"info\",\n\t\t\"ing\",\n\t\t\"ink\",\n\t\t\"institute[50]\",\n\t\t\"insurance\",\n\t\t\"insure\",\n\t\t\"international\",\n\t\t\"investments\",\n\t\t\"irish\",\n\t\t\"jewelry\",\n\t\t\"jobs\",\n\t\t\"joy\",\n\t\t\"kim\",\n\t\t\"kitchen\",\n\t\t\"kosher\",\n\t\t\"land\",\n\t\t\"lat\",\n\t\t\"law\",\n\t\t\"lawyer\",\n\t\t\"lease\",\n\t\t\"leclerc\",\n\t\t\"legal\",\n\t\t\"lgbt\",\n\t\t\"life\",\n\t\t\"lifeinsurance\",\n\t\t\"lighting\",\n\t\t\"like\",\n\t\t\"limited\",\n\t\t\"limo\",\n\t\t\"link\",\n\t\t\"live\",\n\t\t\"living\",\n\t\t\"loan\",\n\t\t\"loans\",\n\t\t\"locker\",\n\t\t\"lol\",\n\t\t\"lotto\",\n\t\t\"love\",\n\t\t\"ltd\",\n\t\t\"luxury\",\n\t\t\"makeup\",\n\t\t\"management\",\n\t\t\"map\",\n\t\t\"market\",\n\t\t\"marketing\",\n\t\t\"markets\",\n\t\t\"mba\",\n\t\t\"med\",\n\t\t\"media\",\n\t\t\"meet\",\n\t\t\"meme\",\n\t\t\"memorial\",\n\t\t\"men\",\n\t\t\"menu\",\n\t\t\"mint\",\n\t\t\"mobi\",\n\t\t\"mobile\",\n\t\t\"mobily\",\n\t\t\"moe\",\n\t\t\"mom\",\n\t\t\"money\",\n\t\t\"monster\",\n\t\t\"mortgage\",\n\t\t\"motorcycles\",\n\t\t\"mov\",\n\t\t\"movie\",\n\t\t\"museum\",\n\t\t\"music\",\n\t\t\"name\",\n\t\t\"navy\",\n\t\t\"network\",\n\t\t\"new\",\n\t\t\"news\",\n\t\t\"ngo\",\n\t\t\"ninja\",\n\t\t\"now\",\n\t\t\"ntt\",\n\t\t\"observer\",\n\t\t\"off\",\n\t\t\"org\",\n\t\t\"one\",\n\t\t\"ong\",\n\t\t\"onl\",\n\t\t\"online\",\n\t\t\"ooo\",\n\t\t\"open\",\n\t\t\"organic\",\n\t\t\"origins\",\n\t\t\"page\",\n\t\t\"partners\",\n\t\t\"parts\",\n\t\t\"party\",\n\t\t\"pay\",\n\t\t\"pet\",\n\t\t\"pharmacy\",\n\t\t\"phone\",\n\t\t\"photo\",\n\t\t\"photography\",\n\t\t\"photos\",\n\t\t\"physio\",\n\t\t\"pics\",\n\t\t\"pictures\",\n\t\t\"pid\",\n\t\t\"pin\",\n\t\t\"pink\",\n\t\t\"pizza\",\n\t\t\"place\",\n\t\t\"plumbing\",\n\t\t\"plus\",\n\t\t\"poker\",\n\t\t\"porn\",\n\t\t\"post\",\n\t\t\"press\",\n\t\t\"prime\",\n\t\t\"pro\",\n\t\t\"productions\",\n\t\t\"prof\",\n\t\t\"promo\",\n\t\t\"properties\",\n\t\t\"property\",\n\t\t\"protection\",\n\t\t\"pub\",\n\t\t\"qpon\",\n\t\t\"racing\",\n\t\t\"radio\",\n\t\t\"read\",\n\t\t\"realestate\",\n\t\t\"realtor\",\n\t\t\"realty\",\n\t\t\"recipes\",\n\t\t\"red\",\n\t\t\"rehab\",\n\t\t\"reit\",\n\t\t\"rent\",\n\t\t\"rentals\",\n\t\t\"repair\",\n\t\t\"report\",\n\t\t\"republican\",\n\t\t\"rest\",\n\t\t\"restaurant\",\n\t\t\"review\",\n\t\t\"reviews\",\n\t\t\"rich\",\n\t\t\"rip\",\n\t\t\"rocks\",\n\t\t\"rodeo\",\n\t\t\"room\",\n\t\t\"rugby\",\n\t\t\"run\",\n\t\t\"safe\",\n\t\t\"sale\",\n\t\t\"salon\",\n\t\t\"save\",\n\t\t\"sbi\",\n\t\t\"scholarships\",\n\t\t\"school\",\n\t\t\"science\",\n\t\t\"search\",\n\t\t\"secure\",\n\t\t\"security\",\n\t\t\"select\",\n\t\t\"services\",\n\t\t\"sex\",\n\t\t\"sexy\",\n\t\t\"shoes\",\n\t\t\"shop\",\n\t\t\"shopping\",\n\t\t\"show\",\n\t\t\"showtime\",\n\t\t\"silk\",\n\t\t\"singles\",\n\t\t\"site\",\n\t\t\"ski\",\n\t\t\"skin\",\n\t\t\"sky\",\n\t\t\"sling\",\n\t\t\"smile\",\n\t\t\"sncf\",\n\t\t\"soccer\",\n\t\t\"social\",\n\t\t\"software\",\n\t\t\"solar\",\n\t\t\"solutions\",\n\t\t\"song\",\n\t\t\"space\",\n\t\t\"spreadbetting\",\n\t\t\"spot\",\n\t\t\"sport\",\n\t\t\"storage\",\n\t\t\"store\",\n\t\t\"stream\",\n\t\t\"studio\",\n\t\t\"study\",\n\t\t\"style\",\n\t\t\"sucks\",\n\t\t\"supplies\",\n\t\t\"supply\",\n\t\t\"support\",\n\t\t\"surf\",\n\t\t\"surgery\",\n\t\t\"systems\",\n\t\t\"talk\",\n\t\t\"tattoo\",\n\t\t\"tax\",\n\t\t\"taxi\",\n\t\t\"team\",\n\t\t\"tech\",\n\t\t\"technology\",\n\t\t\"tel\",\n\t\t\"tennis\",\n\t\t\"theater\",\n\t\t\"theatre\",\n\t\t\"tickets\",\n\t\t\"tips\",\n\t\t\"tires\",\n\t\t\"today\",\n\t\t\"tools\",\n\t\t\"top\",\n\t\t\"tours\",\n\t\t\"town\",\n\t\t\"toys\",\n\t\t\"trade\",\n\t\t\"trading\",\n\t\t\"training\",\n\t\t\"travel\",\n\t\t\"travelersinsurance\",\n\t\t\"trust\",\n\t\t\"tube\",\n\t\t\"tunes\",\n\t\t\"uconnect\",\n\t\t\"university\",\n\t\t\"uno\",\n\t\t\"vacations\",\n\t\t\"ventures\",\n\t\t\"vet\",\n\t\t\"video\",\n\t\t\"villas\",\n\t\t\"vin\",\n\t\t\"vip\",\n\t\t\"vision\",\n\t\t\"vodka\",\n\t\t\"volvo\",\n\t\t\"vote\",\n\t\t\"voting\",\n\t\t\"voyage\",\n\t\t\"wang\",\n\t\t\"watch\",\n\t\t\"watches\",\n\t\t\"weather\",\n\t\t\"webcam\",\n\t\t\"website\",\n\t\t\"wed\",\n\t\t\"wedding\",\n\t\t\"whoswho\",\n\t\t\"wiki\",\n\t\t\"win\",\n\t\t\"wine\",\n\t\t\"winners\",\n\t\t\"work\",\n\t\t\"works\",\n\t\t\"world\",\n\t\t\"wow\",\n\t\t\"wtf\",\n\t\t\"xxx\",\n\t\t\"xyz\",\n\t\t\"yachts\",\n\t\t\"yoga\",\n\t\t\"you\",\n\t\t\"youtube\",\n\t\t\"zero\",\n\t\t\"zip\",\n\t\t\"zone\"\n\t});\n}\n"
  },
  {
    "path": "src/warc/warc.cpp",
    "content": "\n#include \"warc.h\"\n#include \"tlds.h\"\n#include \"text/text.h\"\n#include \"logger/logger.h\"\n#include \"transfer/transfer.h\"\n\nusing namespace std;\n\nnamespace warc {\n\n\tparser::parser() {\n\t\tm_z_buffer_in = new char[WARC_PARSER_ZLIB_IN];\n\t\tm_z_buffer_out = new char[WARC_PARSER_ZLIB_OUT];\n\t}\n\n\tparser::~parser() {\n\t\tdelete [] m_z_buffer_in;\n\t\tdelete [] m_z_buffer_out;\n\t}\n\n\tbool parser::parse_stream(istream &stream) {\n\t\treturn parse_stream(stream, [this](const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date) {\n\t\t\thandle_html(url, html, ip, date);\n\t\t});\n\t}\n\n\tbool parser::parse_stream(std::istream &stream, std::function<void(const std::string &url, const ::parser::html_parser &html, const std::string &ip,\n\t\t\t\tconst std::string &date)> callback) {\n\t\tm_callback = callback;\n\t\tsize_t total_bytes_read = 0;\n\t\twhile (stream.good()) {\n\t\t\tstream.read(m_z_buffer_in, WARC_PARSER_ZLIB_IN);\n\n\t\t\tauto bytes_read = stream.gcount();\n\t\t\ttotal_bytes_read += bytes_read;\n\n\t\t\tif (bytes_read > 0) {\n\t\t\t\tif (unzip_chunk(bytes_read) < 0) {\n\t\t\t\t\tcout << \"Stopped because fatal error\" << endl;\n\t\t\t\t\tbreak;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn true;\n\t}\n\n\tvoid parser::handle_html(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date) {\n\n\t\tm_result += (url\n\t\t\t\t+ '\\t' + html.title()\n\t\t\t\t+ '\\t' + html.h1()\n\t\t\t\t+ '\\t' + html.meta()\n\t\t\t\t+ '\\t' + html.text()\n\t\t\t\t+ '\\t' + date\n\t\t\t\t+ '\\t' + ip\n\t\t\t\t+ '\\n');\n\t\tfor (const auto &link : html.links()) {\n\t\t\tm_links += (link.host()\n\t\t\t\t+ '\\t' + link.path()\n\t\t\t\t+ '\\t' + link.target_host()\n\t\t\t\t+ '\\t' + link.target_path()\n\t\t\t\t+ '\\t' + link.text()\n\t\t\t\t+ '\\t' + (link.nofollow() ? \"1\" : \"0\")\n\t\t\t\t+ '\\n');\n\t\t}\n\n\t\t// internal links are too messy for us now.\n\t\t/*for (const auto &link : html.internal_links()) {\n\t\t\t// link is a std::pair<uint64_t, uint64_t>\n\t\t\tm_internal_links.append((char *)&link.first, sizeof(uint64_t));\n\t\t\tm_internal_links.append((char *)&link.second, sizeof(uint64_t));\n\t\t}*/\n\n\t}\n\n\tint parser::unzip_record(char *data, int size) {\n\n\t\t/*\n\t\t\tdata is:\n\t\t\t#|------------------|-----|------------------------|--|----#-------|\n\t\t\t |doc_a______________doc_b_doc_c_____|\n\t\t\t\t\t\t\t\t WARC_PARSER_ZLIB_IN\n\t\t\t |_________________________________________________________|\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t   size\n\t\t*/\n\n\t\tint data_size = size;\n\t\tint consumed = 0, consumed_total = 0;\n\t\tint avail_in_before_inflate;\n\t\tint ret = Z_OK;\n\t\tunsigned have;\n\n\t\tif (!m_continue_inflate) {\n\t\t\tm_zstream.zalloc = Z_NULL;\n\t\t\tm_zstream.zfree = Z_NULL;\n\t\t\tm_zstream.opaque = Z_NULL;\n\n\t\t\tm_zstream.avail_in = 0;\n\t\t\tm_zstream.next_in = Z_NULL;\n\n\t\t\tint err = inflateInit2(&m_zstream, 16);\n\t\t\tif (err != Z_OK) {\n\t\t\t\tcout << \"zlib error\" << endl;\n\t\t\t}\n\t\t} else {\n\t\t\t// just continue on the last one.\n\t\t}\n\n\t\t/* decompress until deflate stream ends or end of file */\n\t\tdo {\n\n\t\t\tm_zstream.next_in = (unsigned char *)(data + consumed_total);\n\n\t\t\tm_zstream.avail_in = min(WARC_PARSER_ZLIB_IN, data_size);\n\n\t\t\tif (m_zstream.avail_in == 0)\n\t\t\t\tbreak;\n\n\t\t\t/* run inflate() on input until output buffer not full */\n\t\t\tdo {\n\n\t\t\t\tm_zstream.avail_out = WARC_PARSER_ZLIB_OUT;\n\t\t\t\tm_zstream.next_out = (unsigned char *)m_z_buffer_out;\n\n\t\t\t\tavail_in_before_inflate = m_zstream.avail_in;\n\n\t\t\t\tret = inflate(&m_zstream, Z_NO_FLUSH);\n\n\t\t\t\t// consumed is the number of bytes read from input in this inflate\n\t\t\t\tconsumed = (avail_in_before_inflate - m_zstream.avail_in);\n\t\t\t\tdata_size -= consumed;\n\t\t\t\tconsumed_total += consumed;\n\t\t\t\tassert(ret != Z_STREAM_ERROR);  /* state not clobbered */\n\t\t\t\tswitch (ret) {\n\t\t\t\tcase Z_BUF_ERROR:\n\t\t\t\t\t//cout << \"Z_BUF_ERROR\" << endl;\n\t\t\t\t\t// Not fatal, just keep going.\n\t\t\t\t\tbreak;\n\t\t\t\tcase Z_NEED_DICT:\n\t\t\t\t\tret = Z_DATA_ERROR;\t /* and fall through */\n\t\t\t\t\tcout << \"Z_MEM_ERROR\" << endl;\n\t\t\t\t\t(void)inflateEnd(&m_zstream);\n\t\t\t\t\treturn -1;\n\t\t\t\tcase Z_DATA_ERROR:\n\t\t\t\tcase Z_MEM_ERROR:\n\t\t\t\t\tcout << \"Z_MEM_ERROR\" << endl;\n\t\t\t\t\t(void)inflateEnd(&m_zstream);\n\t\t\t\t\treturn -1;\n\t\t\t\t}\n\n\t\t\t\thave = WARC_PARSER_ZLIB_OUT - m_zstream.avail_out;\n\t\t\t\thandle_record_chunk((char *)m_z_buffer_out, have);\n\n\t\t\t} while (m_zstream.avail_out == 0);\n\n\t\t\tif (data_size <= 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\n\t\t\t/* done when inflate() says it's done */\n\t\t} while (ret != Z_STREAM_END);\n\n\t\t//cout << \"ret: \" << ret << endl;\n\t\t//cout << \"Ending with code: \" << ret << endl;\n\t\tif (ret == Z_OK || ret == Z_BUF_ERROR) {\n\t\t\tm_continue_inflate = true;\n\t\t} else {\n\t\t\tm_continue_inflate = false;\n\t\t\t(void)inflateEnd(&m_zstream);\n\t\t}\n\n\t\t/* clean up and return */\n\t\treturn consumed_total;\n\t}\n\n\tint parser::unzip_chunk(int bytes_in) {\n\n\t\tint consumed = 0;\n\t\tint consumed_total = 0;\n\n\t\tchar *ptr = m_z_buffer_in;\n\t\tint len = bytes_in;\n\n\t\twhile (len > 0) {\n\t\t\tconsumed = unzip_record(ptr, len);\n\t\t\t//cout << \"consumed: \" << consumed << \" len: \" << len << endl;\n\t\t\tif (consumed == 0) {\n\t\t\t\tcout << \"Nothing consumed, done...\" << endl;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tif (consumed < 0) {\n\t\t\t\tcout << \"Encountered fatal error\" << endl;\n\t\t\t\treturn -1;\n\t\t\t}\n\t\t\tptr += consumed;\n\t\t\tlen -= consumed;\n\t\t\tconsumed_total += consumed;\n\t\t}\n\n\t\treturn 0;\n\t}\n\n\t/*\n\t * Handles unzipped data. The data pointer is either pointing to a new warc record or it is the continuation of a previous warc record.\n\t * */\n\tvoid parser::handle_record_chunk(char *data, int len) {\n\n\t\tm_handled += len;\n\t\tm_num_handled++;\n\n\t\tif (len > 8 && strncmp(data, \"WARC/1.0\", 8) == 0) {\n\t\t\t// data is the start of a warc record\n\t\t\tstring record(data, len);\n\t\t\tm_current_record.assign(data, len);\n\t\t} else {\n\t\t\tm_current_record.append(data, len);\n\t\t}\n\n\t\tif (m_current_record.find(\"\\r\\n\\r\\n\") != string::npos) {\n\n\t\t\tconst string warc_header = get_warc_header(m_current_record);\n\t\t\tconst string content_len_str = ::parser::get_http_header(warc_header, \"Content-Length: \");\n\n\t\t\tsize_t content_len = stoull(content_len_str);\n\t\t\tsize_t received_content = m_current_record.size() - (warc_header.size() + 8);\n\n\t\t\tif (content_len == received_content) {\n\t\t\t\tconst string type = ::parser::get_http_header(warc_header, \"WARC-Type: \");\n\n\t\t\t\tif (type == \"response\") {\n\t\t\t\t\tparse_record(warc_header, m_current_record);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t}\n\n\tvoid parser::parse_record(const string &warc_header, const string &warc_record) {\n\n\t\tconst string url = ::parser::get_http_header(warc_header, \"WARC-Target-URI: \");\n\t\tconst string tld = m_html_parser.url_tld(url);\n\n\t\tif (tlds.count(tld) == 0) return;\n\n\t\tconst string ip = ::parser::get_http_header(warc_header, \"WARC-IP-Address: \");\n\t\tconst string date = ::parser::get_http_header(warc_header, \"WARC-Date: \");\n\n\t\tconst size_t warc_response_start = warc_record.find(\"\\r\\n\\r\\n\");\n\t\tconst size_t response_body_start = warc_record.find(\"\\r\\n\\r\\n\", warc_response_start + 4);\n\n\t\tstring http_header = warc_record.substr(warc_response_start + 4, response_body_start - warc_response_start - 4);\n\t\ttext::lower_case(http_header);\n\n\t\t//const size_t http_code = http_response_code(http_header);\n\t\t//const string location = ::parser::get_http_header(warc_header, \"location: \");\n\n\t\tstring html = warc_record.substr(response_body_start + 4);\n\t\tm_html_parser.parse(html, url);\n\n\t\tif (m_html_parser.should_insert()) {\n\t\t\tm_callback(url, m_html_parser, ip, date);\n\t\t}\n\t}\n\n\tstring parser::get_warc_header(const string &record) {\n\t\tconst size_t pos = record.find(\"\\r\\n\\r\\n\");\n\t\treturn record.substr(0, pos);\n\t}\n\n\tsize_t parser::http_response_code(const string &http_header) {\n\t\tconst size_t return_on_invalid = 500;\n\t\tconst size_t code_start = http_header.find(' ');\n\t\tconst size_t code_end = http_header.find(' ', code_start);\n\t\tif (code_start == string::npos || code_end == string::npos) return return_on_invalid;\n\n\t\tsize_t response_code = stoull(http_header.substr(code_start + 1, 3));\n\n\t\tif (response_code < 100 || response_code >= 600) return return_on_invalid;\n\n\t\treturn response_code;\n\t}\n\n\tvoid multipart_download(const string &url, const std::function<void(const string &chunk)> &callback) {\n\n\t\tint error;\n\t\tsize_t content_len = transfer::head_content_length(url, error);\n\n\t\tif (error == transfer::ERROR) {\n\t\t\tthrow std::runtime_error(\"Could not make HEAD request to: \" + url);\n\t\t}\n\n\t\tconst size_t max_parts = 50;\n\t\tconst size_t max_retries = 3;\n\n\t\tsize_t part = 1;\n\t\tsize_t read_bytes = 0;\n\t\twhile (read_bytes < content_len && part < max_parts) {\n\t\t\tsize_t retry = 0;\n\t\t\twhile (retry < max_retries) {\n\t\t\t\tstring buffer;\n\t\t\t\ttransfer::url_to_string(url + \"?partNumber=\" + to_string(part), buffer, error);\n\t\t\t\tif (error == transfer::OK) {\n\t\t\t\t\tread_bytes += buffer.size();\n\t\t\t\t\tcallback(buffer);\n\t\t\t\t\tbreak;\n\t\t\t\t} else {\n\t\t\t\t\tthrow std::runtime_error(\"Got error response\");\n\t\t\t\t}\n\t\t\t\tretry++;\n\t\t\t}\n\t\t\tif (retry == max_retries) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tpart++;\n\t\t}\n\t}\n\n\tstring get_result_path(const string &warc_path) {\n\t\tstring path = warc_path;\n\t\tpath.replace(path.find(\".warc.gz\"), 8, string(\".gz\"));\n\t\treturn path;\n\t}\n\n\tstring get_link_result_path(const string &warc_path) {\n\t\tstring path = warc_path;\n\t\tpath.replace(path.find(\".warc.gz\"), 8, string(\".links.gz\"));\n\t\treturn path;\n\t}\n\n\tstring get_internal_link_result_path(const string &warc_path) {\n\t\tstring path = warc_path;\n\t\tpath.replace(path.find(\".warc.gz\"), 8, string(\".internal.gz\"));\n\t\treturn path;\n\t}\n\n}\n\n"
  },
  {
    "path": "src/warc/warc.h",
    "content": "\n#pragma once\n\n#include <iostream>\n#include \"parser/html_parser.h\"\n#include \"parser/parser.h\"\n#include \"zlib.h\"\n\n#define WARC_PARSER_ZLIB_IN 1024*1024*16\n#define WARC_PARSER_ZLIB_OUT 1024*1024*16\n\nnamespace warc {\n\n\tusing std::string;\n\n\tclass parser {\n\n\t\tpublic:\n\n\t\t\tparser();\n\t\t\t~parser();\n\n\t\t\tbool parse_stream(std::istream &stream);\n\t\t\tbool parse_stream(std::istream &stream, std::function<void(const std::string &url, const ::parser::html_parser &html, const std::string &ip,\n\t\t\t\t\t\tconst std::string &date)>);\n\t\t\tconst string &result() const { return m_result; };\n\t\t\tconst string &link_result() const { return m_links; };\n\t\t\tconst string &internal_link_result() const { return m_internal_links; };\n\t\t\tvoid handle_html(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date);\n\n\t\tprivate:\n\n\t\t\tint m_cur_offset = 0;\n\t\t\tbool m_continue_inflate = false;\n\t\t\tstd::string m_result;\n\t\t\tstd::string m_links;\n\t\t\tstd::string m_internal_links;\n\t\t\t::parser::html_parser m_html_parser;\n\t\t\tstd::function<void(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date)>\n\t\t\t\tm_callback;\n\n\t\t\tchar *m_z_buffer_in;\n\t\t\tchar *m_z_buffer_out;\n\n\t\t\tz_stream m_zstream; /* decompression stream */\n\n\t\t\tsize_t m_handled = 0;\n\t\t\tsize_t m_num_handled = 0;\n\t\t\tstring m_current_record;\n\n\t\t\tint unzip_record(char *data, int size);\n\t\t\tint unzip_chunk(int bytes_in);\n\n\t\t\tvoid handle_record_chunk(char *data, int len);\n\t\t\tvoid parse_record(const std::string &warc_header, const std::string &warc_record);\n\t\t\tstd::string get_warc_header(const std::string &record);\n\t\t\tsize_t http_response_code(const string &http_header);\n\n\t};\n\n\tvoid multipart_download(const string &url, const std::function<void(const string &chunk)> &callback);\n\n\tstring get_result_path(const string &warc_path);\n\tstring get_link_result_path(const string &warc_path);\n\tstring get_internal_link_result_path(const string &warc_path);\n}\n"
  },
  {
    "path": "tests/main.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#define BOOST_TEST_MODULE \"Unit tests for alexandria.org\"\n\n#define BOOST_TEST_NO_MAIN\n#define BOOST_TEST_DYN_LINK\n#include <boost/test/unit_test.hpp>\n#include <boost/test/tools/floating_point_comparison.hpp>\n\n#include \"config.h\"\n#include \"logger/logger.h\"\n\n#include <iostream>\n#include <stdlib.h>\n#include <fstream>\n#include <streambuf>\n#include <math.h>\n#include <vector>\n#include <set>\n#include <map>\n\nusing std::string;\nusing std::vector;\nusing std::ifstream;\nusing std::stringstream;\nusing std::set;\nusing std::map;\nusing std::pair;\n\nvoid run_before() {\n\tconfig::read_config(\"../tests/test_config.conf\");\n\tlogger::start_logger_thread();\n}\n\nvoid run_after() {\n\tlogger::join_logger_thread();\n}\n\nint BOOST_TEST_CALL_DECL\nmain(int argc, char* argv[]) {\n\n\trun_before();\n\n    int ret = ::boost::unit_test::unit_test_main(&init_unit_test, argc, argv);\n\n\trun_after();\n\n\treturn ret;\n}\n\n"
  },
  {
    "path": "tests/test_algorithm.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"algorithm/algorithm.h\"\n#include \"algorithm/intersection.h\"\n#include \"algorithm/hyper_ball.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_algorithm)\n\nBOOST_AUTO_TEST_CASE(intersection_test) {\n\n\t{\n\t\tconst vector<int> result = algorithm::intersection<int>({\n\t\t\t{1, 2, 3},\n\t\t\t{2, 3},\n\t\t\t{2, 3, 4}\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(2, result.size());\n\t\tBOOST_CHECK_EQUAL(2, result[0]);\n\t\tBOOST_CHECK_EQUAL(3, result[1]);\n\t}\n\n\t{\n\t\tconst vector<int> result = algorithm::intersection<int>({\n\t\t\t{1, 2, 3, 5},\n\t\t\t{2, 3, 5, 7},\n\t\t\t{2, 3, 4, 5}\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(3, result.size());\n\t\tBOOST_CHECK_EQUAL(2, result[0]);\n\t\tBOOST_CHECK_EQUAL(3, result[1]);\n\t\tBOOST_CHECK_EQUAL(5, result[2]);\n\t}\n\n\t{\n\t\tconst vector<int> result = algorithm::intersection<int>({});\n\n\t\tBOOST_CHECK_EQUAL(0, result.size());\n\t}\n\n\t{\n\t\tconst vector<int> result = algorithm::intersection<int>({\n\t\t\t{1, 2, 3, 5, 6, 7, 8},\n\t\t\t{9, 10},\n\t\t\t{1, 2, 3, 4, 5}\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(0, result.size());\n\t}\n\n\t{\n\n\t\tclass T {\n\t\t\tpublic:\n\t\t\tsize_t m_v;\n\t\t\tfloat m_s;\n\n\t\t\tT(size_t v, float s) : m_v(v), m_s(s) {}\n\n\t\t\tbool operator<(const T &other) const {\n\t\t\t\treturn m_v < other.m_v;\n\t\t\t}\n\n\t\t\tbool operator==(const T &other) const {\n\t\t\t\treturn m_v == other.m_v;\n\t\t\t}\n\n\t\t};\n\t\tconst vector<T> result = algorithm::intersection<T>({\n\t\t\t{T(1, 1.0f), T(2, 1.0f), T(3, 1.0f), T(4, 1.0f)},\n\t\t\t{T(3, 2.0f), T(4, 2.0f), T(5, 2.0f)},\n\t\t\t{T(4, 3.0f), T(5, 3.0f), T(6, 3.0f), T(7, 3.0f), T(8, 3.0f)}\n\t\t}, [](T &a, const T &b) {\n\t\t\treturn a.m_s += b.m_s;\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(1, result.size());\n\t\tBOOST_CHECK_EQUAL(result[0].m_v, 4);\n\t\tBOOST_CHECK_EQUAL(result[0].m_s, 6.0f);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(incremental_partitions) {\n\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({5}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 5);\n\t}\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({6}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 6);\n\t}\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({3}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 3);\n\t\tBOOST_CHECK(res[0] == vector<int>{0});\n\t\tBOOST_CHECK(res[1] == vector<int>{1});\n\t\tBOOST_CHECK(res[2] == vector<int>{2});\n\t}\n\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({2, 2}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 4);\n\t\tBOOST_CHECK((res[0] == vector<int>{0, 0}));\n\t\tBOOST_CHECK((res[1] == vector<int>{1, 0}));\n\t\tBOOST_CHECK((res[2] == vector<int>{0, 1}));\n\t\tBOOST_CHECK((res[3] == vector<int>{1, 1}));\n\t}\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({3, 3}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 9);\n\t\tBOOST_CHECK((res[0] == vector<int>{0, 0}));\n\t\tBOOST_CHECK((res[1] == vector<int>{1, 0}));\n\t\tBOOST_CHECK((res[2] == vector<int>{0, 1}));\n\t\tBOOST_CHECK((res[3] == vector<int>{1, 1}));\n\t\tBOOST_CHECK((res[4] == vector<int>{2, 0}));\n\t\tBOOST_CHECK((res[5] == vector<int>{0, 2}));\n\t\tBOOST_CHECK((res[6] == vector<int>{2, 1}));\n\t\tBOOST_CHECK((res[7] == vector<int>{1, 2}));\n\t\tBOOST_CHECK((res[8] == vector<int>{2, 2}));\n\t}\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({3, 3}, 5);\n\t\tBOOST_CHECK_EQUAL(res.size(), 5);\n\t\tBOOST_CHECK((res[0] == vector<int>{0, 0}));\n\t\tBOOST_CHECK((res[1] == vector<int>{1, 0}));\n\t\tBOOST_CHECK((res[2] == vector<int>{0, 1}));\n\t\tBOOST_CHECK((res[3] == vector<int>{1, 1}));\n\t\tBOOST_CHECK((res[4] == vector<int>{2, 0}));\n\t}\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({3, 3, 3}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 27);\n\t\tBOOST_CHECK((res[0] == vector<int>{0, 0, 0}));\n\t\tBOOST_CHECK((res[1] == vector<int>{1, 0, 0}));\n\t\tBOOST_CHECK((res[2] == vector<int>{0, 1, 0}));\n\t\tBOOST_CHECK((res[3] == vector<int>{0, 0, 1}));\n\t\tBOOST_CHECK((res[4] == vector<int>{1, 1, 0}));\n\t\tBOOST_CHECK((res[5] == vector<int>{1, 0, 1}));\n\t\tBOOST_CHECK((res[6] == vector<int>{0, 1, 1}));\n\t\tBOOST_CHECK((res[7] == vector<int>{2, 0, 0}));\n\t\tBOOST_CHECK((res[8] == vector<int>{0, 2, 0}));\n\t\tBOOST_CHECK((res[9] == vector<int>{0, 0, 2}));\n\t\tBOOST_CHECK((res[10] == vector<int>{1, 1, 1}));\n\t\tBOOST_CHECK((res[11] == vector<int>{2, 1, 0}));\n\t\tBOOST_CHECK((res[12] == vector<int>{2, 0, 1}));\n\t\tBOOST_CHECK((res[13] == vector<int>{1, 2, 0}));\n\t\tBOOST_CHECK((res[14] == vector<int>{1, 0, 2}));\n\t\tBOOST_CHECK((res[15] == vector<int>{0, 2, 1}));\n\t}\n\t{\n\t\tvector<vector<int>> res = algorithm::incremental_partitions({2, 3}, 64);\n\t\tBOOST_CHECK_EQUAL(res.size(), 6);\n\t\tBOOST_CHECK((res[0] == vector<int>{0, 0}));\n\t\tBOOST_CHECK((res[1] == vector<int>{1, 0}));\n\t\tBOOST_CHECK((res[2] == vector<int>{0, 1}));\n\t\tBOOST_CHECK((res[3] == vector<int>{1, 1}));\n\t\tBOOST_CHECK((res[4] == vector<int>{0, 2}));\n\t\tBOOST_CHECK((res[5] == vector<int>{1, 2}));\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(harmonic_centrality) {\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0)};\n\t\tvector<double> h = algorithm::harmonic_centrality(3, e, 6);\n\t\tBOOST_CHECK(h.size() == 3);\n\t\tBOOST_CHECK((h == vector<double>{1.5, 1.5, 1.5}));\n\t}\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 1),\n\t\t\tstd::make_pair(1, 2),\n\t\t\tstd::make_pair(2, 0),\n\t\t\tstd::make_pair(2, 3),\n\t\t\tstd::make_pair(3, 4),\n\t\t\tstd::make_pair(3, 5),\n\t\t\tstd::make_pair(4, 2),\n\t\t\tstd::make_pair(5, 4),\n\t\t};\n\t\tvector<double> h = algorithm::harmonic_centrality(7, e, 6);\n\t\tBOOST_CHECK(h.size() == 7);\n\t\tBOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001);\n\t\tBOOST_CHECK_EQUAL(h[6], 0.0);\n\t}\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 1),\n\t\t\tstd::make_pair(1, 2),\n\t\t\tstd::make_pair(2, 1),\n\t\t\tstd::make_pair(3, 1),\n\t\t\tstd::make_pair(4, 1),\n\t\t\tstd::make_pair(5, 1),\n\t\t\tstd::make_pair(6, 1),\n\t\t\tstd::make_pair(7, 1),\n\t\t};\n\t\tvector<double> h = algorithm::harmonic_centrality(8, e, 6);\n\t\tBOOST_CHECK(h.size() == 8);\n\t\tBOOST_CHECK_CLOSE(h[1], 7, 0.000001);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(harmonic_centrality_threaded) {\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0)};\n\t\tvector<double> h = algorithm::harmonic_centrality_threaded(3, e, 6, 3);\n\t\tBOOST_CHECK(h.size() == 3);\n\t\tBOOST_CHECK((h == vector<double>{1.5, 1.5, 1.5}));\n\t}\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 1),\n\t\t\tstd::make_pair(1, 2),\n\t\t\tstd::make_pair(2, 0),\n\t\t\tstd::make_pair(2, 3),\n\t\t\tstd::make_pair(3, 4),\n\t\t\tstd::make_pair(3, 5),\n\t\t\tstd::make_pair(4, 2),\n\t\t\tstd::make_pair(5, 4),\n\t\t};\n\t\tvector<double> h = algorithm::harmonic_centrality_threaded(7, e, 6, 2);\n\t\tBOOST_CHECK(h.size() == 7);\n\t\tBOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001);\n\t\tBOOST_CHECK_EQUAL(h[6], 0.0);\n\t}\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 1),\n\t\t\tstd::make_pair(1, 2),\n\t\t\tstd::make_pair(2, 1),\n\t\t\tstd::make_pair(3, 1),\n\t\t\tstd::make_pair(4, 1),\n\t\t\tstd::make_pair(5, 1),\n\t\t\tstd::make_pair(6, 1),\n\t\t\tstd::make_pair(7, 1),\n\t\t};\n\t\tvector<double> h = algorithm::harmonic_centrality_threaded(8, e, 6, 1);\n\t\tBOOST_CHECK(h.size() == 8);\n\t\tBOOST_CHECK_CLOSE(h[1], 7, 0.000001);\n\t}\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_bloom_filter.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include <fstream>\n#include \"algorithm/bloom_filter.h\"\n#include \"algorithm/hash.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_bloom_filter)\n\nBOOST_AUTO_TEST_CASE(test_bloom_filter) {\n\talgorithm::bloom_filter bf;\n\n\tbf.insert(\"test\");\n\tBOOST_CHECK(bf.exists(\"test\"));\n\tBOOST_CHECK(!bf.exists(\"test2\"));\n\n\tbf.insert(\"test2\");\n\tBOOST_CHECK(bf.exists(\"test2\"));\n}\n\nBOOST_AUTO_TEST_CASE(test_bloom_filter_merge) {\n\n\talgorithm::bloom_filter bf1;\n\tbf1.insert(\"test1\");\n\tbf1.insert(\"test2\");\n\n\talgorithm::bloom_filter bf2;\n\tbf2.insert(\"test3\");\n\tbf2.insert(\"test4\");\n\n\tbf1.merge(bf2);\n\n\tBOOST_CHECK(bf1.exists(\"test1\"));\n\tBOOST_CHECK(bf1.exists(\"test2\"));\n\tBOOST_CHECK(bf1.exists(\"test3\"));\n\tBOOST_CHECK(bf1.exists(\"test4\"));\n\n\tBOOST_CHECK(!bf1.exists(\"test0\"));\n\tBOOST_CHECK(!bf1.exists(\"test5\"));\n\tBOOST_CHECK(!bf1.exists(\"random\"));\n\tBOOST_CHECK(!bf1.exists(\"random2\"));\n}\n\nBOOST_AUTO_TEST_CASE(test_bloom_filter_save) {\n\t{\n\t\talgorithm::bloom_filter bf;\n\t\tbf.insert(\"test1\");\n\t\tbf.insert(\"test2\");\n\t\tbf.write_file(\"/tmp/bloom\");\n\t}\n\n\t{\n\t\talgorithm::bloom_filter bf;\n\t\tbf.read_file(\"/tmp/bloom\");\n\n\t\tBOOST_CHECK(bf.exists(\"test1\"));\n\t\tBOOST_CHECK(bf.exists(\"test2\"));\n\t\tBOOST_CHECK(!bf.exists(\"test3\"));\n\t}\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_cc_parser.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"config.h\"\n#include \"warc/warc.h\"\n#include \"URL.h\"\n#include \"parser/cc_parser.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(cc_parser)\n\nBOOST_AUTO_TEST_CASE(download_warc_paths) {\n\t{\n\t\tvector<string> paths = parser::download_warc_paths();\n\t\tBOOST_CHECK_EQUAL(paths.size(), 0);\n\n\t\tpaths.push_back(\"test_path/testing1\");\n\t\tpaths.push_back(\"test_path/testing2\");\n\n\t\tBOOST_CHECK(parser::upload_warc_paths(paths));\n\t}\n\t{\n\t\tvector<string> paths = parser::download_warc_paths();\n\t\tBOOST_CHECK_EQUAL(paths.size(), 2);\n\t\tBOOST_CHECK_EQUAL(paths[0], \"test_path/testing1\");\n\t\tBOOST_CHECK_EQUAL(paths[1], \"test_path/testing2\");\n\t}\n\tBOOST_CHECK(parser::upload_warc_paths({}));\n}\n\nBOOST_AUTO_TEST_CASE(download_warc) {\n\t// This amazon bucket is gone\n\t/*\n\tstring buffer;\n\twarc::multipart_download(\"http://alexandria-test-data.s3.amazonaws.com/multipart_test\", [&buffer](const string &data) {\n\t\tbuffer.append(data);\n\t});\n\n\tBOOST_CHECK_EQUAL(buffer.size(), 15728640);\n\tBOOST_CHECK_EQUAL(algorithm::hash(buffer), 1803966798292769636ull);\n\t*/\n}\n\nBOOST_AUTO_TEST_CASE(parse_cc_batch) {\n\tifstream infile(config::test_data_path + \"bokus_test.warc.gz\", std::ios::binary);\n\n\twarc::parser pp;\n\tpp.parse_stream(infile);\n\n\t{\n\t\tstringstream ss(pp.result());\n\t\tstring line;\n\t\tbool found_url = false;\n\t\twhile (getline(ss, line)) {\n\t\t\tvector<string> cols;\n\t\t\tboost::algorithm::split(cols, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tif (cols[0] == \"https://www.bokus.com/recension/670934\") {\n\t\t\t\tBOOST_CHECK(cols[1].substr(0, 26) == \"Mycket intressant läsning\");\n\t\t\t\tBOOST_CHECK(cols[2].substr(0, 25) == \"Recension av Lena Klippvi\");\n\t\t\t\tBOOST_CHECK(cols[3].substr(0, 25) == \"Mycket intressant läsnin\");\n\t\t\t\tBOOST_CHECK(cols[4].substr(0, 120) == \"Recenserad produkt Los Angeles's Original Farmers Market Häftad (Trade Paper) Mycket intressant läsning om hur Farmers\");\n\t\t\t\tBOOST_CHECK(cols[5] == \"2021-07-31T20:08:45Z\");\n\t\t\t\tBOOST_CHECK(cols[6] == \"213.187.205.190\");\n\t\t\t\tfound_url = true;\n\t\t\t}\n\t\t}\n\t\tBOOST_CHECK(found_url);\n\t}\n\n\t{\n\t\tstringstream ss(pp.link_result());\n\t\tstring line;\n\t\tint links_found = 0;\n\t\twhile (getline(ss, line)) {\n\t\t\tvector<string> cols;\n\t\t\tboost::algorithm::split(cols, line, boost::is_any_of(\"\\t\"));\n\n\t\t\tif (links_found == 0) {\n\t\t\t\tBOOST_CHECK(cols[0] == \"bokus.com\");\n\t\t\t\tBOOST_CHECK(cols[1] == \"/recension/670934\");\n\t\t\t\tBOOST_CHECK(cols[2] == \"help.bokus.com\");\n\t\t\t\tBOOST_CHECK(cols[3] == \"/\");\n\t\t\t\tBOOST_CHECK(cols[4] == \"Vanliga frågor & svar\");\n\t\t\t}\n\t\t\tlinks_found++;\n\t\t}\n\t\tBOOST_CHECK_EQUAL(links_found, 8);\n\t}\n\n\t/*{\n\t\tconst char *internal_links = pp.internal_link_result().c_str();\n\t\t{\n\t\t\tconst uint64_t hash1 = *((uint64_t *)&internal_links[0]);\n\t\t\tconst uint64_t hash2 = *((uint64_t *)&internal_links[8]);\n\t\t\tBOOST_CHECK_EQUAL(hash1, URL(\"https://www.bokus.com/recension/670934\").hash());\n\t\t\tBOOST_CHECK_EQUAL(hash2, URL(\"https://www.bokus.com/cgi-bin/logout_user_info.cgi\").hash());\n\t\t}\n\t\t{\n\t\t\tconst uint64_t hash1 = *((uint64_t *)&internal_links[16]);\n\t\t\tconst uint64_t hash2 = *((uint64_t *)&internal_links[24]);\n\t\t\tBOOST_CHECK_EQUAL(hash1, URL(\"https://www.bokus.com/recension/670934\").hash());\n\t\t\tBOOST_CHECK_EQUAL(hash2, URL(\"https://www.bokus.com/cgi-bin/log_in_real.cgi\").hash());\n\t\t}\n\t}*/\n}\n\nBOOST_AUTO_TEST_CASE(parse_cc_batch_multistream) {\n\n\tstring response;\n\t{\n\t\twarc::parser pp;\n\t\tifstream infile(config::test_data_path + \"warc_test.gz\", std::ios::binary);\n\t\tpp.parse_stream(infile);\n\n\t\tresponse = pp.result();\n\t}\n\n\tvector<string> files = {\n\t\tconfig::test_data_path + \"warc_test.gz.aa\",\n\t\tconfig::test_data_path + \"warc_test.gz.ab\",\n\t\tconfig::test_data_path + \"warc_test.gz.ac\",\n\t\tconfig::test_data_path + \"warc_test.gz.ad\",\n\t\tconfig::test_data_path + \"warc_test.gz.ae\",\n\t\tconfig::test_data_path + \"warc_test.gz.af\",\n\t\tconfig::test_data_path + \"warc_test.gz.ag\",\n\t\tconfig::test_data_path + \"warc_test.gz.ah\",\n\t\tconfig::test_data_path + \"warc_test.gz.ai\",\n\t\tconfig::test_data_path + \"warc_test.gz.aj\"\n\t};\n\n\twarc::parser pp;\n\n\tfor (const string &filename : files) {\n\t\tifstream infile(filename, std::ios::binary);\n\t\tpp.parse_stream(infile);\n\t}\n\n\tBOOST_CHECK_EQUAL(pp.result().size(), response.size());\n}\n\nBOOST_AUTO_TEST_CASE(parse_cc_batch_301) {\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_config.conf",
    "content": "\n# Cluster config\nnodes_in_cluster = 3\nnode_id = 0\nurl_store_host = \"http://localhost\";\n\ndata_path = .\n\nindex_snippets = 1\n\n# Indexer config\nbatches[] = ALEXANDRIA-MANUAL-01\nbatches[] = CC-MAIN-2021-25\nbatches[] = CC-MAIN-2021-31\n\nlink_batches[] = CC-MAIN-2021-31\nlink_batches[] = CC-MAIN-2021-25\nlink_batches[] = CC-MAIN-2021-21\nlink_batches[] = CC-MAIN-2021-17\nlink_batches[] = CC-MAIN-2021-10\nlink_batches[] = CC-MAIN-2021-04\nlink_batches[] = CC-MAIN-2020-50\nlink_batches[] = CC-MAIN-2020-45\n\n# Server config\nworker_count = 8\nquery_max_words = 10 # Maximum number of words used in query.\nquery_max_len = 200\ndeduplicate_domain_count = 5\npre_result_limit = 200000\nresult_limit = 1000\n\n# Full text config\nft_max_sections = 4\nft_max_results_per_section = 2000000\n\nn_grams = 1\nshard_hash_table_size = 100000\n\n"
  },
  {
    "path": "tests/test_config2.conf",
    "content": "\n# Cluster config\nnodes_in_cluster = 8;\nnode_id = 1;\n\nindex_snippets = 0\n\n# Indexer config\nbatches[] = ALEXANDRIA-MANUAL-02\nbatches[] = CC-MAIN-2021-20\nbatches[] = CC-MAIN-2021-30\n\nlink_batches[] = CC-MAIN-2021-30\nlink_batches[] = CC-MAIN-2021-20\nlink_batches[] = CC-MAIN-2021-20\nlink_batches[] = CC-MAIN-2021-10\nlink_batches[] = CC-MAIN-2021-11\nlink_batches[] = CC-MAIN-2021-00\nlink_batches[] = CC-MAIN-2020-51\nlink_batches[] = CC-MAIN-2020-40\n\n# Server config\nworker_count = 9\nquery_max_words = 100 # Maximum number of words used in query.\nquery_max_len = 0\ndeduplicate_domain_count = 5000\npre_result_limit = 2\nresult_limit = 10\n\n# Full text config\nft_max_sections = 2\nft_max_results_per_section = 20\n\nn_grams = 5\nshard_hash_table_size = 100000\n\n"
  },
  {
    "path": "tests/test_configuration.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"config.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_config)\n\nBOOST_AUTO_TEST_CASE(read_config) {\n\tconfig::read_config(\"../tests/test_config.conf\");\n\tBOOST_CHECK_EQUAL(config::nodes_in_cluster, 3);\n\tBOOST_CHECK_EQUAL(config::node_id, 0);\n\n\tvector<string> batches{\"ALEXANDRIA-MANUAL-01\", \"CC-MAIN-2021-25\", \"CC-MAIN-2021-31\"};\n\tBOOST_CHECK(config::batches == batches);\n\n\tvector<string> link_batches{\n\t\t\"CC-MAIN-2021-31\",\n        \"CC-MAIN-2021-25\",\n        \"CC-MAIN-2021-21\",\n        \"CC-MAIN-2021-17\",\n        \"CC-MAIN-2021-10\",\n        \"CC-MAIN-2021-04\",\n        \"CC-MAIN-2020-50\",\n        \"CC-MAIN-2020-45\"\n\t};\n\tBOOST_CHECK(config::link_batches == link_batches);\n\tBOOST_CHECK_EQUAL(config::worker_count, 8);\n\tBOOST_CHECK_EQUAL(config::query_max_words, 10);\n\tBOOST_CHECK_EQUAL(config::query_max_len, 200);\n\tBOOST_CHECK_EQUAL(config::deduplicate_domain_count, 5);\n\tBOOST_CHECK_EQUAL(config::pre_result_limit, 200000);\n\tBOOST_CHECK_EQUAL(config::result_limit, 1000);\n\tBOOST_CHECK_EQUAL(config::ft_max_sections, 4);\n\tBOOST_CHECK_EQUAL(config::ft_max_results_per_section, 2000000);\n\n\tconfig::read_config(\"../tests/test_config2.conf\");\n\tBOOST_CHECK_EQUAL(config::nodes_in_cluster, 8);\n\tBOOST_CHECK_EQUAL(config::node_id, 1);\n\n\tvector<string> batches2{\"ALEXANDRIA-MANUAL-02\", \"CC-MAIN-2021-20\", \"CC-MAIN-2021-30\"};\n\tBOOST_CHECK(config::batches == batches2);\n\n\tvector<string> link_batches2{\n\t\t\"CC-MAIN-2021-30\",\n        \"CC-MAIN-2021-20\",\n        \"CC-MAIN-2021-20\",\n        \"CC-MAIN-2021-10\",\n        \"CC-MAIN-2021-11\",\n        \"CC-MAIN-2021-00\",\n        \"CC-MAIN-2020-51\",\n        \"CC-MAIN-2020-40\"\n\t};\n\tBOOST_CHECK(config::link_batches == link_batches2);\n\tBOOST_CHECK_EQUAL(config::worker_count, 9);\n\tBOOST_CHECK_EQUAL(config::query_max_words, 100);\n\tBOOST_CHECK_EQUAL(config::query_max_len, 0);\n\tBOOST_CHECK_EQUAL(config::deduplicate_domain_count, 5000);\n\tBOOST_CHECK_EQUAL(config::pre_result_limit, 2);\n\tBOOST_CHECK_EQUAL(config::result_limit, 10);\n\tBOOST_CHECK_EQUAL(config::ft_max_sections, 2);\n\tBOOST_CHECK_EQUAL(config::ft_max_results_per_section, 20);\n\n\tBOOST_CHECK_EQUAL(config::n_grams, 5);\n\tBOOST_CHECK_EQUAL(config::index_snippets, false);\n\n\tconfig::read_config(\"../tests/test_config.conf\");\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_counted_index_builder.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"indexer/basic_index_builder.h\"\n#include \"indexer/basic_index.h\"\n#include \"indexer/counted_record.h\"\n#include \"indexer/sharded_builder.h\"\n#include \"indexer/sharded.h\"\n\nusing namespace indexer;\n\nBOOST_AUTO_TEST_SUITE(test_basic_index_builder)\n\nBOOST_AUTO_TEST_CASE(test_case_1) {\n\n\t{\n\t\tbasic_index_builder<counted_record> idx(\"test_index\", 0);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, counted_record(1000, 1.0f));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\tbasic_index<counted_record> idx(\"test_index\", 0);\n\n\t\tstd::vector<counted_record> res = idx.find(101);\n\t\tBOOST_REQUIRE(res.size() == 1);\n\t\tBOOST_CHECK(res[0].m_value == 1000);\n\t\tBOOST_CHECK(res[0].m_count == 1);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_case_2) {\n\n\t{\n\t\tbasic_index_builder<counted_record> idx(\"test_index\", 0);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, counted_record(1000));\n\t\tidx.add(101, counted_record(1000));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\tbasic_index<counted_record> idx(\"test_index\", 0);\n\n\t\tstd::vector<counted_record> res = idx.find(101);\n\t\tBOOST_REQUIRE(res.size() == 1);\n\t\tBOOST_CHECK(res[0].m_value == 1000);\n\t\tBOOST_CHECK(res[0].m_count == 2);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_case_3) {\n\n\t{\n\t\tbasic_index_builder<counted_record> idx(\"test_index\", 0);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, counted_record(1000));\n\t\tidx.add(101, counted_record(1001));\n\t\tidx.add(101, counted_record(1000));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\tbasic_index<counted_record> idx(\"test_index\", 0);\n\n\t\tstd::vector<counted_record> res = idx.find(101);\n\t\tBOOST_REQUIRE(res.size() == 2);\n\t\tBOOST_CHECK(res[0].m_value == 1000);\n\t\tBOOST_CHECK(res[0].m_count == 2);\n\t\tBOOST_CHECK(res[1].m_value == 1001);\n\t\tBOOST_CHECK(res[1].m_count == 1);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_case_4) {\n\n\t{\n\t\tsharded_builder<basic_index_builder, counted_record> idx(\"test_index\", 10);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, indexer::counted_record(1000));\n\t\tidx.add(101, indexer::counted_record(1001));\n\t\tidx.add(101, indexer::counted_record(1000));\n\t\tidx.add(102, indexer::counted_record(1002));\n\n\t\tidx.append();\n\t\tidx.merge();\n\n\t\tBOOST_CHECK(idx.document_count() == 3);\n\t}\n\n\t{\n\t\tsharded<basic_index, counted_record> idx(\"test_index\", 10);\n\n\t\tstd::vector<counted_record> res = idx.find(101);\n\t\tBOOST_REQUIRE(res.size() == 2);\n\t\tBOOST_CHECK(res[0].m_value == 1000);\n\t\tBOOST_CHECK(res[0].m_count == 2);\n\t\tBOOST_CHECK(res[1].m_value == 1001);\n\t\tBOOST_CHECK(res[1].m_count == 1);\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_datetime.h",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include \"common/datetime.h\"\n\nBOOST_AUTO_TEST_SUITE(test_datetime)\n\nBOOST_AUTO_TEST_CASE(cur_date) {\n\t/*std::cout << System::cur_date() << std::endl;\n\tstd::cout << System::cur_datetime() << std::endl;\n\tstd::cout << System::iso8601_datetime() << std::endl;*/\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_file.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"transfer/transfer.h\"\n#include \"text/text.h\"\n#include \"file/file.h\"\n#include \"file/tsv_file_remote.h\"\n#include \"file/tsv_file.h\"\n#include \"file/archive.h\"\n#include \"algorithm/hash.h\"\n#include \"config.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_file)\n\nBOOST_AUTO_TEST_CASE(transfer_test) {\n\tint error;\n\t{\n\t\tstring result = transfer::file_to_string(\"/test-data/example.txt\", error);\n\t\tBOOST_CHECK(error == transfer::OK);\n\t\tBOOST_CHECK(text::trim(result) == \"An example file\");\n\t}\n\n\t{\n\t\tstring result = transfer::gz_file_to_string(\"/test-data/example.txt.gz\", error);\n\t\tBOOST_CHECK(error == transfer::OK);\n\t\tBOOST_CHECK(text::trim(result) == \"An example file\");\n\t}\n\n\t{\n\t\tstring result = transfer::file_to_string(\"test-data/example.txt\", error);\n\t\tBOOST_CHECK(error == transfer::OK);\n\t\tBOOST_CHECK(text::trim(result) == \"An example file\");\n\t}\n\n\t{\n\t\tstring result = transfer::gz_file_to_string(\"test-data/example.txt.gz\", error);\n\t\tBOOST_CHECK(error == transfer::OK);\n\t\tBOOST_CHECK(text::trim(result) == \"An example file\");\n\t}\n\n\t{\n\t\tstringstream ss;\n\t\ttransfer::file_to_stream(\"/test-data/example.txt\", ss, error);\n\t\tstring result = ss.str();\n\t\tBOOST_CHECK(error == transfer::OK);\n\t\tBOOST_CHECK(text::trim(result) == \"An example file\");\n\t}\n\n\t{\n\t\tstringstream ss;\n\t\ttransfer::gz_file_to_stream(\"/test-data/example.txt.gz\", ss, error);\n\t\tstring result = ss.str();\n\t\tBOOST_CHECK(error == transfer::OK);\n\t\tBOOST_CHECK(text::trim(result) == \"An example file\");\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(handle_errors) {\n\tint error;\n\t{\n\t\tstring result = transfer::file_to_string(\"/non-existing.txt\", error);\n\t\tBOOST_CHECK(error == transfer::ERROR);\n\t}\n\n\t{\n\t\tstring result = transfer::gz_file_to_string(\"/non-existing.txt.gz\", error);\n\t\tBOOST_CHECK(error == transfer::ERROR);\n\t}\n\n\t{\n\t\tstringstream ss;\n\t\ttransfer::file_to_stream(\"/non-existing.txt\", ss, error);\n\t\tBOOST_CHECK(error == transfer::ERROR);\n\t}\n\n\t{\n\t\tstringstream ss;\n\t\ttransfer::gz_file_to_stream(\"/non-existing.txt.gz\", ss, error);\n\t\tBOOST_CHECK(error == transfer::ERROR);\n\t}\n\n\t{\n\t\tvector<string> downloaded = transfer::download_gz_files_to_disk({\"/non-existing.txt.gz\"});\n\t\tBOOST_CHECK(downloaded.size() == 0);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(tsv_file_exists) {\n\tfile::tsv_file_remote manual_paths_file(\"crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz\");\n\tvector<string> warc_paths;\n\tmanual_paths_file.read_column_into(0, warc_paths);\n\n\tBOOST_CHECK(manual_paths_file.is_open());\n\tBOOST_CHECK(warc_paths.size() > 0);\n\tBOOST_CHECK(warc_paths[0] == \"crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz\");\n}\n\nBOOST_AUTO_TEST_CASE(tsv_file_dont_exists) {\n\tfile::tsv_file_remote manual_paths_file(\"non-existing-file.gz\");\n\tBOOST_CHECK(!manual_paths_file.is_open());\n}\n\nBOOST_AUTO_TEST_CASE(local_tsv_files) {\n\n\tfile::tsv_file my_file(config::test_data_path + \"tsvtest.tsv\");\n\n\tBOOST_CHECK_EQUAL(my_file.find_first_position(\"aaa\"), 0);\n\tBOOST_CHECK_EQUAL(my_file.find_first_position(\"aab\"), 126);\n\tBOOST_CHECK_EQUAL(my_file.find_first_position(\"european\"), string::npos);\n\n\tBOOST_CHECK_EQUAL(my_file.find_last_position(\"aaa\"), 112);\n\tBOOST_CHECK_EQUAL(my_file.find_last_position(\"aab\"), 126);\n\tBOOST_CHECK_EQUAL(my_file.find_last_position(\"european\"), string::npos);\n\n\tfile::tsv_file my_file2(config::test_data_path + \"tsvtest2.tsv\");\n\n\tBOOST_CHECK_EQUAL(my_file2.find_first_position(\"aaa\"), 0);\n\tBOOST_CHECK(my_file2.find_first_position(\"aab\") > 0);\n\tBOOST_CHECK_EQUAL(my_file2.find_first_position(\"european\"), string::npos);\n\n\tBOOST_CHECK(my_file2.find_last_position(\"aaa\") > 0 && my_file2.find_last_position(\"aaa\") < my_file2.size());\n\tBOOST_CHECK(my_file2.find_last_position(\"aab\") > 0 && my_file2.find_last_position(\"aab\") < my_file2.size());\n\tBOOST_CHECK(my_file2.find_last_position(\"aac\") > 0 && my_file2.find_last_position(\"aac\") == my_file2.size() - 115);\n\tBOOST_CHECK(my_file2.find_last_position(\"european\") == string::npos);\n\n\tBOOST_CHECK_EQUAL(my_file2.find_next_position(\"aaa\"), my_file2.find_first_position(\"aab\"));\n\tBOOST_CHECK_EQUAL(my_file2.find_next_position(\"aab\"), my_file2.find_first_position(\"aac\"));\n\tBOOST_CHECK_EQUAL(my_file2.find_next_position(\"aabb\"), my_file2.find_first_position(\"aac\"));\n\tBOOST_CHECK_EQUAL(my_file2.find_next_position(\"aac\"), my_file2.size());\n}\n\nBOOST_AUTO_TEST_CASE(head_content_len) {\n\n\t{\n\t\tint error;\n\t\tsize_t content_len = transfer::head_content_length(\"http://127.0.0.1/test-data/automobileszone.com\", error);\n\t\tBOOST_CHECK_EQUAL(error, transfer::OK);\n\t\tBOOST_CHECK_EQUAL(content_len, 8084);\n\t}\n\n\t{\n\t\tint error;\n\t\tsize_t content_len = transfer::head_content_length(\"http://127.0.0.1/test-data/automobileszone.com-not-here\", error);\n\t\tBOOST_CHECK_EQUAL(error, transfer::ERROR);\n\t\tBOOST_CHECK_EQUAL(content_len, 0);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_upload) {\n\t// This amazon bucket is gone.\n\t/*{\n\t\tint error;\n\t\tstring buffer;\n\t\ttransfer::url_to_string(\"http://alexandria-test-data.s3.amazonaws.com/multipart_test\", buffer, error);\n\t\tBOOST_CHECK_EQUAL(error, transfer::OK);\n\n\t\terror = transfer::upload_file(\"multipart_test\", buffer);\n\t\tBOOST_CHECK_EQUAL(error, transfer::OK);\n\t}*/\n}\n\nBOOST_AUTO_TEST_CASE(test_upload_gz) {\n\t// This amazon bucket is gone.\n\t/*{\n\t\tint error;\n\t\tstring buffer;\n\t\ttransfer::url_to_string(\"http://alexandria-test-data.s3.amazonaws.com/multipart_test\", buffer, error);\n\t\tBOOST_CHECK_EQUAL(error, transfer::OK);\n\n\t\terror = transfer::upload_gz_file(\"multipart_test.gz\", buffer);\n\t\tBOOST_CHECK_EQUAL(error, transfer::OK);\n\n\t\t// Download it again as gz file and see if we get the same result.\n\t\t\n\t\tconst string result_back = transfer::gz_file_to_string(\"multipart_test.gz\", error);\n\t\tBOOST_CHECK_EQUAL(error, transfer::OK);\n\n\t\tBOOST_CHECK_EQUAL(result_back.size(), buffer.size());\n\t\tBOOST_CHECK_EQUAL(algorithm::hash(result_back), algorithm::hash(buffer));\n\t}*/\n}\n\n/*\n * Test the tsv_file::read_column_into function that is used a lot.\n * */\nBOOST_AUTO_TEST_CASE(test_tsv_file) {\n\n\t{\n\t\tfile::tsv_file tsv(config::test_data_path + \"tsvtest3.tsv\");\n\t\tvector<string> vec;\n\t\ttsv.read_column_into(0, vec, 2, 3);\n\n\t\tBOOST_CHECK(vec.size() == 2);\n\t\tBOOST_CHECK(vec[0] == \"line4\");\n\t\tBOOST_CHECK(vec[1] == \"line5\");\n\t}\n\n\t{\n\t\tfile::tsv_file tsv(config::test_data_path + \"tsvtest3.tsv\");\n\t\tset<string> data;\n\t\ttsv.read_column_into(0, data, 2, 3);\n\n\t\tBOOST_CHECK(data.size() == 2);\n\t\tBOOST_CHECK(data.count(\"line4\") == 1);\n\t\tBOOST_CHECK(data.count(\"line5\") == 1);\n\t}\n\n\t{\n\t\tfile::tsv_file tsv(config::test_data_path + \"tsvtest3.tsv\");\n\t\tvector<string> vec;\n\t\ttsv.read_column_into(0, vec, 100, 3);\n\n\t\tBOOST_CHECK(vec.size() == 3);\n\t\tBOOST_CHECK(vec[0] == \"line4\");\n\t\tBOOST_CHECK(vec[1] == \"line5\");\n\t\tBOOST_CHECK(vec[2] == \"line6\");\n\t}\n\n\t{\n\t\tfile::tsv_file tsv(config::test_data_path + \"tsvtest3.tsv\");\n\t\tset<string> data;\n\t\ttsv.read_column_into(0, data, 100, 3);\n\n\t\tBOOST_CHECK(data.size() == 3);\n\t\tBOOST_CHECK(data.count(\"line4\") == 1);\n\t\tBOOST_CHECK(data.count(\"line5\") == 1);\n\t\tBOOST_CHECK(data.count(\"line6\") == 1);\n\t}\n\n\t{\n\t\tfile::tsv_file tsv(config::test_data_path + \"tsvtest3.tsv\");\n\t\tvector<string> vec;\n\t\ttsv.read_column_into(0, vec, 3, 0);\n\n\t\tBOOST_CHECK(vec.size() == 3);\n\t\tBOOST_CHECK(vec[0] == \"line1\");\n\t\tBOOST_CHECK(vec[1] == \"line2\");\n\t\tBOOST_CHECK(vec[2] == \"line3\");\n\t}\n\n\t{\n\t\tfile::tsv_file tsv(config::test_data_path + \"tsvtest3.tsv\");\n\t\tset<string> data;\n\t\ttsv.read_column_into(0, data, 3, 0);\n\n\t\tBOOST_CHECK(data.size() == 3);\n\t\tBOOST_CHECK(data.count(\"line1\") == 1);\n\t\tBOOST_CHECK(data.count(\"line2\") == 1);\n\t\tBOOST_CHECK(data.count(\"line3\") == 1);\n\t}\n}\n\n/*\n * Test the file::archive simple tarball\n * */\nBOOST_AUTO_TEST_CASE(test_archive) {\n\n\t{\n\t\tfile::archive tar(\"test_dir.tar\");\n\n\t\tfile::create_directory(\"test_dir1\");\n\n\t\tstd::ofstream file1(\"test_dir1/file1.txt\");\n\t\tfile1 << \"hello world 1\";\n\t\tfile1.close();\n\n\t\tstd::ofstream file2(\"test_dir1/file2.txt\");\n\t\tfile2 << \"hello world 2\";\n\t\tfile2.close();\n\n\t\tstd::ofstream file3(\"test_dir1/file3.txt\");\n\t\tfile3 << \"hello world 3\";\n\t\tfile3.close();\n\n\t\ttar.read_dir(\"test_dir1\");\n\t}\n\n\t{\n\t\tfile::archive tar(\"test_dir.tar\");\n\n\t\tfile::create_directory(\"test_dir2\");\n\t\ttar.untar(\"test_dir2\");\n\n\t\tBOOST_CHECK_EQUAL(file::cat(\"test_dir2/file1.txt\"), \"hello world 1\");\n\t\tBOOST_CHECK_EQUAL(file::cat(\"test_dir2/file2.txt\"), \"hello world 2\");\n\t\tBOOST_CHECK_EQUAL(file::cat(\"test_dir2/file3.txt\"), \"hello world 3\");\n\n\t}\n\tfile::delete_directory(\"test_dir1\");\n\tfile::delete_directory(\"test_dir2\");\n\tfile::delete_file(\"test_dir.tar\");\n}\n\nBOOST_AUTO_TEST_CASE(test_archive2) {\n\n\t{\n\t\tfile::archive tar(\"test_dir.tar\");\n\n\t\tfile::create_directory(\"test_dir1\");\n\n\t\t// Create 500 files.\n\t\tfor (size_t i = 1; i <= 500; i++) {\n\t\t\tstd::ofstream file1(\"test_dir1/file\" + std::to_string(i) + \".txt\");\n\t\t\tfor (size_t j = 0; j < i; j++) {\n\t\t\t\tfile1 << \"hello world \" << j << std::endl;\n\t\t\t}\n\t\t}\n\n\t\ttar.read_dir(\"test_dir1\");\n\t}\n\n\t{\n\t\tfile::archive tar(\"test_dir.tar\");\n\n\t\tfile::create_directory(\"test_dir2\");\n\t\ttar.untar(\"test_dir2\");\n\n\t\t// Check 500 files.\n\t\tfor (size_t i = 1; i <= 500; i++) {\n\t\t\tstd::ifstream file1(\"test_dir2/file\" + std::to_string(i) + \".txt\");\n\t\t\tstd::string line;\n\t\t\tsize_t j = 0;\n\t\t\twhile (std::getline(file1, line)) {\n\t\t\t\tBOOST_CHECK_EQUAL(line, \"hello world \" + std::to_string(j));\n\t\t\t\tj++;\n\t\t\t}\n\t\t\tBOOST_CHECK_EQUAL(j, i);\n\t\t}\n\n\t}\n\tfile::delete_directory(\"test_dir1\");\n\tfile::delete_directory(\"test_dir2\");\n\tfile::delete_file(\"test_dir.tar\");\n}\n\nBOOST_AUTO_TEST_CASE(test_rename_file) {\n\tfile::create_directory(\"/tmp/alexandria_test_98237593257\");\n\tfile::create_directory(\"/tmp/alexandria_test_98237593257/testdir\");\n\tfile::rename(\"/tmp/alexandria_test_98237593257/testdir\", \"/tmp/alexandria_test_98237593257/testdir2\");\n\tBOOST_CHECK(file::file_exists(\"/tmp/alexandria_test_98237593257/testdir2\"));\n\tBOOST_CHECK(!file::file_exists(\"/tmp/alexandria_test_98237593257/testdir\"));\n\tfile::delete_directory(\"/tmp/alexandria_test_98237593257\");\n\tBOOST_CHECK(!file::file_exists(\"/tmp/alexandria_test_98237593257/testdir\"));\n\tBOOST_CHECK(!file::file_exists(\"/tmp/alexandria_test_98237593257/testdir2\"));\n\tBOOST_CHECK(!file::file_exists(\"/tmp/alexandria_test_98237593257\"));\n\n\tfile::create_directory(\"/tmp/alexandria_test_98237593257/testdir\");\n\tBOOST_CHECK(file::file_exists(\"/tmp/alexandria_test_98237593257\"));\n\tfile::delete_directory(\"/tmp/alexandria_test_98237593257\");\n\tBOOST_CHECK(!file::file_exists(\"/tmp/alexandria_test_98237593257\"));\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_hash.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"algorithm/hash.h\"\n\nBOOST_AUTO_TEST_SUITE(hash)\n\nBOOST_AUTO_TEST_CASE(str) {\n\n\tBOOST_CHECK_EQUAL(algorithm::hash(\"testing\"), 4540905123118180926ull);\n\tBOOST_CHECK_EQUAL(algorithm::hash(\"\"), 6142509188972423790ull);\n\tBOOST_CHECK_EQUAL(algorithm::hash(\"abcdefghijklmnopqrstuvxyz\"), 17219978627035894604ull);\n\tBOOST_CHECK_EQUAL(algorithm::hash(\"123\"), 10089081994332581363ull);\n\tBOOST_CHECK_EQUAL(algorithm::hash(\"1234\"), 15651099383784684535ull);\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_hash_table.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n\n#include \"hash_table2/hash_table.h\"\n#include \"hash_table2/builder.h\"\n#include \"hash_table_helper/hash_table_helper.h\"\n#include \"indexer/merger.h\"\n\n#include <set>\n\nBOOST_AUTO_TEST_SUITE(test_hash_table)\n\nBOOST_AUTO_TEST_CASE(test_file_paths) {\n\n\t{\n\t\thash_table2::hash_table_shard_builder ht_builder(\"test_index\", 8);\n\t\tBOOST_CHECK_EQUAL(ht_builder.file_base_data(), \"./0/hash_table/ht_test_index_8\");\n\t\tBOOST_CHECK_EQUAL(ht_builder.filename_data(), \"./0/hash_table/ht_test_index_8.data\");\n\t}\n\t{\n\t\thash_table2::hash_table_shard_builder ht_builder(\"test_index\", 8, 1000, \"/data_path\");\n\t\tBOOST_CHECK_EQUAL(ht_builder.file_base_data(), \"/data_path/ht_test_index_8\");\n\t\tBOOST_CHECK_EQUAL(ht_builder.filename_data(), \"/data_path/ht_test_index_8.data\");\n\t\tBOOST_CHECK_EQUAL(ht_builder.filename_pos(), \"./0/hash_table/ht_test_index_8.pos\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(single_shard_add) {\n\n\thash_table_helper::truncate(\"test_index\");\n\n\t{\n\t\thash_table2::hash_table_shard_builder idx(\"test_index\", 0);\n\n\t\tidx.truncate();\n\n\t\tidx.add(123, \"hello world\");\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table_shard idx(\"test_index\", 0);\n\n\t\tBOOST_CHECK(idx.has(123));\n\t\tBOOST_CHECK(!idx.has(1234));\n\t\tBOOST_CHECK_EQUAL(idx.find(123), \"hello world\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(single_shard_add_versioned) {\n\n\t{\n\t\thash_table2::hash_table_shard_builder idx(\"test_index\", 0);\n\n\t\tidx.truncate();\n\n\t\tidx.add(123, \"hello world\", 5);\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.add(123, \"new value\", 6);\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.add(123, \"old value\", 4);\n\t\tidx.append();\n\t\tidx.merge();\n\n\t\tidx.add(123, \"old value 2\", 3);\n\t\tidx.add(123, \"newest value\", 7);\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table_shard idx(\"test_index\", 0);\n\n\t\tBOOST_CHECK_EQUAL(idx.find(123), \"newest value\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(single_shard_add_versioned2) {\n\n\t{\n\t\thash_table2::hash_table_shard_builder idx(\"test_index\", 0);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, \"an old value\", 1000);\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t\tidx.add(101, \"another old value\", 1000);\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t\tidx.add(101, \"a new value\", 1001);\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t\tidx.add(101, \"an older value\", 999);\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t}\n\n\t{\n\t\thash_table2::hash_table_shard idx(\"test_index\", 0);\n\n\t\tBOOST_CHECK_EQUAL(idx.find(101), \"a new value\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(add_to_hash_table) {\n\n\thash_table_helper::truncate(\"test_index\");\n\n\t{\n\t\thash_table2::builder idx(\"test_index\", 43);\n\n\t\tidx.truncate();\n\n\t\t// Add 1000 elements.\n\t\tfor (size_t i = 0; i < 1000; i++) {\n\t\t\tidx.add(i, \"Random test data with id: \" + std::to_string(i));\n\t\t}\n\n\t\tidx.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table hash_table(\"test_index\", 43);\n\n\t\tfor (size_t i = 0; i < 1000; i++) {\n\t\t\tBOOST_CHECK_EQUAL(hash_table.find(i), \"Random test data with id: \" + std::to_string(i));\n\t\t}\n\t}\n\n\t{\n\t\thash_table2::builder idx(\"test_index\", 43);\n\n\t\tidx.truncate();\n\n\t\t// Add 1000 elements.\n\t\tfor (size_t i = 1000; i < 2000; i++) {\n\t\t\tidx.add(i, \"Random test data with id: \" + std::to_string(i));\n\t\t}\n\n\t\tidx.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table hash_table(\"test_index\", 43);\n\n\t\tfor (size_t i = 1000; i < 2000; i++) {\n\t\t\tBOOST_CHECK_EQUAL(hash_table.find(i), \"Random test data with id: \" + std::to_string(i));\n\t\t}\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(add_to_hash_table_reverse) {\n\n\thash_table_helper::truncate(\"test_index\");\n\n\t{\n\t\thash_table2::builder idx(\"test_index\", 17);\n\n\t\tidx.truncate();\n\n\t\t// Add 1000 elements.\n\t\tfor (size_t i = 100000; i < 200000; i++) {\n\t\t\tidx.add(i, \"Random test data with id: \" + std::to_string(i));\n\t\t}\n\n\t\tidx.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table hash_table(\"test_index\", 17);\n\n\t\tBOOST_CHECK_EQUAL(hash_table.size(), 100000);\n\t}\n\n\t{\n\t\t// Add more elements.\n\t\thash_table2::builder idx(\"test_index\", 17);\n\n\t\t// Add 1000 elements.\n\t\tfor (size_t i = 0; i < 100000; i++) {\n\t\t\tidx.add(i, \"Random test data with id: \" + std::to_string(i));\n\t\t}\n\n\t\tidx.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table hash_table(\"test_index\", 17);\n\n\t\tBOOST_CHECK_EQUAL(hash_table.size(), 200000);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(optimize) {\n\n\thash_table_helper::truncate(\"test_index\");\n\n\tsize_t shard_size = 0;\n\tsize_t shard_file_size = 0;\n\n\t{\n\t\thash_table2::hash_table_shard_builder builder(\"test_index\", 0);\n\n\t\tbuilder.add(1, \"data element 1 v1\");\n\t\tbuilder.add(2, \"data element 2 v1\");\n\t\tbuilder.add(3, \"data element 3 v1\");\n\n\t\tbuilder.append();\n\t\tbuilder.merge();\n\n\t\thash_table2::hash_table_shard shard(\"test_index\", 0);\n\t\tshard_size = shard.size();\n\t\tshard_file_size = shard.file_size();\n\t}\n\n\t{\n\t\t// Add some more elements with identical keys.\n\t\thash_table2::hash_table_shard_builder builder(\"test_index\", 0);\n\n\t\tbuilder.add(1, \"data element 1 v2\");\n\t\tbuilder.add(2, \"data element 2 v2\");\n\t\tbuilder.add(3, \"data element 3 v2\");\n\n\t\tbuilder.append();\n\t\tbuilder.merge();\n\n\t\tbuilder.optimize();\n\n\t\thash_table2::hash_table_shard shard(\"test_index\", 0);\n\n\t\tBOOST_CHECK_EQUAL(shard.size(), shard_size);\n\t\tBOOST_CHECK_EQUAL(shard.file_size(), shard_file_size);\n\n\t\tBOOST_CHECK_EQUAL(shard.find(1), \"data element 1 v2\");\n\t\tBOOST_CHECK_EQUAL(shard.find(2), \"data element 2 v2\");\n\t\tBOOST_CHECK_EQUAL(shard.find(3), \"data element 3 v2\");\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(optimize_empty) {\n\n\thash_table_helper::truncate(\"main_index\");\n\n\thash_table2::hash_table_shard_builder idx(\"main_index\", 0);\n\tidx.optimize();\n\n}\n\nBOOST_AUTO_TEST_CASE(conditional) {\n\n\thash_table_helper::truncate(\"main_index\");\n\n\t{\n\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tht.truncate();\n\n\t\tht.add(101, \"an old value\", 1000);\n\t\tht.add(101, \"another old value\", 1000);\n\t\tht.add(101, \"a new value\", 1001);\n\t\tht.add(101, \"an older value\", 999);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 10);\n\n\t\tstd::string value = ht.find(101);\n\n\t\tBOOST_CHECK_EQUAL(value, \"a new value\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(conditional2) {\n\n\thash_table_helper::truncate(\"main_index\");\n\n\t{\n\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tht.truncate();\n\n\t\t// Merge between each. Should still get the same value.\n\n\t\tht.add(101, \"an old value\", 1000);\n\t\tht.merge();\n\t\tht.add(101, \"another old value\", 1000);\n\t\tht.merge();\n\t\tht.add(101, \"a new value\", 1001);\n\t\tht.merge();\n\t\tht.add(101, \"an older value\", 999);\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 10);\n\n\t\tstd::string value = ht.find(101);\n\n\t\tBOOST_CHECK_EQUAL(value, \"a new value\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(more_tests) {\n\n\thash_table_helper::truncate(\"main_index\");\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tht.truncate();\n\n\t\tht.add(101, \"first value\", 1000);\n\t\tht.add(101, \"second value\", 1001);\n\t\tht.add(101, \"third value\", 1002);\n\n\t\tht.add(102, \"first value\", 1000);\n\t\tht.add(102, \"second value\", 1001);\n\t\tht.add(102, \"third value\", 1002);\n\n\t\tht.add(103, \"first value\", 1);\n\t\tht.add(103, \"second value\", 100000);\n\t\tht.add(103, \"third value\", 99999999999);\n\n\t\tht.add(50, \"third value\");\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 10);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(101), \"third value\");\n\t\tBOOST_CHECK_EQUAL(ht.find(102), \"third value\");\n\t\tBOOST_CHECK_EQUAL(ht.find(103), \"third value\");\n\t\tBOOST_CHECK_EQUAL(ht.find(50), \"third value\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(for_each) {\n\n\thash_table_helper::truncate(\"main_index\");\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tht.truncate();\n\n\t\tht.add(101, \"first value\", 1000);\n\t\tht.merge();\n\t\tht.add(101, \"second value\", 1001);\n\t\tht.merge();\n\t\tht.add(101, \"third value\", 1002);\n\n\t\tht.add(102, \"first value\", 1000);\n\t\tht.merge();\n\t\tht.add(102, \"second value\", 1001);\n\t\tht.merge();\n\t\tht.add(102, \"third value\", 1002);\n\n\t\tht.add(103, \"third value\", 99999999999);\n\t\tht.add(103, \"first value\", 1);\n\t\tht.merge();\n\t\tht.add(103, \"second value\", 100000);\n\t\tht.merge();\n\n\t\tht.add(50, \"third value\");\n\n\t\tht.merge();\n\t\tht.optimize();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 10);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(101), \"third value\");\n\t\tBOOST_CHECK_EQUAL(ht.find(102), \"third value\");\n\t\tBOOST_CHECK_EQUAL(ht.find(103), \"third value\");\n\t\tBOOST_CHECK_EQUAL(ht.find(50), \"third value\");\n\n\t\tstd::set<uint64_t> keys;\n\t\tstd::set<std::string> values;\n\t\tht.for_each([&keys, &values](uint64_t key, const std::string &val) {\n\t\t\tkeys.insert(key);\n\t\t\tvalues.insert(val);\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(keys.size(), 4);\n\t\tBOOST_CHECK_EQUAL(values.size(), 1);\n\n\t\tfor (const auto &val : values) {\n\t\t\tBOOST_CHECK_EQUAL(val, \"third value\");\n\t\t}\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(larger_test) {\n\n\t{\n\t\tindexer::merger::start_merge_thread();\n\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tht.truncate();\n\n\t\tfor (size_t key = 1000; key < 10000; key++) {\n\t\t\tht.add(key, std::string(key, 'x'));\n\t\t}\n\n\t\tfor (size_t key = 1000; key < 10000; key++) {\n\t\t\tht.add(key, std::string(key, 'y'), 1);\n\t\t}\n\n\t\tindexer::merger::stop_merge_thread();\n\t}\n\n\t{\n\t\tindexer::merger::start_merge_thread();\n\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tfor (size_t key = 1000; key < 10000; key++) {\n\t\t\tht.add(key, std::string(key, 'z'), 2);\n\t\t}\n\n\t\tindexer::merger::stop_merge_thread();\n\t}\n\n\t{\n\t\tindexer::merger::start_merge_thread();\n\n\t\thash_table2::builder ht(\"main_index\", 10);\n\n\t\tfor (size_t key = 1000; key < 10000; key++) {\n\t\t\tht.add(key, std::string(key, 'a'), 2);\n\t\t}\n\n\t\tindexer::merger::stop_merge_thread();\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 10);\n\t\tht.optimize();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 10);\n\n\t\tfor (size_t key = 1000; key < 10000; key++) {\n\t\t\tBOOST_REQUIRE_EQUAL(ht.find(key), std::string(key, 'a'));\n\t\t}\n\n\t\tstd::map<uint64_t, std::vector<std::string>> vals;\n\t\tht.for_each([&vals](uint64_t key, const std::string &val) {\n\t\t\tvals[key].push_back(val);\n\t\t});\n\n\t\tfor (const auto &iter : vals) {\n\t\t\tBOOST_REQUIRE_EQUAL(iter.second.size(), 1);\n\t\t\tBOOST_REQUIRE_EQUAL(iter.second[0], std::string(iter.first, 'a'));\n\t\t}\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(merge_with) {\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 11);\n\n\t\tht.truncate();\n\n\t\tht.add(123, \"a1\", 10);\n\t\tht.add(1230, \"a2\", 10);\n\t\tht.add(1231, \"a3\", 10);\n\t\tht.add(1231, \"a3_n2\", 11);\n\n\t\tht.add(3828540, \"a4\", 10);\n\t\tht.add(2234645, \"a5\", 10);\n\t\tht.add(8424878, \"a6\", 10);\n\t\tht.add(4174861, \"a7\", 10);\n\t\tht.add(7013344, \"a8\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index2\", 11);\n\n\t\tht.truncate();\n\n\t\tht.add(123, \"b1\", 11);\n\t\tht.add(1230, \"b2\", 12);\n\t\tht.add(1231, \"b3\", 9);\n\t\tht.add(1231, \"b3\", 8);\n\n\t\tht.add(8321508, \"b4\", 10);\n\t\tht.add(7309646, \"b5\", 10);\n\t\tht.add(2809224, \"b6\", 10);\n\t\tht.add(6543485, \"b7\", 10);\n\t\tht.add(6078858, \"b8\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::builder ht1(\"main_index\", 11);\n\t\thash_table2::builder ht2(\"main_index2\", 11);\n\n\t\tht1.merge_with(ht2);\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 11);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(123), \"b1\");\n\t\tBOOST_CHECK_EQUAL(ht.find(1230), \"b2\");\n\t\tBOOST_CHECK_EQUAL(ht.find(1231), \"a3_n2\");\n\t\tBOOST_CHECK_EQUAL(ht.find(6543485), \"b7\");\n\t\tBOOST_CHECK_EQUAL(ht.find(2234645), \"a5\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(merge_with_files) {\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 1);\n\n\t\tht.truncate();\n\n\t\tht.add(123, \"a1\", 10);\n\t\tht.add(1230, \"a2\", 10);\n\t\tht.add(1231, \"a3\", 10);\n\t\tht.add(1231, \"a3_n2\", 11);\n\n\t\tht.add(3828540, \"a4\", 10);\n\t\tht.add(2234645, \"a5\", 10);\n\t\tht.add(8424878, \"a6\", 10);\n\t\tht.add(4174861, \"a7\", 10);\n\t\tht.add(7013344, \"a8\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index2\", 1);\n\n\t\tht.truncate();\n\n\t\tht.add(123, \"b1\", 11);\n\t\tht.add(1230, \"b2\", 12);\n\t\tht.add(1231, \"b3\", 9);\n\t\tht.add(1231, \"b3\", 8);\n\n\t\tht.add(8321508, \"b4\", 10);\n\t\tht.add(7309646, \"b5\", 10);\n\t\tht.add(2809224, \"b6\", 10);\n\t\tht.add(6543485, \"b7\", 10);\n\t\tht.add(6078858, \"b8\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index2\", 1);\n\n\t\tht.get_shard(0)->merge_with(\"./0/hash_table/ht_main_index_0.pos\", \"./0/hash_table/ht_main_index_0.data\");\n\t}\n\t{\n\t\thash_table2::hash_table ht(\"main_index2\", 1);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(123), \"b1\");\n\t\tBOOST_CHECK_EQUAL(ht.find(1230), \"b2\");\n\t\tBOOST_CHECK_EQUAL(ht.find(1231), \"a3_n2\");\n\t\tBOOST_CHECK_EQUAL(ht.find(6543485), \"b7\");\n\t\tBOOST_CHECK_EQUAL(ht.find(2234645), \"a5\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(remove_record) {\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 1);\n\n\t\tht.truncate();\n\n\t\tht.add(10000, \"data1\", 10);\n\t\tht.add(10001, \"data2\", 10);\n\t\tht.add(10002, \"data3\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 1);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(10000), \"data1\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10001), \"data2\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10002), \"data3\");\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 1);\n\n\t\tht.remove(10001);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 1);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(10000), \"data1\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10001), \"\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10002), \"data3\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(remove_record2) {\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 1);\n\n\t\tht.truncate();\n\n\t\tht.add(10000, \"data1\", 10);\n\t\tht.add(10001, \"data2\", 10);\n\t\tht.add(10002, \"data3\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index2\", 1);\n\n\t\tht.truncate();\n\n\t\tht.add(10000, \"data1\", 10);\n\t\tht.add(10002, \"data3\", 10);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 1);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(10000), \"data1\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10001), \"data2\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10002), \"data3\");\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index2\", 1);\n\n\t\tBOOST_CHECK_EQUAL(ht.find(10000), \"data1\");\n\t\tBOOST_CHECK_EQUAL(ht.find(10002), \"data3\");\n\t}\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 1);\n\n\t\tht.remove(10001);\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht1(\"main_index\", 1);\n\t\thash_table2::hash_table ht2(\"main_index\", 1);\n\n\t\tsize_t total_size1 = 0;\n\t\tht1.for_each_shard([&total_size1](auto shard) {\n\t\t\ttotal_size1 += shard->file_size();\n\t\t});\n\n\t\tsize_t total_size2 = 0;\n\t\tht2.for_each_shard([&total_size2](auto shard) {\n\t\t\ttotal_size2 += shard->file_size();\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(total_size1, total_size2);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(for_each_key) {\n\n\t{\n\t\thash_table2::builder ht(\"main_index\", 1);\n\n\t\tht.truncate();\n\n\t\tht.add(100, \"data1\");\n\t\tht.add(101, \"other data\");\n\t\tht.add(102, \"data3\");\n\n\t\tht.merge();\n\t}\n\n\t{\n\t\thash_table2::hash_table ht(\"main_index\", 1);\n\n\t\tint num = 0;\n\t\tht.for_each_key([&num](uint64_t key) {\n\t\t\tBOOST_CHECK(key == 100 || key == 101 || key == 102);\n\t\t\tnum++;\n\t\t});\n\n\t\tBOOST_CHECK_EQUAL(num, 3);\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_html_parser.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"parser/html_parser.h\"\n#include \"text/text.h\"\n#include \"file/file.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(html_parser)\n\nBOOST_AUTO_TEST_CASE(html_parse1) {\n\tparser::html_parser parser;\n\n\tparser.parse(\"<title>test1</title>\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"test1\");\n\n\tparser.parse(\"<title>test1</title><h1>test2</h1>\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"test2\");\n\n\tparser.parse(\"he oisjdf osdjfo idjsofi djsof<h1></h1>\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"\");\n\n\tparser.parse(\"<html><title>test1</title><meta name=\\\"description\\\" content=\\\"Recensioner av Vår vid sommen och andra böcker.\\\"></html>\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Recensioner av Vår vid sommen och andra böcker\");\n\n\tparser.parse(file::read_test_file(\"test1.html\"));\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Pris: 199 kr. Inbunden, 2021. Finns i lager. Köp Sammetsdiktaturen : motstånd och medlöpare i dagens Ryssland av Anna-Lena Laurén på Bokus.com. Boken har 3 st läsarrecensioner\");\n\n\tparser.parse(\"<title>test1</title><h1><span>Hej Hopp</span></h1>\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Hej Hopp\");\n\n\tparser.parse(\"<html><title>test1</title><h1>test2</h1> lite text efter</html>\");\n\tBOOST_CHECK_EQUAL(parser.text(), \"lite text efter\");\n}\n\nBOOST_AUTO_TEST_CASE(html_parse2) {\n\tparser::html_parser parser;\n\n\tparser.parse(file::read_test_file(\"test5.html\"));\n\tBOOST_CHECK_EQUAL(parser.text().substr(0, 50),\n\t\tstring(\"Nya lån 2021 Nya lån 2020 Nya lån 2019 Nya lån 2018 Nya lån 2017 Nya lån 2016 Uppdaterad 2021-10-01.\").substr(0, 50));\n\n\tparser.parse(file::read_test_file(\"test6.html\"));\n}\n\nBOOST_AUTO_TEST_CASE(html_parse3) {\n\tparser::html_parser parser;\n\n\tparser.parse(file::read_test_file(\"test7.html\"));\n\tBOOST_CHECK_EQUAL(parser.text().substr(0, 20), \"Add to wishlist Adde\");\n\n}\n\nBOOST_AUTO_TEST_CASE(html_parse4) {\n\tparser::html_parser parser;\n\n\tparser.parse(file::read_test_file(\"test8.html\"));\n\tBOOST_CHECK_EQUAL(parser.text().substr(0, 107), \"Hacker News new | past | comments | ask | show | jobs | submit login 1. Apple Broke Up with Me ( merecivili\");\n\n}\n\nBOOST_AUTO_TEST_CASE(html_parse5) {\n\tparser::html_parser parser;\n\n\tparser.parse(file::read_test_file(\"test10.html\"));\n\n\tBOOST_CHECK_EQUAL(parser.meta(), \"\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Association for Progressive Communications | Internet for social justice and sustainable development\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"\");\n\n}\n\nBOOST_AUTO_TEST_CASE(html_parse6) {\n\tparser::html_parser parser;\n\n\tparser.parse(file::read_test_file(\"test11.html\"));\n\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Svenska Dagbladet står för seriös och faktabaserad kvalitetsjournalistik som utmanar, ifrågasätter och inspirerar\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"SvD | Sveriges kvalitetssajt för nyheter\");\n\n}\n\nBOOST_AUTO_TEST_CASE(html_parse7) {\n\tparser::html_parser parser;\n\n\tparser.parse(file::read_test_file(\"test12.html\"));\n\n\tBOOST_CHECK_EQUAL(parser.meta(), \"The systematic thinking in our industry is that settings are the result of design failure. As designers, our goal is to create product experiences that don’t require any adjustment by the user. So offering customization options is often seen as a failure to make firm product decisions. I think there is a misunderstanding about what settings really are\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Settings are not a design failure\");\n\n}\n\nBOOST_AUTO_TEST_CASE(html_parse_links) {\n\n\tstring html;\n\tvector<parser::html_link> links;\n\n\tstring test2_html = file::read_test_file(\"test2.html\");\n\n\tparser::html_parser parser;\n\tparser.parse(test2_html);\n\tBOOST_CHECK_EQUAL(parser.title(), \"Resebyrån Främmande Världar - L. D. Lapinski - inbunden (9789178937943) | Adlibris Bokhandel\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"inbunden, 2021. Köp boken Resebyrån Främmande Världar av L. D. Lapinski (ISBN 9789178937943) hos Adlibris. Fraktfritt över 229 kr Alltid bra priser och snabb leverans. | Adlibris\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Resebyrån Främmande Världar - inbunden, Svenska, 2021\");\n\n\tBOOST_CHECK_EQUAL(parser.text(), \"\");\n\tBOOST_CHECK(parser.should_insert());\n\n\tstring test4_html = file::read_test_file(\"test4.html\");\n\tparser.parse(test4_html);\n\tBOOST_CHECK_EQUAL(parser.title(), \"Corona – samlad information för privatpersoner | Skatteverket\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Corona – information för privatpersoner\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Här har vi samlat information för privatpersoner som påverkas av corona på olika sätt\");\n\tBOOST_CHECK(parser.should_insert());\n\n\tstring stackoverflow_html = file::read_test_file(\"stackoverflow.html\");\n\tparser.parse(stackoverflow_html);\n\tBOOST_CHECK_EQUAL(parser.title(), \"node.js - How to use Async and Await with AWS SDK Javascript - Stack Overflow\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"How to use Async and Await with AWS SDK Javascript\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"I am working with the AWS SDK using the KMS libary. I would like to use async and await instead of callbacks. import AWS, { KMS } from \\\"aws-sdk\\\"; this.kms = new AWS.KMS(); const key = await this\");\n\tBOOST_CHECK(parser.should_insert());\n\n\thtml = file::read_test_file(\"hallakonsument.html\");\n\tparser.parse(html, \"https://www.hallakonsument.se/konsumentratt-kopsatt/innan-du-tar-ett-lan/\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Innan du tar ett lån | Hallå konsument – Konsumentverket\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Innan du tar ett lån\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Om du har ett behov av att låna pengar är det viktigt att läsa på om vilken typ av lån som passar dig. Prata med flera banker, jämför villkoren och kostnaderna för olika lån\");\n\tBOOST_CHECK(parser.should_insert());\n\n\tlinks = parser.links();\n\tbool found_link = false;\n\tfor (const auto &link : links) {\n\t\tif (link.target_host() == \"konsumenternas.se\" &&\n\t\t\tlink.target_path() == \"/lan--betalningar/lan/sa-fungerar-ett-lan/forhandsinformation/\" &&\n\t\t\tlink.text() == \"Läs mer om förhandsinformation på webbplatsen konsumenternas.se\") {\n\t\t\tfound_link = true;\n\t\t}\n\t}\n\n\tBOOST_CHECK(found_link);\n\n\thtml = file::read_test_file(\"konsumenternas.html\");\n\tparser.parse(html, \"https://www.konsumenternas.se/lan--betalningar/lan/\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Lån\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Lån\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Att låna pengar kan vara ett sätt att finansiera något som du behöver eller gärna vill köpa, men inte har råd att betala direkt. Men ett lån kostar pengar i form av avgifter och räntor\");\n\tBOOST_CHECK(parser.should_insert());\n\n\tlinks = parser.links();\n\tfound_link = false;\n\tfor (const auto &link : links) {\n\t\tif (link.target_host() == \"konsumenternas.us17.list-manage.com\" &&\n\t\t\tlink.target_path() == \"/subscribe?u=a63ab96c95e9b06c9a857d5f9&id=132436ec8d\" &&\n\t\t\tlink.text() == \"Nyhetsbrev\") {\n\t\t\tfound_link = true;\n\t\t}\n\t}\n\tBOOST_CHECK(found_link);\n\n\thtml = file::read_test_file(\"sbab.html\");\n\tparser.parse(html, \"https://www.sbab.se/1/privat/lana/privatlan/privatlan_-_sa_funkar_det.html#/berakna_manadskostnad\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Privatlån - låna pengar till bra ränta - SBAB\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Privatlån – låna pengar till bra ränta\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Ansök om ett privatlån mellan 30 000 och 500 000 kronor. Låna pengar utan säkerhet. Ansök och få besked direkt\");\n\tBOOST_CHECK(parser.should_insert());\n\n\tlinks = parser.links();\n\tfound_link = false;\n\tfor (const auto &link : links) {\n\t\tif (link.target_host() == \"sbab.kundo.se\" &&\n\t\t\tlink.target_path() == \"/org/sbab/\" &&\n\t\t\tlink.text() == \"Kundforum\") {\n\t\t\tfound_link = true;\n\t\t}\n\t}\n\tBOOST_CHECK(found_link);\n\n\thtml = file::read_test_file(\"kronofogden.html\");\n\tparser.parse(html, \"https://www.kronofogden.se/82374.html\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Fem tips om ekonomin förändras | Kronofogden\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Fem tips om ekonomin förändras\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"\");\n\tBOOST_CHECK(parser.should_insert());\n\n\tlinks = parser.links();\n\tfound_link = false;\n\tfor (const auto &link : links) {\n\t\tif (link.target_host() == \"hallakonsument.se\" &&\n\t\t\tlink.target_path() == \"/\" &&\n\t\t\tlink.text() == \"Välkommen till Hallå konsument\") {\n\t\t\tfound_link = true;\n\t\t}\n\t}\n\tBOOST_CHECK(found_link);\n\n\thtml = file::read_test_file(\"uppsala.html\");\n\tparser.parse(html, \"https://www.uppsala.se/stod-och-omsorg/privatekonomi-och-ekonomiskt-stod/boka-tid-for-budget--och-skuldradgivning/\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Budget- och skuldrådgivning hos Konsument Uppsala - Uppsala kommun\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Budget- och skuldrådgivning hos Konsument Uppsala\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Om du vill göra din egen hushållsbudget, vill ha ekonomisk rådgivning eller har skulder och inte får pengarna att räcka till kan du vända dig till Konsument Uppsala. \");\n\tBOOST_CHECK(parser.should_insert());\n\n\tlinks = parser.links();\n\tfound_link = false;\n\tfor (const auto &link : links) {\n\t\tif (link.target_host() == \"outlook.office365.com\" &&\n\t\t\tlink.target_path() == \"/owa/calendar/Budgetochskuldrdgivning@uppsalakommun1.onmicrosoft.com/bookings/\" &&\n\t\t\tlink.text() == \"Boka tid online\") {\n\t\t\tfound_link = true;\n\t\t}\n\t}\n\tBOOST_CHECK(found_link);\n\n\thtml = file::read_test_file(\"chessgames.com\");\n\tparser.parse(html, \"http://store.chessgames.com/chess-books/chess-notation-type/an---algebraic/author/s/alexander-cherniaev-anatoly-karpov-joe-gallagher-joel-r.-steed-miguel-a.-sanchez-richard-obrien/hardware-requirements/windows.html\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"Chess Books : Windows, AN - Algebraic, Alexander Cherniaev, Anatoly Karpov, Joe Gallagher, Joel R. Steed, Miguel A. Sanchez and Richard O'Brien\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"Chess Books\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"Shop for Chess Books at US Chess Federation Sales. We offer the widest selection of Chess Books at the lowest prices with same-day shipping.Windows, AN - Algebraic, Alexander Cherniaev, Anatoly Karpov, Joe Gallagher, Joel R. Steed, Miguel A. Sanchez and Richard O'Brien\");\n\n\tBOOST_CHECK_EQUAL(parser.links().size(), 0);\n\tBOOST_CHECK(parser.should_insert());\n\n\thtml = file::read_test_file(\"acomesf.org\");\n\tparser.parse(html, \"http://acomesf.org/download/42104960-3er-congreso-acomesf/\");\n\tBOOST_CHECK_EQUAL(parser.title(), \"42104960 3er Congreso ACOMESF | Asociación Colombiana de Médicos Especialistas en Salud Familiar (ACOMESF\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"42104960 3er Congreso ACOMESF\");\n\tBOOST_CHECK_EQUAL(parser.meta(), \"\");\n\tBOOST_CHECK(parser.should_insert());\n\n\thtml = file::read_test_file(\"automobileszone.com\");\n\tparser.parse(html, \"http://automobileszone.com/wp-login.php?redirect_to=http%3A%2F%2Fautomobileszone.com%2Fbest-bronco-build-off-our-editors-weigh-in-on-their-ideal-suvs%2F\");\n\tBOOST_CHECK_EQUAL(parser.text(), \"Username or Email Address Password Remember Me Lost your password? ← Back to Automobiles Zone Log in with WordPress.com\");\n\tBOOST_CHECK(parser.should_insert());\n\n\thtml = file::read_test_file(\"vcareprojectmanagement.com\");\n\tparser.parse(html, \"https://vcareprojectmanagement.com/products/project-manager-project-management-certification-pmi-atp-authorised-training-provider-pmp-capm-2021-online-training-course-class\");\n\tBOOST_CHECK_EQUAL(parser.h1(), \"\");\n\tBOOST_CHECK_EQUAL(parser.text(), \"\");\n}\n\nBOOST_AUTO_TEST_CASE(html_parser_encodings) {\n\n\tparser::html_parser parser;\n\tBOOST_CHECK(!parser.is_exotic_language(\"hej jag heter josef cullhed\"));\n\tBOOST_CHECK(!parser.is_exotic_language(\"åäö\"));\n\tBOOST_CHECK(!parser.is_exotic_language(\"Đảng,Đoàn thể - tnxp.hochiminhcity.gov.vn\"));\n\tBOOST_CHECK(!parser.is_exotic_language(\"Maktspelet i Volvo : en skildring inifr&aring;n - Hans Nyman - Kartonnage (9789189323056) | Bokus\"));\n\n\tBOOST_CHECK(parser.is_exotic_language(\"В КФУ проходят съемки короткометражного фильма в рамках проекта «Кино за 7 дней» | ВидеоПрокат+\"));\n\tBOOST_CHECK(parser.is_exotic_language(\"2015-09-09から1日間の記事一覧 - Nani-Sore　何それ？\"));\n\tBOOST_CHECK(parser.is_exotic_language(\"Ремонт Принтеров Hp в Спб Адреса | Ремонт принтеров\"));\n}\n\nBOOST_AUTO_TEST_CASE(html_parser_long_text) {\n\n\tparser::html_parser parser(100000);\n\tstring html = file::read_test_file(\"zlib_manual.html\");\n\n\tparser.parse(html, \"https://zlib.net/manual.html\");\n\n\tstring text = parser.text();\n\tBOOST_CHECK_EQUAL(text.substr(text.size() - 14), \"# endif #endif\");\n\n\tvector<string> words = text::get_expanded_full_text_words(text);\n\n\tbool has_word = false;\n\tfor (const string &word : words) {\n\t\tif (word == \"inflateinit2\") has_word = true;\n\t}\n\n\tBOOST_CHECK(has_word);\n}\n\n/*\n\ttest these links: <a href=\"http://skatteverket.se/\">Skatteverket</A>\n\there: http://nomell.se/2009/03/24/prisa-gud-har-kommer-skatteaterbaringen/\n*/\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_hyper_ball.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"algorithm/hyper_ball.h\"\n#include \"algorithm/algorithm.h\"\n#include <set>\n#include <vector>\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(hyper_ball)\n\nBOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball) {\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 1),\n\t\t\tstd::make_pair(1, 2),\n\t\t\tstd::make_pair(2, 0),\n\t\t\tstd::make_pair(2, 3),\n\t\t\tstd::make_pair(3, 4),\n\t\t\tstd::make_pair(3, 5),\n\t\t\tstd::make_pair(4, 2),\n\t\t\tstd::make_pair(5, 4),\n\t\t};\n\t\tconst size_t n = 1000;\n\t\tvector<uint32_t> *edge_map = algorithm::set_to_edge_map(n, e);\n\t\tvector<double> h = algorithm::hyper_ball(n, edge_map);\n\t\tdelete [] edge_map;\n\t\tBOOST_CHECK(h.size() == n);\n\t\tBOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001);\n\t\tBOOST_CHECK_EQUAL(h[6], 0.0);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball2) {\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 1),\n\t\t\tstd::make_pair(1, 5),\n\t\t\tstd::make_pair(2, 5),\n\t\t\tstd::make_pair(3, 2),\n\t\t\tstd::make_pair(6, 2),\n\t\t\tstd::make_pair(7, 3),\n\t\t\tstd::make_pair(10, 7),\n\t\t\tstd::make_pair(7, 9),\n\t\t\tstd::make_pair(9, 3),\n\t\t\tstd::make_pair(9, 6),\n\t\t\tstd::make_pair(8, 9),\n\t\t\tstd::make_pair(4, 8),\n\t\t};\n\t\tconst size_t n = 1000;\n\t\tvector<uint32_t> *edge_map = algorithm::set_to_edge_map(n, e);\n\t\tvector<double> h = algorithm::hyper_ball(n, edge_map);\n\t\tdelete [] edge_map;\n\t\tBOOST_CHECK(h.size() == n);\n\t\tBOOST_CHECK_CLOSE(h[5], 4.86666666667, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[8], 1.0, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[2], 3.91666666667, 0.000001);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball3) {\n\n\t{\n\t\tset<pair<uint32_t, uint32_t>> e = {\n\t\t\tstd::make_pair(0, 11),\n\t\t\tstd::make_pair(1, 0),\n\t\t\tstd::make_pair(2, 1),\n\t\t\tstd::make_pair(3, 2),\n\t\t\tstd::make_pair(3, 8),\n\t\t\tstd::make_pair(4, 7),\n\t\t\tstd::make_pair(5, 7),\n\t\t\tstd::make_pair(6, 7),\n\t\t\tstd::make_pair(7, 8),\n\t\t\tstd::make_pair(10, 12),\n\t\t\tstd::make_pair(11, 1),\n\t\t\tstd::make_pair(11, 10),\n\t\t\tstd::make_pair(12, 25),\n\t\t\tstd::make_pair(13, 9),\n\t\t\tstd::make_pair(13, 14),\n\t\t\tstd::make_pair(14, 9),\n\t\t\tstd::make_pair(14, 8),\n\t\t\tstd::make_pair(14, 15),\n\t\t\tstd::make_pair(15, 7),\n\t\t\tstd::make_pair(19, 15),\n\t\t\tstd::make_pair(20, 21),\n\t\t\tstd::make_pair(21, 16),\n\t\t\tstd::make_pair(21, 17),\n\t\t\tstd::make_pair(21, 18),\n\t\t\tstd::make_pair(21, 22),\n\t\t\tstd::make_pair(22, 23),\n\t\t\tstd::make_pair(23, 19),\n\t\t\tstd::make_pair(24, 20),\n\t\t\tstd::make_pair(24, 21),\n\t\t\tstd::make_pair(24, 25),\n\t\t\tstd::make_pair(25, 24),\n\t\t};\n\t\tconst size_t n = 1000;\n\t\tvector<uint32_t> *edge_map = algorithm::set_to_edge_map(n, e);\n\t\tvector<double> h = algorithm::hyper_ball(n, edge_map);\n\t\tdelete [] edge_map;\n\t\tBOOST_CHECK(h.size() == n);\n\t\tBOOST_CHECK_CLOSE(h[0], 2.33333333333, 0.000001);\n\t\tBOOST_CHECK_CLOSE(h[7], 7.25156232656, 0.000001);\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n\n"
  },
  {
    "path": "tests/test_hyper_log_log.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"algorithm/hyper_log_log.h\"\n#include <cstdlib>\n#include <vector>\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(hyper_log_log)\n\nBOOST_AUTO_TEST_CASE(hyper_simple) {\n\t{\n\t\talgorithm::hyper_log_log hl;\n\n\t\tBOOST_CHECK(hl.leading_zeros_plus_one(0x0ull) == 65);\n\t\tBOOST_CHECK(hl.leading_zeros_plus_one(0x1ull) == 64);\n\t\tBOOST_CHECK(hl.leading_zeros_plus_one(0xFFFFFFFFull) == 33);\n\t\tBOOST_CHECK(hl.leading_zeros_plus_one(0xFFFFFFFFull) == 33);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(hyper_inserts) {\n\n\t{\n\t\talgorithm::hyper_log_log hl;\n\t\thl.insert(0);\n\t\thl.insert(1);\n\t\thl.insert(2);\n\t\thl.insert(3);\n\t\thl.insert(4);\n\t\thl.insert(5);\n\t\thl.insert(6);\n\n\t\talgorithm::hyper_log_log hl2;\n\t\thl2.insert(0);\n\t\thl2.insert(1);\n\t\thl2.insert(2);\n\t\thl2.insert(3);\n\t\thl2.insert(4);\n\t\thl2.insert(5);\n\t\thl2.insert(7);\n\n\t\talgorithm::hyper_log_log hl3 = hl + hl2;\n\t}\n\n\tvector<size_t> intervals = {400000, 500000, 1000000, 10000000};\n\n\tfor (size_t interval : intervals) {\n\t\talgorithm::hyper_log_log hl;\n\t\tfor (size_t i = 0; i < interval; i++) {\n\t\t\thl.insert(i);\n\t\t}\n\t\tBOOST_CHECK(std::abs((int)hl.count() - (int)interval) < interval * hl.error_bound());\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(hyper_union) {\n\talgorithm::hyper_log_log hl1;\n\talgorithm::hyper_log_log hl2;\n\n\tfor (size_t i = 0; i < 250000; i++) {\n\t\thl1.insert(i);\n\t}\n\tfor (size_t i = 250000; i < 500000; i++) {\n\t\thl2.insert(i);\n\t}\n\n\talgorithm::hyper_log_log hl3 = hl1 + hl2;\n\tBOOST_CHECK(std::abs((int)hl3.count() - 500000) < 500000 * hl3.error_bound());\n}\n\nBOOST_AUTO_TEST_CASE(hyper_log_log_data_copy) {\n\talgorithm::hyper_log_log hl1;\n\n\tfor (size_t i = 0; i < 250000; i++) {\n\t\thl1.insert(i);\n\t}\n\n\talgorithm::hyper_log_log hl2(hl1.data(), hl1.b());\n\n\tBOOST_CHECK(std::abs((int)hl2.count() - 250000) < 250000 * hl1.error_bound());\n\n\tstd::vector<size_t> sizes = {25000, 50000, 75000, 100000, 200000, 300000, 400000};\n\n\tsrand(100);\n\tfor (size_t size : sizes) {\n\t\talgorithm::hyper_log_log hll;\n\t\tfor (size_t i = 0; i < size; i++) {\n\t\t\tsize_t rnd = (((size_t)rand()) << 32) | ((size_t)rand());\n\t\t\thll.insert(rnd);\n\t\t}\n\t\tBOOST_CHECK(std::abs((int)hll.count() - (int)size) < size * hl1.error_bound());\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(hyper_log_log_test2) {\n\talgorithm::hyper_log_log hl1(10);\n\n\tconst int sz = 100000;\n\n\tfor (size_t i = 0; i < sz; i++) {\n\t\thl1.insert(rand());\n\t}\n\n\tBOOST_CHECK(std::abs((int)hl1.count() - sz) < sz * hl1.error_bound());\n}\n\nBOOST_AUTO_TEST_CASE(hyper_log_log_move) {\n\talgorithm::hyper_log_log hl1(10);\n\n\tconst int sz = 100000;\n\n\tfor (size_t i = 0; i < sz; i++) {\n\t\thl1.insert(rand());\n\t}\n\n\tauto hl2 = std::move(hl1);\n\n\tBOOST_CHECK(std::abs((int)hl2.count() - sz) < sz * hl1.error_bound());\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_index_builder.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"file/file.h\"\n#include \"indexer/index_builder.h\"\n#include \"indexer/index.h\"\n#include \"indexer/generic_record.h\"\n#include \"indexer/value_record.h\"\n\nBOOST_AUTO_TEST_SUITE(test_index_builder)\n\nBOOST_AUTO_TEST_CASE(test_merge_with) {\n\n\tfile::delete_directory(\"./0/full_text/test_index\");\n\tfile::create_directory(\"./0/full_text/test_index\");\n\n\t{\n\t\tindexer::index_builder<indexer::value_record> idx(\"test_index\", 0, 1000);\n\n\t\tidx.add(123, indexer::value_record(1000));\n\t\tidx.add(123, indexer::value_record(1001));\n\t\tidx.add(124, indexer::value_record(1000));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\t{\n\t\tindexer::index<indexer::value_record> idx(\"test_index\", 0, 1000);\n\n\t\tauto res1 = idx.find(123);\n\t\tauto res2 = idx.find(124);\n\n\t\tBOOST_REQUIRE_EQUAL(res1.size(), 2);\n\t\tBOOST_REQUIRE_EQUAL(res2.size(), 1);\n\n\t\tBOOST_CHECK_EQUAL(res1[0].m_value, 1000);\n\t\tBOOST_CHECK_EQUAL(res1[1].m_value, 1001);\n\t\tBOOST_CHECK_EQUAL(res2[0].m_value, 1000);\n\t}\n\t{\n\t\tindexer::index_builder<indexer::value_record> idx(\"test_index\", 8, 1000);\n\n\t\tidx.add(123, indexer::value_record(1002));\n\t\tidx.add(123, indexer::value_record(1003));\n\t\tidx.add(124, indexer::value_record(1010));\n\t\tidx.add(125, indexer::value_record(1011));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\tindexer::index_builder<indexer::value_record> idx1(\"test_index\", 0, 1000);\n\t\tindexer::index<indexer::value_record> idx2(\"test_index\", 8, 1000);\n\n\t\tidx1.merge_with(idx2);\n\t}\n\n\t{\n\t\tindexer::index<indexer::value_record> idx(\"test_index\", 0, 1000);\n\n\t\tauto res1 = idx.find(123);\n\t\tauto res2 = idx.find(124);\n\t\tauto res3 = idx.find(125);\n\n\t\tBOOST_REQUIRE_EQUAL(res1.size(), 4);\n\t\tBOOST_REQUIRE_EQUAL(res2.size(), 2);\n\t\tBOOST_REQUIRE_EQUAL(res3.size(), 1);\n\n\t\tBOOST_CHECK_EQUAL(res1[0].m_value, 1000);\n\t\tBOOST_CHECK_EQUAL(res1[1].m_value, 1001);\n\t\tBOOST_CHECK_EQUAL(res1[2].m_value, 1002);\n\t\tBOOST_CHECK_EQUAL(res1[3].m_value, 1003);\n\t\tBOOST_CHECK_EQUAL(res2[0].m_value, 1000);\n\t\tBOOST_CHECK_EQUAL(res2[1].m_value, 1010);\n\t\tBOOST_CHECK_EQUAL(res3[0].m_value, 1011);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(test_merge_with2) {\n\n\tfile::delete_directory(\"./0/full_text/test_index\");\n\tfile::create_directory(\"./0/full_text/test_index\");\n\n\t{\n\t\tindexer::index_builder<indexer::value_record> idx(\"test_index\", 0, 1000);\n\n\t\tidx.add(123, indexer::value_record(1000));\n\t\tidx.add(123, indexer::value_record(1001));\n\t\tidx.add(124, indexer::value_record(1000));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\t{\n\t\tindexer::index<indexer::value_record> idx(\"test_index\", 0, 1000);\n\n\t\tauto res1 = idx.find(123);\n\t\tauto res2 = idx.find(124);\n\n\t\tBOOST_REQUIRE_EQUAL(res1.size(), 2);\n\t\tBOOST_REQUIRE_EQUAL(res2.size(), 1);\n\n\t\tBOOST_CHECK_EQUAL(res1[0].m_value, 1000);\n\t\tBOOST_CHECK_EQUAL(res1[1].m_value, 1001);\n\t\tBOOST_CHECK_EQUAL(res2[0].m_value, 1000);\n\t}\n\t{\n\t\tindexer::index_builder<indexer::value_record> idx(\"test_index\", 8, 1000);\n\n\t\tidx.add(123, indexer::value_record(1002));\n\t\tidx.add(123, indexer::value_record(1003));\n\t\tidx.add(124, indexer::value_record(1010));\n\t\tidx.add(125, indexer::value_record(1011));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\tindexer::index_builder<indexer::value_record> idx1(\"test_index\", 0, 1000);\n\t\tindexer::index<indexer::value_record> idx2(\"test_index\", 8, 1000);\n\n\t\tidx1.merge_with(idx2);\n\t}\n\n\t{\n\t\tindexer::index<indexer::value_record> idx(\"test_index\", 0, 1000);\n\n\t\tauto res1 = idx.find(123);\n\t\tauto res2 = idx.find(124);\n\t\tauto res3 = idx.find(125);\n\n\t\tBOOST_REQUIRE_EQUAL(res1.size(), 4);\n\t\tBOOST_REQUIRE_EQUAL(res2.size(), 2);\n\t\tBOOST_REQUIRE_EQUAL(res3.size(), 1);\n\n\t\tBOOST_CHECK_EQUAL(res1[0].m_value, 1000);\n\t\tBOOST_CHECK_EQUAL(res1[1].m_value, 1001);\n\t\tBOOST_CHECK_EQUAL(res1[2].m_value, 1002);\n\t\tBOOST_CHECK_EQUAL(res1[3].m_value, 1003);\n\t\tBOOST_CHECK_EQUAL(res2[0].m_value, 1000);\n\t\tBOOST_CHECK_EQUAL(res2[1].m_value, 1010);\n\t\tBOOST_CHECK_EQUAL(res3[0].m_value, 1011);\n\t}\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_index_iteration.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include <mutex>\n#include \"indexer/sharded_builder.h\"\n#include \"indexer/sharded.h\"\n#include \"indexer/basic_index_builder.h\"\n#include \"indexer/basic_index.h\"\n#include \"indexer/counted_record.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_index_iteration)\n\nBOOST_AUTO_TEST_CASE(test_index_iteration) {\n\n\t{\n\t\tindexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> idx(\"test_index\", 10);\n\t\tidx.truncate();\n\n\t\tidx.add(100, indexer::counted_record(1000));\n\t\tidx.add(101, indexer::counted_record(1001));\n\t\tidx.add(101, indexer::counted_record(1002));\n\t\tidx.add(102, indexer::counted_record(1003));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\tindexer::sharded<indexer::basic_index, indexer::counted_record> idx(\"test_index\", 10);\n\n\tstd::vector<uint64_t> found_keys;\n\tstd::vector<uint64_t> found_values;\n\tstd::mutex lock;\n\tidx.for_each([&lock, &found_keys, &found_values](uint64_t key, const std::vector<indexer::counted_record> &recs) {\n\n\t\tstd::lock_guard grd(lock);\n\n\t\tfound_keys.push_back(key);\n\t\tfor (auto &rec : recs) {\n\t\t\tfound_values.push_back(rec.m_value);\n\t\t}\n\n\t});\n\n\tstd::sort(found_keys.begin(), found_keys.end());\n\tstd::sort(found_values.begin(), found_values.end());\n\n\tBOOST_CHECK(found_keys[0] == 100);\n\tBOOST_CHECK(found_keys[1] == 101);\n\tBOOST_CHECK(found_keys[2] == 102);\n\tBOOST_CHECK(found_keys.size() == 3);\n\n\tBOOST_CHECK(found_values[0] == 1000);\n\tBOOST_CHECK(found_values[1] == 1001);\n\tBOOST_CHECK(found_values[2] == 1002);\n\tBOOST_CHECK(found_values[3] == 1003);\n\tBOOST_CHECK(found_values.size() == 4);\n\n}\n\nBOOST_AUTO_TEST_CASE(test_index_iteration2) {\n\n\t{\n\t\tindexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> idx(\"test_index\", 10);\n\t\tidx.truncate();\n\n\t\tfor (size_t i = 1; i <= 10000; i++) {\n\t\t\tidx.add(i % 10, indexer::counted_record(i));\n\t\t\tidx.add(i % 100, indexer::counted_record(i));\n\t\t\tidx.add(i % 7, indexer::counted_record(i));\n\t\t\tidx.add(i % 13, indexer::counted_record(i));\n\t\t}\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\tindexer::sharded<indexer::basic_index, indexer::counted_record> idx(\"test_index\", 10);\n\n\tstd::map<uint64_t, std::vector<size_t>> records;\n\tstd::mutex lock;\n\tidx.for_each([&lock, &records](uint64_t key, const std::vector<indexer::counted_record> &recs) {\n\n\t\tstd::lock_guard grd(lock);\n\n\t\tfor (auto &rec : recs) {\n\t\t\trecords[key].push_back(rec.m_value);\n\t\t}\n\n\t});\n\n\tfor (size_t i = 1; i <= 10000; i++) {\n\t\tBOOST_CHECK(std::find(records[i % 10].begin(), records[i % 10].end(), i) != records[i % 10].end());\n\t\tBOOST_CHECK(std::find(records[i % 100].begin(), records[i % 100].end(), i) != records[i % 100].end());\n\t\tBOOST_CHECK(std::find(records[i % 7].begin(), records[i % 7].end(), i) != records[i % 7].end());\n\t\tBOOST_CHECK(std::find(records[i % 13].begin(), records[i % 13].end(), i) != records[i % 13].end());\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_index_reader.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"indexer/index_builder.h\"\n#include \"indexer/index.h\"\n#include \"indexer/generic_record.h\"\n#include <boost/iostreams/filtering_stream.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include \"URL.h\"\n#include \"text/text.h\"\n#include \"profiler/profiler.h\"\n#include \"roaring/roaring.hh\"\n\nBOOST_AUTO_TEST_SUITE(test_index_reader)\n\nBOOST_AUTO_TEST_CASE(test_index_reader1) {\n\n\t{\n\t\tindexer::index_builder<indexer::generic_record> idx(\"test_db\", 0, 1000);\n\n\t\tidx.truncate();\n\n\t\tidx.add(100, indexer::generic_record(1000));\n\t\tidx.add(100, indexer::generic_record(1001));\n\t\tidx.add(100, indexer::generic_record(1002));\n\n\t\tidx.append();\n\t\tidx.merge();\n\n\t}\n\n\t{\n\t\tifstream reader(\"./0/full_text/test_db/0.data\", ios::binary);\n\t\treader.seekg(0, ios::end);\n\t\tsize_t file_size = reader.tellg();\n\t\treader.seekg(0, ios::beg);\n\t\tchar *buffer = new char[file_size];\n\t\treader.read(buffer, file_size);\n\n\t\tstd::string file_data(buffer, file_size);\n\n\t\tstd::istringstream ram_reader(file_data);\n\n\t\tindexer::index<indexer::generic_record> idx(&ram_reader, 1000);\n\n\t\tvector<indexer::generic_record> res = idx.find(100);\n\n\t\tBOOST_REQUIRE(res.size() == 3);\n\t\tBOOST_CHECK(res[0].m_value == 1000);\n\t\tBOOST_CHECK(res[1].m_value == 1001);\n\t\tBOOST_CHECK(res[2].m_value == 1002);\n\n\t\tdelete[] buffer;\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_index_reader_2) {\n\n\t/*\n\t{\n\t\tindexer::index_builder<indexer::url_record> idx(\"restaurantbusinessonline.com\");\n\t\tidx.set_hash_table_size(1000);\n\n\t\tidx.truncate();\n\n\t\tconst vector<size_t> cols = {1, 2, 3, 4};\n\n\t\tvector<string> files;\n\n\t\tboost::filesystem::path p (\"./output\");\n\t\tboost::filesystem::directory_iterator end_itr;\n\n\t\tfor (boost::filesystem::directory_iterator itr(p); itr != end_itr; ++itr) {\n\t\t\t// If it's not a directory, list it. If you want to list directories too, just remove this check.\n\t\t\tif (boost::filesystem::is_regular_file(itr->path())) {\n\t\t\t\t// assign current file name to current_file and echo it out to the console.\n\t\t\t\tstring current_file = itr->path().string();\n\t\t\t\tfiles.push_back(current_file);\n\t\t\t}\n\t\t}\n\n\t\tsize_t num_added = 0;\n\t\tsize_t num_bytes_added = 0;\n\n\t\tfor (const string &local_path : files) {\n\n\t\t\tifstream infile(local_path, ios::in);\n\t\t\tboost::iostreams::filtering_istream decompress_stream;\n\t\t\tdecompress_stream.push(boost::iostreams::gzip_decompressor());\n\t\t\tdecompress_stream.push(infile);\n\n\t\t\tstring line;\n\t\t\twhile (getline(decompress_stream, line)) {\n\t\t\t\tvector<string> col_values;\n\t\t\t\tboost::algorithm::split(col_values, line, boost::is_any_of(\"\\t\"));\n\n\t\t\t\tURL url(col_values[0]);\n\n\t\t\t\tif (url.host() != \"doodlecraftblog.com\") continue;\n\n\t\t\t\tnum_added++;\n\n\t\t\t\tuint64_t url_hash = url.hash();\n\n\t\t\t\tfor (size_t col : cols) {\n\t\t\t\t\tvector<string> words = text::get_full_text_words(col_values[col]);\n\t\t\t\t\tfor (const string &word : words) {\n\t\t\t\t\t\tnum_bytes_added += word.size();\n\t\t\t\t\t\tidx.add(::algorithm::hash(word), ::indexer::url_record(url_hash));\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\tnum_added++;\n\n\t\tcout << \"ADDED \" << num_added << \" URLS\" << endl;\n\t\tcout << num_bytes_added << \" bytes\" << endl;\n\n\t\tidx.append();\n\t\tidx.merge();\n\n\t}\n\n\t{\n\t\tlogger::verbose(true);\n\t\tprofiler::instance prof(\"load index file to ram\");\n\t\tifstream reader(\"restaurantbusinessonline.com.data\", ios::binary);\n\t\treader.seekg(0, ios::end);\n\t\tsize_t file_size = reader.tellg();\n\t\treader.seekg(0, ios::beg);\n\t\tchar *buffer = new char[file_size];\n\t\treader.read(buffer, file_size);\n\t\tprof.stop();\n\n\t\tindexer::index_reader_ram ram_reader(buffer, file_size);\n\n\t\tindexer::index<indexer::generic_record> idx((indexer::index_reader *)&ram_reader, 1000);\n\n\t\tcout << \"file_size: \" << file_size << endl;\n\t\tidx.print_stats();\n\n\t\tvector<indexer::generic_record> res = idx.find(::algorithm::hash(\"helicopter\"));\n\n\t\tBOOST_REQUIRE(res.size() > 0);\n\n\t\tdelete buffer;\n\t}*/\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_logger.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"logger/logger.h\"\n#include \"config.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_logger)\n\nBOOST_AUTO_TEST_CASE(test_logger1) {\n\n\tlogger::log_string(\"test1\");\n\tlogger::log_string(\"test2\");\n\n\tlogger::sync();\n\n\tifstream logfile(config::log_file_path);\n\tlogfile.seekg(-12, std::ios::end);\n\tstring line1, line2;\n\tgetline(logfile, line1);\n\tgetline(logfile, line2);\n\n\tBOOST_CHECK_EQUAL(line1, \"test1\");\n\tBOOST_CHECK_EQUAL(line2, \"test2\");\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_memory.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"memory/memory.h\"\n#include \"memory/debugger.h\"\n#include \"indexer/index_builder.h\"\n#include \"indexer/basic_index_builder.h\"\n#include \"indexer/domain_link_record.h\"\n\nBOOST_AUTO_TEST_SUITE(test_memory)\n\nBOOST_AUTO_TEST_CASE(test_memory) {\n\tmemory::update();\n\n\tBOOST_CHECK(memory::get_available_memory() > 0);\n\tBOOST_CHECK(memory::get_total_memory() > 0);\n\n\tconst size_t used1 = memory::allocated_memory();\n\n\tconst size_t memlen = 1000000;\n\tchar *some_mem = new char[memlen];\n\tfor (size_t i = 0; i < memlen; i++) {\n\t\tsome_mem[i] = 1;\n\t}\n\tmemory::update();\n\n\tconst size_t used2 = memory::allocated_memory();\n\tdelete[] some_mem;\n\tconst size_t used3 = memory::allocated_memory();\n\n\tstd::cout << \"used1: \" << used1 << std::endl;\n\tstd::cout << \"used2: \" << used2 << std::endl;\n\tstd::cout << \"used3: \" << used3 << std::endl;\n\tBOOST_CHECK(used1 + 1000000 == used2);\n\tBOOST_CHECK(used1 == used3);\n}\n\n/*\n * Test memory consumtion during merge, should end with same amount.\n * */\nBOOST_AUTO_TEST_CASE(test_indexer_memory) {\n\tmemory::update();\n\n\tindexer::create_db_directories(\"domain_link_index\");\n\n\tBOOST_CHECK(memory::get_available_memory() > 0);\n\tBOOST_CHECK(memory::get_total_memory() > 0);\n\n\tsize_t memuse1, memuse2, memuse3, memuse4;\n\tmemuse1 = memory::allocated_memory();\n\n\t{\n\t\tindexer::basic_index_builder<indexer::domain_link_record> idx(\"domain_link_index\", 97ull);\n\n\t\tmemuse2 = memory::allocated_memory();\n\t\tidx.append();\n\t\tidx.merge();\n\t\tmemuse3 = memory::allocated_memory();\n\t}\n\n\tmemuse4 = memory::allocated_memory();\n\n\tBOOST_CHECK(memuse1 == memuse4);\n\tBOOST_CHECK(memuse2 == memuse3);\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_n_gram.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"text/text.h\"\n#include \"algorithm/hash.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(n_gram)\n\nBOOST_AUTO_TEST_CASE(words_to_ngram) {\n\tvector<uint64_t> ngrams;\n\ttext::words_to_ngram_hash({\"the\", \"quick\", \"brown\", \"fox\", \"jumps\", \"over\", \"the\", \"lazy\", \"dog\"}, 3, [&ngrams](const uint64_t hash) {\n\t\tngrams.push_back(hash);\n\t});\n\n\tBOOST_CHECK_EQUAL(ngrams[0], algorithm::hash(\"the\"));\n\tBOOST_CHECK_EQUAL(ngrams[1], algorithm::hash(\"the quick\"));\n\tBOOST_CHECK_EQUAL(ngrams[2], algorithm::hash(\"the quick brown\"));\n\n\tBOOST_CHECK_EQUAL(ngrams[3], algorithm::hash(\"quick\"));\n\tBOOST_CHECK_EQUAL(ngrams[4], algorithm::hash(\"quick brown\"));\n\tBOOST_CHECK_EQUAL(ngrams[5], algorithm::hash(\"quick brown fox\"));\n\n\tBOOST_CHECK_EQUAL(ngrams[6], algorithm::hash(\"brown\"));\n\tBOOST_CHECK_EQUAL(ngrams[7], algorithm::hash(\"brown fox\"));\n\tBOOST_CHECK_EQUAL(ngrams[8], algorithm::hash(\"brown fox jumps\"));\n\n\tBOOST_CHECK_EQUAL(ngrams[18], algorithm::hash(\"the\"));\n\tBOOST_CHECK_EQUAL(ngrams[19], algorithm::hash(\"the lazy\"));\n\tBOOST_CHECK_EQUAL(ngrams[20], algorithm::hash(\"the lazy dog\"));\n\n\tBOOST_CHECK_EQUAL(ngrams[21], algorithm::hash(\"lazy\"));\n\tBOOST_CHECK_EQUAL(ngrams[22], algorithm::hash(\"lazy dog\"));\n\tBOOST_CHECK_EQUAL(ngrams[23], algorithm::hash(\"dog\"));\n\n\tBOOST_CHECK_EQUAL(ngrams.size(), 24);\n\n}\n\nBOOST_AUTO_TEST_CASE(n_gram2) {\n\n\tvector<uint64_t> ngrams;\n\ttext::words_to_ngram_hash({\"i\", \"liberoklubben\", \"här\"}, 3, [&ngrams](const uint64_t hash, const std::string &word) {\n\t\tngrams.push_back(hash);\n\t});\n\n\tBOOST_CHECK_EQUAL(ngrams[0], algorithm::hash(\"i\"));\n\tBOOST_CHECK_EQUAL(ngrams[1], algorithm::hash(\"i liberoklubben\"));\n\tBOOST_CHECK_EQUAL(ngrams[2], algorithm::hash(\"i liberoklubben här\"));\n\n\tBOOST_CHECK_EQUAL(ngrams[3], algorithm::hash(\"liberoklubben\"));\n\tBOOST_CHECK_EQUAL(ngrams[4], algorithm::hash(\"liberoklubben här\"));\n\tBOOST_CHECK_EQUAL(ngrams[5], algorithm::hash(\"här\"));\n\n\tBOOST_CHECK_EQUAL(ngrams.size(), 6);\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_robot_parser.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"robots.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(robot_parser)\n\nBOOST_AUTO_TEST_CASE(parse) {\n\tstd::string robots_content = \"Sitemap: https://www.omnible.se/sitemap.xml\\n\"\n\t\t\"User-agent: AlexandriaBot\\n\"\n\t\t\"Disallow: *\\n\"\n\t\t\"User-agent: *   # all agents\\n\"\n\t\t\"Disallow: /*crawl=no*\\n\"\n\t\t\"Disallow: /basket/add*\\n\"\n\t;\n\tstd::string user_agent = \"AlexandriaBot\";\n\tgooglebot::RobotsMatcher matcher;\n\tstd::string url = \"/visit\";\n\tbool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url);\n\tBOOST_CHECK(!allowed);\n}\n\nBOOST_AUTO_TEST_CASE(parse2) {\n\tstd::string robots_content = string(\"Sitemap: https://www.omnible.se/sitemap.xml\\n\"\n\t\t\"User-agent: *\\n\"\n\t\t\"Disallow: /visit\\n\"\n\t\t\"User-agent: AlexandriaBot\\n\"\n\t\t\"Disallow: /10126597891759986715\\n\");\n\n\tstd::string user_agent = \"AlexandriaBot\";\n\tgooglebot::RobotsMatcher matcher;\n\t{\n\t\tstd::string url = \"https://www.omnible.se/10126597891759986715\";\n\t\tbool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url);\n\t\tBOOST_CHECK(!allowed);\n\t}\n\t{\n\t\tstd::string url = \"https://www.omnible.se/1012659789175998671\";\n\t\tbool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url);\n\t\tBOOST_CHECK(allowed);\n\t}\n\n}\n\n\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_scraper.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include <boost/algorithm/string.hpp>\n#include \"scraper/scraper.h\"\n#include <queue>\n#include <vector>\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_scraper)\n\nBOOST_AUTO_TEST_CASE(test_scraper) {\n\n\tscraper::scraper_store store;\n\n\tscraper::scraper scraper(\"omnible.se\", &store);\n\tscraper.set_timeout(0);\n\tscraper.push_url(URL(\"http://omnible.se/\"));\n\tscraper.push_url(URL(\"http://omnible.se/10126597891759986715\"));\n\tscraper.push_url(URL(\"http://omnible.se/10123997891267016458\"));\n\tscraper.push_url(URL(\"http://omnible.se/gtin/9789180230865\"));\n\tscraper.push_url(URL(\"http://omnible.se/10123697814011564169\"));\n\tscraper.push_url(URL(\"https://www.omnible.se/notfound\"));\n\tscraper.push_url(URL(\"https://www.omnible.se/gtin/9789177714958\"));\n\n\tscraper.run();\n\n\tstring last = store.tail();\n\tvector<string> cols;\n\tboost::algorithm::split(cols, last, boost::is_any_of(\"\\t\"));\n\tBOOST_CHECK_EQUAL(cols[0], \"https://www.omnible.se/10123697814011564169\");\n\tBOOST_CHECK_EQUAL(cols[1], \"Den sista gåvan av Abdulrazak Gurnah - recensioner & prisjämförelse - Omnible\");\n}\n\nBOOST_AUTO_TEST_CASE(scraper_multithreaded) {\n\n\treturn;\n\n\tvector<string> urls = {\n\t\t/*\"http://omnible.se/\",\n\t\t\"http://omnible.se/10126597891759986715\",\n\t\t\"http://omnible.se/10123997891267016458\",\n\t\t\"https://spelagratis.nu/\",\n\t\t\"https://spelagratis.nu/super_mario_world.html\",\n\t\t\"http://omnible.se/gtin/9789180230865\",\n\t\t\"http://omnible.se/10123697814011564169\",\n\t\t\"https://spelagratis.nu/dirt_bike.html\"*/\n\t\t\"http://optout.aboutads.info/\",\n\t\t\"http://tabernus.com/\",\n\t\t\"http://tabernus.com/test\",\n\t\t\"http://apnews.excite.com/article/20071031/D8SKBRKO0.html\",\n\t\t\"http://thebetter.wiki/en/Jeb_Magruder\",\n\t\t\"https://www.thebetter.wiki/en/testing\"\n\t};\n\n\tscraper::run_scraper_on_urls(urls);\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_sharded_index_builder.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"indexer/index_manager.h\"\n#include \"indexer/sharded_index_builder.h\"\n#include \"indexer/sharded_index.h\"\n#include \"indexer/merger.h\"\n#include \"text/text.h\"\n#include \"algorithm/hash.h\"\n#include \"transfer/transfer.h\"\n\nBOOST_AUTO_TEST_SUITE(test_sharded_index_builder)\n\nBOOST_AUTO_TEST_CASE(test_sharded_index_builder) {\n\n\t{\n\t\tindexer::sharded_index_builder<indexer::generic_record> idx(\"test_index\", 10);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, indexer::generic_record(1000, 1.0f));\n\t\tidx.add(102, indexer::generic_record(1001, 1.0f));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t}\n\n\t{\n\t\tindexer::sharded_index<indexer::generic_record> idx(\"test_index\", 10);\n\t\tvector<indexer::generic_record> res = idx.find(101);\n\n\t\tBOOST_REQUIRE(res.size() == 1);\n\t\tBOOST_CHECK(res[0].m_value == 1000);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_group_by) {\n\n\tusing indexer::domain_link_record;\n\n\t{\n\t\tindexer::sharded_index_builder<domain_link_record> idx(\"test_index\", 1);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, domain_link_record(1000, 1.0f, 200));\n\t\tidx.add(101, domain_link_record(1004, 1.0f, 300));\n\t\tidx.add(101, domain_link_record(1001, 1.0f, 200));\n\t\tidx.add(101, domain_link_record(1003, 1.0f, 300));\n\t\tidx.add(101, domain_link_record(1002, 1.0f, 200));\n\n\t\tidx.add(102, domain_link_record(1000, 1.0f, 200));\n\t\tidx.add(102, domain_link_record(1001, 1.0f, 200));\n\t\tidx.add(102, domain_link_record(1005, 1.0f, 300));\n\t\tidx.add(102, domain_link_record(1002, 1.0f, 200));\n\n\t\tidx.add(103, domain_link_record(1000, 1.0f, 200));\n\t\tidx.add(103, domain_link_record(1001, 1.0f, 200));\n\t\tidx.add(103, domain_link_record(1004, 1.0f, 300));\n\t\tidx.add(103, domain_link_record(1002, 1.0f, 200));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t}\n\n\t{\n\t\tindexer::sharded_index<domain_link_record> idx(\"test_index\", 1);\n\n\t\tauto identity = [](float score) {\n\t\t\treturn score;\n\t\t};\n\t\tstd::vector<size_t> counts;\n\t\tvector<domain_link_record> res = idx.find_group_by({101, 102}, identity, counts);\n\n\t\tBOOST_REQUIRE(res.size() == 1);\n\t\tBOOST_CHECK(res[0].m_score == 3.0f);\n\t\tBOOST_CHECK(counts[0] == 3);\n\t}\n\n\t{\n\t\tindexer::sharded_index<domain_link_record> idx(\"test_index\", 1);\n\t\tauto times_two = [](float score) {\n\t\t\treturn 2.0f * score;\n\t\t};\n\t\tstd::vector<size_t> counts;\n\t\tvector<domain_link_record> res = idx.find_group_by({101, 103}, times_two, counts);\n\n\t\tBOOST_REQUIRE(res.size() == 2);\n\n\t\tsort(res.begin(), res.end(), domain_link_record::storage_order());\n\t\tBOOST_CHECK(res[0].m_score == 2.0f * (3.0f));\n\t\tBOOST_CHECK(res[1].m_score == 2.0f * (1.0f));\n\t\tBOOST_CHECK(counts[0] == 3);\n\t\tBOOST_CHECK(counts[1] == 1);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(test_score_mod) {\n\n\tusing indexer::domain_record;\n\n\t{\n\t\tindexer::sharded_index_builder<domain_record> idx(\"test_index\", 1);\n\n\t\tidx.truncate();\n\n\t\tidx.add(101, domain_record(1000, 1.0f));\n\t\tidx.add(101, domain_record(1004, 1.0f));\n\t\tidx.add(101, domain_record(1001, 1.0f));\n\t\tidx.add(101, domain_record(1003, 1.0f));\n\t\tidx.add(101, domain_record(1002, 1.0f));\n\n\t\tidx.add(102, domain_record(1000, 1.0f));\n\t\tidx.add(102, domain_record(1001, 1.0f));\n\t\tidx.add(102, domain_record(1005, 1.0f));\n\t\tidx.add(102, domain_record(1002, 1.0f));\n\n\t\tidx.append();\n\t\tidx.merge();\n\t\tidx.optimize();\n\t}\n\n\t{\n\t\t/*\n\t\t * intersected records will be in this order:\n\t\t * 1000\n\t\t * 1001\n\t\t * 1002\n\t\t *\n\t\t * so score modification will take place in that order.\n\t\t *\n\t\t * */\n\t\tindexer::sharded_index<domain_record> idx(\"test_index\", 1);\n\t\tuint64_t sum_id = 0;\n\t\tvector<domain_record> res = idx.find_top({101, 102}, 2,\n\t\t\t\t[&sum_id](const domain_record &val) -> float {\n\t\t\t\t\treturn (float)(sum_id++);\n\t\t\t\t});\n\n\t\tBOOST_REQUIRE(res.size() == 2);\n\t\tBOOST_CHECK(res[0].m_score == 2.0f);\n\t\tBOOST_CHECK(res[0].m_value == 1002);\n\t\tBOOST_CHECK(res[1].m_score == 1.0f);\n\t\tBOOST_CHECK(res[1].m_value == 1001);\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_sort.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"algorithm/sort.h\"\n#include <vector>\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_sort)\n\nstruct test_data_struct1 {\n\tint data1;\n\tint data2;\n};\n\nBOOST_AUTO_TEST_CASE(merge_arrays) {\n\n\t{\n\t\tvector<int> arr1 = {1, 2, 3};\n\t\tvector<int> arr2 = {4, 5, 6};\n\t\tvector<int> arr3;\n\t\tvector<int> arr4{1, 2, 3, 4, 5, 6};\n\n\t\talgorithm::sort::merge_arrays(arr1, arr2, arr3);\n\n\t\tBOOST_CHECK(arr3 == arr4);\n\t}\n\n\t{\n\t\tvector<int> arr1 = {1, 2, 3};\n\t\tvector<int> arr2 = {3, 4, 5, 6};\n\t\tvector<int> arr3;\n\t\tvector<int> arr4{1, 2, 3, 3, 4, 5, 6};\n\n\t\talgorithm::sort::merge_arrays(arr1, arr2, arr3);\n\n\t\tBOOST_CHECK(arr3 == arr4);\n\t}\n\n\t{\n\t\tvector<int> arr1 = {};\n\t\tvector<int> arr2 = {3, 4, 5, 6};\n\t\tvector<int> arr3;\n\t\tvector<int> arr4{3, 4, 5, 6};\n\n\t\talgorithm::sort::merge_arrays(arr1, arr2, arr3);\n\n\t\tBOOST_CHECK(arr3 == arr4);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(merge_arrays_of_struct) {\n\n\t{\n\t\tvector<struct test_data_struct1> arr1{test_data_struct1{.data1 = 1, .data2 = 2}};\n\t\tvector<struct test_data_struct1> arr2{test_data_struct1{.data1 = 2, .data2 = 3}};\n\t\tvector<struct test_data_struct1> arr3;\n\t\tvector<struct test_data_struct1> arr4{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 2, .data2 = 3}};\n\n\t\talgorithm::sort::merge_arrays(arr1, arr2, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) {\n\t\t\treturn a.data1 < b.data1;\n\t\t}, arr3);\n\n\t\tBOOST_CHECK(arr3[0].data1 == arr4[0].data1 && arr3[0].data2 == arr4[0].data2);\n\t\tBOOST_CHECK(arr3[1].data1 == arr4[1].data1 && arr3[1].data2 == arr4[1].data2);\n\t}\n\n\t{\n\t\tvector<struct test_data_struct1> arr1{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 3, .data2 = 4}};\n\t\tvector<struct test_data_struct1> arr2{test_data_struct1{.data1 = 2, .data2 = 3}};\n\t\tvector<struct test_data_struct1> arr3;\n\t\tvector<struct test_data_struct1> arr4{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 2, .data2 = 3},\n\t\t\ttest_data_struct1{.data1 = 3, .data2 = 4}};\n\n\t\talgorithm::sort::merge_arrays(arr1, arr2, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) {\n\t\t\treturn a.data1 < b.data1;\n\t\t}, arr3);\n\n\t\tBOOST_CHECK(arr3[0].data1 == arr4[0].data1 && arr3[0].data2 == arr4[0].data2);\n\t\tBOOST_CHECK(arr3[1].data1 == arr4[1].data1 && arr3[1].data2 == arr4[1].data2);\n\t\tBOOST_CHECK(arr3[2].data1 == arr4[2].data1 && arr3[2].data2 == arr4[2].data2);\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(merge_many_arrays) {\n\n\t{\n\t\tvector<int> arr1 = {1, 2, 3};\n\t\tvector<int> arr2 = {4, 5, 6};\n\t\tvector<int> arr3 = {7, 8, 9};\n\t\tvector<int> res;\n\t\tvector<vector<int>> inp{arr1, arr2, arr3};\n\t\tvector<int> corr{1, 2, 3, 4, 5, 6, 7, 8, 9};\n\n\t\talgorithm::sort::merge_arrays(inp, res);\n\n\t\tBOOST_CHECK(res == corr);\n\t}\n\n\t{\n\t\tvector<int> arr1 = {1, 3, 6};\n\t\tvector<int> arr2 = {2, 4, 9};\n\t\tvector<int> arr3 = {1, 5, 7, 8};\n\t\tvector<int> res;\n\t\tvector<vector<int>> inp{arr1, arr2, arr3};\n\t\tvector<int> corr{1, 1, 2, 3, 4, 5, 6, 7, 8, 9};\n\n\t\talgorithm::sort::merge_arrays(inp, res);\n\n\t\tBOOST_CHECK(res == corr);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(merge_many_arrays_of_struct) {\n\n\t{\n\t\tvector<struct test_data_struct1> arr1{\n\t\t\ttest_data_struct1{.data1 = 1, .data2 = 11},\n\t\t\ttest_data_struct1{.data1 = 2, .data2 = 12},\n\t\t\ttest_data_struct1{.data1 = 3, .data2 = 13}\n\t\t};\n\t\tvector<struct test_data_struct1> arr2 = {\n\t\t\ttest_data_struct1{.data1 = 4, .data2 = 14},\n\t\t\ttest_data_struct1{.data1 = 5, .data2 = 15},\n\t\t\ttest_data_struct1{.data1 = 6, .data2 = 16}\n\t\t};\n\t\tvector<struct test_data_struct1> arr3 = {\n\t\t\ttest_data_struct1{.data1 = 7, .data2 = 17},\n\t\t\ttest_data_struct1{.data1 = 8, .data2 = 18},\n\t\t\ttest_data_struct1{.data1 = 9, .data2 = 19}\n\t\t};\n\t\tvector<struct test_data_struct1> res;\n\t\tvector<vector<struct test_data_struct1>> inp{arr1, arr2, arr3};\n\t\tvector<struct test_data_struct1> corr{\n\t\t\ttest_data_struct1{.data1 = 1, .data2 = 11},\n\t\t\ttest_data_struct1{.data1 = 2, .data2 = 12},\n\t\t\ttest_data_struct1{.data1 = 3, .data2 = 13},\n\t\t\ttest_data_struct1{.data1 = 4, .data2 = 14},\n\t\t\ttest_data_struct1{.data1 = 5, .data2 = 15},\n\t\t\ttest_data_struct1{.data1 = 6, .data2 = 16},\n\t\t\ttest_data_struct1{.data1 = 7, .data2 = 17},\n\t\t\ttest_data_struct1{.data1 = 8, .data2 = 18},\n\t\t\ttest_data_struct1{.data1 = 9, .data2 = 19}\n\t\t};\n\n\t\talgorithm::sort::merge_arrays(inp, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) {\n\t\t\treturn a.data1 < b.data1;\n\t\t}, res);\n\n\t\tBOOST_CHECK(corr.size() == res.size());\n\t\tfor (size_t i = 0; i < corr.size(); i++) {\n\t\t\tBOOST_CHECK(res[i].data1 == corr[i].data1 && res[i].data2 == corr[i].data2);\n\t\t}\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_sum_sorted.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include <vector>\n#include \"algorithm/sum_sorted.h\"\n#include \"indexer/counted_record.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_sum_sorted, * boost::unit_test::tolerance(0.00001))\n\nBOOST_AUTO_TEST_CASE(test_sum_sorted1) {\n\n\tvector<vector<int>> sorted = {\n\t\t{1, 2, 3},\n\t\t{2, 3},\n\t\t{3}\n\t};\n\tvector<int> res = ::algorithm::sum_sorted<int>(sorted, [](int &a, const int &b) {\n\t\ta += b;\n\t});\n\n\tBOOST_REQUIRE(res.size() == 3);\n\tBOOST_CHECK(res[0] == 1);\n\tBOOST_CHECK(res[1] == 4);\n\tBOOST_CHECK(res[2] == 9);\n}\n\nBOOST_AUTO_TEST_CASE(test_sum_sorted2) {\n\n\tvector<vector<int>> sorted = {\n\t\t{3},\n\t\t{2, 3},\n\t\t{1, 2, 3},\n\t};\n\tvector<int> res = ::algorithm::sum_sorted<int>(sorted, [](int &a, const int &b) {\n\t\ta += b;\n\t});\n\n\tBOOST_REQUIRE(res.size() == 3);\n\tBOOST_CHECK(res[0] == 1);\n\tBOOST_CHECK(res[1] == 4);\n\tBOOST_CHECK(res[2] == 9);\n}\n\nBOOST_AUTO_TEST_CASE(test_sum_sorted3) {\n\n\tvector<vector<indexer::counted_record>> sorted = {\n\t\t{indexer::counted_record(3, 0.1)},\n\t\t{indexer::counted_record(2, 0.1), indexer::counted_record(3, 0.1)},\n\t\t{indexer::counted_record(1, 0.1), indexer::counted_record(2, 0.1), indexer::counted_record(3, 0.1)},\n\t};\n\tvector<indexer::counted_record> res = ::algorithm::sum_sorted<indexer::counted_record>(sorted,\n\t\t\t[](indexer::counted_record &a, const indexer::counted_record &b) {\n\t\ta.m_score += b.m_score;\n\t});\n\n\tBOOST_REQUIRE(res.size() == 3);\n\tBOOST_CHECK_EQUAL(res[0].m_score, 0.1f);\n\tBOOST_CHECK_EQUAL(res[1].m_score, 0.2f);\n\tBOOST_CHECK_EQUAL(res[2].m_score, 0.3f);\n}\n\nBOOST_AUTO_TEST_CASE(test_sum_sorted4) {\n\n\tvector<vector<indexer::counted_record>> sorted = {\n\t\t{indexer::counted_record(1, 0.1), indexer::counted_record(2, 0.2), indexer::counted_record(3, 0.3)},\n\t\t{indexer::counted_record(10, 0.4), indexer::counted_record(25, 0.5), indexer::counted_record(30, 0.6)},\n\t\t{indexer::counted_record(1, 0.7), indexer::counted_record(25, 0.8), indexer::counted_record(40, 0.9)},\n\t};\n\tvector<indexer::counted_record> res = ::algorithm::sum_sorted<indexer::counted_record>(sorted,\n\t\t\t[](indexer::counted_record &a, const indexer::counted_record &b) {\n\t\ta.m_score += b.m_score;\n\t});\n\n\tBOOST_REQUIRE(res.size() == 7);\n\tBOOST_CHECK_EQUAL(res[0].m_score, 0.8f);\n\tBOOST_CHECK_EQUAL(res[1].m_score, 0.2f);\n\tBOOST_CHECK_EQUAL(res[2].m_score, 0.3f);\n\tBOOST_CHECK_EQUAL(res[3].m_score, 0.4f);\n\tBOOST_CHECK_EQUAL(res[4].m_score, 1.3f);\n\tBOOST_CHECK_EQUAL(res[5].m_score, 0.6f);\n\tBOOST_CHECK_EQUAL(res[6].m_score, 0.9f);\n\n\tBOOST_CHECK_EQUAL(res[0].m_value, 1);\n\tBOOST_CHECK_EQUAL(res[1].m_value, 2);\n\tBOOST_CHECK_EQUAL(res[2].m_value, 3);\n\tBOOST_CHECK_EQUAL(res[3].m_value, 10);\n\tBOOST_CHECK_EQUAL(res[4].m_value, 25);\n\tBOOST_CHECK_EQUAL(res[5].m_value, 30);\n\tBOOST_CHECK_EQUAL(res[6].m_value, 40);\n}\n\nBOOST_AUTO_TEST_CASE(test_sum_sorted5) {\n\n\tvector<vector<int>> sorted = {\n\t\t{1, 2, 3},\n\t\t{}\n\t};\n\tvector<int> res = ::algorithm::sum_sorted<int>(sorted, [](int &a, const int &b) {\n\t\ta += b;\n\t});\n\n\tBOOST_REQUIRE(res.size() == 3);\n\tBOOST_CHECK(res[0] == 1);\n\tBOOST_CHECK(res[1] == 2);\n\tBOOST_CHECK(res[2] == 3);\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_text.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"text/text.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_text)\n\nBOOST_AUTO_TEST_CASE(get_full_text_words) {\n\t{\n\t\tvector<string> words = text::get_full_text_words(\"C++ map. is the, best thing\");\n\t\tBOOST_CHECK_EQUAL(words[0], \"c++\");\n\t\tBOOST_CHECK_EQUAL(words[1], \"map\");\n\t\tBOOST_CHECK_EQUAL(words[2], \"is\");\n\t\tBOOST_CHECK_EQUAL(words[3], \"the\");\n\t\tBOOST_CHECK_EQUAL(words[4], \"best\");\n\t\tBOOST_CHECK_EQUAL(words[5], \"thing\");\n\t}\n\n\t{\n\t\tvector<string> words = text::get_full_text_words(\"C# is also good.\");\n\t\tBOOST_CHECK_EQUAL(words[0], \"c#\");\n\t\tBOOST_CHECK_EQUAL(words[1], \"is\");\n\t\tBOOST_CHECK_EQUAL(words[2], \"also\");\n\t\tBOOST_CHECK_EQUAL(words[3], \"good\");\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(get_tokens) {\n\tvector<uint64_t> tokens = text::get_tokens(\"My name is Josef Cullhed\");\n\n\tvector<uint64_t> targets = {\n\t\talgorithm::hash(\"my\"),\n\t\talgorithm::hash(\"name\"),\n\t\talgorithm::hash(\"is\"),\n\t\talgorithm::hash(\"josef\"),\n\t\talgorithm::hash(\"cullhed\"),\n\t};\n\n\tBOOST_CHECK(tokens == targets);\n}\n\nBOOST_AUTO_TEST_CASE(get_tokens2) {\n\tvector<uint64_t> tokens = text::get_tokens(\"Test. Ing! the    test   +function+\");\n\n\tvector<uint64_t> targets = {\n\t\talgorithm::hash(\"test\"),\n\t\talgorithm::hash(\"ing\"),\n\t\talgorithm::hash(\"the\"),\n\t\talgorithm::hash(\"test\"),\n\t\talgorithm::hash(\"+function+\"),\n\t};\n\n\tBOOST_CHECK(tokens == targets);\n}\n\nBOOST_AUTO_TEST_CASE(get_tokens3) {\n\tvector<uint64_t> tokens = text::get_expanded_full_text_tokens(\"Test. Ing! the    test   +func-tion+\");\n\n\tvector<uint64_t> targets = {\n\t\talgorithm::hash(\"test\"),\n\t\talgorithm::hash(\"ing\"),\n\t\talgorithm::hash(\"the\"),\n\t\talgorithm::hash(\"test\"),\n\t\talgorithm::hash(\"+func-tion+\"),\n\t\talgorithm::hash(\"+func\"),\n\t\talgorithm::hash(\"tion+\"),\n\t};\n\n\tBOOST_CHECK(tokens == targets);\n}\n\nBOOST_AUTO_TEST_CASE(get_snippets) {\n\t{\n\t\tvector<string> snippets = text::get_snippets(\"A small text that should fit in one snippet\");\n\n\t\tBOOST_REQUIRE(snippets.size() == 1);\n\t\tBOOST_CHECK(snippets[0] == \"A small text that should fit in one snippet\");\n\t}\n\t{\n\t\tvector<string> snippets = text::get_snippets(\" The zlib compression library provides in-memory compression and decompression functions, including integrity checks of the uncompressed data. This version of the library supports only one compression method (deflation) but other algorithms will be added later and will have the same stream interface.  Compression can be done in a single step if the buffers are large enough (for example if an input file is mmap'ed), or can be done by repeated calls of the compression function. In the latter case, the application must provide more input and/or consume the output (providing more output space) before each call. \");\n\n\t\tBOOST_REQUIRE(snippets.size() == 3);\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(get_words_without_stopwords) {\n\tvector<string> words = text::get_words_without_stopwords(\"Hej asd!asd jag, heter! !josef. cullhed \t\\\n\t\tjfoidjfoai823hr9hfhwe9f8hshgohewogiqhoih\");\n\n\tBOOST_CHECK_EQUAL(words.size(), 8);\n\tBOOST_CHECK_EQUAL(words[0], \"hej\");\n\tBOOST_CHECK_EQUAL(words[1], \"asd\");\n\tBOOST_CHECK_EQUAL(words[2], \"asd\");\n\tBOOST_CHECK_EQUAL(words[3], \"jag\");\n\tBOOST_CHECK_EQUAL(words[4], \"heter\");\n\tBOOST_CHECK_EQUAL(words[5], \"josef\");\n\tBOOST_CHECK_EQUAL(words[6], \"cullhed\");\n\tBOOST_CHECK_EQUAL(words[7], \"jfoidjfoai823hr9hfhwe9f8hshgohewogiqhoih\");\n}\n\nBOOST_AUTO_TEST_CASE(clean_word) {\n\n\tBOOST_CHECK_EQUAL(text::clean_word(\"hej\"), \"hej\");\n\tBOOST_CHECK_EQUAL(text::clean_word(\"åäö\"), \"åäö\");\n\tBOOST_CHECK_EQUAL(text::clean_word(\"123\"), \"123\");\n\tBOOST_CHECK_EQUAL(text::clean_word(\"$Üç\"), \"\");\n\tBOOST_CHECK_EQUAL(text::clean_word(\"hejç\"), \"hej\");\n\tBOOST_CHECK_EQUAL(text::clean_word(\"açd\"), \"ad\");\n\n\tBOOST_CHECK(text::is_clean_word(\"hej\"));\n\tBOOST_CHECK(text::is_clean_word(\"åäö\"));\n\tBOOST_CHECK(text::is_clean_word(\"123\"));\n\tBOOST_CHECK(!text::is_clean_word(\"$Üç\"));\n\tBOOST_CHECK(!text::is_clean_word(\"hejç\"));\n\tBOOST_CHECK(!text::is_clean_word(\"açd\"));\n\n\tBOOST_CHECK_EQUAL(text::get_words_without_stopwords(\"hej\")[0], \"hej\");\n\tBOOST_CHECK_EQUAL(text::get_words_without_stopwords(\"åäö\")[0], \"åäö\");\n\tBOOST_CHECK_EQUAL(text::get_words_without_stopwords(\"123\")[0], \"123\");\n\tBOOST_CHECK_EQUAL(text::get_words_without_stopwords(\"$Üç\").size(), 0);\n\tBOOST_CHECK_EQUAL(text::get_words_without_stopwords(\"hejç\").size(), 0);\n\tBOOST_CHECK_EQUAL(text::get_words_without_stopwords(\"açd\").size(), 0);\n\n\tBOOST_CHECK(text::get_words_without_stopwords(\"hej josef\") == vector<string>({\"hej\", \"josef\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"hej, josef!\") == vector<string>({\"hej\", \"josef\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"hej jÜsef cullhed du är bäst\") ==\n\t\tvector<string>({\"hej\", \"cullhed\", \"du\", \"bäst\"}));\n\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Låna! (Pengar till bilar)\") ==\n\t\tvector<string>({\"låna\", \"pengar\", \"bilar\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Dallas Swarner | Character | zKillboard\", 3) ==\n\t\tvector<string>({\"dallas\", \"swarner\", \"character\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Tapis Fleur des Champs Moutarde | Zen Dos\", 3) ==\n\t\tvector<string>({\"tapis\", \"fleur\", \"des\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Gina Osorno & The Dreamers\", 3) ==\n\t\tvector<string>({\"gina\", \"osorno\", \"dreamers\"}));\n\n\tBOOST_CHECK(text::get_words_without_stopwords(\"IMG_2190 | Zhenyu (Tony) Tian\") ==\n\t\tvector<string>({\"zhenyu\", \"tony\", \"tian\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Tills alla dör - Diamant Salihu - Bok (9789189061842) | Bokus\", 3)\n\t\t== vector<string>({\"tills\", \"dör\", \"diamant\"}));\n\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Messages postés par Prechan • Forum • Zeste de Savoir\", 3) ==\n\t\tvector<string>({\"messages\", \"par\", \"prechan\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Science SARU – 紙本分格\") == vector<string>({\"science\", \"saru\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Realiteti i trishtë shqiptar përmes fotove të gazetarit gjerman që komunizmi nuk i lejoi \\\n\t\ttë bëheshin publike | Gazeta Malesia\", 3) == vector<string>({\"realiteti\", \"shqiptar\", \"fotove\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"York County, VA\") == vector<string>({\"york\", \"county\", \"va\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"HTML Sitemap 14 - zfreeti.com\", 3) ==\n\t\tvector<string>({\"html\", \"sitemap\", \"14\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"HTML Sitemap 14 - zfreeti.com\") ==\n\t\tvector<string>({\"html\", \"sitemap\", \"14\"}));\n\tBOOST_CHECK(text::get_words_without_stopwords(\"Archives.com zfreeti.com best. stream. in .the world\") ==\n\t\tvector<string>({\"best\", \"stream\", \"world\"}));\n\n}\n\nBOOST_AUTO_TEST_CASE(word_freq, * boost::unit_test::tolerance(0.00001)) {\n\tauto freq = text::get_word_frequency(\"hello my name is josef and it is good\");\n\tBOOST_TEST(freq[\"hello\"] == 1.0/9.0);\n\tBOOST_TEST(freq[\"is\"] == 2.0/9.0);\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_thread_pool.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"utils/thread_pool.hpp\"\n#include \"profiler/profiler.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_thread_pool)\n\nBOOST_AUTO_TEST_CASE(thread_pool) {\n\tutils::thread_pool pool(10);\n\n\tvector<int> vec(10);\n\tfor (int &i : vec) {\n\t\tpool.enqueue([&i]() {\n\t\t\ti++;\n\t\t});\n\t}\n\n\tpool.run_all();\n\n\tfor (int i : vec) {\n\t\tBOOST_CHECK(i == 1);\n\t}\n\t\n}\n\nBOOST_AUTO_TEST_CASE(thread_pool2) {\n\tutils::thread_pool pool(12);\n\n\tvector<int> vec(24);\n\tfor (int &i : vec) {\n\t\tpool.enqueue([&i]() {\n\t\t\t\tstd::this_thread::sleep_for(200ms);\n\t\t\ti = 1;\n\t\t});\n\t}\n\n\tdouble now = profiler::now_micro();\n\n\tpool.run_all();\n\n\tdouble dt = profiler::now_micro() - now;\n\n\tBOOST_CHECK(dt < (200*2 + 10)*1000);\n\n\tfor (int i : vec) {\n\t\tBOOST_CHECK(i == 1);\n\t}\n\t\n}\n\n/*\n * Test limit of queue length. The idea here is that if you pass a second parameter to the pool\n * you get a maximum queue length. Then if the workers are all working and the queue is full\n * the next call to enqueue will wait for the queue to become smaller.\n * \n * This is useful if you want X workers to work but you don't want to fill up the queue because of.. limited memory.\n * */\nBOOST_AUTO_TEST_CASE(thread_pool3) {\n\tutils::thread_pool pool(4, 1);\n\n\tvector<int> vec(4);\n\tint idx = 1;\n\tfor (int &i : vec) {\n\t\tpool.enqueue([&i, idx]() {\n\t\t\tstd::this_thread::sleep_for(200ms);\n\t\t\ti = idx;\n\t\t});\n\t\t// Allow some time for the work to be picked from the queue.\n\t\tstd::this_thread::sleep_for(10ms);\n\t\tidx++;\n\t}\n\t// Now the 4 workers are working.\n\n\t// Enqueue one more.\n\tdouble now1 = profiler::now_micro();\n\tpool.enqueue([]() {\n\t\tstd::this_thread::sleep_for(200ms);\n\t});\n\tdouble now2 = profiler::now_micro();\n\n\t// Should be quick.\n\tBOOST_CHECK(now2 - now1 < 10 * 1000); // < 10 milliseconds.\n\n\t// Now the next enqueue should wait around 200ms\n\tdouble now3 = profiler::now_micro();\n\tpool.enqueue([]() {\n\t\tstd::this_thread::sleep_for(200ms);\n\t});\n\tdouble now4 = profiler::now_micro();\n\n\tBOOST_CHECK(now4 - now3 > 180 * 1000);\n\n\tstd::this_thread::sleep_for(300ms);\n\n\t// All threads should be done now\n\tidx = 1;\n\tfor (int i : vec) {\n\t\tBOOST_CHECK(i == idx);\n\t\tidx++;\n\t}\n\n\tpool.run_all();\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_top_k.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include <algorithm/top_k.h>\n\nBOOST_AUTO_TEST_SUITE(test_top_k)\n\nBOOST_AUTO_TEST_CASE(test_1) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({1,2,3,4,5,6}, 2);\n\tbool is_correct = (res == std::vector<int>{5,6} || res == std::vector<int>{6,5});\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_2) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({1,2,3,4,5,6,7}, 2);\n\tbool is_correct = (res == std::vector<int>{6,7} || res == std::vector<int>{7,6});\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_3) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({}, 2);\n\tbool is_correct = (res == std::vector<int>{});\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_4) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({2,3,1}, 2);\n\tbool is_correct = (res == std::vector<int>{2,3} || res == std::vector<int>{3,2});\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_5) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({7,5,3,4,4,8,4,1,1,3,4}, 3);\n\n\tbool is_correct = true;\n\tfor (int i : res) {\n\t\tif (i < 5) is_correct = false;\n\t}\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_6) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({7,5,3,4,4,8,4,1,1,3,4}, 6);\n\n\tbool is_correct = true;\n\tfor (int i : res) {\n\t\tif (i < 4) is_correct = false;\n\t}\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_7) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({1,3,0,1,4,3,9,2,0,3}, 1);\n\n\tbool is_correct = res == std::vector<int>{9};\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_CASE(test_8) {\n\tconst std::vector<int> res = ::algorithm::top_k<int>({1,3,0,1,4,3,9,2,0,3}, 3, [](const int &a, const int &b) {\n\t\treturn a > b;\n\t});\n\n\tbool is_correct = true;\n\tfor (int i : res) {\n\t\tif (i > 1) is_correct = false;\n\t}\n\tBOOST_CHECK(is_correct);\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_unicode.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"parser/unicode.h\"\n\nBOOST_AUTO_TEST_SUITE(unicode)\n\nBOOST_AUTO_TEST_CASE(unicode) {\n\tBOOST_CHECK_EQUAL(parser::unicode::encode(\"hej jag heter josef\"), \"hej jag heter josef\");\n\tBOOST_CHECK_EQUAL(parser::unicode::encode(\"hej jag heter josef och jag tillåter utf8 åäö chars$€\"),\n\t\t\"hej jag heter josef och jag tillåter utf8 åäö chars$€\");\n\tBOOST_CHECK_EQUAL(parser::unicode::encode(\"是美国民主党政治家，于19世纪下半叶担\"), \"是美国民主党政治家，于19世纪下半叶担\");\n\n\tBOOST_CHECK(parser::unicode::is_valid(parser::unicode::encode(\"L�gg i varukorg Om produkten Specifikation Anv�ndning Våra bönor är \\\n\t\trika på protein, mineraler och fibrer. Smaken är söt och konsistensen le\")));\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_url.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"URL.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_url)\n\nBOOST_AUTO_TEST_CASE(basic) {\n\tBOOST_CHECK_EQUAL(URL(\"https://www.facebook.com/test.html?key=value\").str(), \"https://www.facebook.com/test.html?key=value\");\n\n\t{\n\t\tURL url(\"https://www.facebook.com/test.html?key=value\");\n\t\turl.set_scheme(\"http\");\n\t\turl.set_www(false);\n\t\t\n\t\tBOOST_CHECK_EQUAL(url.str(), \"http://facebook.com/test.html?key=value\");\n\n\t\turl.set_scheme(\"https\");\n\t\turl.set_www(true);\n\n\t\tBOOST_CHECK_EQUAL(url.str(), \"https://www.facebook.com/test.html?key=value\");\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(url_parsing) {\n\n\t{\n\t\tURL url(\"https://www.facebook.com/test.html?key=value\");\n\t\tBOOST_CHECK_EQUAL(url.str(), \"https://www.facebook.com/test.html?key=value\");\n\t\tBOOST_CHECK_EQUAL(url.domain_without_tld(), \"facebook\");\n\t\tBOOST_CHECK_EQUAL(url.host(), \"facebook.com\");\n\t\tBOOST_CHECK_EQUAL(url.host_reverse(), \"com.facebook\");\n\t\tBOOST_CHECK_EQUAL(url.scheme(), \"https\");\n\t\tBOOST_CHECK_EQUAL(url.path(), \"/test.html\");\n\t\tBOOST_CHECK_EQUAL(url.path_with_query(), \"/test.html?key=value\");\n\t\tBOOST_CHECK_EQUAL(url.size(), strlen(\"https://www.facebook.com/test.html?key=value\"));\n\t\tBOOST_CHECK_EQUAL(url.has_https(), true);\n\t\tBOOST_CHECK_EQUAL(url.has_www(), true);\n\n\t\tauto query = url.query();\n\t\tBOOST_CHECK_EQUAL(query.size(), 1);\n\t\tBOOST_CHECK_EQUAL(query[\"key\"], \"value\");\n\t}\n\t{\n\t\tURL url(\"http://example.com/\");\n\t\tBOOST_CHECK_EQUAL(url.has_https(), false);\n\t\tBOOST_CHECK_EQUAL(url.has_www(), false);\n\t}\n\n\t{\n\t\tURL url(\"http://example.com/\");\n\t\tBOOST_CHECK_EQUAL(url.path(), \"/\");\n\t}\n\t{\n\t\tURL url(\"http://example.com\");\n\t\tBOOST_CHECK_EQUAL(url.path(), \"/\");\n\t}\n}\n\nBOOST_AUTO_TEST_CASE(url_parsing2) {\n\n\tURL url(\"https://github.com/joscul/alexandria/blob/main/tests/File.h\");\n\tBOOST_CHECK_EQUAL(url.domain_without_tld(), \"github\");\n\tBOOST_CHECK_EQUAL(url.host(), \"github.com\");\n\tBOOST_CHECK_EQUAL(url.scheme(), \"https\");\n\tBOOST_CHECK_EQUAL(url.path(), \"/joscul/alexandria/blob/main/tests/File.h\");\n\tBOOST_CHECK_EQUAL(url.path_with_query(), \"/joscul/alexandria/blob/main/tests/File.h\");\n\n\tauto query = url.query();\n\tBOOST_CHECK_EQUAL(query.size(), 0);\n}\n\nBOOST_AUTO_TEST_CASE(hash) {\n\n\tURL url(\"https://github.com/joscul/alexandria/blob/main/tests/File.h\");\n\n\tsize_t hash1 = URL(\"https://github.com/joscul/alexandria/blob/main/tests/File.h\").hash();\n\tsize_t hash2 = URL(\"https://github.com/joscul/alexandria/blob/main/tests/File.h?query=param\").hash();\n\tsize_t hash3 = URL(\"https://github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp\").hash();\n\tsize_t hash4 = URL(\"https://www.github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp\").hash();\n\tsize_t hash5 = URL(\"http://github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp\").hash();\n\n\tBOOST_CHECK(hash1 != hash2);\n\tBOOST_CHECK(hash2 != hash3);\n\tBOOST_CHECK(hash3 == hash4);\n\tBOOST_CHECK(hash4 == hash5);\n}\n\nBOOST_AUTO_TEST_CASE(unescape) {\n\n\t{\n\t\tURL url(\"https://github.com/?q=test%20test\");\n\t\tmap<string, string> query = url.query();\n\n\t\tBOOST_CHECK_EQUAL(query[\"q\"], \"test test\");\n\t}\n\t{\n\t\tURL url(\"https://github.com/?q=test%2020\");\n\t\tmap<string, string> query = url.query();\n\n\t\tBOOST_CHECK_EQUAL(query[\"q\"], \"test 20\");\n\t}\n\t{\n\t\tURL url(\"https://github.com/search?q=targumical&cp=0&hl=en-US&pq=%targumical%&sourceid=chrome&ie=UTF-8\");\n\t\tmap<string, string> query = url.query();\n\n\t\tBOOST_CHECK_EQUAL(query[\"pq\"], \"%targumical%\");\n\t}\n\n\t{\n\t\tURL url(\"https://github.com/search?q=stress%%c3%C3%a5%C3%A4%c3%b6%0G\");\n\t\tmap<string, string> query = url.query();\n\n\t\tBOOST_CHECK_EQUAL(query[\"q\"], \"stress%c3åäö%0G\");\n\t}\n\n\t{\n\t\t// Test double encoding.\n\t\tURL url(\"https://github.com/search?q=%25C3%25A5%25C3%25A4%25C3%25B6\");\n\t\tmap<string, string> query = url.query();\n\n\t\tBOOST_CHECK_EQUAL(query[\"q\"], \"%C3%A5%C3%A4%C3%B6\");\n\t}\n\n\t{\n\t\t// Test double encoding.\n\t\tURL url(\"https://github.com/search?q=%josef%0\");\n\t\tmap<string, string> query = url.query();\n\n\t\tBOOST_CHECK_EQUAL(query[\"q\"], \"%josef%0\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_CASE(host_top_domain) {\n\n\t{\n\t\tURL url(\"https://test.uk\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"test.uk\");\n\t}\n\t{\n\t\tURL url(\"https://testing.com.au\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"testing.com.au\");\n\t}\n\t{\n\t\tURL url(\"https://subdomain.testing.com.au\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"testing.com.au\");\n\t}\n\t{\n\t\tURL url(\"https://github.com/\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"github.com\");\n\t}\n\t{\n\t\tURL url(\"https://test.github.com/\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"github.com\");\n\t}\n\n\t{\n\t\tURL url(\"https://bbc.co.uk/\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"bbc.co.uk\");\n\t}\n\n\t{\n\t\tURL url(\"https://testing.bbc.co.uk/\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"bbc.co.uk\");\n\t}\n\n\t{\n\t\tURL url(\".\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"\");\n\t}\n\n\t{\n\t\tURL url(\"\");\n\t\tBOOST_CHECK_EQUAL(url.host_top_domain(), \"\");\n\t}\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  },
  {
    "path": "tests/test_url_record.cpp",
    "content": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n * \n * The above copyright notice and this permission notice shall be included in all\n * copies or substantial portions of the Software.\n * \n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n * SOFTWARE.\n */\n\n#include <boost/test/unit_test.hpp>\n#include \"indexer/url_record.h\"\n\nusing namespace std;\n\nBOOST_AUTO_TEST_SUITE(test_url_record)\n\nBOOST_AUTO_TEST_CASE(basic) {\n\n\tindexer::url_record record(123ull);\n\n\trecord.url_length(442);\n\tBOOST_CHECK_EQUAL(record.url_length(), 442);\n\n\trecord.url_length(4);\n\tBOOST_CHECK_EQUAL(record.url_length(), 4);\n\n\trecord.url_length(0);\n\tBOOST_CHECK_EQUAL(record.url_length(), 0);\n\n\n}\n\nBOOST_AUTO_TEST_SUITE_END()\n"
  }
]