Repository: alexandria-org/alexandria
Branch: main
Commit: 129e162e8068
Files: 234
Total size: 891.5 KB

Directory structure:
gitextract_46gecs8w/

├── .gdbinit
├── .gitignore
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── README.md
├── cmake/
│   └── Findfcgi.cmake
├── config.conf
├── documentation/
│   ├── alexandria.md
│   ├── api_response_format.md
│   ├── caching.md
│   ├── coding_rules.md
│   ├── configure_local_nginx.md
│   ├── full_text_indexes.md
│   ├── ideas.md
│   ├── index_file_format.md
│   ├── indexer.md
│   ├── installing_nodes.md
│   ├── performance_journal.md
│   ├── search_result_ranking.md
│   └── statues_swe.tex
├── scripts/
│   ├── bootstrap_node_2drives.sh
│   ├── build-deps.sh
│   ├── clean.sh
│   ├── download-deps.sh
│   ├── download-test-data.sh
│   ├── find_missing_files_in_batch.sh
│   ├── init-docker.sh
│   ├── install-deps.sh
│   ├── packager.sh
│   ├── prepare-output-dirs.sh
│   ├── truncate.sh
│   └── update.sh
├── src/
│   ├── URL.cpp
│   ├── URL.h
│   ├── alexandria.cpp
│   ├── algorithm/
│   │   ├── algorithm.cpp
│   │   ├── algorithm.h
│   │   ├── bloom_filter.cpp
│   │   ├── bloom_filter.h
│   │   ├── hash.cpp
│   │   ├── hash.h
│   │   ├── hyper_ball.h
│   │   ├── hyper_log_log.cpp
│   │   ├── hyper_log_log.h
│   │   ├── intersection.cpp
│   │   ├── intersection.h
│   │   ├── sort.cpp
│   │   ├── sort.h
│   │   ├── sum_sorted.h
│   │   └── top_k.h
│   ├── api/
│   │   ├── api_response.cpp
│   │   ├── api_response.h
│   │   ├── result_with_snippet.cpp
│   │   └── result_with_snippet.h
│   ├── cluster/
│   │   ├── cluster.h
│   │   ├── document.cpp
│   │   └── document.h
│   ├── common/
│   │   ├── ThreadPool.h
│   │   ├── datetime.cpp
│   │   ├── datetime.h
│   │   ├── dictionary.cpp
│   │   ├── dictionary.h
│   │   ├── dictionary_row.cpp
│   │   ├── dictionary_row.h
│   │   ├── simple_thread_pool.hpp
│   │   ├── system.cpp
│   │   └── system.h
│   ├── config.cpp
│   ├── config.h
│   ├── debug.cpp
│   ├── debug.h
│   ├── domain_stats/
│   │   ├── domain_stats.cpp
│   │   └── domain_stats.h
│   ├── downloader/
│   │   ├── merge_downloader.cpp
│   │   ├── merge_downloader.h
│   │   ├── warc_downloader.cpp
│   │   └── warc_downloader.h
│   ├── file/
│   │   ├── archive.cpp
│   │   ├── archive.h
│   │   ├── file.cpp
│   │   ├── file.h
│   │   ├── gz_tsv_file.cpp
│   │   ├── gz_tsv_file.h
│   │   ├── tsv_file.cpp
│   │   ├── tsv_file.h
│   │   ├── tsv_file_remote.cpp
│   │   ├── tsv_file_remote.h
│   │   ├── tsv_row.cpp
│   │   └── tsv_row.h
│   ├── full_text/
│   │   ├── domain_link_record.h
│   │   ├── link_record.h
│   │   ├── record.h
│   │   ├── result_set.h
│   │   └── search_metric.h
│   ├── hash_table2/
│   │   ├── builder.cpp
│   │   ├── builder.h
│   │   ├── hash_table.cpp
│   │   ├── hash_table.h
│   │   ├── hash_table_shard.cpp
│   │   ├── hash_table_shard.h
│   │   ├── hash_table_shard_base.h
│   │   ├── hash_table_shard_builder.cpp
│   │   └── hash_table_shard_builder.h
│   ├── hash_table_helper/
│   │   ├── hash_table_helper.cpp
│   │   └── hash_table_helper.h
│   ├── http/
│   │   ├── request.cpp
│   │   ├── request.h
│   │   ├── response.h
│   │   ├── server.cpp
│   │   └── server.h
│   ├── indexer/
│   │   ├── basic_index.h
│   │   ├── basic_index_builder.h
│   │   ├── console.cpp
│   │   ├── console.h
│   │   ├── counted_record.h
│   │   ├── domain_link_record.h
│   │   ├── domain_record.h
│   │   ├── generic_record.h
│   │   ├── index.h
│   │   ├── index_base.h
│   │   ├── index_builder.h
│   │   ├── index_manager.cpp
│   │   ├── index_manager.h
│   │   ├── index_reader.cpp
│   │   ├── index_reader.h
│   │   ├── index_utils.cpp
│   │   ├── index_utils.h
│   │   ├── link_record.h
│   │   ├── merger.cpp
│   │   ├── merger.h
│   │   ├── regular_index_builder.h
│   │   ├── return_record.h
│   │   ├── score_builder.cpp
│   │   ├── score_builder.h
│   │   ├── sharded.h
│   │   ├── sharded_builder.h
│   │   ├── sharded_index.h
│   │   ├── sharded_index_builder.h
│   │   ├── url_record.h
│   │   └── value_record.h
│   ├── indexer.cpp
│   ├── logger/
│   │   ├── logger.cpp
│   │   └── logger.h
│   ├── memory/
│   │   ├── debugger.cpp
│   │   ├── debugger.h
│   │   ├── memory.cpp
│   │   ├── memory.h
│   │   └── overload.cpp
│   ├── parser/
│   │   ├── cc_parser.cpp
│   │   ├── cc_parser.h
│   │   ├── entities.cpp
│   │   ├── entities.h
│   │   ├── html_link.cpp
│   │   ├── html_link.h
│   │   ├── html_parser.cpp
│   │   ├── html_parser.h
│   │   ├── parser.cpp
│   │   ├── parser.h
│   │   ├── unicode.cpp
│   │   └── unicode.h
│   ├── profiler/
│   │   ├── profiler.cpp
│   │   └── profiler.h
│   ├── scraper/
│   │   ├── scraper.cpp
│   │   ├── scraper.h
│   │   ├── scraper_store.cpp
│   │   └── scraper_store.h
│   ├── scraper.cpp
│   ├── search_engine/
│   │   ├── search_allocation.h
│   │   ├── search_engine.cpp
│   │   └── search_engine.h
│   ├── server/
│   │   ├── search_server.cpp
│   │   ├── search_server.h
│   │   ├── url_server.cpp
│   │   └── url_server.h
│   ├── server.cpp
│   ├── stats/
│   │   └── stats.h
│   ├── text/
│   │   ├── stopwords.cpp
│   │   ├── stopwords.h
│   │   ├── text.cpp
│   │   └── text.h
│   ├── tools/
│   │   ├── calculate_harmonic.cpp
│   │   ├── calculate_harmonic.h
│   │   ├── counter.cpp
│   │   ├── counter.h
│   │   ├── find_links.cpp
│   │   ├── find_links.h
│   │   ├── generate_url_lists.cpp
│   │   ├── generate_url_lists.h
│   │   ├── splitter.cpp
│   │   └── splitter.h
│   ├── transfer/
│   │   ├── transfer.cpp
│   │   └── transfer.h
│   ├── url_link/
│   │   ├── link.cpp
│   │   └── link.h
│   ├── utils/
│   │   ├── id_allocator.h
│   │   ├── thread_pool.cpp
│   │   ├── thread_pool.hpp
│   │   └── thread_pool_arg.h
│   └── warc/
│       ├── tlds.h
│       ├── warc.cpp
│       └── warc.h
└── tests/
    ├── main.cpp
    ├── test_algorithm.cpp
    ├── test_bloom_filter.cpp
    ├── test_cc_parser.cpp
    ├── test_config.conf
    ├── test_config2.conf
    ├── test_configuration.cpp
    ├── test_counted_index_builder.cpp
    ├── test_datetime.h
    ├── test_file.cpp
    ├── test_hash.cpp
    ├── test_hash_table.cpp
    ├── test_html_parser.cpp
    ├── test_hyper_ball.cpp
    ├── test_hyper_log_log.cpp
    ├── test_index_builder.cpp
    ├── test_index_iteration.cpp
    ├── test_index_reader.cpp
    ├── test_logger.cpp
    ├── test_memory.cpp
    ├── test_n_gram.cpp
    ├── test_robot_parser.cpp
    ├── test_scraper.cpp
    ├── test_sharded_index_builder.cpp
    ├── test_sort.cpp
    ├── test_sum_sorted.cpp
    ├── test_text.cpp
    ├── test_thread_pool.cpp
    ├── test_top_k.cpp
    ├── test_unicode.cpp
    ├── test_url.cpp
    └── test_url_record.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .gdbinit
================================================

set history save on


================================================
FILE: .gitignore
================================================

deps/*
tmp/*
src/*.o
tests/*.o
build/*
documentation/*.aux
documentation/*.log
documentation/statues_swe.pdf
.DS_Store
config/config.h
response.txt
cc_parser.zip
cc_parser
cc_indexer.zip
cc_indexer
cc_api.zip
cc_api
cc_full_text.zip
cc_full_text
run_tests
CMakeCache.txt
CMakeFiles
CMakeScripts
Makefile
cmake_install.cmake
warc.paths
.vscode
.gdb_history
*~
*.swp
*.swo


================================================
FILE: CMakeLists.txt
================================================

set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)

cmake_minimum_required(VERSION 3.5)
set(CMAKE_C_COMPILER /usr/bin/gcc-10)
set(CMAKE_CXX_COMPILER /usr/bin/g++-10)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(THREADS_PREFER_PTHREAD_FLAG ON)
project(alexandria LANGUAGES CXX)

add_definitions(-Wfatal-errors)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")

add_subdirectory("deps/abseil-cpp")

find_package(roaring REQUIRED)
find_package(Threads REQUIRED)
FIND_PACKAGE(CURL REQUIRED)
find_package(Boost REQUIRED COMPONENTS system iostreams filesystem unit_test_framework)
find_package(ZLIB)
find_package(fcgi)

include_directories(src/)
include_directories(deps/)
include_directories(tests/)

set(SRC_CLASSES

	"src/url_link/link.cpp"
	"src/api/result_with_snippet.cpp"
	"src/api/api_response.cpp"
	
	"src/file/file.cpp"
	"src/file/archive.cpp"
	"src/file/tsv_file.cpp"
	"src/file/gz_tsv_file.cpp"
	"src/file/tsv_file_remote.cpp"
	"src/file/tsv_row.cpp"

	"src/transfer/transfer.cpp"

	"src/hash_table2/hash_table.cpp"
	"src/hash_table2/hash_table_shard.cpp"
	"src/hash_table2/hash_table_shard_builder.cpp"
	"src/hash_table2/builder.cpp"

	"src/hash_table_helper/hash_table_helper.cpp"

	"src/parser/parser.cpp"
	"src/parser/entities.cpp"
	"src/parser/html_link.cpp"
	"src/parser/html_parser.cpp"
	"src/parser/unicode.cpp"
	"src/parser/cc_parser.cpp"

	"src/downloader/warc_downloader.cpp"
	"src/downloader/merge_downloader.cpp"

	"src/URL.cpp"

	"src/warc/warc.cpp"

	"src/profiler/profiler.cpp"

	"src/logger/logger.cpp"

	"src/utils/thread_pool.cpp"

	"src/memory/memory.cpp"
	"src/memory/debugger.cpp"

	"src/config.cpp"

	"src/algorithm/algorithm.cpp"
	"src/algorithm/intersection.cpp"
	"src/algorithm/sort.cpp"
	"src/algorithm/hash.cpp"
	"src/algorithm/hyper_log_log.cpp"
	"src/algorithm/bloom_filter.cpp"

	"src/tools/splitter.cpp"
	"src/tools/find_links.cpp"
	"src/tools/counter.cpp"
	"src/tools/calculate_harmonic.cpp"
	"src/tools/generate_url_lists.cpp"

	"src/cluster/document.cpp"
	"src/scraper/scraper.cpp"
	"src/scraper/scraper_store.cpp"

	"src/indexer/index_manager.cpp"
	"src/indexer/console.cpp"
	"src/indexer/merger.cpp"
	"src/indexer/score_builder.cpp"
	"src/indexer/index_reader.cpp"
	"src/indexer/index_utils.cpp"

	"src/server/search_server.cpp"
	"src/server/url_server.cpp"

	"src/http/server.cpp"
	"src/http/request.cpp"

	"src/domain_stats/domain_stats.cpp"
	"src/debug.cpp"

	"deps/robots.cc"
)

set(SRC_COMMON
	"src/common/dictionary.cpp"
	"src/common/system.cpp"
	"src/common/datetime.cpp"
	"src/common/dictionary_row.cpp"
	"src/text/stopwords.cpp"
	"src/text/text.cpp"
)

set(SRC_TESTS
	"tests/test_hyper_log_log.cpp"
	"tests/test_memory.cpp"
	"tests/test_algorithm.cpp"
	"tests/test_bloom_filter.cpp"
	"tests/test_cc_parser.cpp"
	"tests/test_configuration.cpp"
	"tests/test_counted_index_builder.cpp"
	"tests/test_datetime.h"
	"tests/test_file.cpp"
	"tests/test_hash.cpp"
	"tests/test_hash_table.cpp"
	"tests/test_html_parser.cpp"
	"tests/test_hyper_ball.cpp"
	"tests/test_index_builder.cpp"
	"tests/test_index_iteration.cpp"
	"tests/test_index_reader.cpp"
	"tests/test_logger.cpp"
	"tests/test_n_gram.cpp"
	"tests/test_robot_parser.cpp"
	"tests/test_scraper.cpp"
	"tests/test_sharded_index_builder.cpp"
	"tests/test_sort.cpp"
	"tests/test_sum_sorted.cpp"
	"tests/test_text.cpp"
	"tests/test_thread_pool.cpp"
	"tests/test_top_k.cpp"
	"tests/test_unicode.cpp"
	"tests/test_url.cpp"
	"tests/test_url_record.cpp"

	# This overloads the new/delete operators to keep track of memory, slows things down a lot.
	"src/memory/overload.cpp"
)

add_executable(run_tests
	"tests/main.cpp"
	${SRC_CLASSES}
	${SRC_COMMON}
	${SRC_TESTS}
)
add_executable(server
	"src/server.cpp"
	${SRC_CLASSES}
	${SRC_COMMON}
)
add_executable(scraper
	"src/scraper.cpp"
	${SRC_CLASSES}
	${SRC_COMMON}
)
add_executable(indexer
	"src/indexer.cpp"
	${SRC_CLASSES}
	${SRC_COMMON}
)
add_executable(alexandria
	"src/alexandria.cpp"
	${SRC_CLASSES}
	${SRC_COMMON}
)

target_compile_definitions(run_tests PUBLIC IS_TEST)
target_compile_definitions(run_tests PUBLIC FT_NUM_SHARDS=16)
target_compile_definitions(run_tests PUBLIC HT_NUM_SHARDS=16)
target_compile_definitions(run_tests PUBLIC FILE_SERVER="http://127.0.0.1")
target_compile_definitions(run_tests PUBLIC COMPILE_WITH_LINK_INDEX)

target_compile_options(run_tests PUBLIC -Wall -Werror)
target_compile_options(server PUBLIC -Wall -Werror)
target_compile_options(scraper PUBLIC -Wall -Werror)
target_compile_options(indexer PUBLIC -Wall -Werror)
target_compile_options(alexandria PUBLIC -Wall -Werror)

target_link_libraries(run_tests PUBLIC
	${FCGI_LIBRARY}
	${FCGI_LIBRARYCPP}
	${CURL_LIBRARIES}
	${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(server PUBLIC
	${FCGI_LIBRARY}
	${FCGI_LIBRARYCPP}
	${CURL_LIBRARIES}
	${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(scraper PUBLIC
	${FCGI_LIBRARY}
	${FCGI_LIBRARYCPP}
	${CURL_LIBRARIES}
	${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(indexer PUBLIC
	${FCGI_LIBRARY}
	${FCGI_LIBRARYCPP}
	${CURL_LIBRARIES}
	${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(alexandria PUBLIC
	${FCGI_LIBRARY}
	${FCGI_LIBRARYCPP}
	${CURL_LIBRARIES}
	${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)


================================================
FILE: Dockerfile
================================================
# syntax=docker/dockerfile:1
FROM ubuntu:latest
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y zip make cmake gcc gcc-10 g++ g++-10 libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx vim wget git curl


================================================
FILE: LICENSE
================================================
MIT License

Alexandria.org

Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# Alexandria.org

1. [Coding Rules](/documentation/coding_rules.md)
2. [Full text indexes](/documentation/full_text_indexes.md)
3. [Hash table](/documentation/hash_table.md)

## Build instructions with docker
1. Checkout repo
WINDOWS USERS: You need to run 'git config --global core.autocrlf false' before checking out the repository
```
git clone git@github.com:alexandria-org/alexandria.git
```
2. Build docker image
```
docker build . -t alexandria
```
3. Run container
```
docker container run --name alexandria -v ${PWD}:/alexandria -it -d alexandria
```
4. Attach to container.
```
docker exec -it alexandria /bin/bash
```
5. Navigate to directory
```
cd /alexandria
```
6. Initialize docker
```
scripts/init-docker.sh
```
7. Configure with cmake
```
mkdir build; cd build; cmake ..
```
8. Build all
```
make -j4
```
9. Run test suite
```
./run_tests
```

## How to build manually (not recommended)
1. Configure the system (Tested on Ubuntu 20.04)
```
# Will alter your system and install dependencies with apt.
./scripts/install-deps.sh

# Will download and build zlib, aws-lambda-cpp and aws-sdk-cpp will only alter the local directory.
./scripts/build-deps.sh
```

2. Build with cmake
```
mkdir build
cd build

cmake .. -DCMAKE_BUILD_TYPE=Debug
or
cmake .. -DCMAKE_BUILD_TYPE=Release

make -j24
```

3. Download test data to local server.
To run the test suite you need to install nginx and pre-download all the data: [Configure local nginx test data server](/documentation/configure_local_nginx.md)

4. Create output directories. Note, this will create a bunch of directories in the /mnt so make sure you don't have anything there.
```
./scripts/prepare-output-dirs.sh
```

5. Run the test suite
```
cd build
make run_tests -j24
./run_tests
```

## Notes
On nodes with spinning disks we should turn off energy saving:
```
hdparm -B 255 /dev/sda
```

## Debugging notes
### Debugging scraper with gdb:
By default, gdb captures SIGPIPE of a process and pauses it. However, some program ignores SIGPIPE. So, the default behavour of gdb is not desired when debugging those program. To avoid gdb stopping in SIGPIPE, use the folloing command in gdb:
```handle SIGPIPE nostop noprint pass```


================================================
FILE: cmake/Findfcgi.cmake
================================================
# CMake module to search for FastCGI headers
#
# If it's found it sets FCGI_FOUND to TRUE
# and following variables are set:
#    FCGI_INCLUDE_DIR
#    FCGI_LIBRARY
FIND_PATH(FCGI_INCLUDE_DIR
  fcgio.h
  PATHS
  /usr/include
  /usr/local/include
  /usr/include/fastcgi
  "$ENV{LIB_DIR}/include"
  $ENV{INCLUDE}
  )

FIND_LIBRARY(FCGI_LIBRARY NAMES fcgi libfcgi PATHS 
  /usr/local/lib 
  /usr/lib 
  "$ENV{LIB_DIR}/lib"
  "$ENV{LIB}"
  )
FIND_LIBRARY(FCGI_LIBRARYCPP NAMES libfcgi++.so PATHS 
  /usr/local/lib 
  /usr/lib 
  "$ENV{LIB_DIR}/lib"
  "$ENV{LIB}"
  )

IF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY)
   SET(FCGI_FOUND TRUE)
ENDIF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY)

IF (FCGI_FOUND)
   IF (NOT FCGI_FIND_QUIETLY)
      MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARY}")
      MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARYCPP}")
   ENDIF (NOT FCGI_FIND_QUIETLY)
ELSE (FCGI_FOUND)
   IF (FCGI_FIND_REQUIRED)
      MESSAGE(FATAL_ERROR "Could not find FCGI")
   ENDIF (FCGI_FIND_REQUIRED)
ENDIF (FCGI_FOUND)


================================================
FILE: config.conf
================================================

# Cluster config
nodes_in_cluster = 3
node_id = 0

# Indexer config
batches[] = ALEXANDRIA-MANUAL-01
batches[] = CC-MAIN-2021-25
batches[] = CC-MAIN-2021-31

link_batches[] = CC-MAIN-2021-31
link_batches[] = CC-MAIN-2021-25
link_batches[] = CC-MAIN-2021-21
link_batches[] = CC-MAIN-2021-17
link_batches[] = CC-MAIN-2021-10
link_batches[] = CC-MAIN-2021-04
link_batches[] = CC-MAIN-2020-50
link_batches[] = CC-MAIN-2020-45

# Server config
worker_count = 8
query_max_words = 10 # Maximum number of words used in query.
query_max_len = 200
deduplicate_domain_count = 5
pre_result_limit = 200000
result_limit = 1000

# Full text config
ft_max_sections = 4
ft_max_results_per_section = 2000000


================================================
FILE: documentation/alexandria.md
================================================
Usage: ./alexandria [OPTIONS]...

## Options

**--downloader [commoncrawl-batch] [limit] [offset]**

Downloads files from the given commoncrawl batch. Limit and offset arguments are used for downloading a subset of the files. Example
```
./alexandria --downloader CC-MAIN-2022-27 2500 0
```
Will download the first 2500 files from CC-MAIN-2022-27 and upload them to the 'upload' host. See config documentation.

**--downloader-merge**

Merges downloaded files. This should run on the upload host to merge the different downloaded batches into our hash table.

**--hash-table-url [URL]**

Searches the local hash table called 'all_urls' for the given URL.

**--hash-table-url-hash [URL-hash]**

Searches the local hash table called 'all_urls' for the given URL-hash.

**--hash-table-count**

Counts all items in local hash table called 'all_urls'.

**--hash-table-find-all [HOST]**

Searches the local hash table called 'all_urls' for urls from specified host. This takes several days for large hash table.

**--hash-table-count [HOST]**

Estimated count of host from hash table by only counting one shard and multiply by number of shards.

**--hash-table-optimize-shard [SHARD]**

Optimizes shard for local hash table called 'all_urls'.

**--internal-harmonic**

Run the whole internal links harmonic calculator. Should run on 'upload' host.


================================================
FILE: documentation/api_response_format.md
================================================
# Api Response Format

This is a description of the endpoints available on a node.

### Perform search
```
curl http://node0002.alexandria.org/?q=the%20beatles
{
  "status":	"success",
  "time_ms":	35.876,
  "total_found":	245436,
  "total_url_links_found":	4092,
  "total_domain_links_found":	4092,
  "links_handled":	674,
  "link_domain_matches":	18059,
  "link_url_matches":	589,
  "results":	[{
    "url":	"https://www.example.com/",
    "title":	"Example dot com",
    "snippet":	"Lorem ipsum dolor esit",
    "score":	182.51408386230469,
    "domain_hash":	"2892282071861106665",
    "url_hash":	"2892281418178079567"
  }]
}

The url flag d can be used to control deduplication:
curl http://node0002.alexandria.org/?q=the%20beatles&d=a
curl http://node0002.alexandria.org/?q=the%20beatles&d=d

d=a // No deduplication, show all results
d=d // Deduplication
Default value is d=d
```

### Perform url lookup
```
curl http://node0002.alexandria.org/?u=https://www.example.org/
{
  "status":	"success",
  "time_ms":	35.876,
  "response":	"[DATA]"
}
```

### Fetch information about search result
```
curl http://node0002.alexandria.org/?s=example%20query
{
  "status":	"success",
  "time_ms":	13.984,
  "index":	{
    "total":	980770801,
    "words":	{
      "example":	0.0080152416772448342,
      "query":	0.0017581304401006531
    }
  },
  "link_index":	{
    "total":	472012858,
    "words":	{
      "example":	0.000581251114985516,
      "query":	6.3595725182554242e-05
    }
  }
}
```

### Fetch status of the node.
```
curl http://node0002.alexandria.org/status
{
  "status":	"success",
  "time_ms":	13.984,
  "total_disk_space": 89374934876,
  "avail_disk_space": 83975235,
  "avail_disk_percent": 0.0832,
  "index":	{
    "items":	980770801,
    "full_text_disk_used": 973295875,
    "full_text_disk_percent": 0.5423,
    "hash_table_disk_used": 839265,
    "hash_table_disk_percent": 0.05423
  },
  "link_index":	{
    "items":	980770801,
    "full_text_disk_used": 973295875,
    "full_text_disk_percent": 0.2423,
    "hash_table_disk_used": 839265,
    "hash_table_disk_percent": 0.0423
  }
}
```

### Combined api response (api.alexandria.org)
```
curl https://api.alexandria.org/?q=the%20beatles&p=1
{
  "status":	"success",
  "time_ms":	35.876,
  "total_found":	245436,
  "total_url_links_found":	4092,
  "total_domain_links_found":	4092,
  "links_handled":	674,
  "link_domain_matches":	18059,
  "link_url_matches":	589,
  "page_max": 10,
  "results":	[{
    "url":	"https://www.example.com/",
    "display_url": "https://www.example.com/",
    "title":	"Example dot com",
    "snippet":	"Lorem ipsum dolor esit",
    "score":	182.51408386230469,
    "domain_hash":	"2892282071861106665",
    "url_hash":	"2892281418178079567",
    "exact_match": 1,
    "phrase_match": 1,
    "year": 3300,
    "is_old": 0,
    "is_subdomain": 0,
    "domain": "www.example.com"
  },
  ...
  ]
}
```


================================================
FILE: documentation/caching.md
================================================
## Caching

Our nodes should try to use as much RAM as possible to store index data for common tokens in RAM. I think the best way would be to hold a list of the most commonly queried tokens.

We can use /proc/meminfo to retrieve information about available memory on the server.


================================================
FILE: documentation/coding_rules.md
================================================

## Coding rules
1. Indent with tabs.
2. Use auto for variable declarations when possible.
3. Never put "using namespace std" in any file.
4. Prefix class member variables with m_, this way you know you are using a member or local variable.
5. All namespaces, classes, functions and variables should be lower_case.
6. All files within a sub-directory must declare everything within a namespace with the same name as the directory. For example src/file/tsv_file.h must declare everything within the namespace file::
7. Prefer smart pointers over regular pointers.
8. Prefer if statements over switch statements.

## Indentation examples

Indent with tabs!

### pointers
```c++
// * and & are glued to the variable
int *ptr = new int[100];
int *ptr2 = &addr;
```

### operators
```c++
// Spaces between binary operators
int a = 1 + 2;
int b = multiple * (add1 + add2);
a += b;

// Unary operators are glued to variable
int a = 1;
a++;
int b = -a;
```

### functions
```c++
// Spaces after comma
int add(int a, int b) {
    return a + b;
}

// Spaces after comma here too
add(123, 333);
```

### classes
```c++
template<typename data_record>
class index_builder {
    public:
        index_builder(const std::string &db_name, size_t id);
        int public_func();

    private:
        int m_member;
        int m_counter;

        int private_func();
};
```


### if
```c++
// Space between "if" and "("
// Space between ")" and "{"
if (something) {
    do_something();
} else if (something_else) {
    do_something_else();
} else {
    do_else();
}
```

### loops
```c++
// Prefer range based loops.
for (const auto &iter : m_map) {

}
// But if you need a standard loop indent it like this.
for (int i = 0; i < 100; i++) {

} 
```

### memory allocation
```c++
// Avoid new/delete, use smart pointers everywhere.
// If you just need a regular pointer to memory do this:
std::unique_ptr<char[]> allocator;
try {
    allocator = std::make_unique<char[]>(1000);
} catch (std::bad_alloc &error) {
    // Handle allocation error.
}

char *ptr = allocator.get();

// Use ptr as regular pointer to 1000 chars.
// ptr will be deleted automatically when allocator goes out of scope.
```


================================================
FILE: documentation/configure_local_nginx.md
================================================
# COnfigure local nginx server.

1. Install nginx
```
apt-get install nginx
```

2. Add configuration to /etc/nginx/sites-available/default (If you are running other sites locally you should probably do something else here)
```
server {
	listen 80 default_server;
	listen [::]:80 default_server;

	root /var/www/html/node0003.alexandria.org;

	index index.html index.htm index.nginx-debian.html;

	server_name _;

	location / {
		try_files $uri $uri/ =404;
		autoindex on;
	}
}
```

3. Download test data to /var/www/html
```
./scripts/download-test-data.sh /var/www/html
```


================================================
FILE: documentation/full_text_indexes.md
================================================
# The alexandria full text index

A full text index in its simplest form is a hash map from an integer word id ```key``` to a list of documents.

There are two kinds of data structures called ```index``` and ```counted_index```. Both data structures acts on a given template type
```data_record```.
The two data structures shares the same data layout except for the last part where ```index``` stores roaring bitmaps while `counted_index` store the records.

## Data layout

The index starts with a hash table. The hash table stores the position for the page containing `key` at index `key % hash_table_size`.

```
hash table        : uint64_t[hash_table_size] (8 x hash_table_size bytes)
num_records       : uint64_t (8 bytes)
list of records   : data_record[num_records] (sizeof(data_record) * num_records bytes)
consecutive pages : page[varying] (undetermined size)
```

A single page consists of a list of keys. Each key then has a corresponding position among the bitmaps and a length of the bitmap. The bitmaps (of varying length) are then stored consecutively.
```
num_keys             : uint64_t (8 bytes)
list of keys         : uint64_t[num_keys] (8 x num_keys bytes)
list of positions    : uint64_t[num_keys] (8 x num_keys bytes)
list of lengths      : uint64_t[num_keys] (8 x num_keys bytes)
consecutive bitmaps  : bitmap[num_keys] (undetermined size)
```


================================================
FILE: documentation/ideas.md
================================================
# Similar words
To handle similar words (saluhall, saluhallen) we should create a hashtable with similar words and as an additional index create "saluhall+" by combining our existing indexes of saluhall, saluhallen, saluhallarna etc. into one additional index.

# Autocomplete
We should base our autocomplete on the most common words in titles of documents before and after each word. For example "Uppsala" could suggest "Uppsala kommun", "Uppsala universitet" and "Destination Uppsala" based on the search results.


================================================
FILE: documentation/index_file_format.md
================================================
# Index file format

```8 bytes number of keys (n)
8 * n bytes keys
8 * n bytes positions
8 * n bytes lengths (len(k) number of records for key k)
8 * n bytes total found results
[Data Records]
```

```
Data records are structured like this:
len(k) * (8 bytes unsigned long URL id, 4 bytes single precision float score)


================================================
FILE: documentation/indexer.md
================================================
### NAME

indexer - manually index data or analyze things

### SYNOPSIS

indexer [OPTION]

### DESCRIPTION
```
	--split source_batch target_prefix
		splits the urls in the local source batch and outputs them into {target_prefix}-[0-23]/files.
		for example --split CC-MAIN-2021-04 /mnt/crawl-data/NODE
	--split-count
	--split-count-domains
	--split-count-links
	--split-make-scraper-urls

	--tools-download-batch
	--tools-upload-urls-with-links
	--tools-find-links

	--calculate-harmonic-hosts
	--calculate-harmonic-links
	--calculate-harmonic

	--host-hash
	--host-hash-mod

	--console
		run the interactive console for making debug searches.

	--index-domans BATCH LIMIT OFFSET
		run the indexer for our domain index adding the urls+data from BATCH
	--index-links BATCH LIMIT OFFSET
		run the link indexer adding url_ and domain_ links from BATCH
	--index-words BATCH LIMIT OFFSET
		run the word indexer adding word data from BATCH
	--index-urls BATCH LIMIT OFFSET
		run the url indexer on batch generating one index per domain
	--index-snippets BATCH LIMIT OFFSET
		run the snippet indexer

	--truncate-domains
	--truncate-links
	--truncate-words
	--truncate-urls
	--truncate-snippets

	--info
		print info about indexes
```


================================================
FILE: documentation/installing_nodes.md
================================================
If problem with raid information on drive unmount all partitions and do this:
```
wipefs -a /dev/nvme1n1
```
then reset and install node again.

To setup node with two drives run:
```
source <(curl -s https://raw.githubusercontent.com/alexandria-org/alexandria/main/scripts/bootstrap_node_2drives.sh)
```


================================================
FILE: documentation/performance_journal.md
================================================
## Performance journal

### File system testing
Ext2 (noatime,nodiratime,barrier=0)
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 4.76649 s, 451 MB/s

$ echo 3 > /proc/sys/vm/drop_caches

$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.43043 s, 1.5 GB/s

real	0m1.435s
user	0m0.013s
sys	0m0.763s
```
Ext2 (relatime)
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 5.02563 s, 427 MB/s

$ echo 3 > /proc/sys/vm/drop_caches

$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.48533 s, 1.4 GB/s

real	0m1.490s
user	0m0.046s
sys	0m0.604s
```

Ext4 (noatime,nodiratime,barrier=0):
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.26469 s, 948 MB/s

$ echo 3 > /proc/sys/vm/drop_caches

$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.821499 s, 2.6 GB/s

real	0m0.824s
user	0m0.004s
sys	0m0.648s
```

Ext4 (relatime):
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.15461 s, 997 MB/s

$ echo 3 > /proc/sys/vm/drop_caches

$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.822013 s, 2.6 GB/s

real	0m0.825s
user	0m0.029s
sys	0m0.568s
```

Conclusion. Run ext4

### Software load testing
2021-10-06, AX61-NVME with two discs
```
Server Software:        nginx/1.18.0
Server Hostname:        node0002.alexandria.org
Server Port:            80

Concurrency Level:      5
Time taken for tests:   294.451 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      294262066 bytes
HTML transferred:       293986342 bytes
Requests per second:    6.79 [#/sec] (mean)
Time per request:       736.127 [ms] (mean)
Time per request:       147.225 [ms] (mean, across all concurrent requests)
Transfer rate:          975.94 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       12   19  10.1     16     152
Processing:    16  717 461.5    652    2896
Waiting:        0  662 431.7    587    2770
Total:         31  736 460.4    671    2911

Percentage of the requests served within a certain time (ms)
  50%    671
  66%    879
  75%   1009
  80%   1108
  90%   1344
  95%   1595
  98%   1864
  99%   2062
 100%   2911 (longest request)
```

2021-10-10, AX61-NVME with two discs
```
Server Software:        nginx/1.18.0
Server Hostname:        node0002.alexandria.org
Server Port:            80

Concurrency Level:      5
Time taken for tests:   328.051 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      255881934 bytes
HTML transferred:       255605934 bytes
Requests per second:    6.10 [#/sec] (mean)
Time per request:       820.128 [ms] (mean)
Time per request:       164.026 [ms] (mean, across all concurrent requests)
Transfer rate:          761.73 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       12   52  95.6     25    1560
Processing:    16  767 558.9    689    3961
Waiting:       15  638 427.9    594    2631
Total:         32  819 558.5    742    4113

Percentage of the requests served within a certain time (ms)
  50%    742
  66%    982
  75%   1159
  80%   1260
  90%   1560
  95%   1831
  98%   2186
  99%   2470
 100%   4113 (longest request)
```

2021-10-10, AX41-NVMe with four discs
```
Server Software:        nginx/1.18.0
Server Hostname:        65.21.238.146
Server Port:            80

Concurrency Level:      5
Time taken for tests:   278.694 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      232745432 bytes
HTML transferred:       232469432 bytes
Requests per second:    7.18 [#/sec] (mean)
Time per request:       696.735 [ms] (mean)
Time per request:       139.347 [ms] (mean, across all concurrent requests)
Transfer rate:          815.56 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       12   69  98.4     35    1107
Processing:    14  627 698.4    454    9790
Waiting:       14  435 346.5    368    4045
Total:         29  696 719.1    522   10159

Percentage of the requests served within a certain time (ms)
  50%    522
  66%    755
  75%    927
  80%   1050
  90%   1382
  95%   1781
  98%   2415
  99%   3439
 100%  10159 (longest request)
```

2021-10-10, AX41-NVMe with four discs
```
Server Software:        nginx/1.18.0
Server Hostname:        65.21.238.146
Server Port:            80

Concurrency Level:      5
Time taken for tests:   252.503 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      230349918 bytes
HTML transferred:       230073780 bytes
Requests per second:    7.92 [#/sec] (mean)
Time per request:       631.258 [ms] (mean)
Time per request:       126.252 [ms] (mean, across all concurrent requests)
Transfer rate:          890.88 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       12   54  78.2     27    1068
Processing:    15  576 519.3    436    3659
Waiting:       15  421 325.7    354    2421
Total:         30  631 527.6    491    3728

Percentage of the requests served within a certain time (ms)
  50%    491
  66%    707
  75%    861
  80%    988
  90%   1355
  95%   1736
  98%   2100
  99%   2419
 100%   3728 (longest request)
```

2021-10-10, AX61-NVME with two discs, 4 partitions
```
Server Software:        nginx/1.18.0
Server Hostname:        65.21.125.158
Server Port:            80

Concurrency Level:      5
Time taken for tests:   263.283 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      282821583 bytes
HTML transferred:       282545445 bytes
Requests per second:    7.60 [#/sec] (mean)
Time per request:       658.209 [ms] (mean)
Time per request:       131.642 [ms] (mean, across all concurrent requests)
Transfer rate:          1049.03 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       13   28  32.9     26     630
Processing:    17  629 434.1    563    3051
Waiting:       15  587 412.8    517    2949
Total:         36  657 435.8    593    3090

Percentage of the requests served within a certain time (ms)
  50%    593
  66%    774
  75%    914
  80%   1003
  90%   1260
  95%   1480
  98%   1708
  99%   1959
 100%   3090 (longest request)
```

2021-10-10, AX61-NVME with two discs, 4 partitions
```
Server Software:        nginx/1.18.0
Server Hostname:        65.21.125.158
Server Port:            80

Concurrency Level:      5
Time taken for tests:   249.241 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      267058842 bytes
HTML transferred:       266782842 bytes
Requests per second:    8.02 [#/sec] (mean)
Time per request:       623.101 [ms] (mean)
Time per request:       124.620 [ms] (mean, across all concurrent requests)
Transfer rate:          1046.38 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       13   27  19.3     25     734
Processing:    15  596 469.4    506    3785
Waiting:        0  554 449.3    467    3660
Total:         32  622 470.7    531    3805

Percentage of the requests served within a certain time (ms)
  50%    531
  66%    735
  75%    878
  80%    974
  90%   1234
  95%   1495
  98%   1809
  99%   2104
 100%   3805 (longest request)
```

2021-10-12, AX61-NVME with four discs and 8 partitions
```
Server Software:        nginx/1.18.0
Server Hostname:        135.181.182.4
Server Port:            80

Concurrency Level:      5
Time taken for tests:   264.412 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      274309399 bytes
HTML transferred:       274033261 bytes
Requests per second:    7.56 [#/sec] (mean)
Time per request:       661.029 [ms] (mean)
Time per request:       132.206 [ms] (mean, across all concurrent requests)
Transfer rate:          1013.12 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       13   27  16.1     25     348
Processing:    14  633 449.6    565    2996
Waiting:        0  590 425.7    520    2545
Total:         34  661 450.3    594    3014

Percentage of the requests served within a certain time (ms)
  50%    594
  66%    772
  75%    905
  80%   1000
  90%   1271
  95%   1510
  98%   1834
  99%   1997
 100%   3014 (longest request)
```

2021-10-12, AX61-NVME with four discs and 8 partitions
```
Server Software:        nginx/1.18.0
Server Hostname:        135.181.182.4
Server Port:            80

Concurrency Level:      5
Time taken for tests:   233.408 seconds
Complete requests:      2000
Failed requests:        0
Write errors:           0
Total transferred:      272488725 bytes
HTML transferred:       272213277 bytes
Requests per second:    8.57 [#/sec] (mean)
Time per request:       583.519 [ms] (mean)
Time per request:       116.704 [ms] (mean, across all concurrent requests)
Transfer rate:          1140.07 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:       12   25  10.1     24     187
Processing:    15  558 402.0    487    2727
Waiting:        0  512 377.0    440    2051
Total:         33  583 402.8    512    2757

Percentage of the requests served within a certain time (ms)
  50%    512
  66%    695
  75%    806
  80%    882
  90%   1114
  95%   1373
  98%   1621
  99%   1779
 100%   2757 (longest request)
```


================================================
FILE: documentation/search_result_ranking.md
================================================

# Search Result Ranking

This document describes how search results are indexed and ranked.

## Input
Input to our indexer is a sequence of deduplicated urls with the following data.
```
{
    url: "https://www.example.com/",
    title: "Example Page",
    meta_description: "",
    h1: "Example Domain",
    text: "This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information..."
}
```

## 1. Domain level
Each url is added with the url hash as key. The tokens are not deduplicated throughout the domain.

```
domain_score:
idf * sum(tf_ + )
```

```
domain_score = expm1(5 * link.m_score) + 0.1;
url_score = expm1(10 * link.m_score) + 0.1;
```


================================================
FILE: documentation/statues_swe.tex
================================================

\documentclass[12pt, a4paper]{article}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[swedish]{babel}

\title{Stadgar för Föreningen Alexandria.org}
\date{Januari 2022}

\begin{document}

\maketitle

\paragraph{§ 1 Föreningens firma}
\paragraph{}
Föreningens firma är Föreningen Alexandria.org och föreningens firmatecknare är ordförande eller annan person utsedd till firmatecknare av styrelsen.

\paragraph{§ 2 Föreningens ändamål}
\paragraph{}
Föreningen har som ändamål att göra kunskap mer tillgängligt. Föreningen ska uppfylla sitt ändamål genom att utveckla och tillhandahålla en sökmotor som är gratis och utan annonser. Källkoden till sökmotorn ska publiceras som öppen källkod.

\paragraph{§ 3 Föreningens säte}
\paragraph{}
Föreningen har sitt säte i Uppsala.

\paragraph{§ 4 Medlemsskap}
\paragraph{}
Föreningens medlemmar är aktiva i föreningens verksamhet. Nya medlemmar måste godkännas av styrelsen.

\paragraph{§ 5 Medlemsavgifter}
\paragraph{}
Medlem ska betala den medlemsavgift som årligen fastställs av årsmötet.

\pagebreak


\paragraph{§ 6 Styrelsen}
\paragraph{}
Styrelsen består av en ordförande, en kassör, en suppleant och eventuellt ytterligare ledarmöter enligt årsmötets beslut.

\paragraph{§ 7 Styrelsens uppdrag}
\paragraph{}
Styrelsen företräder föreningen, bevakar dess intressen och handhar dess angelägenheter. Styrelsen beslutar å föreningens vägnar såvida inte annat
föreskrivs i dessa stadgar. Styrelsen ska verkställa av årsmötet fattade beslut, handha föreningens ekonomiska angelägenheter och föra räkenskaper,
samt avge årsredovisning till årsstämman för det senaste räkenskapsåret. Styrelsen sammanträder när ordföranden finner det erforderligt eller om
minst två styrelseledamöter begär detta.

\paragraph{}
Styrelsen är beslutsför då minst hälften av ledmöterna, avrundat uppåt är närvarande. Styrelsebeslut fattas med enkel majoritet. Vid lika röstetal gäller den mening
ordföranden biträder.

\paragraph{§ 8 Räkenskaper}
\paragraph{}
Räkenskapsår ska vara kalenderår.

\paragraph{§ 9 Revisor}
\paragraph{}
Styrelsens förvaltning ska årligen granskas av en på årsmötet utsedd revisor. Revisorn ska senast den 1 mars avge sin revisionsberättelse. Revisorn får ej vara medlem i styrelsen.

\paragraph{§ 10 Årsmöte}
\paragraph{}
Ordinarie årsmöte, vilket är föreningens högsta beslutande organ, hålls årligen före den 30 juni på tid och plats som styrelsen bestämmer. Kallelse sker via epost minst 2 veckor före utsatt möte. Motioner som har inkommit senast 7 dagar före årsmötet ska anses ha kommit i tid. Motioner skickas via epost.

\paragraph{}
Vid ordinarie årsmöte ska följande ärenden behandlas:
\begin{enumerate}
\item Val av ordförande och sekreterare för mötet.
\item Fastställande av röstlängd för mötet.
\item Fastställande av dagordning.
\item Styrelsens verksamhetsberättelse för det senaste verksamhetsåret.
\item Styrelsens förvaltningsberättelse (balans- och resultaträkning) för det senaste verksamhets-/räkenskapsåret.
\item Revisionsberättelsen för verksamhets-/räkenskapsåret.
\item Fråga om ansvarsfrihet för styrelsen för den tid revisionen avser.
\item Fastställande av medlemsavgifter.
\item Fastställande av ev. verksamhetsplan och behandling av budget för det kommande verksamhets-/räkenskapsåret.
\item Val av ordförande i föreningen för en tid av 1 år.
\item Val av kassör, övriga styrelseledamöter samt suppleanter för en tid av 1 år
\item Val av revisorer.
\item Behandling av styrelsens förslag och i rätt tid inkomna motioner.
\item Övriga frågor. 
\end{enumerate}

\paragraph{§ 11 Extra årsmöte}
\paragraph{}
Extra årsmöte hålls när styrelsen eller revisorerna finner att det är nödvändigt. Kallelse sker via epost minst 2 veckor före utsatt möte.

\paragraph{§ 12 Rösträtt}
\paragraph{}
Vid årsmöte har varje medlem en röst. Rösträtten är personlig och kan inte utövas genom ombud.

\paragraph{§ 13 Beslut, omröstning och beslutsmässighet}
\paragraph{}
Beslut fattas med bifallsrop (acklamation) eller om så begärs, efter omröstning (votering).

\paragraph{}
Omröstning sker öppet, utom vid val där sluten omröstning ska äga rum om någon begär detta. Beslut fattas, såvida dessa stadgar ej föreskriver
annat, med enkel majoritet. Vid lika röstetal skall den mening som ordförande biträder vinna bifall.

\paragraph{}
Mötet är beslutsmässigt med det antal röstberättigade medlemmar som är närvarande på mötet.

\paragraph{§ 14 Regler för ändring av stadgarna}
\paragraph{}
För ändring av dessa stadgar krävs beslut av två på varandra följande ordinarie årsmöten. Förslag till ändring av stadgarna får ges såväl av medlem som styrelsen.

\paragraph{§ 15 Utträde}
\paragraph{}
Medlem som önskar utträda ur föreningen ska skriftligen anmäla detta till styrelsen och anses därmed omedelbart ha lämnat föreningen.

\paragraph{§ 16 Uteslutning}
\paragraph{}
Medlem får uteslutas från föreningen om den har försummat att betala beslutade avgifter, motarbetat föreningens
verksamhet eller ändamål, eller skadat föreningens intressen. Beslut om uteslutning fattas av styrelsen.

\end{document}


================================================
FILE: scripts/bootstrap_node_2drives.sh
================================================
#!/bin/bash

apt-get update
apt-get -y install vim parted zip unzip nginx

_mkpart() { 
	disc=$1
	mountpoint1=$2
	mountpoint2=$3
	mountpoint3=$4
	mountpoint4=$5
	parted -s $disc mklabel gpt
	parted -s -a optimal $disc mkpart primary ext4 0% 25%
	parted -s -a optimal $disc mkpart primary ext4 25% 50%
	parted -s -a optimal $disc mkpart primary ext4 50% 75%
	parted -s -a optimal $disc mkpart primary ext4 75% 100%

	sleep 1

	mkfs.ext4 -F ${disc}p1
	mkfs.ext4 -F ${disc}p2
	mkfs.ext4 -F ${disc}p3
	mkfs.ext4 -F ${disc}p4

	mkdir $mountpoint1
	mkdir $mountpoint2
	mkdir $mountpoint3
	mkdir $mountpoint4

	mount ${disc}p1 $mountpoint1
	mount ${disc}p2 $mountpoint2
	mount ${disc}p3 $mountpoint3
	mount ${disc}p4 $mountpoint4

	echo "" >> /etc/fstab
	echo "${disc}p1 $mountpoint1 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
	echo "${disc}p2 $mountpoint2 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
	echo "${disc}p3 $mountpoint3 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
	echo "${disc}p4 $mountpoint4 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
}

mkdir /mnt/0
mkdir /mnt/1
mkdir /mnt/2
mkdir /mnt/3

_mkpart /dev/nvme1n1 /mnt/4 /mnt/5 /mnt/6 /mnt/7

for shard in $(seq 0 7); do
	mkdir "/mnt/$shard/input";
	mkdir "/mnt/$shard/output";
	mkdir "/mnt/$shard/upload";
	mkdir "/mnt/$shard/hash_table";
	mkdir "/mnt/$shard/full_text";
	mkdir "/mnt/$shard/tmp";
done

echo "server {
    listen 80;
    server_name localhost;

    location / {
        fastcgi_pass   127.0.0.1:8000;
        fastcgi_param  GATEWAY_INTERFACE  CGI/1.1;
        fastcgi_param  SERVER_SOFTWARE    nginx;
        fastcgi_param  QUERY_STRING       \$query_string;
        fastcgi_param  REQUEST_METHOD     \$request_method;
        fastcgi_param  CONTENT_TYPE       \$content_type;
        fastcgi_param  CONTENT_LENGTH     \$content_length;
        fastcgi_param  SCRIPT_FILENAME    \$document_root\$fastcgi_script_name;
        fastcgi_param  SCRIPT_NAME        \$fastcgi_script_name;
        fastcgi_param  REQUEST_URI        \$request_uri;
        fastcgi_param  DOCUMENT_URI       \$document_uri;
        fastcgi_param  DOCUMENT_ROOT      \$document_root;
        fastcgi_param  SERVER_PROTOCOL    \$server_protocol;
        fastcgi_param  REMOTE_ADDR        \$remote_addr;
        fastcgi_param  REMOTE_PORT        \$remote_port;
        fastcgi_param  SERVER_ADDR        \$server_addr;
        fastcgi_param  SERVER_PORT        \$server_port;
        fastcgi_param  SERVER_NAME        \$server_name;
    }
}" > /etc/nginx/sites-enabled/default
/etc/init.d/nginx restart

adduser --system --shell /sbin/nologin --gecos "User for running alexandria service" --disabled-password --home /alexandria alexandria

touch /var/log/alexandria.log
chown alexandria:syslog /var/log/alexandria.log

echo "[Unit]
Description=Alexandria Server

[Service]
User=alexandria
WorkingDirectory=/alexandria
ExecStart=/alexandria/server
Nice=-20
Restart=always

[Install]
WantedBy=multi-user.target" > /etc/systemd/system/alexandria.service

echo "# Cluster config
nodes_in_cluster = 4
node_id = 0

# Indexer config
batches[] = NODE-0
batches[] = NODE-1
batches[] = NODE-2
batches[] = NODE-3
batches[] = NODE-4
batches[] = NODE-5

link_batches[] = LINK-0
link_batches[] = LINK-1
link_batches[] = LINK-2
link_batches[] = LINK-3
link_batches[] = LINK-4
link_batches[] = LINK-5

# Server config
worker_count = 8
query_max_words = 10 # Maximum number of words used in query.
query_max_len = 200
deduplicate_domain_count = 5
pre_result_limit = 200000
result_limit = 1000

# Full text config
ft_max_sections = 8
ft_max_results_per_section = 2000000
ft_section_depth = 4" > /etc/alexandria.conf

mkdir /alexandria
cd /alexandria
wget https://github.com/alexandria-org/alexandria/releases/download/v1.0/alexandria.zip
unzip alexandria.zip
chown -R alexandria /mnt/*


================================================
FILE: scripts/build-deps.sh
================================================
#!/bin/bash

cd `dirname $0`
cd ..

base_path=`pwd`

cd $base_path
cd deps

cd zlib-1.2.12
./configure
make -j4
make install

cd $base_path
cd deps

export CC=/usr/bin/gcc
export CXX=/usr/bin/g++

cd CRoaring
mkdir build
cd build
cmake ..
make
make install


================================================
FILE: scripts/clean.sh
================================================
#!/bin/bash

cd `dirname $0`
cd ..

read -p "Do you want to delete your local alexandria data? [Y/n] " -n 1 -r
echo
if [[ $REPLY =~ ^[Y]$ ]]
then
	for shard in $(seq 0 7); do
		rm -r /mnt/$shard/*
		mkdir /mnt/$shard
		mkdir "/mnt/$shard/input";
		mkdir "/mnt/$shard/output";
		mkdir "/mnt/$shard/upload";
		mkdir "/mnt/$shard/hash_table";
		mkdir "/mnt/$shard/full_text";
		mkdir "/mnt/$shard/tmp";
	done

else
	echo "Ignoring"
fi


================================================
FILE: scripts/download-deps.sh
================================================
#!/bin/bash

cd `dirname $0`
cd ..

export CC=/usr/bin/gcc-10
export CXX=/usr/bin/g++-10

base_path=`pwd`
cd $base_path

mkdir -p deps
cd deps

curl -L https://github.com/nlohmann/json/releases/latest/download/json.hpp > json.hpp

curl https://zlib.net/fossils/zlib-1.2.12.tar.gz > zlib-1.2.12.tar.gz
gunzip -f zlib-1.2.12.tar.gz
tar -xvf zlib-1.2.12.tar

git clone https://github.com/abseil/abseil-cpp.git
git clone https://github.com/RoaringBitmap/CRoaring.git
wget https://raw.githubusercontent.com/google/robotstxt/master/robots.cc
wget https://raw.githubusercontent.com/google/robotstxt/master/robots.h


================================================
FILE: scripts/download-test-data.sh
================================================
#!/bin/bash

cd `dirname $0`

if [ $# -eq 0 ]; then
	echo "Provide destination path as first argument"
	exit 1
fi

for shard in $(seq 0 7); do
	mkdir "/mnt/$shard";
	mkdir "/mnt/$shard/input";
	mkdir "/mnt/$shard/output";
	mkdir "/mnt/$shard/upload";
	mkdir "/mnt/$shard/hash_table";
	mkdir "/mnt/$shard/full_text";
	mkdir "/mnt/$shard/tmp";
done

DEST=$1

cd $DEST || { echo "target directory does not exist"; exit 127; }

rm -r node0003.alexandria.org
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-01/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-02/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-03/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-04/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-05/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-06/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-07/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-08/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-09/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-10/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/50_top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/dev_files/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/example.txt --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/example.txt.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/test-data/ --http-user=alexandria --http-password=wmXN6U4u

mkdir node0003.alexandria.org/nodes
mkdir node0003.alexandria.org/nodes/test0001
mkdir node0003.alexandria.org/upload-tmp

chown -R www-data:www-data node0003.alexandria.org


================================================
FILE: scripts/find_missing_files_in_batch.sh
================================================
#!/bin/bash

cd `dirname $0`
cd ..

batch=$1

files=`curl https://data.commoncrawl.org/crawl-data/$batch/warc.paths.gz | gunzip`

missing_files_path="/mnt/crawl-data/$batch/missing.paths"

truncate -s 0 $missing_files_path

for raw_file in $files; do
	file="/mnt/${raw_file/.warc.gz/.gz}"
	if [[ -f "$file" ]]; then
		filesize=$(stat -c%s "$file")
		if [[ $filesize -lt 1000 ]]; then
			echo "The file '$file' exists and is small."
			echo $raw_file >> $missing_files_path
		fi
	else
		echo "The file '$file' does not exist."
		echo $raw_file >> $missing_files_path
	fi
done

gzip $missing_files_path


================================================
FILE: scripts/init-docker.sh
================================================
#!/bin/bash

cd `dirname $0`

# The local docker development environment runs the data server on the local machine.
# This script sets that up and downloads the test data.

echo "Copying nginx config";

echo "server {
	listen 80 default_server;
	listen [::]:80 default_server;

	root /var/www/html/node0003.alexandria.org;
	index index.html;
	server_name _;

	location / {
			autoindex on;
    		client_body_temp_path /var/www/html/node0003.alexandria.org/upload-tmp;
    		dav_methods PUT;
    		create_full_put_path  on;
    		dav_access group:rw  all:r;
    		client_max_body_size 10000m;
	}
	location /store {
		fastcgi_pass   127.0.0.1:8001;
		fastcgi_param  GATEWAY_INTERFACE  CGI/1.1;
		fastcgi_param  SERVER_SOFTWARE    nginx;
		fastcgi_param  QUERY_STRING       \$query_string;
		fastcgi_param  REQUEST_METHOD     \$request_method;
		fastcgi_param  CONTENT_TYPE       \$content_type;
		fastcgi_param  CONTENT_LENGTH     \$content_length;
		fastcgi_param  SCRIPT_FILENAME    \$document_root\$fastcgi_script_name;
		fastcgi_param  SCRIPT_NAME        \$fastcgi_script_name;
		fastcgi_param  REQUEST_URI        \$request_uri;
		fastcgi_param  DOCUMENT_URI       \$document_uri;
		fastcgi_param  DOCUMENT_ROOT      \$document_root;
		fastcgi_param  SERVER_PROTOCOL    \$server_protocol;
		fastcgi_param  REMOTE_ADDR        \$remote_addr;
		fastcgi_param  REMOTE_PORT        \$remote_port;
		fastcgi_param  SERVER_ADDR        \$server_addr;
		fastcgi_param  SERVER_PORT        \$server_port;
		fastcgi_param  SERVER_NAME        \$server_name;
	}
}
" > /etc/nginx/sites-enabled/default

echo "Downloading test data";
./download-test-data.sh /var/www/html

mkdir /var/www/html/node0003.alexandria.org/nodes
mkdir /var/www/html/node0003.alexandria.org/nodes/test0001
mkdir /var/www/html/node0003.alexandria.org/upload-tmp

chown -R www-data:www-data /var/www/html/node0003.alexandria.org

/etc/init.d/nginx restart

./download-deps.sh
./build-deps.sh


================================================
FILE: scripts/install-deps.sh
================================================
#!/bin/bash

apt-get install -y zip make cmake gcc-10 g++-10 gcc g++ libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx


================================================
FILE: scripts/packager.sh
================================================
#!/bin/bash
#  Copyright 2018-present Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License").
#  You may not use this file except in compliance with the License.
#  A copy of the License is located at
#
#   http://aws.amazon.com/apache2.0
#
#  or in the "license" file accompanying this file. This file is distributed
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
#  express or implied. See the License for the specific language governing
#  permissions and limitations under the License.

# Modified by Josef Cullhed 2021

set -euo pipefail

print_help() {
    echo -e "Usage: packager [OPTIONS] <binary name>\n"
    echo -e "OPTIONS\n"
    echo -e "\t-d,--default-libc\t Use the target host libc libraries. This will not package the C library files.\n"
}

if [ $# -lt 1 ]; then
    echo -e "Error: missing arguments\n"
    print_help
    exit 1
fi

POSITIONAL=()
INCLUDE_LIBC=true
while [[ $# -gt 0 ]]
do
    key="$1"
    case $key in
        -d|--default-libc)
            INCLUDE_LIBC=false
            shift # past argument
            ;;
        *)    # unknown option
            POSITIONAL+=("$1") # save it in an array for later
            shift # past argument
            ;;
    esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters

PKG_BIN_PATH=$1
architecture=$(arch)

if [ ! -d "$PKG_BIN_PATH" ]; then
    echo "$PKG_BIN_PATH" - No such directory.;
    exit 1;
fi

if ! type zip > /dev/null 2>&1; then
    echo "zip utility is not found. Please install it and re-run this script"
    exit 1
fi
function package_libc_via_pacman {
    if grep --extended-regexp "Arch Linux|Manjaro Linux" < /etc/os-release > /dev/null 2>&1; then
        if type pacman > /dev/null 2>&1; then
            pacman --query --list --quiet glibc | sed -E '/\.so$|\.so\.[0-9]+$/!d'
        fi
    fi
}

function package_libc_via_dpkg() {
    if type dpkg-query > /dev/null 2>&1; then
        if [[ $(dpkg-query --listfiles libc6 | wc -l) -gt 0 ]]; then
            dpkg-query --listfiles libc6 | sed -E '/\.so$|\.so\.[0-9]+$/!d'
        fi
    fi
}

function package_libc_via_rpm() {
    if type rpm > /dev/null 2>&1; then
       if [[ $(rpm --query --list glibc.$architecture | wc -l) -gt 1 ]]; then
           rpm --query --list glibc.$architecture | sed -E '/\.so$|\.so\.[0-9]+$/!d'
       fi
    fi
}

# hasElement expects an element and an array parameter
# it's equivalent to array.contains(element)
# e.g. hasElement "needle" ${haystack[@]}
function hasElement() {
    local el key=$1
    shift
    for el in "$@"
    do
        [[ "$el" == "$key" ]] && return 0
    done
    return 1
}

PKG_BIN_FILENAME=alexandria
PKG_DIR=tmp
PKG_LD=""

list=$(ldd "$PKG_BIN_PATH/server" | awk '{print $(NF-1)}')
libc_libs=()
libc_libs+=($(package_libc_via_dpkg))
libc_libs+=($(package_libc_via_rpm))
libc_libs+=($(package_libc_via_pacman))

mkdir -p "$PKG_DIR/bin" "$PKG_DIR/lib"

for i in $list
do
    if [[ ! -f $i ]]; then # ignore linux-vdso.so.1
        continue
    fi

    # Do not copy libc files which are directly linked unless it's the dynamic loader
    if hasElement "$i" "${libc_libs[@]}"; then
        filename=$(basename "$i")
        if [[ -z "${filename##ld-*}" ]]; then
            PKG_LD=$filename # Use this file as the loader
            cp "$i" "$PKG_DIR/lib"
        fi
        continue
    fi

    cp "$i" $PKG_DIR/lib
done

if [[ $INCLUDE_LIBC == true ]]; then
    for i in "${libc_libs[@]}"
    do
        filename=$(basename "$i")
        if [[ -z "${filename##ld-*}" ]]; then
            # if the loader is empty, then the binary is probably linked to a symlink of the loader. The symlink will
            # not show up when quering the package manager for libc files. So, in this case, we want to copy the loader
            if [[ -z "$PKG_LD" ]]; then 
                PKG_LD=$filename
                cp "$i" "$PKG_DIR/lib" # we want to follow the symlink (default behavior)
            fi
            continue # We don't want the dynamic loader's symlink because its target is an absolute path (/lib/ld-*).
        fi
        cp --no-dereference "$i" "$PKG_DIR/lib"
    done
fi

if [[ -z "$PKG_LD" ]]; then
    echo "Failed to identify, locate or package the loader. Please file an issue on Github!" 1>&2
    exit 1
fi

bootstrap_script_server=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf nice -n -20 ./lib/$PKG_LD --library-path ./lib ./bin/server
EOF
)

bootstrap_script_scraper=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf nice -n -20 ./lib/$PKG_LD --library-path ./lib ./bin/scraper
EOF
)

bootstrap_script_indexer=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf ./lib/$PKG_LD --library-path ./lib ./bin/indexer \$@
EOF
)

bootstrap_script_alexandria=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf ./lib/$PKG_LD --library-path ./lib ./bin/alexandria \$@
EOF
)

cp "$PKG_BIN_PATH/server" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/scraper" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/indexer" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/alexandria" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/../scripts/bootstrap_node_2drives.sh" "$PKG_DIR/"
cp "$PKG_BIN_PATH/../scripts/truncate.sh" "$PKG_DIR/"
cp "$PKG_BIN_PATH/../scripts/update.sh" "$PKG_DIR/"
chmod +x "$PKG_DIR/bootstrap_node_2drives.sh"
chmod +x "$PKG_DIR/truncate.sh"
chmod +x "$PKG_DIR/update.sh"
echo -e "$bootstrap_script_server" > "$PKG_DIR/server"
echo -e "$bootstrap_script_scraper" > "$PKG_DIR/scraper"
echo -e "$bootstrap_script_indexer" > "$PKG_DIR/indexer"
echo -e "$bootstrap_script_alexandria" > "$PKG_DIR/alexandria"
chmod +x "$PKG_DIR/server"
chmod +x "$PKG_DIR/scraper"
chmod +x "$PKG_DIR/indexer"
chmod +x "$PKG_DIR/alexandria"
# some shenanigans to create the right layout in the zip file without extraneous directories
pushd "$PKG_DIR" > /dev/null
zip --symlinks --recurse-paths "$PKG_BIN_FILENAME".zip -- *
ORIGIN_DIR=$(dirs -l +1)
mv "$PKG_BIN_FILENAME".zip "$ORIGIN_DIR"
popd > /dev/null
rm -r "$PKG_DIR"
echo Created "$ORIGIN_DIR/$PKG_BIN_FILENAME".zip


================================================
FILE: scripts/prepare-output-dirs.sh
================================================
#!/bin/bash

cd `dirname $0`
cd ..

for shard_id in $(seq 0 7); do
	shard="/mnt/$shard_id"
	rm -r $shard
	mkdir $shard
	mkdir "$shard/input";
	mkdir "$shard/output";
	mkdir "$shard/upload";
	mkdir "$shard/hash_table";
	mkdir "$shard/full_text";
	mkdir "$shard/tmp";
done


================================================
FILE: scripts/truncate.sh
================================================
#!/bin/bash

cd `dirname $0`
cd ..

for shard in $(seq 0 7); do
	rm -r /mnt/$shard/*
	mkdir "/mnt/$shard/input";
	mkdir "/mnt/$shard/output";
	mkdir "/mnt/$shard/upload";
	mkdir "/mnt/$shard/hash_table";
	mkdir "/mnt/$shard/full_text";
	mkdir "/mnt/$shard/tmp";
done

chown -R alexandria /mnt/*


================================================
FILE: scripts/update.sh
================================================
#!/bin/bash

cd `dirname $0`

wget https://github.com/alexandria-org/alexandria/releases/latest/download/alexandria.zip -O alexandria.zip
unzip -o alexandria.zip


================================================
FILE: src/URL.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "URL.h"
#include "algorithm/hash.h"
#include "parser/parser.h"
#include <curl/curl.h>
#include "text/text.h"
#include "warc/tlds.h"

using namespace std;

URL::URL() {
	m_status = ::parser::OK;
}

URL::URL(const URL &url) :
	m_url_string(url.m_url_string),
	m_host(url.m_host),
	m_host_reverse(url.m_host_reverse),
	m_scheme(url.m_scheme),
	m_path(url.m_path),
	m_query(url.m_query),
	m_status(url.m_status),
	m_has_www(url.m_has_www)
{
}

URL::URL(const string &url) :
	m_url_string(url)
{
	m_status = parse();
}

URL::URL(const string &host, const string &path) :
	m_url_string("http://" + host + path), m_host(host), m_path(path)
{
	m_host_reverse = URL::host_reverse(m_host);
	m_status = ::parser::OK;
}

URL::~URL() {

}

void URL::set_url_string(const string &url) {
	m_url_string = url;
	m_status = parse();
}

string URL::str() const {
	return m_url_string;
}

string URL::key() const {
	/*
	 * We should probably change this to:
	 * return m_host + path_with_query();
	 * but we need to do it later..
	 */
	return m_host + m_path + m_query;
}

string URL::hash_input() const {
	return m_host + path_with_query();
}

uint64_t URL::hash() const {
	return ::algorithm::hash(hash_input());
}

uint64_t URL::host_hash() const {
	return ::algorithm::hash(m_host);
}

uint64_t URL::link_hash(const URL &target_url, const string &link_text) const {
	return ::algorithm::hash(host() + target_url.str());
}

uint64_t URL::domain_link_hash(const URL &target_url, const string &link_text) const {
	return ::algorithm::hash(host() + target_url.host());
}

bool URL::canonically_different(const URL &url) const {
	return key() != url.key();
}

bool URL::has_https() const {
	return m_scheme == "https";
}

bool URL::has_www() const {
	return m_has_www;
}

string URL::host() const {
	return m_host;
}

string URL::host_top_domain() const {
	vector<string> parts;
	std::string_view host(m_host);

	size_t pos1 = host.find_last_of(".");
	if (host.substr(pos1 + 1) == "uk") {
		pos1 = host.find_last_of(".", pos1 - 1);
		if (host.substr(pos1 + 1) != "co.uk") {
			return m_host;
		}
	} else if (host.substr(pos1 + 1) == "au") {
		pos1 = host.find_last_of(".", pos1 - 1);
	}
	size_t pos2 = host.find_last_of(".", pos1 - 1);
	if (pos2 == string::npos) {
		return m_host;
	}
	return m_host.substr(pos2 + 1);
}

string URL::scheme() const {
	return m_scheme;
}

string URL::host_reverse() const {
	return m_host_reverse;
}

string URL::path() const {
	return m_path;
}

string URL::path_with_query() const {
	if (m_query.size() > 0) {
		return m_path + "?" + m_query;
	} else {
		return m_path;
	}
}

map<string, string> URL::query() const {
	map<string, string> ret;
	vector<string> parts;
	boost::split(parts, m_query, boost::is_any_of("&"));
	for (const string &part : parts) {
		vector<string> pair;
		boost::split(pair, part, boost::is_any_of("="));
		if (pair.size() > 1) {
			ret[pair[0]] = parser::urldecode(pair[1]);
		}
	}

	return ret;
}

float URL::harmonic() const {

	return 0.0f;
}

string URL::host_reverse(const string &host) {
	vector<string> parts;
	boost::split(parts, host, boost::is_any_of("."));
	reverse(parts.begin(), parts.end());
	return boost::algorithm::join(parts, ".");
}

string URL::host_reverse_top_domain(const string &host) {
	/*
	 * This algorithm is OK since we only run on these tlds:
	 * {"se", "com", "nu", "net", "org", "gov", "edu", "info"}
	 * */
	vector<string> parts;
	boost::split(parts, host, boost::is_any_of("."));
	if (parts.size() > 2) {
		parts = {parts[parts.size() - 2], parts[parts.size() - 1]};
	}
	reverse(parts.begin(), parts.end());
	return boost::algorithm::join(parts, ".");
}

string URL::domain_without_tld() const {
	vector<string> parts;
	boost::split(parts, m_host, boost::is_any_of("."));
	if (parts.size() > 1) {
		return parts[parts.size() - 2];
	}
	return "";
}

uint32_t URL::size() const {
	return str().size();
}

void URL::set_scheme(const string &scheme) {
	m_scheme = scheme;
	rebuild_url_str();
}

void URL::set_www(bool has_www) {
	m_has_www = has_www;
	rebuild_url_str();
}

URL &URL::operator=(const URL &other) {
	m_url_string = other.m_url_string;
	m_host = other.m_host;
	m_host_reverse = other.m_host_reverse;
	m_scheme = other.m_scheme;
	m_path = other.m_path;
	m_query = other.m_query;
	m_status = other.m_status;
	m_has_www = other.m_has_www;

	return *this;
}

istream &operator >>(istream &ss, URL &url) {
	ss >> (url.m_url_string);
	url.m_status = url.parse();

	return ss;
}

ostream &operator <<(ostream& os, const URL& url) {
	os << url.m_url_string;
	return os;
}

int URL::parse() {
	CURLU *h = curl_url();
	if (!h) return ::parser::ERROR;

	CURLUcode uc = curl_url_set(h, CURLUPART_URL, m_url_string.c_str(), 0);
	if (uc) {
		curl_url_cleanup(h);
		return ::parser::ERROR;
	}

	char *chost;
	uc = curl_url_get(h, CURLUPART_HOST, &chost, 0);
	if (!uc) {
		m_host = chost;
		remove_www(m_host);
		curl_free(chost);
	}

	char *scheme;
	uc = curl_url_get(h, CURLUPART_SCHEME, &scheme, 0);
	if (!uc) {
		m_scheme = scheme;
		curl_free(scheme);
	}

	char *cpath;
	uc = curl_url_get(h, CURLUPART_PATH, &cpath, 0);
	if (!uc) {
		m_path = cpath;
		curl_free(cpath);
	}

	char *cquery;
	uc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0);
	if (!uc) {
		m_query = cquery;
		curl_free(cquery);
	}

	curl_url_cleanup(h);

	m_host_reverse = URL::host_reverse(m_host);

	return ::parser::OK;
}

void URL::rebuild_url_str() {
	m_url_string = m_scheme + "://" + (m_has_www ? "www." : "") + m_host + path_with_query();
}

inline void URL::remove_www(string &path) {
	size_t pos = path.find("www.");
	if (pos == 0) {
		m_has_www = true;
		path.erase(0, 4);
	} else {
		m_has_www = false;
	}
	text::trim(path);
}


================================================
FILE: src/URL.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "config.h"

#include <iostream>
#include <functional>
#include <map>
#include <boost/algorithm/string/join.hpp>

class URL {

public:
	URL();
	URL(const URL &url);
	explicit URL(const std::string &url);
	explicit URL(const std::string &host, const std::string &path);
	~URL();

	static std::string host_reverse(const std::string &host);
	static std::string host_reverse_top_domain(const std::string &host);

	void set_url_string(const std::string &url);
	std::string str() const;
	std::string key() const;

	std::string hash_input() const;
	uint64_t hash() const;
	uint64_t host_hash() const;
	uint64_t link_hash(const URL &target_url, const std::string &link_text) const;
	uint64_t domain_link_hash(const URL &target_url, const std::string &link_text) const;
	bool canonically_different(const URL &url) const;
	bool has_https() const;
	bool has_www() const;

	std::string host() const;
	std::string host_top_domain() const;
	std::string scheme() const;
	std::string path() const;
	std::string path_with_query() const;
	std::map<std::string, std::string> query() const;
	std::string host_reverse() const;
	std::string domain_without_tld() const;
	uint32_t size() const;

	void set_scheme(const std::string &scheme);
	void set_www(bool has_www);

	float harmonic() const;

	size_t index_on_node() const {
		return host_hash() % config::nodes_in_cluster;
	}

	URL &operator=(const URL &other);
	friend std::istream &operator >>(std::istream &ss, URL &url);
	friend std::ostream &operator <<(std::ostream& os, const URL& url);

private:

	std::string m_url_string;
	std::string m_host;
	std::string m_host_reverse;
	std::string m_scheme;
	std::string m_path;
	std::string m_query;
	int m_status;
	bool m_has_www;

	int parse();
	void rebuild_url_str();
	inline void remove_www(std::string &path);


};


================================================
FILE: src/alexandria.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iostream>
#include <sstream>
#include <numeric>
#include "logger/logger.h"
#include "downloader/warc_downloader.h"
#include "downloader/merge_downloader.h"
#include "URL.h"
#include "hash_table2/hash_table.h"
#include "hash_table2/hash_table_shard_builder.h"
#include "indexer/index.h"
#include "indexer/index_builder.h"
#include "indexer/value_record.h"
#include "algorithm/hyper_ball.h"
#include "utils/thread_pool.hpp"
#include "file/file.h"
#include "http/server.h"
#include "parser/parser.h"
#include <boost/algorithm/string.hpp>

using namespace std;

void help() {
	std::string content = file::cat("../documentation/alexandria.md");
	std::cout << content << std::endl;
}

int main(int argc, const char **argv) {

	logger::start_logger_thread();
	logger::verbose(true);

	if (getenv("ALEXANDRIA_CONFIG") != NULL) {
		config::read_config(getenv("ALEXANDRIA_CONFIG"));
	} else {
		config::read_config("/etc/alexandria.conf");
	}

	if (argc < 2) {
		help();
		return 0;
	}

	const string arg(argc > 1 ? argv[1] : "");

	if (arg == "--hash-table-url" && argc > 2) {
		URL url(argv[2]);
		hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");

		size_t ver = 0;
		std::string data = ht.find(url.hash(), ver);
		std::cout << ver << std::endl;
		std::cout << data << std::endl;
	} else if (arg == "--hash-table-url-hash" && argc > 2) {
		uint64_t url_hash = std::stoull(argv[2]);
		hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");

		size_t ver = 0;
		std::string data = ht.find(url_hash, ver);
		std::cout << ver << std::endl;
		std::cout << data << std::endl;
	} else if (arg == "--hash-table-count") {

		hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");

		std::cout << ht.size() << std::endl;

	} else if (arg == "--hash-table-find-all" && argc > 2) {

		hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");

		// Put given hosts in array with hashes to search for.
		std::vector<uint64_t> search_for;
		for (int i = 2; i < argc; i++) {
			search_for.push_back(URL(string("https://") + argv[i]).host_hash());
		}

		ht.for_each([&search_for](uint64_t key, std::string value) {

			URL url(value.substr(0, value.find("\t")));

			const auto my_host_hash = url.host_hash();
			for (const auto &host_hash : search_for) {
				if (host_hash == my_host_hash) {
					std::cout << key << "\t" << url.str() << std::endl;
					break;
				}
			}

		});

	} else if (arg == "--hash-table-count" && argc > 2) {

		std::string data = file::cat("domains.txt");
		std::vector<std::string> lines;
		boost::split(lines, data, boost::is_any_of("\n"));
		std::map<std::string, uint64_t> domains;
		std::map<uint64_t, size_t> domain_counts;
		std::vector<std::string> domain_list;
		for (const auto &line : lines) {
			if (line == "") continue;
			const std::string reversed = URL::host_reverse(line);
			std::cout << reversed << std::endl;
			const uint64_t domain_hash = URL(string("https://") + reversed).host_hash();
			domains[reversed] = domain_hash;
			domain_counts[domain_hash] = 0;
			domain_list.push_back(reversed);
		}

		hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");

		uint64_t thelazy_host_hash = URL(string("https://") + argv[2]).host_hash();

		ht.for_each([thelazy_host_hash, &domain_counts](uint64_t key, std::string value) {

			URL url(value.substr(0, value.find("\t")));

			const auto my_host_hash = url.host_hash();
			for (auto &iter : domain_counts) {
				if (iter.first == my_host_hash) {
					domain_counts[iter.first]++;
					break;
				}
			}

			/*if (url.host_hash() == thelazy_host_hash) {
				std::cout << key << " => " << url.str() << std::endl;
			}*/

		});

		for (auto &domain : domain_list) {
			std::cout << domain << "\t" << domain_counts[domains[domain]] << std::endl;
		}

	} else if (arg == "--hash-table-optimize-shard" && argc > 2) {
		size_t shard_id = std::stoull(argv[2]);
		hash_table2::hash_table_shard_builder ht_shard("all_urls", shard_id, 1000000, "/slow_data");

		ht_shard.optimize();

	} else if (arg == "--internal-harmonic") {
		profiler::instance prof_total("total");
		/*

		std::vector<std::string> all_files;
		file::read_directory("/mnt/0/full_text/internal_links", [&all_files](const std::string &filename) {
			all_files.push_back(filename);
		});

		size_t done_with = 0;
		profiler::instance prof("total");
		for (const auto &filename : all_files) {

			// Read the file.
			std::ifstream infile("/mnt/0/full_text/internal_links/" + filename, std::ios::binary);
			std::string infile_data(std::istreambuf_iterator<char>(infile), {});
			infile.close();
			std::istringstream reader(infile_data);
			indexer::index<indexer::value_record> idx(&reader, 1000);

			// Create vertices vector
			std::vector<uint64_t> vertices;
			std::map<uint64_t, uint64_t> vertex_map;

			size_t record_id = 0;
			for (const auto &record : idx.records()) {
				vertices.push_back(record.m_value);
				vertex_map[record.m_value] = record_id;
				record_id++;
			}

			std::vector<roaring::Roaring> edge_map(vertices.size());

			// Populate edge map
			idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {
					if (vertex_map.count(key) == 0) {
						vertices.push_back(key);
						edge_map.push_back(roaring::Roaring());
						vertex_map[key] = record_id;
						record_id++;
					}
					edge_map[vertex_map[key]] = std::move(bitmap);
			});


			// Calculate harmonic centrality on graph.
			if (vertices.size() > 500) {
				auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());
			}

			// Sort the results a bit.
			std::vector<size_t> sorted(harmonic.size());
			std::iota(sorted.begin(), sorted.end(), 0);
			std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {
				return harmonic[a] > harmonic[b];
			});

			done_with++;
			float percent = ((float)done_with / all_files.size()) * 100.0f;
			float elapsed_milliseconds = prof.get();
			size_t items_left = all_files.size() - done_with;
			float milliseconds_per_file = elapsed_milliseconds/done_with;
			float milliseconds_left = milliseconds_per_file * items_left;
			float hours_left = milliseconds_left / (1000.0f * 3600.0f);
			std::cout << "done with " << done_with << " out of " << all_files.size() << " (" <<
				percent << "% done) time left: " << hours_left << " hours"<< std::endl;
		}

		return 0;*/

		// load the file
		std::string content = file::cat("multiple_domains.tsv");
		std::vector<std::string> lines;
		boost::split(lines, content, boost::is_any_of("\n"));
		std::vector<std::vector<std::string>> csv_data;
		for (auto line : lines) {
			std::vector<std::string> cols;
			boost::split(cols, line, boost::is_any_of("\t"));
			if (cols.size() > 1) {
				if (URL(cols[1]).host_hash() == URL("http://abc13.com").host_hash()) {
					csv_data.push_back(cols);
				}
			}
		}

		profiler::instance prof_load("load");
		//std::ifstream infile("/mnt/5/full_text/internal_links/3492248666075096845.data", std::ios::binary);
		std::ifstream infile("/mnt/6/full_text/internal_links/12854855988816217414.data", std::ios::binary);
		std::string infile_data(std::istreambuf_iterator<char>(infile), {});
		infile.close();
		std::istringstream reader(infile_data);
		indexer::index<indexer::value_record> idx(&reader, 1000);
		prof_load.stop();

		profiler::instance prof("make vertices");

		std::vector<uint64_t> vertices;
		std::map<uint64_t, uint64_t> vertex_map;

		size_t record_id = 0;
		for (const auto &record : idx.records()) {
			vertices.push_back(record.m_value);
			vertex_map[record.m_value] = record_id;
			record_id++;
		}

		std::vector<roaring::Roaring> edge_map(vertices.size());

		idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {
				if (vertex_map.count(key) == 0) {
					vertices.push_back(key);
					edge_map.push_back(roaring::Roaring());
					vertex_map[key] = record_id;
					record_id++;
				}
				edge_map[vertex_map[key]] = std::move(bitmap);
		});

		prof.stop();
		profiler::instance prof2("run hyper_ball");

		auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());

		prof2.stop();

		prof_total.stop();

		std::vector<size_t> sorted(harmonic.size());
		std::iota(sorted.begin(), sorted.end(), 0);
		std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {
			return harmonic[a] > harmonic[b];
		});
		std::map<uint64_t, double> harmonic_by_url;
		for (size_t i = 0; i < harmonic.size(); i++) {
			harmonic_by_url[vertices[sorted[i]]] = harmonic[sorted[i]] / vertices.size();
		}

		for (auto row : csv_data) {
			uint64_t url_hash = stoull(row[0]);
			double harmonic = harmonic_by_url[url_hash];
			std::cout << row[0] << "\t" << row[1] << "\t" << harmonic << std::endl;
		}

		/*
		profiler::instance prof_load("load");
		//std::ifstream infile("/mnt/5/full_text/internal_links/3492263685688109621.data", std::ios::binary);
		//std::ifstream infile("/mnt/5/full_text/internal_links/3492528524383210893.data", std::ios::binary);
		//std::ifstream infile("/mnt/0/full_text/internal_links/7131549202223940368.data", std::ios::binary);
		std::ifstream infile("/mnt/0/full_text/internal_links/10401139885298228528.data", std::ios::binary);
		std::string infile_data(std::istreambuf_iterator<char>(infile), {});
		infile.close();
		std::istringstream reader(infile_data);
		indexer::index<indexer::value_record> idx(&reader, 1000);
		prof_load.stop();

		profiler::instance prof("make vertices");

		std::vector<uint64_t> vertices;
		std::map<uint64_t, uint64_t> vertex_map;

		size_t record_id = 0;
		for (const auto &record : idx.records()) {
			vertices.push_back(record.m_value);
			vertex_map[record.m_value] = record_id;
			record_id++;
		}

		std::vector<roaring::Roaring> edge_map(vertices.size());

		idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {
				if (vertex_map.count(key) == 0) {
					vertices.push_back(key);
					edge_map.push_back(roaring::Roaring());
					vertex_map[key] = record_id;
					record_id++;
				}
				edge_map[vertex_map[key]] = std::move(bitmap);
		});

		prof.stop();
		profiler::instance prof2("run hyper_ball");

		auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());

		prof2.stop();

		prof_total.stop();

		std::vector<size_t> sorted(harmonic.size());
		std::iota(sorted.begin(), sorted.end(), 0);
		std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {
			return harmonic[a] > harmonic[b];
		});

		//for (size_t i = 0; i < harmonic.size(); i++) {
			//std::cout << "vertex: " << vertices[sorted[i]] << " has harmonic: " << harmonic[sorted[i]] << std::endl;
		//}
		*/
	} else if (arg == "--url-server") {
		// Spin up a simple url server.

		hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");

		http::server url_server([&ht](auto request) {
			http::response res;

			URL url = request.url();
			auto query = url.query();
			URL find_url(parser::urldecode(query["url"]));

			size_t ver;
			const auto find_str = ht.find(find_url.hash(), ver);

			if (find_str == "") {
				res.code(404);
				res.body("Not found 404");
			} else {
				res.code(200);
				res.body(find_str);
			}

			return res;
		});
	} else {
		help();
	}

	logger::join_logger_thread();

	return 0;
}


================================================
FILE: src/algorithm/algorithm.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "algorithm.h"
#include "profiler/profiler.h"
#include <iostream>
#include <set>
#include <numeric>
#include <map>
#include <math.h>
#include <cassert>
#include <future>
#include <cstring>

namespace algorithm {

	/*
		Returns partitions with indices that are smaller than the values in the dims vector.
		For example:
		dims = {2,2} gives {0,0}, {1,0}, {0,1}, {1,1}
		dims = {2,3} gives {0,0}, {1,0}, {0,1}, {1,1}, {0,2}, {1,2}
	*/
	std::vector<std::vector<int>> incremental_partitions(const std::vector<int> &dims, size_t limit) {
		std::vector<std::vector<int>> res;
		std::set<std::vector<int>> uniq;
		std::vector<int> initial(dims.size(), 0);
		res.push_back(initial);
		uniq.insert(initial);

		for (size_t j = 0; j < res.size(); j++) {
			std::vector<int> vec = res[j];
			for (size_t i = 0; i < vec.size(); i++) {
				if (vec[i] < dims[i]-1) {
					std::vector<int> copy(vec);
					copy[i]++;

					res.push_back(copy);
					uniq.insert(copy);
					if (uniq.size() >= limit) break;
				}
			}
			if (uniq.size() >= limit) break;
		}

		std::vector<std::vector<int>> ret(uniq.begin(), uniq.end());
		sort(ret.begin(), ret.end(), [](const std::vector<int> &a, const std::vector<int> &b) {
			int sum1 = accumulate(a.begin(), a.end(), 0);
			int sum2 = accumulate(b.begin(), b.end(), 0);
			if (sum1 == sum2) {
				int max1 = *max_element(a.begin(), a.end());
				int max2 = *max_element(b.begin(), b.end());
				if (max1 == max2) {
					return b < a;
				}
				return max1 < max2;
			}
			return sum1 < sum2;
		});
		return ret;
	}

	/*
		Calculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i.
		The depth parameter is the maximum level to traverse in the neighbour tree.
		The edges set contains pairs of edges (from vertex, to vertex)
	*/

	/*
	 * This is the inner outer loop for calculating harmonic centrality.
	 * */
	std::vector<double> harmonic_centrality_subvector(size_t vlen, const std::vector<uint32_t> *edge_map,
			size_t depth, size_t start, size_t len) {

		char *all = new char[vlen];
		uint32_t *level1 = new uint32_t[vlen];
		uint32_t *level2 = new uint32_t[vlen];

		uint32_t *levels[2] = {level1, level2};
		size_t level_len[2] = {0, 0};

		std::vector<double> harmonics;

		profiler::instance prof("Timetaker");
		for (size_t i = start; i < start + len; i++) {
			const uint32_t vertex = i;

			level_len[0] = 0;
			level_len[1] = 0;
			memset(all, 0, vlen);

			levels[0][0] = vertex;
			level_len[0]++;
			all[vertex] = 1;

			double harmonic = 0.0;
			/*
				If we can assume the average number of incoming edges per vertex to be constant these loops should be O(1) in n.
				Example, if we have n = 10 000 000 vertices and 10 inbound edges on each vertex these loops should be
				(first loop is depth) X (worst case second loop is 10^depth) X (inner loop is 10)
				depth * 10^depth * 10
				independent of n
			*/
			size_t last_level = 0;
			size_t cur_level = 1;
			for (size_t level = 1; level <= depth; level++) {
				//for (const uint32_t &v : level[level - 1]) {
				for (size_t j = 0; j < level_len[last_level]; j++) {
					const uint32_t v = levels[last_level][j];
					for (const uint32_t &edge : edge_map[v]) {
						if (!all[edge]) {
							levels[cur_level][level_len[cur_level]++] = edge;
							all[edge] = 1;
						}
					}
				}
				if (level_len[cur_level] == 0) break;
				harmonic += (double)level_len[cur_level] / level;
				// Swap levels
				level_len[last_level] = 0;
				size_t tmp = last_level;
				last_level = cur_level;
				cur_level = tmp;
			}

			harmonics.push_back(harmonic);
		}

		delete [] level2;
		delete [] level1;
		delete [] all;

		return harmonics;
	}

	std::vector<double> harmonic_centrality(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth) {
		std::vector<double> harmonics;

		std::vector<uint32_t> *edge_map = new std::vector<uint32_t>[vlen];
		for (const auto &edge : edges) {
			/*
			second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase
			harmonic centrality of vertex.
			*/
			edge_map[edge.second].push_back(edge.first);
		}

		std::vector<double> ret = harmonic_centrality(vlen, edge_map, depth);

		delete [] edge_map;

		return ret;
	}

	std::vector<double> harmonic_centrality(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth) {
		return harmonic_centrality_subvector(vlen, edge_map, depth, 0, vlen);
	}

	std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth,
			size_t num_threads) {

		std::vector<uint32_t> *edge_map = new std::vector<uint32_t>[vlen];
		for (const auto &edge : edges) {
			/*
			second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase
			harmonic centrality of vertex.
			*/
			edge_map[edge.second].push_back(edge.first);
		}

		std::vector<double> ret = harmonic_centrality_threaded(vlen, edge_map, depth, num_threads);

		delete [] edge_map;

		return ret;
	}

	std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth, size_t num_threads) {

		assert(vlen >= num_threads);

		std::vector<std::future<std::vector<double>>> threads;

		// Split the vertices into several vectors.
		const size_t max_len = ceil((double)vlen / num_threads);
		for (size_t i = 0; i < vlen; i += max_len) {
			const size_t len = std::min(max_len, vlen - i);
			threads.emplace_back(std::async(std::launch::async, harmonic_centrality_subvector, vlen, edge_map, depth, i, len));
		}

		std::vector<double> harmonic;
		for (auto &thread : threads) {
			std::vector<double> part = thread.get();
			harmonic.insert(harmonic.end(), part.begin(), part.end());
		}

		return harmonic;
	}

	std::vector<uint32_t> *set_to_edge_map(size_t n, const std::set<std::pair<uint32_t, uint32_t>> &edges) {
		std::vector<uint32_t> *edge_map = new std::vector<uint32_t>[n];
		for (const auto &edge : edges) {
			/*
			second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase
			harmonic centrality of vertex.
			*/
			edge_map[edge.second].push_back(edge.first);
		}

		return edge_map;
	}

}


================================================
FILE: src/algorithm/algorithm.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <vector>
#include <set>
#include <unordered_map>
#include <cstdint>

namespace algorithm {

	template<class T>
	void vector_chunk(const std::vector<T> &vec, size_t chunk_size, std::vector<std::vector<T>> &dest) {
		std::vector<T> chunk;
		for (T item : vec) {
			chunk.push_back(item);
			if (chunk.size() == chunk_size) {
				dest.push_back(chunk);
				chunk.clear();
			}
		}
		if (chunk.size()) {
			dest.push_back(chunk);
		}
	}

	std::vector<std::vector<int>> incremental_partitions(const std::vector<int> &dims, size_t limit);

	/*
		Calculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i.
		The depth parameter is the maximum level to traverse in the neighbour tree.
		The edges set contains pairs of edges (from vertex, to vertex)
	*/
	std::vector<double> harmonic_centrality(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth);
	std::vector<double> harmonic_centrality(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth);
	std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth,
			size_t num_threads);
	std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::vector<uint32_t> *edge_map,
			size_t depth, size_t num_threads);

	std::vector<uint32_t> *set_to_edge_map(size_t n, const std::set<std::pair<uint32_t, uint32_t>> &edges);
}


================================================
FILE: src/algorithm/bloom_filter.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "bloom_filter.h"
#include "algorithm/hash.h"
#include <cmath>
#include <cstring>
#include <fstream>

namespace algorithm {

	bloom_filter::bloom_filter()
	{
		m_bitmap = std::make_unique<uint64_t[]>(m_dim);
		for (size_t i = 0; i < m_dim; i++) {
			m_bitmap[i] = 0x0ull;
		}
	}

	// Dim should be a prime number..
	bloom_filter::bloom_filter(size_t dim)
	: m_dim(dim), m_bitlen(dim * 64)
	{
		m_bitmap = std::make_unique<uint64_t[]>(m_dim);
		for (size_t i = 0; i < m_dim; i++) {
			m_bitmap[i] = 0x0ull;
		}
	}

	void bloom_filter::insert(const std::string &item) {
		for (size_t i = 0; i < m_seeds.size(); i++) {
			const uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]);
			set_bit(hash);
		}
	}

	void bloom_filter::insert(uint64_t item) {
		insert(std::to_string(item));
	}

	void bloom_filter::insert_many(std::vector<uint64_t> &items) {

		std::vector<size_t> hashes;
		for (const auto &item : items) {
			const auto str_item = std::to_string(item);
			for (size_t i = 0; i < m_seeds.size(); i++) {
				const uint64_t hash = algorithm::hash_with_seed(str_item, m_seeds[i]);
				hashes.push_back(hash);
			}
		}

		std::lock_guard guard(m_mutex);
		for (const auto &hash : hashes) {
			set_bit(hash);
		}
	}

	const char * bloom_filter::data() const {
		return (char *)m_bitmap.get();
	}

	bool bloom_filter::exists(const std::string &item) const {
		for (size_t i = 0; i < m_seeds.size(); i++) {
			const uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]);
			if (!get_bit(hash)) return false;
		}
		return true;
	}

	bool bloom_filter::exists(uint64_t data) const {
		return exists(std::to_string(data));
	}

	void bloom_filter::read(char *data, size_t len) {
		memcpy((char *)m_bitmap.get(), data, len);
	}

	void bloom_filter::merge(const bloom_filter &other) {
		for (size_t i = 0; i < m_dim; i++) {
			m_bitmap[i] |= other.m_bitmap[i];
		}
	}

	double bloom_filter::saturation() {
		return 1.0;
	}

	void bloom_filter::read_file(const std::string &file_name) {
		std::ifstream infile(file_name, std::ios::binary);
		infile.read((char *)m_bitmap.get(), size());
	}

	void bloom_filter::write_file(const std::string &file_name) const {
		std::ofstream outfile(file_name, std::ios::binary | std::ios::trunc);
		outfile.write((char *)m_bitmap.get(), size());
	}

	void bloom_filter::set_bit(size_t bit) {
		const size_t x = bit % m_bitlen;
		const size_t pos = static_cast<size_t>(x / 64);
		const size_t bit_in_pos = x % 64;
		m_bitmap[pos] = m_bitmap[pos] | (0x1ull << bit_in_pos);
	}

	bool bloom_filter::get_bit(size_t bit) const {
		const size_t x = bit % m_bitlen;
		const size_t pos = static_cast<size_t>(x / 64);
		const size_t bit_in_pos = x % 64;
		return (m_bitmap[pos] & (0x1ull << bit_in_pos)) >> bit_in_pos;
	}

}


================================================
FILE: src/algorithm/bloom_filter.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <memory>
#include <mutex>
#include "roaring/roaring64map.hh"

namespace algorithm {

	class bloom_filter {
		public:
			bloom_filter();
			bloom_filter(size_t dim);

			void insert(const std::string &item);
			void insert(uint64_t item);
			void insert_many(std::vector<uint64_t> &items);
			bool exists(const std::string &item) const;
			bool exists(uint64_t data) const;
			size_t size() const { return m_dim * sizeof(uint64_t); }
			const char *data() const;
			void read(char *data, size_t len);
			void merge(const bloom_filter &other);
			double saturation();

			void read_file(const std::string &file_name);
			void write_file(const std::string &file_name) const;

		private:

			std::unique_ptr<uint64_t[]> m_bitmap;

			#ifdef IS_TEST
			size_t m_dim = 2695797;
			#else
			size_t m_dim = 4043696581;
			#endif

			size_t m_bitlen = m_dim * 64;

			// some random prime numbers
			std::array<uint64_t, 10> m_seeds = {3339675911, 2695798769, 2695831867, 2695857877, 2695879891, 2695879891, 2695922687, 2695935521,
					3339689791, 3339703163};

			std::mutex m_mutex;

			void set_bit(size_t bit);
			bool get_bit(size_t bit) const;

	};

}


================================================
FILE: src/algorithm/hash.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <cstdint>

#include "hash.h"

namespace algorithm {

	/*
	 * Murmur hash by Austin Appleby
	 * Taken from here https://sites.google.com/site/murmurhash/
	 * */
	size_t murmur_hash(const char *key, size_t len, size_t seed) {
		const uint64_t m = 0xc6a4a7935bd1e995ull;
		const int r = 47;

		uint64_t h = seed ^ (len * m);

		const uint64_t * data = (const uint64_t *)key;
		const uint64_t * end = data + (len/8);

		while(data != end) {
			uint64_t k = *data++;

			k *= m; 
			k ^= k >> r; 
			k *= m; 
			
			h ^= k;
			h *= m; 
		}

		const unsigned char * data2 = (const unsigned char*)data;

		switch(len & 7) {
			case 7: h ^= uint64_t(data2[6]) << 48;
			case 6: h ^= uint64_t(data2[5]) << 40;
			case 5: h ^= uint64_t(data2[4]) << 32;
			case 4: h ^= uint64_t(data2[3]) << 24;
			case 3: h ^= uint64_t(data2[2]) << 16;
			case 2: h ^= uint64_t(data2[1]) << 8;
			case 1: h ^= uint64_t(data2[0]);
				h *= m;
		};
 
		h ^= h >> r;
		h *= m;
		h ^= h >> r;

		return h;
	}

	size_t hash(const std::string &str) {
		static const size_t seed = 0xc70f6907ul;
		return murmur_hash(str.c_str(), str.size(), seed);
	}

	size_t hash_with_seed(const std::string &str, size_t seed) {
		return murmur_hash(str.c_str(), str.size(), seed);
	}


}


================================================
FILE: src/algorithm/hash.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <string>

namespace algorithm {

	size_t hash(const std::string &str);
	size_t hash_with_seed(const std::string &str, size_t seed);

}


================================================
FILE: src/algorithm/hyper_ball.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <vector>
#include <cstdint>
#include "hyper_log_log.h"
#include "profiler/profiler.h"
#include "logger/logger.h"
#include <future>

namespace algorithm {

	template <typename edge_map_type>
	bool hyper_ball_worker(double t, size_t v_begin, size_t v_end, const edge_map_type &edge_map,
			std::vector<hyper_log_log> &c, std::vector<hyper_log_log> &a, std::vector<double> &harmonic) {

		bool counter_changed = false;
		for (uint32_t v = v_begin; v < v_end; v++) {
			a[v] = c[v];
			for (const uint32_t &w : edge_map[v]) {
				a[v] += c[w];
			}

			// a[v] is t + 1 and c[v] is at t
			const size_t counter_diff = a[v].count() - c[v].count();
			if (counter_diff) {
				counter_changed = true;
				harmonic[v] += (1.0 / (t + 1.0)) * counter_diff;
			}
		}
		for (uint32_t v = v_begin; v < v_end; v++) {
			c[v] = a[v];
		}
		return counter_changed;
	}

	/*
	 * n is the number of vertices in graph.
	 * edge_map is pointing to a static array of size n.
	 * each item in edge_map is a vector of variable size.
	 * each vector edge_map[m] contains values between 0 and n-1 indicating edge between m and edge_map[m].
	 * NOTE direction of edge in edge map has to be EDGE_FROM -> EDGE_TO.
	 * so for vertex m, n = edge_map[m] indicates directed edge from n to m
	 * */
	template <typename edge_map_type>
	std::vector<double> hyper_ball(uint32_t n, const edge_map_type &edge_map) {

		if (n == 0) return {};

		const size_t num_threads = std::min(32, (int)n);
		const size_t items_per_thread = n / num_threads;
		std::vector<hyper_log_log> c(n, hyper_log_log(10));
		std::vector<hyper_log_log> a(n, hyper_log_log(10));
		std::vector<double> harmonic(n, 0.0);

		for (uint32_t v = 0; v < n; v++) {
			c[v].insert(v);
		}

		double t = 0.0;
		while (true) {
			std::vector<std::future<bool>> threads;
			for (size_t i = 0; i < num_threads; i++) {
				const size_t v_begin = i * items_per_thread;
				const size_t v_end = (i == num_threads - 1) ? n : (i + 1) * items_per_thread;
				auto fut = std::async(hyper_ball_worker<edge_map_type>, t, v_begin, v_end, std::cref(edge_map), std::ref(c), std::ref(a), std::ref(harmonic));
				threads.emplace_back(std::move(fut));
			}

			bool should_continue = false;
			for (auto &fut : threads) {
				should_continue = fut.get() || should_continue;
			}

			t += 1.0;
			if (!should_continue) break;
		}

		return harmonic;
	}

}


================================================
FILE: src/algorithm/hyper_log_log.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <numeric>
#include "hyper_log_log.h"
#include "algorithm/hash.h"

namespace algorithm {

	hyper_log_log::hyper_log_log(size_t b)
	: m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) {
		m_M.resize(m_len);
		std::fill(m_M.begin(), m_M.end(), 0);
	}

	hyper_log_log::hyper_log_log(const char *registers, size_t b)
	: m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) {
		m_M.resize(m_len);
		memcpy(m_M.data(), registers, m_len);
	}

	hyper_log_log::hyper_log_log(const hyper_log_log &other)
	: m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) {
		m_M.resize(m_len);
		std::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin());
	}

	hyper_log_log::hyper_log_log(hyper_log_log &&other)
	: m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) {
		m_M.swap(other.m_M);
	}

	hyper_log_log::~hyper_log_log() {
	}

	void hyper_log_log::insert(size_t v) {
		size_t x = algorithm::hash(std::to_string(v));
		size_t j = x >> (64-m_b);
		m_M[j] = std::max(m_M[j], leading_zeros_plus_one(x << m_b));
	}

	size_t hyper_log_log::count() const {
		double Z = 0.0;
		for (size_t j = 0; j < m_len; j++) {
			Z += 1.0 / (1ull << m_M[j]);
		}
		double E = m_alpha * m_len * m_len / Z;

		// Only small range correction implemented since we use 64 bit hash.
		if (E <= (5.0/2.0) * m_len) {
			size_t V = num_zero_registers();
			if (V != 0) {
				E = m_len * log((double)m_len / V);
			}
		}

		return (size_t)E;
	}

	void hyper_log_log::reset() {
		std::fill(m_M.begin(), m_M.end(), 0);
	}

	char hyper_log_log::leading_zeros_plus_one(size_t x) const {
		size_t num_zeros = 1;
		for (size_t i = 0; i < 64; i++) {
			if ((x >> (64 - 1 - i)) & 0x1ull) return num_zeros;
			num_zeros++;
		}
		return num_zeros;
	}

	size_t hyper_log_log::num_zero_registers() const {
		return std::transform_reduce(m_M.begin(), m_M.end(), 0,
			[](int a, int b) { return a + b; },
			[](char a) { return a == 0 ? 1 : 0; });
	}

	double hyper_log_log::error_bound() const {
		double stdd = 1.04 / sqrt((double)m_len);
		return stdd * 3; // Gives 99% confidence
	}

	hyper_log_log hyper_log_log::operator +(const hyper_log_log &hl) const {
		hyper_log_log res;
		std::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(res.m_M), [] (char a, char b) { return std::max(a, b); });

		return res;
	}

	hyper_log_log &hyper_log_log::operator +=(const hyper_log_log &hl) {
		std::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(m_M), [] (char a, char b) { return std::max(a, b); });
		return *this;
	}

	hyper_log_log &hyper_log_log::operator =(const hyper_log_log &other) {
		std::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin());
		return *this;
	}

}


================================================
FILE: src/algorithm/hyper_log_log.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <cmath>
#include <cstring>
#include <algorithm>
#include <iostream>
#include <vector>

namespace algorithm {

	/*
	 * Implementation of the hyper log log algorithm as described by Flajolet1 et al.
	 * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
	 *
	 * Using 64 bit hash instead of 32bit.
	 * */

	class hyper_log_log {

		public:

			/*
			 * initializes with given b parameter. size of data structure will be 2^b bytes.
			 * */
			hyper_log_log(size_t b = 15);
			hyper_log_log(const char *registers, size_t b = 15);
			hyper_log_log(const hyper_log_log &other);
			hyper_log_log(hyper_log_log &&other);
			~hyper_log_log();

			void insert(size_t v);
			size_t count() const;
			double error_bound() const;
			void reset();

			const char *data() const { return m_M.data(); };
			char *data() { return m_M.data(); };
			int b() const { return m_b; }
			size_t data_size() const { return m_len; };

			hyper_log_log operator +(const hyper_log_log &hl) const;
			hyper_log_log &operator +=(const hyper_log_log &hl);
			hyper_log_log &operator =(const hyper_log_log &other);

			char leading_zeros_plus_one(size_t x) const;

		private:
			
			std::vector<char> m_M; // Points to registers.
			const int m_b;
			const size_t m_len;
			const double m_alpha;

			size_t num_zero_registers() const;

	};

}


================================================
FILE: src/algorithm/intersection.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <functional>

#include "intersection.h"

namespace algorithm {

	roaring::Roaring intersection(const std::vector<roaring::Roaring> &input) {

		if (input.size() == 0) return roaring::Roaring();

		roaring::Roaring intersection = input[0];

		for (size_t i = 1; i < input.size(); i++) {
			intersection &= input[i];
		}

		return intersection;
	}

}


================================================
FILE: src/algorithm/intersection.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#pragma once

#include <vector>
#include <memory>
#include "roaring/roaring.hh"

namespace algorithm {

	roaring::Roaring intersection(const std::vector<roaring::Roaring> &input);

	template<typename item>
	std::vector<item> intersection(const std::vector<std::vector<item>> &input,
		std::function<void(item &a, const item &b)> sum_fun) {

		if (input.size() == 0) return {};

		size_t shortest_vector_position = 0;
		size_t shortest_len = SIZE_MAX;
		size_t iter_index = 0;
		for (const std::vector<item> &vec : input) {
			if (shortest_len > vec.size()) {
				shortest_len = vec.size();
				shortest_vector_position = iter_index;
			}
			iter_index++;
		}

		std::vector<size_t> positions(input.size(), 0);
		std::vector<item> intersection;

		while (positions[shortest_vector_position] < shortest_len) {

			bool all_equal = true;
			item value = input[shortest_vector_position][positions[shortest_vector_position]];

			size_t iter_index = 0;
			for (const std::vector<item> &vec : input) {
				const size_t len = vec.size();

				size_t *pos = &(positions[iter_index]);
				while (*pos < len && vec[*pos] < value) {
					(*pos)++;
				}
				if (((*pos < len) && (value < vec[*pos])) || *pos >= len) {
					all_equal = false;
					break;
				} else {
					if (iter_index != shortest_vector_position) {
						sum_fun(value, vec[*pos]);
					}
				}
				iter_index++;
			}
			if (all_equal) {
				intersection.push_back(value);
			}

			positions[shortest_vector_position]++;
		}

		return intersection;
	}

	template<typename item>
	std::vector<item> intersection(const std::vector<std::unique_ptr<item[]>> &input, const std::vector<size_t> lengths) {

		if (input.size() == 0) return {};

		size_t shortest_vector_position = 0;
		size_t shortest_len = SIZE_MAX;
		size_t iter_index = 0;
		for (size_t len : lengths) {
			if (shortest_len > len) {
				shortest_len = len;
				shortest_vector_position = iter_index;
			}
			iter_index++;
		}

		std::vector<size_t> positions(input.size(), 0);
		std::vector<item> intersection;

		while (positions[shortest_vector_position] < shortest_len) {

			bool all_equal = true;
			item value = input[shortest_vector_position][positions[shortest_vector_position]];

			size_t iter_index = 0;
			for (const std::unique_ptr<item[]> &ptr : input) {
				const size_t len = lengths[iter_index];

				size_t *pos = &(positions[iter_index]);
				while (*pos < len && ptr[*pos] < value) {
					(*pos)++;
				}
				if (((*pos < len) && (value < ptr[*pos])) || *pos >= len) {
					all_equal = false;
					break;
				} else {
					if (iter_index != shortest_vector_position) {
						//sum_fun(value, ptr[*pos]);
					}
				}
				iter_index++;
			}
			if (all_equal) {
				intersection.push_back(value);
			}

			positions[shortest_vector_position]++;
		}

		return intersection;
	}

	template<typename item>
	std::vector<item> intersection(const std::vector<std::vector<item>> &input) {
		return intersection<item>(input, [](item &a, const item &b) {});
	}

}


================================================
FILE: src/algorithm/sort.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "sort.h"

namespace algorithm {

}


================================================
FILE: src/algorithm/sort.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <vector>
#include <span>

namespace algorithm {

	namespace sort {

		template<typename data_record, typename F>
		void merge_arrays(const std::vector<data_record> &arr1, const std::vector<data_record> &arr2, F compare, std::vector<data_record> &arr3) {

			size_t i = 0, j = 0;

			while (i < arr1.size() && j < arr2.size()) {
				if (compare(arr1[i], arr2[j])) {
					arr3.push_back(arr1[i++]);
				} else {
					arr3.push_back(arr2[j++]);
				}
			}

			while (i < arr1.size()) arr3.push_back(arr1[i++]);
			while (j < arr2.size()) arr3.push_back(arr2[j++]);
		}

		template<typename data_record, typename F>
		void merge_arrays(const std::span<data_record> *arr1, const std::span<data_record> *arr2, F compare, std::vector<data_record> &arr3) {

			size_t i = 0, j = 0;

			while (i < arr1->size() && j < arr2->size()) {
				if (compare((*arr1)[i], (*arr2)[j])) {
					arr3.push_back((*arr1)[i++]);
				} else {
					arr3.push_back((*arr2)[j++]);
				}
			}

			while (i < arr1->size()) arr3.push_back((*arr1)[i++]);
			while (j < arr2->size()) arr3.push_back((*arr2)[j++]);
		}

		template<typename data_record>
		void merge_arrays(const std::vector<data_record> &arr1, const std::vector<data_record> &arr2, std::vector<data_record> &arr3) {
			merge_arrays(arr1, arr2, [](const data_record &a, const data_record &b) {
				return a < b;
			}, arr3);
		}

		template<typename data_record>
		void merge_arrays(const std::vector<std::vector<data_record>> &arrays, std::vector<data_record> &res) {
			merge_arrays(arrays, [](const data_record &a, const data_record &b) {
				return a < b;
			}, res);
		}

		template<typename data_record, typename F>
		void merge_array_range(const std::vector<std::vector<data_record>> &arrays, size_t i, size_t j, F compare, std::vector<data_record> &res) {
			if (i == j) {
				for (const data_record &rec : arrays[i]) {
					res.push_back(rec);
				}
			} else if (j - i == 1) {
				merge_arrays(arrays[i], arrays[j], compare, res);
			} else {
				std::vector<data_record> out1;
				std::vector<data_record> out2;

				merge_array_range(arrays, i, (i + j)/2, compare, out1);
				merge_array_range(arrays, (i + j)/2 + 1, j, compare, out2);

				merge_arrays(out1, out2, compare, res);
			}
		}

		template<typename data_record, typename F>
		void merge_arrays(const std::vector<std::vector<data_record>> &arrays, F compare, std::vector<data_record> &res) {
			if (arrays.size() == 0) return;
			merge_array_range(arrays, 0, arrays.size() - 1, compare, res);
		}

		template<typename data_record, typename F>
		void merge_array_range(const std::vector<std::span<data_record> *> &arrays, size_t i, size_t j, F compare, std::vector<data_record> &res) {
			if (i == j) {
				for (const data_record &rec : *(arrays[i])) {
					res.push_back(rec);
				}
			} else if (j - i == 1) {
				merge_arrays(arrays[i], arrays[j], compare, res);
			} else {
				std::vector<data_record> out1;
				std::vector<data_record> out2;

				merge_array_range(arrays, i, (i + j)/2, compare, out1);
				merge_array_range(arrays, (i + j)/2 + 1, j, compare, out2);

				merge_arrays(out1, out2, compare, res);
			}
		}

		template<typename data_record, typename F>
		void merge_arrays(const std::vector<std::span<data_record> *> &arrays, F compare, std::vector<data_record> &res) {
			if (arrays.size() == 0) return;
			merge_array_range(arrays, 0, arrays.size() - 1, compare, res);
		}
	
	}

}


================================================
FILE: src/algorithm/sum_sorted.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <vector>
#include <functional>

namespace algorithm {

	template<class dtype>
	std::vector<dtype> sum_sorted(const std::vector<std::vector<dtype>> &input,
			std::function<void(dtype &a, const dtype &b)> plus_eq) {

		const size_t n = input.size();
		if (n == 0) return {};

		std::vector<dtype> ret;
		std::vector<size_t> pos(n, 0);
		
		while (true) {
			int start_vec = -1;
			for (size_t i = 0; i < n; i++) {
				if (pos[i] < input[i].size() ) {
					start_vec = i;
					break;
				}
			}
			if (start_vec == -1) break;

			dtype smallest = input[start_vec][pos[start_vec]];

			for (size_t i = 0; i < n; i++) {
				if (pos[i] < input[i].size() && input[i][pos[i]] < smallest) {
					smallest = input[i][pos[i]];
					start_vec = i;
				}
			}

			const dtype el = input[start_vec][pos[start_vec]];
			dtype sum = el;
			pos[start_vec]++;
			for (size_t i = start_vec + 1; i < n; i++) {
				while (pos[i] < input[i].size() && input[i][pos[i]] < el) {
					pos[i]++;
				}
				if (pos[i] < input[i].size() && input[i][pos[i]] == el) {
					plus_eq(sum, input[i][pos[i]]);
					pos[i]++;
				}
			}
			ret.push_back(sum);
		}
		return ret;
	}

}


================================================
FILE: src/algorithm/top_k.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <vector>
#include <functional>

namespace algorithm {

	/*
	 * Returns top k elements in unsorted const vector in linear time using a 2k memory buffer.
	 * */

	template<class dtype>
	std::vector<dtype> top_k(const std::vector<dtype> &input, size_t k,
		std::function<bool(const dtype &, const dtype &)> ordered) {
		
		if (input.size() <= k) return input;
		if (input.size() <= 2 * k) {
			std::vector<dtype> buf(input.begin(), input.end());
			std::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered);
			return std::vector<dtype>(buf.begin() + buf.size() / 2, buf.end());
		}

		std::vector<dtype> buf(input.begin(), input.begin() + (2 * k));

		size_t idx = 2 * k;
		while (idx < input.size()) {
			std::nth_element(buf.begin(), buf.begin() + k, buf.end(), ordered);
			for (size_t i = 0, j = idx; i < k && j < input.size(); i++, j++) {
				// Only insert objects that are out of order compared to pivot buf[k]
				if (!ordered(input[j], buf[k])) {
					buf[i] = input[idx + i];
				}
			}
			idx += k;
		}
		// Run final partition.
		std::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered);

		return std::vector<dtype>(buf.begin() + k, buf.end());
	}

	/*
	 * top_k but with default less than operator.
	 * */
	template<class dtype>
	std::vector<dtype> top_k(const std::vector<dtype> &input, size_t k) {
		return top_k<dtype>(input, k, [](const dtype &a, const dtype &b) { return a < b; });
	}

}


================================================
FILE: src/api/api_response.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "api_response.h"
#include "indexer/return_record.h"
#include "full_text/search_metric.h"
#include "parser/unicode.h"
#include "json.hpp"

namespace api {

	api_response::api_response(const std::vector<indexer::return_record> &results, const struct full_text::search_metric &metric, double profile) {

		using json = nlohmann::ordered_json;

		json message;

		json result_array;
		for (const auto &result : results) {
			json json_result;

			try {
				json_result["url"] = result.m_url.str();
				json_result["title"] = parser::unicode::encode(result.m_title);
				json_result["snippet"] = parser::unicode::encode(result.m_snippet);
				json_result["score"] = result.m_score;
				json_result["domain_hash"] = std::to_string(result.m_domain_hash);
				json_result["url_hash"] = std::to_string(result.m_url.hash());

				result_array.push_back(json_result);
			} catch (nlohmann::detail::type_error &error) {
				// skip this result.
				// in future log this and fix what is wrong.
			}
		}

		message["status"] = "success";
		message["time_ms"] = profile;
		message["total_found"] = metric.m_total_found;
		message["total_url_links_found"] = metric.m_total_url_links_found;
		message["total_domain_links_found"] = metric.m_total_domain_links_found;
		message["links_handled"] = metric.m_links_handled;
		message["link_domain_matches"] = metric.m_link_domain_matches;
		message["link_url_matches"] = metric.m_link_url_matches;
		message["results"] = result_array;

		//m_response = message.dump();
		m_response = message.dump(4);
	}

	api_response::~api_response() {

	}

	std::ostream &operator<<(std::ostream &os, const api_response &api_response) {
		os << api_response.m_response;
		return os;
	}

}


================================================
FILE: src/api/api_response.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>

namespace full_text {
	struct search_metric;
}

namespace indexer {
	class return_record;
}

namespace api {

	class api_response {

		public:
			api_response(const std::vector<indexer::return_record> &results, const struct full_text::search_metric &metric, double profile);
			~api_response();

			friend std::ostream &operator<<(std::ostream &os, const api_response &api_response);

		private:

			std::string m_response;

	};

}


================================================
FILE: src/api/result_with_snippet.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "result_with_snippet.h"
#include "text/text.h"

namespace api {

	result_with_snippet::result_with_snippet(const std::string &tsv_data, const indexer::return_record &res)
	: m_score(res.m_score), m_domain_hash(res.m_domain_hash) {
		size_t pos_start = 0;
		size_t pos_end = 0;
		size_t col_num = 0;
		while (pos_end != std::string::npos) {
			pos_end = tsv_data.find('\t', pos_start);
			const size_t len = pos_end - pos_start;
			if (col_num == 0) {
				m_url = URL(tsv_data.substr(pos_start, len));
			}
			if (col_num == 1) {
				m_title = tsv_data.substr(pos_start, len);
			}
			if (col_num == 3) {
				m_meta = tsv_data.substr(pos_start, len);
			}
			if (col_num == 4) {
				m_snippet = make_snippet(tsv_data.substr(pos_start, len));
				if (m_snippet.size() == 0) {
					m_snippet = make_snippet(m_meta);
				}
			}

			pos_start = pos_end + 1;
			col_num++;
		}
	}

	result_with_snippet::~result_with_snippet() {

	}

	std::string result_with_snippet::make_snippet(const std::string &text) const {
		std::string response = text.substr(0, 140);
		text::trim(response);
		if (response.size() >= 140) response += "...";
		return response;
	}

}


================================================
FILE: src/api/result_with_snippet.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include "URL.h"
#include "indexer/return_record.h"

namespace api {

	class result_with_snippet {

	public:
		result_with_snippet(const std::string &tsv_data, const indexer::return_record &res);
		~result_with_snippet();

		const URL &url() const { return m_url; };
		const std::string &title() const { return m_title; };
		const std::string &snippet() const { return m_snippet; };
		const float &score() const { return m_score; };
		const uint64_t &domain_hash() const { return m_domain_hash; };

	private:

		URL m_url;
		std::string m_title;
		std::string m_meta;
		std::string m_snippet;
		float m_score;
		uint64_t m_domain_hash;

		std::string make_snippet(const std::string &text) const;

	};

}


================================================
FILE: src/cluster/cluster.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once


================================================
FILE: src/cluster/document.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "document.h"
#include "algorithm/hash.h"
#include "text/text.h"
#include "URL.h"

namespace cluster {

	document::document() 
	: m_name("unnamed document"){
	}

	document::document(const std::string &name)
	: m_name(name) {

	}

	document::~document() {

	}

	void document::read_text(const std::string &text) {
		const std::vector<std::string> words = text::get_words(text, 0);

		for (const auto &word : words) {
			m_counts[algorithm::hash(word)]++;
		}
	}

	void read_text_to_corpus(corpus &corp, const std::string &text) {
		const std::vector<std::string> words = text::get_words(text, 0);

		for (const auto &word : words) {
			size_t key = algorithm::hash(word);
			corp.counts[key]++;
			if (corp.words.count(key) == 0) {
				corp.words[key] = word;
			}
		}
	}

	void read_corpus(corpus &corp, documents &documents, std::stringstream &tsv) {
		std::string line;
		while (getline(tsv, line)) {
			const size_t pos = line.find('\t');
			if (pos == std::string::npos) continue;

			URL url(line.substr(0, pos));
			const std::string doc_text = line.substr(pos);

			const size_t key = url.host_hash();

			if (!documents.count(key)) {
				documents.emplace(key, url.host());
			}
			documents[key].read_text(doc_text);
			if (key == algorithm::hash("annicaviklund.se")) {
				std::cout << doc_text << std::endl;
			}
			read_text_to_corpus(corp, doc_text);
		}
	}

	void print_document(corpus &corp, const document &document) {
		std::vector<std::pair<size_t, size_t>> keys;
		for (const auto &iter : document.m_counts) {
			keys.emplace_back(iter.first, iter.second);
		}

		sort(keys.begin(), keys.end(), [](const auto &a, const auto &b) {
			return a.second > b.second;
		});

		size_t len = keys.size();
		for (size_t i = 0; i < std::min(100ul, len); i++) {
			std::cout << corp.words[keys[i].first] << " = " << keys[i].second << std::endl;
		}
	}
}


================================================
FILE: src/cluster/document.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <unordered_map>
#include <cstdint>

namespace cluster {

	typedef struct corpus_s {
		std::unordered_map<size_t, std::string> words;
		std::unordered_map<size_t, size_t> counts;
	} corpus;

	class document {
		public:
			document();
			document(const std::string &name);
			~document();
			std::string name() const { return m_name; };
			size_t size() const { return m_counts.size(); };

			void read_text(const std::string &text);
			friend void print_document(corpus &corp, const document &document);

		private:

			std::string m_name;
			std::unordered_map<size_t, size_t> m_counts;

	};

	typedef document topic;
	typedef std::unordered_map<size_t, document> documents;

	void read_corpus(corpus &corp, documents &documents, std::stringstream &tsv);
	void print_document(corpus &corp, const document &document);
}


================================================
FILE: src/common/ThreadPool.h
================================================
/*
Copyright (c) 2012 Jakob Progsch, Václav Zeman

This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.

Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:

   1. The origin of this software must not be misrepresented; you must not
   claim that you wrote the original software. If you use this software
   in a product, an acknowledgment in the product documentation would be
   appreciated but is not required.

   2. Altered source versions must be plainly marked as such, and must not be
   misrepresented as being the original software.

   3. This notice may not be removed or altered from any source
   distribution.
*/

#ifndef THREAD_POOL_H
#define THREAD_POOL_H

#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>

class ThreadPool {
public:
    explicit ThreadPool(size_t);
    template<class F, class... Args>
    auto enqueue(F&& f, Args&&... args) 
        -> std::future<typename std::result_of<F(Args...)>::type>;
    ~ThreadPool();
private:
    // need to keep track of threads so we can join them
    std::vector< std::thread > workers;
    // the task queue
    std::queue< std::function<void()> > tasks;
    
    // synchronization
    std::mutex queue_mutex;
    std::condition_variable condition;
    bool stop;
};
 
// the constructor just launches some amount of workers
inline ThreadPool::ThreadPool(size_t threads)
    :   stop(false)
{
    for(size_t i = 0;i<threads;++i)
        workers.emplace_back(
            [this]
            {
                for(;;)
                {
                    std::function<void()> task;

                    {
                        std::unique_lock<std::mutex> lock(this->queue_mutex);
                        this->condition.wait(lock,
                            [this]{ return this->stop || !this->tasks.empty(); });
                        if(this->stop && this->tasks.empty())
                            return;
                        task = std::move(this->tasks.front());
                        this->tasks.pop();
                    }

                    task();
                }
            }
        );
}

// add new work item to the pool
template<class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args) 
    -> std::future<typename std::result_of<F(Args...)>::type>
{
    using return_type = typename std::result_of<F(Args...)>::type;

    auto task = std::make_shared< std::packaged_task<return_type()> >(
            std::bind(std::forward<F>(f), std::forward<Args>(args)...)
        );
        
    std::future<return_type> res = task->get_future();
    {
        std::unique_lock<std::mutex> lock(queue_mutex);

        // don't allow enqueueing after stopping the pool
        if(stop)
            throw std::runtime_error("enqueue on stopped ThreadPool");

        tasks.emplace([task](){ (*task)(); });
    }
    condition.notify_one();
    return res;
}

// the destructor joins all threads
inline ThreadPool::~ThreadPool()
{
    {
        std::unique_lock<std::mutex> lock(queue_mutex);
        stop = true;
    }
    condition.notify_all();
    for(std::thread &worker: workers)
        worker.join();
}

#endif


================================================
FILE: src/common/datetime.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "datetime.h"
#include <ctime>

namespace common {

	size_t cur_date() {
		time_t tt = time(NULL);
		struct tm tm = *localtime(&tt);
		size_t year_since_00 = tm.tm_year - 100;
		size_t year = 2000 + year_since_00;
		return (year * 100 * 100) + ((tm.tm_mon + 1) * 100) + tm.tm_mday;
	}

	size_t cur_time() {
		time_t tt = time(NULL);
		struct tm tm = *localtime(&tt);
		return (tm.tm_hour * 100 * 100) + (tm.tm_min * 100) + tm.tm_sec;
	}

	size_t cur_datetime() {
		size_t date = cur_date();
		return (date * 100 * 100 * 100) + cur_time();
	}

	const std::string iso8601_datetime() {
		time_t now;
		time(&now);
		char buf[21];
		strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&now));
		return std::string(buf);
	}

}


================================================
FILE: src/common/datetime.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#pragma once

#include <iostream>

namespace common {
	size_t cur_date();
	size_t cur_time();
	size_t cur_datetime();
	const std::string iso8601_datetime();
}


================================================
FILE: src/common/dictionary.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "dictionary.h"
#include "logger/logger.h"
#include "file/tsv_file.h"
#include "dictionary_row.h"
#include "algorithm/hash.h"

using namespace std;

namespace common {

	dictionary::dictionary() {

	}

	dictionary::dictionary(file::tsv_file &tsv_file) {
		load_tsv(tsv_file);
	}

	dictionary::~dictionary() {

	}

	void dictionary::load_tsv(file::tsv_file &tsv_file) {
		while (!tsv_file.eof()) {
			auto line = tsv_file.get_line();
			std::stringstream ss(line);
			std::string col;
			getline(ss, col, '\t');

			if (col.size()) {
				size_t key = ::algorithm::hash(col);

				if (m_rows.find(key) != m_rows.end()) {
					handle_collision(key, col);
				}

				m_rows[key] = dictionary_row(ss);
			}
		}
	}

	unordered_map<size_t, dictionary_row>::const_iterator dictionary::find(const std::string &key) const {
		return m_rows.find(::algorithm::hash(key));
	}

	unordered_map<size_t, dictionary_row>::const_iterator dictionary::find(size_t hash) const {
		return m_rows.find(hash);
	}

	unordered_map<size_t, dictionary_row>::const_iterator dictionary::begin() const {
		return m_rows.begin();
	}

	unordered_map<size_t, dictionary_row>::const_iterator dictionary::end() const {
		return m_rows.end();
	}

	bool dictionary::has_key(const std::string &key) const {
		return find(key) != end();
	}

	void dictionary::handle_collision(size_t key, const std::string &col) {
		LOG_ERROR("Collision: " + std::to_string(key) + " " + col);
	}
}


================================================
FILE: src/common/dictionary.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <map>
#include <unordered_map>
#include "dictionary_row.h"

namespace file {
	class tsv_file;
}

namespace common {

	class dictionary {

		public:

			dictionary();
			explicit dictionary(file::tsv_file &tsv_file);
			~dictionary();

			void load_tsv(file::tsv_file &tsv_file);

			std::unordered_map<size_t, dictionary_row>::const_iterator find(const std::string &key) const;
			std::unordered_map<size_t, dictionary_row>::const_iterator find(size_t hash) const;

			std::unordered_map<size_t, dictionary_row>::const_iterator begin() const;
			std::unordered_map<size_t, dictionary_row>::const_iterator end() const;

			bool has_key(const std::string &key) const;
			size_t size() const { return m_rows.size(); }

		private:

			std::unordered_map<size_t, dictionary_row> m_rows;

			void handle_collision(size_t key, const std::string &col);

	};
}


================================================
FILE: src/common/dictionary_row.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "dictionary_row.h"

namespace common {

	dictionary_row::dictionary_row() {
	}

	dictionary_row::dictionary_row(const dictionary_row &row) {
		m_columns = row.m_columns;
	}

	dictionary_row::dictionary_row(const std::string &row) {
		std::stringstream stream(row);
		read_stream(stream);
	}

	dictionary_row::dictionary_row(std::stringstream &stream) {
		read_stream(stream);
	}

	dictionary_row::~dictionary_row() {

	}

	int dictionary_row::get_int(int column) const {
		return (int)m_columns[column];
	}

	float dictionary_row::get_float(int column) const {
		return (float)m_columns[column];
	}

	double dictionary_row::get_double(int column) const {
		return m_columns[column];
	}

	void dictionary_row::read_stream(std::stringstream &stream) {
		std::string col;
		int i = 0;
		while (std::getline(stream, col, '\t')) {
			try {
				m_columns.push_back(stod(col));
			} catch(const std::invalid_argument &error) {

			} catch(const std::out_of_range &error) {
			}
			i++;
		}
	}

}


================================================
FILE: src/common/dictionary_row.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <sstream>
#include <vector>

#define CC_ROW_LEN 5

namespace common {

	class dictionary_row {

		public:

			dictionary_row();
			dictionary_row(const dictionary_row &row);
			explicit dictionary_row(const std::string &row);
			explicit dictionary_row(std::stringstream &stream);
			~dictionary_row();

			int get_int(int column) const;
			float get_float(int column) const;
			double get_double(int column) const;

		private:
			std::vector<double> m_columns;

			void read_stream(std::stringstream &stream);

	};

}


================================================
FILE: src/common/simple_thread_pool.hpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <thread>
#include <future>
#include <queue>

namespace common {

	class simple_thread_pool {

		public:

			explicit simple_thread_pool(size_t);
			~simple_thread_pool();

			void enqueue(std::function<void()> &&fun);

		private:

			void handle_work();

			std::vector<std::thread> m_workers;
			std::queue<std::function<void()>> m_queue;

			std::mutex m_queue_lock;
			std::condition_variable m_condition;
			bool m_stop = false;

	};
	
}


================================================
FILE: src/common/system.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "system.h"
#include <thread>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
#include <boost/uuid/uuid_io.hpp>

namespace common {

	bool is_dev() {
		if (getenv("ALEXANDRIA_LIVE") != NULL && std::stoi(getenv("ALEXANDRIA_LIVE")) > 0) {
			return false;
		}
		return true;
	}

	std::string domain_index_filename() {
		if (is_dev()) {
			return "/dev_files/domain_info.tsv";
		}
		return "/files/domain_info.tsv";
	}

	std::string dictionary_filename() {
		if (is_dev()) {
			return "/dev_files/dictionary.tsv";
		}
		return "/files/dictionary.tsv";
	}

	std::string uuid() {
		// Create a random UUID
		boost::uuids::uuid uuid = boost::uuids::random_generator()();
		// Convert UUID to string and return
		return boost::uuids::to_string(uuid);
	}

}


================================================
FILE: src/common/system.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace common {

	bool is_dev();
	std::string domain_index_filename();
	std::string dictionary_filename();
	std::string uuid();

}


================================================
FILE: src/config.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "config.h"
#include "text/text.h"
#include "logger/logger.h"
#include "file/file.h"

using namespace std;

namespace config {

	config::config() {
		create_data_directories(m_data_path);
	}

	const config s_instance = config();

	const std::string &data_path() {
		return s_instance.data_path();
	}

	void create_data_directories(const std::string &data_path) {
		if (file::directory_exists(data_path)) {
			for (size_t shard_id = 0; shard_id < 8; shard_id++) {
				const std::string base = data_path + "/" + to_string(shard_id);
				file::create_directory(base);
				file::create_directory(base + "/input");
				file::create_directory(base + "/output");
				file::create_directory(base + "/upload");
				file::create_directory(base + "/hash_table");
				file::create_directory(base + "/full_text");
				file::create_directory(base + "/tmp");
			}
		}
	}

	string node = "test0001";
	string master = "localhost";
	string upload = "localhost";
	string data_node;
	//string url_store_host = "http://localhost";
	string url_store_host = "http://node0009.alexandria.org";
	string url_store_path = "/alexandria/urlstore";
	string url_store_cache_path = "/mnt/4/urlstore_cache";

	size_t nodes_in_cluster = 1;
	size_t node_id = 0;

	bool index_snippets = true;
	bool index_text = true;

	vector<string> batches;
	vector<string> link_batches;
	size_t worker_count = 8;
	size_t query_max_words = 10;
	size_t query_max_len = 200;
	size_t deduplicate_domain_count = 5;
	size_t pre_result_limit = 200000;
	size_t result_limit = 1000;
	string file_upload_user = "";
	string file_upload_password = "";
	size_t n_grams = 1;
	size_t shard_hash_table_size = 100000;
	size_t html_parser_long_text_len = 1000;
	size_t ft_shard_builder_buffer_len = 240000;

	size_t ft_num_shards = 2048;
	size_t ft_max_sections = 8;
	size_t ft_max_results_per_section = 100000;
	size_t ft_section_depth = 8;
	size_t ft_max_cache_gb = 30;
	size_t ft_num_threads_indexing = 24;
	size_t ft_num_threads_merging = 24;
	size_t ft_num_threads_appending = 8;

	double ft_cached_bytes_per_shard() {
		return (ft_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * ft_num_threads_indexing);
	}

	void read_config(const string &config_file) {

		batches.clear();
		link_batches.clear();

		ifstream in(config_file);

		if (!in.is_open()) {
			LOG_ERROR("Could not read config file: " + config_file);
			return;
		}

		string line;
		while (getline(in, line)) {
			size_t comment_pos = line.find("#");
			if (comment_pos != string::npos) {
				line = line.substr(0, comment_pos);
			}
			if (text::trim(line) == "") {
				continue;
			}
			vector<string> parts;
			boost::split(parts, line, boost::is_any_of("="));

			for (string &part : parts) {
				part = text::trim(part);
			}

			if (parts[0] == "node") {
				node = parts[1];
			} else if (parts[0] == "master") {
				master = parts[1];
				upload = parts[1];
			} else if (parts[0] == "upload") {
				upload = parts[1];
			} else if (parts[0] == "data_node") {
				data_node = parts[1];
			} else if (parts[0] == "url_store_host") {
				url_store_host = parts[1];
			} else if (parts[0] == "url_store_path") {
				url_store_path = parts[1];
			} else if (parts[0] == "nodes_in_cluster") {
				nodes_in_cluster = stoi(parts[1]);
			} else if (parts[0] == "node_id") {
				node_id = stoi(parts[1]);
			} else if (parts[0] == "batches[]") {
				batches.push_back(parts[1]);
			} else if (parts[0] == "link_batches[]") {
				link_batches.push_back(parts[1]);
			} else if (parts[0] == "worker_count") {
				worker_count = stoi(parts[1]);
			} else if (parts[0] == "query_max_words") {
				query_max_words = stoi(parts[1]);
			} else if (parts[0] == "query_max_len") {
				query_max_len = stoi(parts[1]);
			} else if (parts[0] == "deduplicate_domain_count") {
				deduplicate_domain_count = stoi(parts[1]);
			} else if (parts[0] == "pre_result_limit") {
				pre_result_limit = stoi(parts[1]);
			} else if (parts[0] == "result_limit") {
				result_limit = stoi(parts[1]);
			} else if (parts[0] == "ft_num_shards") {
				ft_num_shards = stoi(parts[1]);
			} else if (parts[0] == "ft_max_sections") {
				ft_max_sections = stoi(parts[1]);
			} else if (parts[0] == "ft_max_results_per_section") {
				ft_max_results_per_section = stoi(parts[1]);
			} else if (parts[0] == "ft_section_depth") {
				ft_section_depth = stoi(parts[1]);
			} else if (parts[0] == "ft_max_cache_gb") {
				ft_max_cache_gb = stoi(parts[1]);
			} else if (parts[0] == "ft_num_threads_indexing") {
				ft_num_threads_indexing = stoi(parts[1]);
			} else if (parts[0] == "ft_num_threads_merging") {
				ft_num_threads_merging = stoi(parts[1]);
			} else if (parts[0] == "ft_num_threads_appending") {
				ft_num_threads_appending = stoi(parts[1]);
			} else if (parts[0] == "file_upload_user") {
				file_upload_user = parts[1];
			} else if (parts[0] == "file_upload_password") {
				file_upload_password = parts[1];
			} else if (parts[0] == "n_grams") {
				n_grams = stoull(parts[1]);
			} else if (parts[0] == "index_snippets") {
				index_snippets = static_cast<bool>(stoull(parts[1]));
			} else if (parts[0] == "index_text") {
				index_text = static_cast<bool>(stoull(parts[1]));
			} else if (parts[0] == "shard_hash_table_size") {
				shard_hash_table_size = stoull(parts[1]);
			} else if (parts[0] == "html_parser_long_text_len") {
				html_parser_long_text_len = stoull(parts[1]);
			} else if (parts[0] == "data_path") {
				s_instance.data_path(parts[1]);
			}
		}
	}

}


================================================
FILE: src/config.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <fstream>
#include <vector>

namespace config {

	void create_data_directories(const std::string &data_path);

	class config {
		public:

			config();

			const std::string &data_path() const { return m_data_path; }

			void data_path(const std::string &str) const { m_data_path = str; create_data_directories(m_data_path); }

		private:

			mutable std::string m_data_path = "/mnt";

	};

	const std::string &data_path();

	extern std::string node;
	extern std::string master;
	extern std::string upload;
	extern std::string data_node;
	extern std::string url_store_host;
	extern std::string url_store_path;
	extern std::string url_store_cache_path;

	const size_t url_store_shards = 24;

	extern size_t nodes_in_cluster;
	extern size_t node_id;

	extern bool index_snippets;
	extern bool index_text;

	extern std::vector<std::string> batches;
	extern std::vector<std::string> link_batches;

	extern size_t worker_count;
	extern size_t query_max_words;
	extern size_t query_max_len;
	extern size_t deduplicate_domain_count;
	extern size_t pre_result_limit;
	extern size_t result_limit;
	extern std::string file_upload_user;
	extern std::string file_upload_password;
	extern size_t n_grams;
	extern size_t shard_hash_table_size;
	extern size_t html_parser_long_text_len;
	extern size_t ft_shard_builder_buffer_len;

	/*
		Constants only configurable at compilation time.
	*/

	// Full text indexer config
	extern size_t ft_num_shards;
	extern size_t ft_max_sections;
	extern size_t ft_max_results_per_section;
	extern size_t ft_section_depth;
	extern size_t ft_max_cache_gb;
	extern size_t ft_num_threads_indexing;
	extern size_t ft_num_threads_merging;
	extern size_t ft_num_threads_appending;
	double ft_cached_bytes_per_shard();

	// Link indexer config
	inline const unsigned long long li_max_cache_gb = 4;
	inline const unsigned long long li_num_threads_indexing = 48;
	inline const unsigned long long li_num_threads_merging = 16;
	inline const double li_cached_bytes_per_shard  = (li_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * li_num_threads_indexing);
	inline const unsigned long long li_indexer_max_cache_size = 500;

	// Hash table indexer config
	inline const unsigned long long ht_num_shards = 1031;
	inline const unsigned long long ht_num_buckets = 8;
	inline const unsigned long long ht_key_size = 8;

	// Server config

	// Other constants.
	inline const unsigned long long num_async_file_transfers = 48;
	inline const std::string test_data_path = "/var/www/html/node0003.alexandria.org/test-data/";

	// Commoncrawl parser.
	inline const std::string cc_target_output = "alexandria-cc-output";
	inline const bool cc_run_on_lambda = false;

	inline const std::string log_file_path = "/var/log/alexandria.log";

	void read_config(const std::string &config_file);

}


================================================
FILE: src/debug.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "debug.h"

void print_elem(std::map<size_t, size_t> &m, size_t elem) {
	std::cout << m[elem] << std::endl;
}


================================================
FILE: src/debug.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <map>

void print_elem(std::map<size_t, size_t> &m, size_t elem);


================================================
FILE: src/domain_stats/domain_stats.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "domain_stats.h"
#include <iostream>
#include "common/dictionary.h"
#include "file/tsv_file_remote.h"
#include "logger/logger.h"
#include "common/system.h"

namespace domain_stats {

	common::dictionary domain_data;

	void download_domain_stats() {
		LOG_INFO("download domain_info.tsv");
		file::tsv_file_remote domain_info_tsv(common::domain_index_filename());
		LOG_INFO("parsing.....");
		domain_data.load_tsv(domain_info_tsv);
	}

	float harmonic_centrality(const URL &url) {
		return harmonic_centrality(url.host());
	}

	float harmonic_centrality(const std::string &host) {

		const auto iter = domain_data.find(host);

		float harmonic = 0.0f;
		if (iter != domain_data.end()) {
			const common::dictionary_row row = iter->second;
			harmonic = row.get_float(0);
		}

		return harmonic;
	}

}


================================================
FILE: src/domain_stats/domain_stats.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include "URL.h"

namespace domain_stats {
	void download_domain_stats();
	float harmonic_centrality(const URL &url);
	float harmonic_centrality(const std::string &domain);
}


================================================
FILE: src/downloader/merge_downloader.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iostream>
#include <sstream>
#include "file/file.h"
#include "file/archive.h"
#include "hash_table2/builder.h"
#include "utils/thread_pool.hpp"
#include "indexer/index.h"
#include "indexer/index_builder.h"
#include "indexer/index_reader.h"
#include "indexer/value_record.h"

namespace downloader {

	bool internal_links_complete(const std::string &path) {

		for (size_t i = 0; i < 8; i++) {
			if (!file::file_exists(path + "/internal_links_" + std::to_string(i))) {
				return false;
			}
		}

		return true;
	}

	bool hash_table_complete(const std::string &path) {
		const size_t num_shards = 1019;
		for (size_t i = 0; i < num_shards; i++) {
			if (!file::file_exists(path + "/" + std::to_string(i) + ".pos")) {
				return false;
			}
		}
		for (size_t i = 0; i < num_shards; i++) {
			if (!file::file_exists(path + "/" + std::to_string(i) + ".data")) {
				return false;
			}
		}

		return true;
	}

	void merge_internal_links(const std::string &path, const std::string &batch_name) {
		return;
		/*
		const std::string target_path = "/slow_data/internal_links/" + batch_name;
		file::create_directory(target_path);
		for (size_t i = 0; i < 8; i++) {
			file::copy_file(path + "/internal_links_" + std::to_string(i), target_path + "/internal_links_" + std::to_string(i));
		}
		*/
		utils::thread_pool pool(8);
		for (size_t i = 0; i < 8; i++) {
			pool.enqueue([i, path]() {
				file::archive tar(path + "/internal_links_" + std::to_string(i));
				utils::thread_pool pool(4, 10);
				tar.untar([&pool](const std::string &filename, const std::string &data) {

					pool.enqueue([filename, data]() {
						uint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5));

						std::istringstream ram_reader(data);

						indexer::index_builder<indexer::value_record> idx1("internal_links", host_hash, 1000);
						indexer::index<indexer::value_record> idx2(&ram_reader, 1000);

						try {
							idx1.merge_with(idx2);
						} catch (const std::runtime_error &err) {
							// The file is corrupt. Lets delete it and report.
							std::cout << "internal_links: " << host_hash << " is corrupt" << std::endl;
							idx1.truncate();
						} catch (const std::bad_alloc &err) {
							// The file is corrupt. Lets delete it and report.
							std::cout << "internal_links: " << host_hash << " is corrupt" << std::endl;
							idx1.truncate();
						}
					});
				});
				pool.run_all();
			});
		}
		pool.run_all();
		std::cout << "finished with the merge" << std::endl;
	}

	void merge_hash_table(const std::string &path) {
		utils::thread_pool pool(32);
		hash_table2::builder ht("all_urls", 1019, 1000000, "/slow_data");
		for (size_t i = 0; i < 1019; i++) {
			pool.enqueue([&ht, i, path]() {
				ht.get_shard(i)->merge_with(path + "/" + std::to_string(i) + ".pos", path + "/" + std::to_string(i) + ".data");
			});
		}
		pool.run_all();
	}

	void merge_downloader() {

		indexer::index_builder<indexer::value_record>::create_directories("internal_links");

		file::read_directory(config::data_path() + "/downloader", [](const std::string &node_id) {
			const std::string dir = config::data_path() + "/downloader/" + node_id;
			file::read_directory(dir, [dir](const std::string &file) {
				try {
					size_t ts = std::stoull(file);
					const std::string batch = dir + "/" + std::to_string(ts);
					if (internal_links_complete(batch) && hash_table_complete(batch + "/ht")) {
						std::cout << "merging directory: " << batch << std::endl;
						profiler::instance prof1("merge_internal_links");
						merge_internal_links(batch, std::to_string(ts));
						prof1.stop();
						profiler::instance prof2("merge_hash_table");
						merge_hash_table(batch + "/ht");
						prof2.stop();
						file::delete_directory(batch);
						exit(0);
					}
				} catch (...) {
				}
			});
		});
	}
}


================================================
FILE: src/downloader/merge_downloader.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace downloader {
	void merge_downloader();
}


================================================
FILE: src/downloader/warc_downloader.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iomanip>

#include "config.h"
#include "common/datetime.h"
#include "warc/warc.h"
#include "utils/thread_pool.hpp"
#include "utils/id_allocator.h"
#include "file/archive.h"
#include "logger/logger.h"
#include "text/text.h"
#include "transfer/transfer.h"
#include <iostream>
#include "hash_table2/builder.h"
#include "algorithm/algorithm.h"
#include "indexer/index_utils.h"
#include "indexer/index_builder.h"
#include "indexer/value_record.h"
#include "indexer/merger.h"

namespace downloader {

	void run_downloader(const std::string &warc_path) {

		warc::parser pp;
		for (int retry = 0; retry < 3; retry++) {
			try {
				warc::multipart_download("http://data.commoncrawl.org/" + warc_path, [&pp](const std::string &chunk) {
					std::stringstream ss(chunk);
					pp.parse_stream(ss);
				});
				break;
			} catch (const std::runtime_error &err) {
				std::cout << "GOT ERROR: " << err.what() << std::endl;
				std::cout << "Retrying... try " << retry << std::endl;
				std::this_thread::sleep_for(std::chrono::seconds(5));
			}
		}

		LOG_INFO("uploading: " + warc_path);
		int error;
		error = transfer::upload_gz_file(warc::get_result_path(warc_path), pp.result());
		error = transfer::upload_gz_file(warc::get_link_result_path(warc_path), pp.link_result());

		if (error) {
			LOG_INFO("error uploading: " + warc_path);
		}

	}

	std::vector<std::string> download_warc_paths() {
		int error;
		auto content = transfer::file_to_string("nodes/" + config::node + "/warc.paths", error);
		if (error == transfer::ERROR) return {};

		content = text::trim(content);

		std::vector<std::string> raw_warc_paths;
		boost::algorithm::split(raw_warc_paths, content, boost::is_any_of("\n"));

		std::vector<std::string> warc_paths;
		for (const auto &warc_path : raw_warc_paths) {
			if (text::trim(warc_path).size()) {
				warc_paths.push_back(text::trim(warc_path));
			}
		}

		return warc_paths;
	}

	bool upload_warc_paths(const std::vector<std::string> &warc_paths) {
		auto content = boost::algorithm::join(warc_paths, "\n");
		int error = transfer::upload_file("nodes/" + config::node + "/warc.paths", content);
		return error == transfer::OK;
	}

	void start_downloaders(const std::vector<std::string> &warc_paths) {

		const size_t num_threads = 12;

		std::vector<std::vector<std::string>> chunks;
		algorithm::vector_chunk<std::string>(warc_paths, std::ceil(warc_paths.size() / num_threads) + 1, chunks);

		utils::thread_pool pool(num_threads);

		for (const auto &chunk : chunks) {
			pool.enqueue([chunk] {
				size_t count = 0;
				for (const auto &warc_path : chunk) {
					run_downloader(warc_path);
					count++;
					std::cout << "done with " << warc_path << " done with " << count << "/" << chunk.size() << std::endl;
				}
			});
		}

		pool.run_all();
	}

	void upload_all() {

		/*auto upload_id = std::to_string(common::cur_datetime());

		// Upload internal links.
		for (size_t i = 0; i < 8; i++) {

			// Optimize all internal links.
			utils::thread_pool pool(32);
			file::read_directory(config::data_path() + "/" + std::to_string(i) + "/full_text/internal_links", [&pool](const std::string &filename) {
				uint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5));
				indexer::index_builder<indexer::value_record> idx("internal_links", host_hash, 1000);
				idx.optimize();
			});
			pool.run_all();

			const auto filename = "internal_links_" + std::to_string(i);
			file::archive tar(filename);
			tar.read_dir(config::data_path() + "/" + std::to_string(i) + "/full_text/internal_links");

			transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/" + filename, filename);

			file::delete_file(filename);
		}

		hash_table2::hash_table ht("crawl_index", 1019);
		ht.for_each_shard([upload_id](auto shard) {

			const auto pos_filename = shard->filename_pos();
			const auto data_filename = shard->filename_data();
			const auto target_filename = std::to_string(shard->shard_id());

			transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/ht/" + target_filename + ".pos", pos_filename);
			transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/ht/" + target_filename + ".data", data_filename);
		});
		*/

	}

	void warc_downloader_with_url(const std::string &batch, const std::string &warc_paths_url) {
	
		std::vector<std::string> warc_paths;

		int error;
		auto content = transfer::gz_file_to_string(warc_paths_url, error);

		std::stringstream ss(content);

		std::string line;
		size_t line_num = 0;
		while (std::getline(ss, line)) {
			if (line_num % config::nodes_in_cluster == config::node_id) {
				warc_paths.emplace_back(std::move(line));
			}

			line_num++;
		}

		start_downloaders(warc_paths);
	}

	void warc_downloader(const std::string &batch) {
		warc_downloader_with_url(batch, "https://data.commoncrawl.org/crawl-data/" + batch + "/warc.paths.gz");
	}

	void warc_downloader_missing(const std::string &batch) {
		warc_downloader_with_url(batch, "crawl-data/" + batch + "/missing.paths.gz");
	}
}


================================================
FILE: src/downloader/warc_downloader.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>

namespace downloader {

	std::vector<std::string> download_warc_paths();
	bool upload_warc_paths(const std::vector<std::string> &warc_paths);

	void warc_downloader(const std::string &batch);
	void warc_downloader_missing(const std::string &batch);
}


================================================
FILE: src/file/archive.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "archive.h"
#include "file.h"
#include "algorithm/algorithm.h"
#include "utils/thread_pool.hpp"
#include <cmath>
#include <boost/filesystem.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <sstream>

namespace file {

	archive::archive(const std::string &filename)
	: m_filename(filename) {

	}

	archive::~archive() {
	
	}

	void archive::read_dir(const std::string &dirname) {

		// Truncate target file.
		std::ofstream outfile(m_filename, std::ios::binary | std::ios::trunc);
		outfile.close();

		boost::filesystem::path path(dirname);

		std::vector<boost::filesystem::path> paths;

		if (is_directory(path)) {
			boost::filesystem::directory_iterator iter(path);
			for (auto &file : boost::make_iterator_range(iter, {})) {
				paths.push_back(file.path());
			}
		}

		std::vector<std::vector<boost::filesystem::path>> chunks;
		algorithm::vector_chunk(paths, std::ceil(paths.size() / m_num_threads) + 1, chunks);

		utils::thread_pool pool(m_num_threads);

		size_t worker_id = 0;
		for (const auto &chunk : chunks) {

			// Remove worker file.
			::file::delete_file(m_filename + "." + std::to_string(worker_id));

			pool.enqueue([this, chunk, worker_id]() {
				for (const auto &path : chunk) {
					add_file(path.generic_string(), path.filename().generic_string(), worker_id);
				}
			});
			worker_id++;
		}

		pool.run_all();

		// Merge workers.
		for (size_t worker_id = 0; worker_id < m_num_threads; worker_id++) {

			std::filebuf infile, outfile;
			
			outfile.open(m_filename, std::ios::out | std::ios::binary | std::ios::app);
			infile.open(m_filename + "." + std::to_string(worker_id), std::ios::in | std::ios::binary);

			std::copy(std::istreambuf_iterator<char>(&infile), {}, std::ostreambuf_iterator<char>(&outfile));

			// Remove worker file.
			::file::delete_file(m_filename + "." + std::to_string(worker_id));
		}
	}

	void archive::untar(const std::string &dest_dir) {
		std::ifstream infile(m_filename, std::ios::binary);

		tar_header header;

		while (!infile.eof()) {
			infile.read((char *)&header, sizeof(tar_header));

			if (infile.eof()) break;

			// This is an unnessecary copy.
			char *buffer = new char[header.m_len];
			infile.read(buffer, header.m_len);

			std::string buffer_string(buffer, header.m_len);
			std::stringstream buffer_stream(buffer_string);

			delete[] buffer;

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(buffer_stream);

			std::string decompressed_data(std::istreambuf_iterator<char>(decompress_stream), {});

			std::ofstream outfile(dest_dir + "/" + header.m_filename, std::ios::binary);
			outfile.write(decompressed_data.c_str(), decompressed_data.size());
		}
		
	}

	void archive::untar(std::function<void(const std::string &, const std::string &)> cb) {
		std::ifstream infile(m_filename, std::ios::binary);

		tar_header header;

		while (!infile.eof()) {
			infile.read((char *)&header, sizeof(tar_header));

			if (infile.eof()) break;

			// This is an unnessecary copy.
			char *buffer = new char[header.m_len];
			infile.read(buffer, header.m_len);

			std::string buffer_string(buffer, header.m_len);
			std::stringstream buffer_stream(buffer_string);

			delete[] buffer;

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(buffer_stream);

			std::string decompressed_data(std::istreambuf_iterator<char>(decompress_stream), {});

			cb(header.m_filename, decompressed_data);
		}
		
	}

	void archive::add_file(const std::string &path, const std::string &filename, size_t worker_id) {

		std::ofstream outfile(m_filename + "." + std::to_string(worker_id), std::ios::binary | std::ios::app);

		std::string data = ::file::cat(path);

		std::stringstream ss(data);
		boost::iostreams::filtering_istream compress_stream;
		compress_stream.push(boost::iostreams::gzip_compressor());
		compress_stream.push(ss);

		std::string compressed_data(std::istreambuf_iterator<char>(compress_stream), {});

		tar_header header;
		header.m_len = compressed_data.size();
		filename.copy(header.m_filename, filename.size(), 0);
		header.m_filename[filename.size()] = 0;

		outfile.write((char *)&header, sizeof(tar_header));
		outfile.write((char *)compressed_data.c_str(), compressed_data.size());
	}


}


================================================
FILE: src/file/archive.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <functional>

namespace file {

	class archive {

		public:
			explicit archive(const std::string &filename);
			~archive();

			void read_dir(const std::string &dirname);
			void untar(const std::string &dest_dir);
			void untar(std::function<void(const std::string &, const std::string &)> cb);

		private:
			const size_t m_num_threads = 32;
			std::string m_filename;

			struct tar_header {
				size_t m_len;
				char m_filename[256];
			};

			void add_file(const std::string &path, const std::string &filename, size_t worker_id);

	};

}


================================================
FILE: src/file/file.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "config.h"
#include "file.h"
#include <boost/filesystem.hpp>
#include <boost/range/iterator_range.hpp>

namespace file {

	std::string read_test_file(const std::string &file_name) {

		std::ifstream file(config::test_data_path + file_name);
		if (file.is_open()) {
			std::string ret;
			file.seekg(0, std::ios::end);
			ret.resize(file.tellg());
			file.seekg(0, std::ios::beg);
			file.read(&ret[0], ret.size());
			file.close();
			return ret;
		}
		return "";
	}

	void rename(const std::string &old_path, const std::string &new_path) {
		boost::filesystem::rename(old_path, new_path);
	}

	void copy_file(const std::string &source, const std::string &dest) {
		std::ifstream infile(source, std::ios::binary);
		std::ofstream outfile(dest, std::ios::binary | std::ios::trunc);

		outfile << infile.rdbuf();
	}

	void delete_file(const std::string &file) {
		boost::filesystem::remove(file);
	}

	void create_directory(const std::string &path) {
		boost::filesystem::create_directories(path);
	}

	void delete_directory(const std::string &path) {
		boost::filesystem::remove_all(path);
	}

	std::string cat(const std::string &filename) {
		std::ifstream infile(filename);
		std::istreambuf_iterator<char> iter(infile), end; 
		std::string ret(iter, end);
		return ret;
	}

	void read_directory(const std::string &dirname, std::function<void(const std::string &)> cb) {

		boost::filesystem::path path(dirname);

		if (is_directory(path)) {
			boost::filesystem::directory_iterator iter(path);
			for (auto &file : boost::make_iterator_range(iter, {})) {
				cb(file.path().filename().generic_string());
			}
		}
	}

	bool directory_exists(const std::string &filename) {
		return boost::filesystem::is_directory(filename) && boost::filesystem::exists(filename);
	}

	bool file_exists(const std::string &filename) {
		std::ifstream infile(filename);
		return infile.good();
	}

}


================================================
FILE: src/file/file.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <fstream>
#include <stdio.h>
#include <functional>

namespace file {

	std::string read_test_file(const std::string &file_name);

	void rename(const std::string &old_path, const std::string &new_path);

	void copy_file(const std::string &source, const std::string &dest);
	void delete_file(const std::string &filename);

	void create_directory(const std::string &path);
	void delete_directory(const std::string &path);

	/*
	 * Returns the whole content of the file.
	 * */
	std::string cat(const std::string &filename);

	void read_directory(const std::string &path, std::function<void(const std::string &)> cb);

	bool directory_exists(const std::string &filename);
	bool file_exists(const std::string &filename);

}


================================================
FILE: src/file/gz_tsv_file.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "gz_tsv_file.h"
#include <exception>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/algorithm/string.hpp>

namespace file {

	gz_tsv_file::gz_tsv_file() {

	}

	gz_tsv_file::gz_tsv_file(const std::string &file_name) {
		m_file_name = file_name;

		std::ifstream infile(m_file_name);

		if (infile.is_open()) {
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			m_data = std::string(std::istreambuf_iterator<char>(decompress_stream), {});
		}
	}

	gz_tsv_file::~gz_tsv_file() {
	}

	size_t gz_tsv_file::read_column_into(size_t column, std::vector<std::string> &container) {
		std::stringstream ss(m_data);

		std::string line;
		size_t rows_read = 0;
		while (getline(ss, line)) {
			std::vector<std::string> cols;
			boost::algorithm::split(cols, line, boost::is_any_of("\t"));
			if (cols.size() > column) {
				container.push_back(cols[column]);
			} else {
				container.push_back("");
			}
			rows_read++;
		}

		return rows_read;
	}

}


================================================
FILE: src/file/gz_tsv_file.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <sstream>
#include <fstream>
#include <set>
#include <vector>
#include <map>
#include <string.h>

namespace file {

	class gz_tsv_file {

	public:

		gz_tsv_file();
		explicit gz_tsv_file(const std::string &file_name);
		~gz_tsv_file();

		size_t read_column_into(size_t column, std::vector<std::string> &container);

	protected:

		std::string m_file_name;
		std::string m_data;

	};
}


================================================
FILE: src/file/tsv_file.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "tsv_file.h"
#include <exception>

namespace file {

	tsv_file::tsv_file() {

	}

	tsv_file::tsv_file(const std::string &file_name) {
		set_file_name(file_name);
	}

	tsv_file::~tsv_file() {
		m_file.close();
	}

	std::string tsv_file::find(const std::string &key) {
		size_t pos = binary_find_position(m_file_size, 0, key);
		if (pos == std::string::npos) {
			return "";
		}

		m_file.seekg(pos, m_file.beg);
		

		std::string line;
		getline(m_file, line);

		return line;
	}

	size_t tsv_file::find_first_position(const std::string &key) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);
		const size_t pos = binary_find_position(m_file_size, 0, key);
		if (pos == std::string::npos) return std::string::npos;
		// pos is the position of one item. but we need the first one.
		size_t jump = 1000;
		while (pos > jump) {
			m_file.seekg(pos - jump, m_file.beg);
			// read next line.
			std::string line;
			getline(m_file, line);
			getline(m_file, line);
			auto jump_key = line.substr(0, line.find("\t"));
			if (jump_key < key) {
				// We jamp too far.
				break;
			}
			jump = jump << 1;
		}
		if (pos < jump) jump = pos;

		// The first occurance is between pos - jump and pos - (jump/2)
		// Linear search.
		m_file.seekg(pos - jump, m_file.beg);
		std::string line;
		if (pos > jump) {
			getline(m_file, line);
		}
		while (getline(m_file, line)) {
			auto jump_key = line.substr(0, line.find("\t"));
			if (jump_key == key) {
				return (size_t)m_file.tellg() - (line.size() + 1u);
			}
		}
		return std::string::npos;
	}

	size_t tsv_file::find_last_position(const std::string &key) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);
		const size_t pos = binary_find_position(m_file_size, 0, key);
		if (pos == std::string::npos) return std::string::npos;
		// pos is the position of one item. but we need the last one.
		size_t jump = 1000;
		while (pos + jump < m_file_size) {
			m_file.seekg(pos + jump, m_file.beg);
			// read next line.
			std::string line;
			getline(m_file, line);
			getline(m_file, line);
			auto jump_key = line.substr(0, line.find("\t"));
			if (jump_key > key) {
				// We jamp too far.
				break;
			}
			jump = jump << 1;
		}
		jump = jump >> 1;
		if (pos + jump > m_file_size) {
			jump = 0;
		}

		// The first occurance is between pos - jump and pos - (jump/2)
		// Linear search.
		m_file.seekg(pos + jump, m_file.beg);
		size_t ret_pos = pos + jump;
		std::string line;
		getline(m_file, line);
		size_t last_line_length = line.size() + 1u;
		ret_pos += line.size() + 1u;
		while (getline(m_file, line)) {
			auto jump_key = line.substr(0, line.find("\t"));
			if (jump_key > key) {
				return ret_pos - last_line_length;
			}
			ret_pos += line.size() + 1u;
			last_line_length = line.size() + 1u;
		}
		return ret_pos - last_line_length;
	}

	size_t tsv_file::find_next_position(const std::string &key) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);
		const size_t pos = binary_find_position_any(m_file_size, 0, key);

		// pos is the position of one item. but we need the last one.
		size_t jump = 1000;
		while (pos + jump < m_file_size) {
			m_file.seekg(pos + jump, m_file.beg);
			// read next line.
			std::string line;
			getline(m_file, line);
			getline(m_file, line);
			auto jump_key = line.substr(0, line.find("\t"));
			if (jump_key > key) {
				// We jamp too far.
				break;
			}
			jump = jump << 1;
		}
		jump = jump >> 1;
		if (pos + jump > m_file_size) {
			jump = 0;
		}

		// The first occurance is between pos - jump and pos - (jump/2)
		// Linear search.
		m_file.seekg(pos + jump, m_file.beg);
		size_t ret_pos = pos + jump;
		std::string line;
		getline(m_file, line);
		ret_pos += line.size() + 1u;
		while (getline(m_file, line)) {
			auto jump_key = line.substr(0, line.find("\t"));
			if (jump_key > key) {
				return ret_pos;
			}
			ret_pos += line.size() + 1u;
		}
		return m_file_size;
	}

	std::map<std::string, std::string> tsv_file::find_all(const std::set<std::string> &keys) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);
		size_t pos = 0;
		std::map<std::string, std::string> result;
		std::string line;
		for (const auto &key : keys) {
			pos = binary_find_position(m_file_size, pos, key);
			if (pos != std::string::npos) {
				m_file.seekg(pos, m_file.beg);
				getline(m_file, line);
				result[key] = line;
			} else {
				// Key not found, ignore.
			}
		}

		return result;
	}

	size_t tsv_file::read_column_into(int column, std::set<std::string> &container) {
		(void)column;
		m_file.clear();
		m_file.seekg(0, m_file.beg);

		if (!m_file.is_open()) {
			throw std::runtime_error("File is not open any more: " + m_file_name);
		}

		std::string line;
		size_t rows_read = 0;
		while (getline(m_file, line)) {
			std::stringstream ss(line);
			std::string col;
			ss >> col;
			container.insert(col);
			rows_read++;
		}

		return rows_read;
	}

	size_t tsv_file::read_column_into(int column, std::set<std::string> &container, size_t limit) {
		(void)limit;
		m_file.clear();
		m_file.seekg(0, m_file.beg);

		if (!m_file.is_open()) {
			throw std::runtime_error("File is not open any more: " + m_file_name);
		}

		std::string line;
		size_t rows_read = 0;
		while (getline(m_file, line)) {
			std::stringstream ss(line);
			std::string col;
			ss >> col;
			container.insert(col);
			rows_read++;
			if (rows_read >= limit) break;
		}

		return rows_read;
	}

	size_t tsv_file::read_column_into(int column, std::set<std::string> &container, size_t limit, size_t offset) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);

		if (!m_file.is_open()) {
			throw std::runtime_error("File is not open any more: " + m_file_name);
		}

		std::string line;
		size_t rows_read = 0;
		while (getline(m_file, line)) {
			std::stringstream ss(line);
			std::string col;
			ss >> col;
			if (rows_read >= offset) {
				container.insert(col);
				rows_read++;
				if ((rows_read - offset) >= limit) break;
			} else {
				rows_read++;
			}
		}

		return rows_read;
	}

	size_t tsv_file::size() const {
		return m_file_size;
	}

	bool tsv_file::eof() const {
		return m_file.eof();
	}

	bool tsv_file::is_open() const {
		return m_file.is_open();
	}

	std::string tsv_file::get_line() {
		std::string line;
		getline(m_file, line);
		return line;
	}

	size_t tsv_file::read_column_into(int column, std::vector<std::string> &container) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);

		std::string line;
		size_t rows_read = 0;
		while (getline(m_file, line)) {
			std::stringstream ss(line);
			std::string col;
			ss >> col;
			container.push_back(col);
			rows_read++;
		}

		return rows_read;
	}

	size_t tsv_file::read_column_into(int column, std::vector<std::string> &container, size_t limit) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);

		std::string line;
		size_t rows_read = 0;
		while (getline(m_file, line)) {
			std::stringstream ss(line);
			std::string col;
			ss >> col;
			container.push_back(col);
			rows_read++;
			if (rows_read >= limit) break;
		}

		return rows_read;
	}

	size_t tsv_file::read_column_into(int column, std::vector<std::string> &container, size_t limit, size_t offset) {
		m_file.clear();
		m_file.seekg(0, m_file.beg);

		std::string line;
		size_t rows_read = 0;
		while (getline(m_file, line)) {
			std::stringstream ss(line);
			std::string col;
			ss >> col;
			if (rows_read >= offset) {
				container.push_back(col);
				rows_read++;
				if ((rows_read - offset) >= limit) break;
			} else {
				rows_read++;
			}
		}

		return rows_read;
	}

	size_t tsv_file::binary_find_position(size_t file_size, size_t offset, const std::string &key) {

		std::string line;

		if (file_size - offset < 750) {
			// Make linear search.
			m_file.seekg(offset, m_file.beg);
			size_t bytes_read = 0;
			while (getline(m_file, line) && bytes_read <= file_size - offset) {
				bytes_read += (line.size() + 1u);
				if (line.starts_with(key + "\t")) {
					return (size_t)m_file.tellg() - (line.size() + 1u);
				}
			}

			return std::string::npos;
		}

		size_t pivot_len_1 = (file_size - offset) / 2;
		size_t pivot = offset + pivot_len_1;

		// Get key at pivot.
		m_file.seekg(pivot, m_file.beg);

		getline(m_file, line);
		getline(m_file, line);
		auto pivot_key = line.substr(0, line.find("\t"));

		if (key < pivot_key) {
			return binary_find_position(offset + pivot_len_1, offset, key);
		} else if (key > pivot_key) {
			return binary_find_position(file_size, pivot, key);
		}

		return (size_t)m_file.tellg() - (line.size() + 1u);
	}

	size_t tsv_file::binary_find_position_any(size_t file_size, size_t offset, const std::string &key) {

		std::string line;

		if (file_size - offset < 750) {
			// Make linear search.
			m_file.seekg(offset, m_file.beg);
			size_t bytes_read = 0;
			while (getline(m_file, line) && bytes_read <= file_size - offset) {
				bytes_read += (line.size() + 1u);
				const auto this_key = line.substr(0, line.find("\t"));
				if (this_key >= key) {
					return (size_t)m_file.tellg() - (line.size() + 1u);
				}
			}

			return m_file_size;
		}

		size_t pivot_len_1 = (file_size - offset) / 2;
		size_t pivot = offset + pivot_len_1;

		// Get key at pivot.
		m_file.seekg(pivot, m_file.beg);

		getline(m_file, line);
		getline(m_file, line);
		auto pivot_key = line.substr(0, line.find("\t"));

		if (key < pivot_key) {
			return binary_find_position(offset + pivot_len_1, offset, key);
		} else if (key > pivot_key) {
			return binary_find_position(file_size, pivot, key);
		}

		return (size_t)m_file.tellg() - (line.size() + 1u);
	}

	void tsv_file::set_file_name(const std::string &file_name) {

		m_file_name = file_name;
		m_original_file_name = file_name;

		m_file.open(m_file_name);

		if (!m_file.is_open()) {
			throw std::runtime_error("Could not open file: " + m_file_name + " error: " + strerror(errno));
		}

		m_file.seekg(0, m_file.end);
		m_file_size = m_file.tellg();
		m_file.seekg(0, m_file.beg);
	}

}


================================================
FILE: src/file/tsv_file.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <sstream>
#include <fstream>
#include <set>
#include <vector>
#include <map>
#include <string.h>

namespace file {

	class tsv_file {

	public:

		tsv_file();
		explicit tsv_file(const std::string &file_name);
		~tsv_file();

		// Returns the line with the first column equals key. Returns std::string::npos if not present in file.
		std::string find(const std::string &key);

		/*
			Returns the position of the FIRST line in the file with first column equals key.
			Returns std::string::npos if not present in file.
		*/
		size_t find_first_position(const std::string &key);
		
		/*
			Returns the position of the LAST line in the file with first column equals key.
			Returns std::string::npos if not present in file.
		*/
		size_t find_last_position(const std::string &key);

		/*
			Returns the position of the line AFTER the line in the file with first column equals key.
			If the key does not exist it returns the position to the line where this key would be inserted. If the
			key should be inserted to the end it returns m_file_size
		*/
		size_t find_next_position(const std::string &key);

		std::map<std::string, std::string> find_all(const std::set<std::string> &keys);

		size_t read_column_into(int column, std::set<std::string> &container);
		size_t read_column_into(int column, std::set<std::string> &container, size_t limit);
		size_t read_column_into(int column, std::set<std::string> &container, size_t limit, size_t offset);
		size_t read_column_into(int column, std::vector<std::string> &container);
		size_t read_column_into(int column, std::vector<std::string> &container, size_t limit);
		size_t read_column_into(int column, std::vector<std::string> &container, size_t limit, size_t offset);

		size_t size() const;
		bool eof() const;
		bool is_open() const;
		std::string get_line();

	protected:

		std::string m_file_name;
		std::string m_original_file_name;
		std::ifstream m_file;
		size_t m_file_size;
		bool m_is_gzipped = false;
		
		/*
			Difference is that _any returns the position where this key WOULD be if it was inserted even if it is not
			present.
		*/
		size_t binary_find_position(size_t file_size, size_t offset, const std::string &key);
		size_t binary_find_position_any(size_t file_size, size_t offset, const std::string &key);

		void set_file_name(const std::string &file_name);

	};
}


================================================
FILE: src/file/tsv_file_remote.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "tsv_file_remote.h"
#include "logger/logger.h"
#include "transfer/transfer.h"

#include <boost/filesystem.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/gzip.hpp>

//using namespace boost::iostreams;

namespace file {

	tsv_file_remote::tsv_file_remote(const std::string &file_name) {
		// Check if the file exists.

		m_file_name = file_name;

		std::ifstream infile(get_path());

		if (download_file() == transfer::OK) {
			set_file_name(get_path());
		} else {
			infile.close();
		}
	}

	tsv_file_remote::~tsv_file_remote() {
		
	}

	std::string tsv_file_remote::get_path() const {
		return config::data_path() + "/0/" + m_file_name;
	}

	int tsv_file_remote::download_file() {

		if (m_file_name.find(".gz") == m_file_name.size() - 3) {
			m_is_gzipped = true;
		} else {
			m_is_gzipped = false;
		}

		LOG_INFO("Downloading file with key: " + m_file_name);

		create_directory();
		std::ofstream outfile(get_path(), std::ios::trunc);

		int error = transfer::ERROR;
		if (outfile.good()) {
			if (m_is_gzipped) {
				transfer::gz_file_to_stream(m_file_name, outfile, error);
			} else {
				transfer::file_to_stream(m_file_name, outfile, error);
			}

			if (error == transfer::ERROR) {
				LOG_INFO("Download failed...");
			}
		}

		LOG_INFO("Done downloading file with key: " + m_file_name);

		return error;
	}

	void tsv_file_remote::create_directory() {
		boost::filesystem::path path(get_path());
		boost::filesystem::create_directories(path.parent_path());
	}

}


================================================
FILE: src/file/tsv_file_remote.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "tsv_file.h"

namespace file {

	class tsv_file_remote : public tsv_file {

	public:

		explicit tsv_file_remote(const std::string &file_name);
		~tsv_file_remote();

		std::string get_path() const;

	private:

		int download_file();
		void create_directory();

	};

}


================================================
FILE: src/file/tsv_row.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "tsv_row.h"

namespace file {

	tsv_row::tsv_row(const std::string &line) {
		size_t pos_start = 0;
		size_t pos_end = 0;
		while (pos_end != std::string::npos) {
			pos_end = line.find(pos_start, '\t');
			m_cols.emplace_back(line.substr(pos_start, pos_end));
			pos_start = pos_end + 1;
		}
	}

	tsv_row::~tsv_row() {

	}

}


================================================
FILE: src/file/tsv_row.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>

namespace file {

	class tsv_row {

	public:
		explicit tsv_row(const std::string &line);
		~tsv_row();

	private:
		std::vector<std::string> m_cols;

	};

}


================================================
FILE: src/full_text/domain_link_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

namespace full_text {

	struct domain_link_record {

		uint64_t m_value;
		float m_score;
		uint64_t m_source_domain;
		uint64_t m_target_domain;

	};
}


================================================
FILE: src/full_text/link_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

namespace full_text {

	struct link_record {

		uint64_t m_value;
		float m_score;
		uint64_t m_source_domain;
		uint64_t m_target_hash;

	};
}


================================================
FILE: src/full_text/record.h
================================================

#pragma once

namespace full_text {

	struct record {

		uint64_t m_value;
		float m_score;
		uint64_t m_domain_hash;

	};
}


================================================
FILE: src/full_text/result_set.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "config.h"
#include <fcntl.h>
#include <unistd.h>
#include <iostream>
#include <span>
#include <cassert>

namespace full_text {

	template<typename data_record>
	class result_set {

	public:

		result_set(size_t size);
		~result_set();

		size_t size() const { return m_size; }
		size_t max_size() const { return m_max_size; }

		const data_record *data_pointer() const { return m_data_pointer; }
		const data_record *section_pointer(size_t section) const { return &m_data_pointer[section * config::ft_max_results_per_section]; }
		data_record *data_pointer() { return m_data_pointer; }
		data_record *section_pointer(size_t section) { return &m_data_pointer[section * config::ft_max_results_per_section]; }
		std::span<data_record> *span_pointer() { return &m_span; }

		size_t total_num_results() const { return m_total_num_results ; };
		void set_total_num_results(size_t total_num_results);

		void resize(size_t n) {
			m_span = std::span<data_record>(m_data_pointer, n);
			m_size = n;
		}

		void prepare_sections(const std::string &filename, size_t offset, size_t len);
		void read_to_section(size_t section);
		bool has_next_section();
		size_t num_sections();
		void close_sections();
		void copy_vector(const std::vector<data_record> &vec);

	private:

		result_set(const result_set &res) = delete;

		std::span<data_record> m_span;
		data_record *m_data_pointer;

		size_t m_size; // The length in first section.
		const size_t m_max_size; // The maximum number of elements the result set can hold.
		size_t m_total_size; // The lengths of all elements in all sections.
		size_t m_total_num_results; // The total indexed length, only used to display total number of results.
		size_t m_section_len;
		size_t m_records_read;
		int m_file_descriptor;
		bool m_error = false;

	};

	template<typename data_record>
	result_set<data_record>::result_set(size_t size)
	: m_size(size), m_max_size(size), m_total_num_results(0)
	{
		m_file_descriptor = -1;
		m_data_pointer = new data_record[size];
		m_span = std::span<data_record>(m_data_pointer, size);
	}

	template<typename data_record>
	result_set<data_record>::~result_set() {
		delete []m_data_pointer;
	}

	template<typename data_record>
	void result_set<data_record>::set_total_num_results(size_t total_num_results) {
		m_total_num_results = total_num_results;
	}

	template<typename data_record>
	void result_set<data_record>::prepare_sections(const std::string &filename, size_t offset, size_t len) {

		assert(m_file_descriptor < 0);

		m_size = len / sizeof(data_record);
		m_total_size = m_size;
		if (m_size > config::ft_max_results_per_section) m_size = config::ft_max_results_per_section;

		m_file_descriptor = open(filename.c_str(), O_RDONLY);
		posix_fadvise(m_file_descriptor, offset, m_total_size * sizeof(data_record), POSIX_FADV_SEQUENTIAL);
		lseek(m_file_descriptor, offset, SEEK_SET);
		m_records_read = 0;
		resize(m_size);
	}

	/*
		Reads data up to and includint the section. So if the argument section equals zero the first section is read.
	*/
	template<typename data_record>
	void result_set<data_record>::read_to_section(size_t section) {
		size_t read_start = m_records_read;
		size_t read_end = (section + 1) * config::ft_max_results_per_section;
		if (read_end > m_total_size) read_end = m_total_size;

		if (read_start > read_end) return;

		size_t records_to_read = read_end - read_start;

		int bytes_read = ::read(m_file_descriptor, (void *)&m_data_pointer[m_records_read], (size_t)records_to_read * sizeof(data_record));
		if (bytes_read < 0) {
			m_error = true;
		} else {
			m_error = false;
		}
		m_records_read += records_to_read;
	}

	template<typename data_record>
	bool result_set<data_record>::has_next_section() {
		if (m_file_descriptor < 0) return false;
		return m_total_size > m_records_read;
	}

	template<typename data_record>
	size_t result_set<data_record>::num_sections() {
		// Ceiling integer division of m_total_size/config::ft_max_results_per_section;
		return (m_total_size + config::ft_max_results_per_section - 1) / config::ft_max_results_per_section;
	}

	template<typename data_record>
	void result_set<data_record>::close_sections() {
		if (m_file_descriptor >= 0) {
			close(m_file_descriptor);
			m_file_descriptor = -1;
		}
	}

	template<typename data_record>
	void result_set<data_record>::copy_vector(const std::vector<data_record> &vec) {
		memcpy(&m_data_pointer[0], vec.data(), vec.size() * sizeof(data_record));
		resize(vec.size());
	}
}


================================================
FILE: src/full_text/search_metric.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

namespace full_text {

	class search_metric {

		public:
		size_t m_total_found = 0;
		size_t m_total_url_links_found = 0;
		size_t m_total_domain_links_found = 0;
		size_t m_links_handled = 0;
		size_t m_link_domain_matches = 0;
		size_t m_link_url_matches = 0;

	};

}


================================================
FILE: src/hash_table2/builder.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "builder.h"
#include "utils/thread_pool.hpp"

namespace hash_table2 {

	builder::builder(const std::string &db_name, size_t num_shards, size_t hash_table_size,
			const std::string &data_path)
	: m_db_name(db_name) {
		for (size_t i = 0; i < num_shards; i++) {
			m_shards.push_back(new hash_table_shard_builder(db_name, i, hash_table_size, data_path));
		}
	}

	builder::~builder() {
		for (hash_table_shard_builder *shard : m_shards) {
			delete shard;
		}
	}

	void builder::add(uint64_t key, const std::string &value, size_t version) {
		m_shards[key % m_shards.size()]->add(key, value, version);
	}

	void builder::remove(uint64_t key) {
		m_shards[key % m_shards.size()]->remove(key);
	}

	void builder::merge() {
		utils::thread_pool pool(32);
		for (hash_table_shard_builder *shard : m_shards) {
			pool.enqueue([shard]() -> void {
				shard->append();
				shard->merge();
			});
		}

		pool.run_all();
	}

	void builder::optimize() {
		utils::thread_pool pool(32);
		for (hash_table_shard_builder *shard : m_shards) {
			pool.enqueue([shard]() -> void {
				shard->optimize();
			});
		}

		pool.run_all();
	}

	void builder::truncate() {
		for (hash_table_shard_builder *shard : m_shards) {
			shard->truncate();
		}
	}

	void builder::merge_with(const builder &other) {
		for (size_t i = 0; i < m_shards.size(); i++) {
			m_shards[i]->merge_with(*(other.m_shards[i]));
		}
	}
}


================================================
FILE: src/hash_table2/builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include "hash_table_shard_builder.h"
#include "config.h"

namespace hash_table2 {

	class builder {

	public:

		explicit builder(const std::string &db_name, size_t num_shards = config::ht_num_shards,
			size_t hash_table_size = 1000000,
			const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table");
		~builder();

		void add(uint64_t key, const std::string &value, size_t version = 0);
		void remove(uint64_t key);

		void merge();
		void optimize();
		void truncate();

		void merge_with(const builder &other);

		hash_table_shard_builder *get_shard(size_t shard_id) { return m_shards[shard_id]; };

	private:

		std::vector<hash_table_shard_builder *> m_shards;
		const std::string m_db_name;

	};
}


================================================
FILE: src/hash_table2/hash_table.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "config.h"
#include "hash_table.h"
#include "hash_table_shard_builder.h"
#include "logger/logger.h"

namespace hash_table2 {

	hash_table::hash_table(const std::string &db_name, size_t num_shards, size_t hash_table_size, const std::string &data_path)
	: m_db_name(db_name)
	{
		for (size_t shard_id = 0; shard_id < num_shards; shard_id++) {
			auto shard = new hash_table_shard(m_db_name, shard_id, hash_table_size, data_path);
			m_shards.push_back(shard);
		}
	}

	hash_table::~hash_table() {
		for (hash_table_shard *shard : m_shards) {
			delete shard;
		}
	}

	void hash_table::add(uint64_t key, const std::string &value) {

		const size_t shard_id = key % m_shards.size();
		hash_table_shard_builder builder(m_db_name, shard_id);

		builder.add(key, value);
	}

	void hash_table::truncate() {
		for (size_t shard_id = 0; shard_id < m_shards.size(); shard_id++) {
			hash_table_shard_builder builder(m_db_name, shard_id);
			builder.truncate();
		}
	}

	bool hash_table::has(uint64_t key) {
		return m_shards[key % m_shards.size()]->has(key);
	}

	std::string hash_table::find(uint64_t key) {
		size_t ver = 0;
		return find(key, ver);
	}

	std::string hash_table::find(uint64_t key, size_t &ver) {
		return m_shards[key % m_shards.size()]->find(key, ver);
	}

	size_t hash_table::size() const {
		size_t num_items = 0;
		for (const auto &shard : m_shards) {
			num_items += shard->size();
		} 
		return num_items;
	}

	void hash_table::for_each(std::function<void(uint64_t, const std::string &)> callback) const {
		for (const auto &shard : m_shards) {
			shard->for_each(callback);
		}
	}

	void hash_table::for_each_key(std::function<void(uint64_t)> callback) const {
		for (const auto &shard : m_shards) {
			shard->for_each_key(callback);
		}
	}

	void hash_table::for_each_shard(std::function<void(const hash_table_shard *shard)> callback) const {
		for (const auto &shard : m_shards) {
			callback(shard);
		}
	}

}


================================================
FILE: src/hash_table2/hash_table.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <thread>
#include <vector>
#include <map>

#include "config.h"
#include "hash_table_shard.h"

namespace hash_table2 {

	class hash_table_shard;

	class hash_table {

	public:

		explicit hash_table(const std::string &db_name, size_t num_shards = config::ht_num_shards,
				size_t hash_table_size = 1000000,
				const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table");
		~hash_table();

		void add(uint64_t key, const std::string &value);
		void truncate();
		bool has(uint64_t key);
		std::string find(uint64_t key);
		std::string find(uint64_t key, size_t &ver);
		size_t size() const;
		void for_each(std::function<void(uint64_t, const std::string &)> callback) const;
		void for_each_key(std::function<void(uint64_t)> callback) const;
		void for_each_shard(std::function<void(const hash_table_shard *shard)> callback) const;

	private:

		std::vector<hash_table_shard *> m_shards;
		const std::string m_db_name;

	};

}


================================================
FILE: src/hash_table2/hash_table_shard.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iostream>
#include <sstream>
#include <numeric>
#include "config.h"
#include "hash_table_shard.h"
#include "logger/logger.h"

#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>

namespace hash_table2 {

	hash_table_shard::hash_table_shard(const std::string &db_name, size_t shard_id, size_t hash_table_size,
			const std::string &data_path)
	: hash_table_shard_base(db_name, shard_id, hash_table_size, data_path)
	{
	}

	hash_table_shard::~hash_table_shard() {

	}

	bool hash_table_shard::has(uint64_t key) const {

		std::ifstream reader(filename_pos(), std::ios::binary);

		const size_t hash_pos = key % this->m_hash_table_size;
		reader.seekg(hash_pos * sizeof(size_t));

		// Read page pos.
		size_t page_pos = SIZE_MAX;
		reader.read((char *)&page_pos, sizeof(size_t));

		if (page_pos == SIZE_MAX) return false;

		// Read page.
		size_t page_len;
		reader.seekg(this->hash_table_byte_size() + page_pos, std::ios::beg);
		reader.read((char *)&page_len, sizeof(size_t));

		std::vector<std::array<uint64_t, 3>> page(page_len);
		reader.read((char *)page.data(), page_len * sizeof(std::array<uint64_t, 3>));

		// Find key among pages.
		for (const auto &page_item : page) {
			if (page_item[0] == key) {
				return true;
			}
		}

		return false;
	}

	std::string hash_table_shard::find(uint64_t key) const {
		size_t ver;
		return find(key, ver);
	}

	std::string hash_table_shard::find(uint64_t key, size_t &ver) const {

		std::ifstream reader(filename_pos(), std::ios::binary);

		const size_t hash_pos = key % this->m_hash_table_size;
		reader.seekg(hash_pos * sizeof(size_t));

		// Read page pos.
		size_t page_pos = SIZE_MAX;
		reader.read((char *)&page_pos, sizeof(size_t));

		if (page_pos == SIZE_MAX) return "";

		// Read page.
		size_t page_len;
		reader.seekg(this->hash_table_byte_size() + page_pos, std::ios::beg);
		reader.read((char *)&page_len, sizeof(size_t));

		std::vector<std::array<uint64_t, 3>> page(page_len);
		reader.read((char *)page.data(), page_len * sizeof(std::array<uint64_t, 3>));

		// Find key among pages.
		size_t pos = SIZE_MAX;
		for (const auto &page_item : page) {
			if (page_item[0] == key) {
				pos = page_item[1];
				ver = page_item[2];
			}
		}

		if (pos == SIZE_MAX) return "";

		return data_at_position(pos);
	}

	void hash_table_shard::for_each(std::function<void(uint64_t, std::string)> callback) const {
		std::ifstream infile(filename_data(), std::ios::binary);
		infile.seekg(0, std::ios::beg);

		while (!infile.eof()) {
			size_t key;
			if (!infile.read((char *)&key, sizeof(size_t))) break;
			
			size_t data_len;
			if (!infile.read((char *)&data_len, sizeof(size_t))) break;

			if (key == 0ull) {
				// Skip.
				infile.seekg(data_len, std::ios::cur);
				continue;
			}

			std::unique_ptr<char[]> buffer_allocator;
			try {
				buffer_allocator = std::make_unique<char[]>(data_len);
			} catch (std::bad_alloc &exception) {
				std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
				std::cout << "tried to allocate: " << data_len << " bytes" << std::endl;
				break;
			}
			char *buffer = buffer_allocator.get();

			infile.read(buffer, data_len);
			std::stringstream ss(std::string(buffer, data_len));

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(ss);

			std::stringstream decompressed;
			decompressed << decompress_stream.rdbuf();

			const std::string value = decompressed.str();

			callback(key, std::move(value));
		}
	}

	void hash_table_shard::for_each_key(std::function<void(uint64_t)> callback) const {
		std::ifstream infile(filename_data(), std::ios::binary);
		infile.seekg(0, std::ios::beg);

		while (!infile.eof()) {
			size_t key;
			if (!infile.read((char *)&key, sizeof(size_t))) break;
			
			size_t data_len;
			if (!infile.read((char *)&data_len, sizeof(size_t))) break;

			infile.seekg(data_len, std::ios::cur);

			callback(key);
		}
	}

	size_t hash_table_shard::shard_id() const {
		return m_shard_id;
	}

	size_t hash_table_shard::size() const {
		auto pages = this->read_pages();
		return std::transform_reduce(pages.cbegin(), pages.cend(), 0, [](auto a, auto b) { return a + b; }, [](const auto &p) { return p.size(); });
	}

	size_t hash_table_shard::file_size() const {
		std::ifstream infile(filename_data(), std::ios::binary);
		infile.seekg(0, std::ios::end);
		return infile.tellg();
	}

	std::string hash_table_shard::data_at_position(size_t pos) const {

		std::ifstream infile(filename_data(), std::ios::binary);
		infile.seekg(pos, std::ios::beg);

		// Read key
		uint64_t read_key;
		infile.read((char *)&read_key, sizeof(uint64_t));

		// Read data length.
		size_t data_len;
		infile.read((char *)&data_len, sizeof(size_t));

		std::unique_ptr<char[]> buffer_allocator;
		try {
			buffer_allocator = std::make_unique<char[]>(data_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << data_len << " bytes" << std::endl;
			return "";
		}
		char *buffer = buffer_allocator.get();

		infile.read(buffer, data_len);
		std::stringstream ss(std::string(buffer, data_len));

		boost::iostreams::filtering_istream decompress_stream;
		decompress_stream.push(boost::iostreams::gzip_decompressor());
		decompress_stream.push(ss);

		std::stringstream decompressed;
		decompressed << decompress_stream.rdbuf();

		return decompressed.str();
	}

}


================================================
FILE: src/hash_table2/hash_table_shard.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <map>
#include <vector>
#include <functional>

#include "config.h"
#include "hash_table_shard_base.h"

namespace hash_table2 {

	class hash_table_shard : public hash_table_shard_base {

		public:

			hash_table_shard(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000,
					const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table");
			~hash_table_shard();

			/*
			 * Checks if the key exists in the hash table.
			 * */
			bool has(uint64_t key) const;

			/*
			 * Finds a value for the given key. Returns empty string if key is not present.
			 * */
			std::string find(uint64_t key) const;

			/*
			 * Finds a value for the given key. Returns empty string if key is not present. Also sets version in 'ver'
			 * */
			std::string find(uint64_t key, size_t &ver) const;

			/*
			 * Loop over all elements in hash table shard and call the given function. 
			 * */
			void for_each(std::function<void(uint64_t, std::string)>) const;
			void for_each_key(std::function<void(uint64_t)>) const;

			/*
			 * Returns the id of the shard.
			 * */
			size_t shard_id() const;

			/*
			 * Returns the number of elements in the shard.
			 * */
			size_t size() const;

			/*
			 * Returns the size of the data file in bytes.
			 * */
			size_t file_size() const;

		private:

			std::string data_at_position(size_t pos) const;

	};

}


================================================
FILE: src/hash_table2/hash_table_shard_base.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <memory>
#include <array>
#include <vector>

namespace hash_table2 {

	class hash_table_shard_base {

		public:

			hash_table_shard_base(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000,
					const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table")
			: m_db_name(db_name), m_shard_id(shard_id), m_hash_table_size(hash_table_size), m_data_path(data_path) {}

			std::string file_base_data() const {
				const size_t disk_shard = m_shard_id % 8;
				std::string data_path = m_data_path;
				if (data_path.find("{shard_id_mod_8}") != std::string::npos) {
					data_path.replace(data_path.find("{shard_id_mod_8}"), 16, std::to_string(disk_shard));
				}
				return data_path + "/ht_" + m_db_name + "_" + std::to_string(m_shard_id);
			}

			std::string file_base() const {
				const size_t disk_shard = m_shard_id % 8;
				std::string data_path = config::data_path() + "/{shard_id_mod_8}/hash_table";
				if (data_path.find("{shard_id_mod_8}") != std::string::npos) {
					data_path.replace(data_path.find("{shard_id_mod_8}"), 16, std::to_string(disk_shard));
				}
				return data_path + "/ht_" + m_db_name + "_" + std::to_string(m_shard_id);
			}

			std::string filename_data() const {
				return file_base_data() + ".data";
			}

			std::string filename_pos() const {
				return file_base() + ".pos";
			}

			std::string filename_data_tmp() const {
				return file_base() + ".data.tmp";
			}

		protected:

			const std::string m_db_name;
			size_t m_shard_id;
			size_t m_hash_table_size;
			const std::string m_data_path;

			size_t hash_table_byte_size() const { return m_hash_table_size * sizeof(size_t); }

			std::vector<std::vector<std::array<uint64_t, 3>>> read_pages() const {
				std::ifstream infile(filename_pos(), std::ios::binary);
				return read_pages(infile);
			}

			std::vector<std::vector<std::array<uint64_t, 3>>> read_pages(std::ifstream &infile) const {
				
				const size_t max_records = 10000;
				const size_t record_len = sizeof(std::array<uint64_t, 3>);
				const size_t buffer_len = record_len * max_records;

				auto buffer_allocator = std::make_unique<char[]>(buffer_len);
				char *buffer = buffer_allocator.get();

				std::vector<std::vector<std::array<uint64_t, 3>>> ret(this->m_hash_table_size);

				if (infile.is_open()) {
					infile.seekg(this->hash_table_byte_size());

					do {
						size_t num_keys;
						infile.read((char *)&num_keys, sizeof(size_t));

						if (infile.eof()) break;

						if (num_keys > max_records) {
							break;
						}

						const size_t len = record_len * num_keys;
						infile.read(buffer, len);

						for (size_t i = 0; i < len; i += record_len) {
							const uint64_t key = *((uint64_t *)&buffer[i]);
							const size_t page_id = key % this->m_hash_table_size;
							const size_t pos = *((size_t *)&buffer[i + sizeof(uint64_t)]);
							const size_t version = *((size_t *)&buffer[i + sizeof(uint64_t) + sizeof(size_t)]);
							ret[page_id].emplace_back(std::array{key, (uint64_t)pos, (uint64_t)version});
						}

					} while (!infile.eof());
				}
			
				return ret;
			}

	};

}


================================================
FILE: src/hash_table2/hash_table_shard_builder.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <sstream>
#include "config.h"
#include "hash_table_shard_builder.h"
#include "logger/logger.h"
#include "file/file.h"
#include "indexer/merger.h"

#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>

namespace hash_table2 {

	hash_table_shard_builder::hash_table_shard_builder(const std::string &db_name, size_t shard_id, size_t hash_table_size,
		const std::string &data_path)
	: hash_table_shard_base(db_name, shard_id, hash_table_size, data_path)
	{
		indexer::merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
		indexer::merger::register_merger((size_t)this, [this]() {merge();});
	}

	hash_table_shard_builder::~hash_table_shard_builder() {
		indexer::merger::deregister_merger((size_t)this);
	}

	void hash_table_shard_builder::add(uint64_t key, const std::string &value, size_t version) {
		indexer::merger::lock();

		std::lock_guard guard(m_lock);

		auto ver_iter = m_version.find(key);
		if (version > 0 && ver_iter != m_version.end() && ver_iter->second > version) {
			// do nothing
		} else {
			m_data_size += value.capacity();
			m_cache[key] = value;
			m_version[key] = version;
		}
	}

	void hash_table_shard_builder::remove(uint64_t key) {
		m_remove_keys.push_back(key);
	}

	size_t hash_table_shard_builder::cache_size() const {
		// This is an OK approximation since m_data_size will be much larger than the keys.
		return m_cache.size() * sizeof(uint64_t) * 2 + m_data_size;
	}

	void hash_table_shard_builder::append() {

		std::lock_guard guard(m_lock);

		ofstream outfile(this->filename_data_tmp(), ios::binary | ios::app);

		for (const auto &iter : m_cache) {
			const size_t version = m_version[iter.first];
			outfile.write((char *)&iter.first, sizeof(uint64_t));
			outfile.write((char *)&version, sizeof(size_t));

			// Compress data
			std::stringstream ss(iter.second);

			boost::iostreams::filtering_istream compress_stream;
			compress_stream.push(boost::iostreams::gzip_compressor());
			compress_stream.push(ss);

			std::stringstream compressed;
			compressed << compress_stream.rdbuf();

			std::string compressed_string(compressed.str());

			const size_t data_len = compressed_string.size();
			outfile.write((char *)&data_len, sizeof(size_t));

			outfile.write(compressed_string.c_str(), data_len);
		}

		// Free RAM caches and set m_data_size to zero.
		m_cache = std::map<uint64_t, std::string>{};
		m_version = std::map<uint64_t, size_t>{};
		m_data_size = 0;
	}

	void hash_table_shard_builder::merge() {

		auto pages = this->read_pages();

		const size_t buffer_len = 1024*1024*20;
		
		std::unique_ptr<char[]> buffer_allocator;
		try {
			buffer_allocator = std::make_unique<char[]>(buffer_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << buffer_len << " bytes" << std::endl;
			return;
		}
		char *buffer = buffer_allocator.get();

		// Read append cache and add to pages + data file.
		std::ifstream infile(this->filename_data_tmp(), std::ios::binary);
		std::ofstream outfile(this->filename_data(), std::ios::binary | std::ios::app);

		size_t last_pos = outfile.tellp();

		while (!infile.eof()) {
			uint64_t key;
			if (!infile.read((char *)&key, sizeof(uint64_t))) break;

			size_t version;
			if (!infile.read((char *)&version, sizeof(size_t))) break;

			size_t data_len;
			if (!infile.read((char *)&data_len, sizeof(size_t))) break;

			if (data_len > buffer_len) {
				LOG_INFO("data_len " + std::to_string(data_len) + "is larger than buffer_len " + std::to_string(buffer_len) + " in file " + filename_data());
				infile.seekg(data_len, ios::cur);
				continue;
			} else {
				if (!infile.read(buffer, data_len)) break;
			}

			const size_t page_id = key % this->m_hash_table_size;
			const std::array elem{key, last_pos, version};

			auto insert_at = std::upper_bound(pages[page_id].begin(), pages[page_id].end(), elem, [](const auto &a, const auto &b) {
				return a[0] < b[0];
			});

			// insert_at points to the element after "elem"

			bool add_data = false;
			if (pages[page_id].size() == 0) {
				pages[page_id].push_back(elem);
				add_data = true;
			} else {

				const auto elem_at = *(insert_at - 1);
				if (elem_at[0] == elem[0]) {
					// If version is bigger on the new element. Replace element.
					if (elem_at[2] <= elem[2]) {
						*(insert_at - 1) = elem;
						add_data = true;
					}
				} else {
					pages[page_id].insert(insert_at, elem);
					add_data = true;
				}
			}

			if (add_data) {
				outfile.write((char *)&key, sizeof(uint64_t));
				outfile.write((char *)&data_len, sizeof(size_t));
				outfile.write(buffer, data_len);

				last_pos += data_len + sizeof(uint64_t) + sizeof(size_t);
			}
		}

		// Delete cache file.
		file::delete_file(this->filename_data_tmp());

		// Remove keys that are in m_remove_keys.
		remove_keys_from_pages(pages);
		m_remove_keys = std::vector<uint64_t>{};

		write_pages(pages);
	}

	void hash_table_shard_builder::optimize() {
		auto pages = this->read_pages();

		std::ifstream infile(this->filename_data(), std::ios::binary);
		std::ofstream outfile(this->filename_data_tmp(), std::ios::binary | std::ios::trunc);

		read_optimized_to(pages, infile, outfile);

		outfile.close();

		file::delete_file(filename_data());
		file::delete_file(filename_pos());

		merge();
	}

	void hash_table_shard_builder::truncate() {
		std::lock_guard guard(m_lock);
		ofstream outfile(this->filename_data(), ios::binary | ios::trunc);
		ofstream outfile_pos(this->filename_pos(), ios::binary | ios::trunc);

		file::delete_file(this->filename_data_tmp());
	}

	void hash_table_shard_builder::merge_with(const hash_table_shard_builder &other) {
		merge_with(other.filename_pos(), other.filename_data());
	}

	void hash_table_shard_builder::merge_with(const std::string &pos_file, const std::string &data_file) {

		std::ifstream other_posfile(pos_file, std::ios::binary);

		auto pages1 = this->read_pages();
		auto pages2 = this->read_pages(other_posfile);

		// Remove the pages in pages1 that have higher version number in pages2 and vise versa.
		for (size_t p = 0; p < pages1.size(); p++) {
			size_t i = 0, j = 0;
			while (i < pages1[p].size() && j < pages2[p].size()) {
				if (pages1[p][i][0] == pages2[p][j][0]) {
					if (pages1[p][i][2] < pages2[p][j][2]) {
						// delete pages1[p][i]
						pages1[p][i][1] = SIZE_MAX;
					} else {
						// delete pages2[p][j]
						pages2[p][j][1] = SIZE_MAX;
					}
					i++;
					j++;
				} else if (pages1[p][i][0] < pages2[p][j][0]) {
					i++;
				} else {
					j++;
				}
			}
		}

		std::ofstream outfile(this->filename_data_tmp(), std::ios::binary | std::ios::trunc);

		std::ifstream data_file_2(data_file, std::ios::binary);
		
		read_optimized_to(pages2, data_file_2, outfile);

		outfile.close();

		merge();
	}

	void hash_table_shard_builder::read_optimized_to(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages, std::ifstream &infile,
		std::ofstream &outfile) const {
		
		infile.seekg(0, std::ios::beg);

		while (!infile.eof()) {
			const size_t my_pos = infile.tellg();

			size_t key;
			if (!infile.read((char *)&key, sizeof(size_t))) break;
			
			size_t data_len;
			if (!infile.read((char *)&data_len, sizeof(size_t))) break;

			const size_t page_id = key % this->m_hash_table_size;

			std::array elem{key, (uint64_t)0, (uint64_t)0};

			auto iter = std::upper_bound(pages[page_id].cbegin(), pages[page_id].cend(), elem, [](const auto &a, const auto &b) {
				return a[0] < b[0];
			});

			if (pages[page_id].size() == 0) {
				// Skip. Did not find key.
				infile.seekg(data_len, std::ios::cur);
				continue;
			}

			elem = *(iter - 1);

			if (elem[0] == key && elem[1] == my_pos) {

				std::unique_ptr<char[]> buffer_allocator;
				try {
					buffer_allocator = std::make_unique<char[]>(data_len);
				} catch (std::bad_alloc &exception) {
					std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
					std::cout << "tried to allocate: " << data_len << " bytes" << std::endl;
					break;
				}
				char *buffer = buffer_allocator.get();

				infile.read(buffer, data_len);

				// Keep this data.
				const size_t version = elem[2];
				outfile.write((char *)&key, sizeof(uint64_t));
				outfile.write((char *)&version, sizeof(size_t));
				outfile.write((char *)&data_len, sizeof(size_t));
				outfile.write(buffer, data_len);
			} else {
				// Ignore data.
				infile.seekg(data_len, std::ios::cur);
			}
		}
	}

	void hash_table_shard_builder::write_pages(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages) {

		std::ofstream key_writer(this->filename_pos(), std::ios::binary | std::ios::trunc);

		const size_t page_item_size = sizeof(std::array<uint64_t, 3>);
		const size_t empty_key = SIZE_MAX;

		size_t last_pos = 0;
		for (size_t page_id = 0; page_id < pages.size(); page_id++) {
			const size_t page_len = pages[page_id].size();
			if (page_len) {
				key_writer.write((char *)&last_pos, sizeof(size_t));
				last_pos += pages[page_id].size() * page_item_size + sizeof(size_t);
			} else {
				key_writer.write((char *)&empty_key, sizeof(size_t));
			}
		}

		// Write pages.
		for (size_t page_id = 0; page_id < pages.size(); page_id++) {
			const size_t page_len = pages[page_id].size();
			if (page_len) {
				key_writer.write((char *)&page_len, sizeof(size_t));
				for (const auto &page_item : pages[page_id]) {
					key_writer.write((char *)&page_item, page_item_size);
				}
			}
		}
	}

	void hash_table_shard_builder::remove_keys_from_pages(std::vector<std::vector<std::array<uint64_t, 3>>> &pages) {
		for (auto key : m_remove_keys) {

			const size_t page_id = key % this->m_hash_table_size;

			std::array elem{key, (uint64_t)0, (uint64_t)0};

			auto iter = std::upper_bound(pages[page_id].cbegin(), pages[page_id].cend(), elem, [](const auto &a, const auto &b) {
				return a[0] < b[0];
			});

			iter--;

			if ((*iter)[0] == key) {
				// remove the key from the page.
				pages[page_id].erase(iter);
			}
		}
	}

}


================================================
FILE: src/hash_table2/hash_table_shard_builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <map>
#include <mutex>

#include "hash_table.h"
#include "hash_table_shard_base.h"

namespace hash_table2 {

	/*
	 * Implementation of a hash table shard.
	 *
	 * usage:
	 * hash_table_shard shard("test_db", 0);
	 * shard.add(12345, "test data", 3);
	 * shard.add(12345, "new test data", 4);
	 *
	 * shard.append();
	 * shard.merge();
	 *
	 * */

	class hash_table_shard_builder : public hash_table_shard_base {

		public:

			hash_table_shard_builder(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000,
					const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table");
			~hash_table_shard_builder();

			/*
			 * Add key/value pair to hash table.
			 * */
			void add(uint64_t key, const std::string &value, size_t version = 0);

			/*
			 * Remove key from hash table.
			 * */
			void remove(uint64_t key);

			/*
			 * Return approximation of amount of memory in cache.
			 * */
			size_t cache_size() const;

			/*
			 * Write memory cache to disc cache.
			 * */
			void append();

			/*
			 * Write disc cache to persistant hash table.
			 * */
			void merge();

			/*
			 * Optimize persistant has table to remove data for unused versions.
			 * */
			void optimize();

			/*
			 * Delete all data in shard.
			 * */
			void truncate();

			/*
			 * Merge with another shard. Handles key collisions by keeping the one with highest version.
			 * */
			void merge_with(const hash_table_shard_builder &other);

			/*
			 * Merge with another pos and datafile.
			 * */
			void merge_with(const std::string &pos_file, const std::string &data_file);

		private:

			std::map<uint64_t, std::string> m_cache;
			std::map<uint64_t, size_t> m_version;
			std::vector<uint64_t> m_remove_keys;

			std::map<uint64_t, size_t> m_sort_pos;
			std::mutex m_lock;
			size_t m_data_size = 0;

			void read_optimized_to(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages, std::ifstream &infile, std::ofstream &outfile) const;
			void write_pages(const std::vector<std::vector<std::array<uint64_t, 3>>> &pages);
			void remove_keys_from_pages(std::vector<std::vector<std::array<uint64_t, 3>>> &pages);

	};

}


================================================
FILE: src/hash_table_helper/hash_table_helper.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "config.h"
#include "hash_table_helper.h"
#include "logger/logger.h"

namespace hash_table_helper {

	void truncate(const std::string &hash_table_name) {
		std::vector<hash_table2::hash_table_shard_builder *> shards = create_shard_builders(hash_table_name);

		for (auto shard : shards) {
			shard->truncate();
		}

		delete_shard_builders(shards);
	}

	std::vector<hash_table2::hash_table_shard_builder *> create_shard_builders(const std::string &hash_table_name) {
		std::vector<hash_table2::hash_table_shard_builder *> shards;
		for (size_t shard_id = 0; shard_id < config::ht_num_shards; shard_id++) {
			shards.push_back(new hash_table2::hash_table_shard_builder(hash_table_name, shard_id));
		}

		return shards;
	}

	void delete_shard_builders(std::vector<hash_table2::hash_table_shard_builder *> &shards) {
		for (auto shard : shards) {
			delete shard;
		}

		shards.clear();
	}

}


================================================
FILE: src/hash_table_helper/hash_table_helper.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include "hash_table2/hash_table.h"
#include "hash_table2/hash_table_shard_builder.h"

namespace hash_table_helper {

	void truncate(const std::string &hash_table_name);
	std::vector<hash_table2::hash_table_shard_builder *> create_shard_builders(const std::string &hash_table_name);
	void delete_shard_builders(std::vector<hash_table2::hash_table_shard_builder *> &shards);

}


================================================
FILE: src/http/request.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "request.h"

namespace http {

	request::request(const URL &url, std::string request_method, std::string request_body)
	: m_url(url), m_request_method(request_method), m_request_body(request_body) {
	
	}

}


================================================
FILE: src/http/request.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include "URL.h"

namespace http {

	class request {
		public:
			request(const URL &url, std::string request_method = "POST", std::string request_body = "");

			const URL& url() const { return m_url; }
			const std::string &request_method() const { return m_request_method; }
			const std::string &request_body() const { return m_request_body; }

		private:
			size_t m_code = 200;
			URL m_url;
			std::string m_request_method;
			std::string m_request_body;

	};

}


================================================
FILE: src/http/response.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace http {

	class response {
		public:

			void code(size_t code) { m_code = code; }
			size_t code() const { return m_code; }

			void body(const std::string &body) { m_body = body; }
			const std::string &body() const { return m_body; }

			void content_type(const std::string &content_type) { m_content_type = content_type; }
			const std::string &content_type() const { return m_content_type; }

		private:
			size_t m_code = 200;
			std::string m_body = "";
			std::string m_content_type = "text/html";

	};

}


================================================
FILE: src/http/server.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "server.h"
#include "fcgio.h"
#include "logger/logger.h"
#include "URL.h"

#include <thread>
#include <vector>

namespace http {

	server::server(std::function<http::response(const http::request &)> handler) {
		m_handler = handler;

		start();
	}

	void server::run_worker(int socket_id) {

		const size_t max_post_len = 1024*1024*1024;
		const size_t buffer_len = 1024*1024;
		std::unique_ptr<char[]> buffer_allocator = std::make_unique<char[]>(buffer_len);
		char *buffer = buffer_allocator.get();

		FCGX_Request request;

		FCGX_InitRequest(&request, socket_id, 0);

		LOG_INFO("Server has started...");

		while (true) {

			m_lock.lock();
			int accept_response = FCGX_Accept_r(&request);
			m_lock.unlock();

			if (accept_response < 0) {
				break;
			}

			const char *uri_ptr = FCGX_GetParam("REQUEST_URI", request.envp);
			const char *req_ptr = FCGX_GetParam("REQUEST_METHOD", request.envp);
			if ((uri_ptr == nullptr) || (req_ptr == nullptr)) {
				FCGX_Finish_r(&request);
				continue;
			}
			std::string uri(uri_ptr);
			std::string request_method(req_ptr);

			LOG_INFO("Serving request: " + uri);

			URL url("http://alexandria.org" + uri);

			std::string post_data;
			if (request_method == "POST") {
				while (true) {

					const size_t read_bytes = FCGX_GetStr(buffer, buffer_len, request.in);
					if (read_bytes == 0) break;

					if (post_data.size() + read_bytes > max_post_len) {
						LOG_ERROR("Posted data larger then " + std::to_string(max_post_len) + ", ignoring request");
						break;
					}
					post_data.append(buffer, read_bytes);
				}
			}

			::http::request http_request(url, request_method, post_data);

			::http::response http_response = m_handler(http_request);

			const std::string data_out = http_response.body();

			// Output response
			const std::string content_type = std::string("Content-type: ") + http_response.content_type() + "\r\n";
			const std::string status = std::string("Status: ") + std::to_string(http_response.code()) + "\r\n";
			const std::string end_req = "\r\n";

			FCGX_FPrintF(request.out, status.c_str());
			FCGX_FPrintF(request.out, content_type.c_str());
			FCGX_FPrintF(request.out, end_req.c_str());
			FCGX_PutStr(data_out.c_str(), data_out.size(), request.out);

			FCGX_Finish_r(&request);
		}

		FCGX_Free(&request, true);
	}

	void server::start() {
		FCGX_Init();

		int socket_id = FCGX_OpenSocket("127.0.0.1:8000", 20);
		if (socket_id < 0) {
			LOG_INFO("Could not open socket, exiting");
			return;
		}

		std::vector<std::thread> threads;

		for (size_t i = 0; i < m_workers; i++) {
			threads.emplace_back(std::move(std::thread([this](int socket_id){ run_worker(socket_id); }, socket_id)));
		}

		for (auto &thread : threads) {
			thread.join();
		}

		close(socket_id);
	}

}


================================================
FILE: src/http/server.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <mutex>
#include <functional>
#include "request.h"
#include "response.h"

namespace http {

	class server {
		public:
			server(std::function<::http::response(const ::http::request &)> handler);

		private:
			std::function<::http::response(const ::http::request &)> m_handler;
			size_t m_port = 8080;
			size_t m_workers = 8;
			std::mutex m_lock;

			void run_worker(int socket_id);
			void start();
	};

}


================================================
FILE: src/indexer/basic_index.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "index_reader.h"
#include "index_base.h"
#include <vector>

namespace indexer {

	template<typename data_record>
	class basic_index : public index_base<data_record> {

	public:

		explicit basic_index(const std::string &file_name);
		explicit basic_index(const std::string &db_name, size_t id);
		explicit basic_index(const std::string &db_name, size_t id, size_t hash_table_size);
		explicit basic_index(std::istream *reader, size_t hash_table_size);
		~basic_index();

		std::vector<data_record> find(uint64_t key) const;
		std::vector<data_record> find(uint64_t key, size_t limit) const;

		std::unique_ptr<data_record[]> find_ptr(uint64_t key, size_t &num_records) const;
		std::unique_ptr<data_record[]> find_ptr(uint64_t key, size_t limit, size_t &num_records) const;
		size_t find_count(uint64_t key) const;

		/*
		 * Iterates the keys of the index and calls the callback with key and vector of records for that key.
		 * */
		void for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const;
		void for_each_key(std::function<void(uint64_t key)> on_each_key) const;

	private:

		mutable std::istream *m_reader;
		std::unique_ptr<std::ifstream> m_default_reader;
		
		std::string m_file_name;
		std::string m_db_name;
		size_t m_id;
		size_t m_unique_count = 0;

		size_t read_key_pos(uint64_t key) const;
		void read_meta();
		std::string mountpoint() const;
		std::string filename() const;
		std::string meta_filename() const;
		
	};

	template<typename data_record>
	basic_index<data_record>::basic_index(const std::string &file_name)
	: index_base<data_record>(), m_file_name(file_name) {
		m_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);
		m_reader = m_default_reader.get();
	}

	template<typename data_record>
	basic_index<data_record>::basic_index(const std::string &db_name, size_t id)
	: index_base<data_record>(), m_db_name(db_name), m_id(id) {
		m_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);
		m_reader = m_default_reader.get();
	}

	template<typename data_record>
	basic_index<data_record>::basic_index(const std::string &db_name, size_t id, size_t hash_table_size)
	: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id) {
		m_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);
		m_reader = m_default_reader.get();
	}

	template<typename data_record>
	basic_index<data_record>::basic_index(std::istream *reader, size_t hash_table_size)
	: index_base<data_record>(hash_table_size) {
		m_reader = reader;
	}

	template<typename data_record>
	basic_index<data_record>::~basic_index() {
	}

	template<typename data_record>
	std::vector<data_record> basic_index<data_record>::find(uint64_t key) const {
		return find(key, 0);
	}

	template<typename data_record>
	std::vector<data_record> basic_index<data_record>::find(uint64_t key, size_t limit) const {

		std::lock_guard lock(this->m_lock);

		size_t num_records;
		unique_ptr<data_record[]> ptr = find_ptr(key, limit, num_records);

		std::vector<data_record> ret;
		for (size_t i = 0; i < num_records; i++) {
			ret.push_back(ptr[i]);
		}

		return ret;
		
	}

	template<typename data_record>
	std::unique_ptr<data_record[]> basic_index<data_record>::find_ptr(uint64_t key, size_t &num_records) const {
		return find_ptr(key, 0, num_records);
	}

	template<typename data_record>
	std::unique_ptr<data_record[]> basic_index<data_record>::find_ptr(uint64_t key, size_t limit, size_t &num_records) const {

		std::lock_guard lock(this->m_lock);

		num_records = 0;

		size_t key_pos = read_key_pos(key);

		if (key_pos == SIZE_MAX) {
			return {};
		}

		// Read page.
		m_reader->seekg(key_pos);
		size_t num_keys;
		m_reader->read((char *)&num_keys, sizeof(size_t));

		std::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);
		uint64_t *keys = keys_allocator.get();
		m_reader->read((char *)keys, num_keys * sizeof(uint64_t));

		size_t key_data_pos = SIZE_MAX;
		for (size_t i = 0; i < num_keys; i++) {
			if (keys[i] == key) {
				key_data_pos = i;
			}
		}

		if (key_data_pos == SIZE_MAX) {
			return {};
		}

		char buffer[64];

		// Read position and length.
		m_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8);
		m_reader->read(buffer, 8);
		size_t pos = *((size_t *)(&buffer[0]));

		m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8);
		m_reader->read(buffer, 8);
		size_t len = *((size_t *)(&buffer[0]));

		m_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos);

		num_records = len / sizeof(data_record);

		if (limit && num_records > limit) {
			num_records = limit;
			len = num_records * sizeof(data_record);
		}

		std::unique_ptr<data_record[]> ret = std::make_unique<data_record[]>(num_records);

		m_reader->read((char *)ret.get(), len);

		return ret;
	}

	template<typename data_record>
	size_t basic_index<data_record>::find_count(uint64_t key) const {

		std::lock_guard lock(this->m_lock);

		size_t key_pos = read_key_pos(key);

		if (key_pos == SIZE_MAX) {
			return 0;
		}

		// Read page.
		m_reader->seekg(key_pos);
		size_t num_keys;
		m_reader->read((char *)&num_keys, sizeof(size_t));

		std::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);
		uint64_t *keys = keys_allocator.get();
		m_reader->read((char *)keys, num_keys * sizeof(uint64_t));

		size_t key_data_pos = SIZE_MAX;
		for (size_t i = 0; i < num_keys; i++) {
			if (keys[i] == key) {
				key_data_pos = i;
			}
		}

		if (key_data_pos == SIZE_MAX) {
			return 0;
		}

		char buffer[64];

		// Read length only.
		m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8);
		m_reader->read(buffer, 8);
		size_t len = *((size_t *)(&buffer[0]));

		return len / sizeof(data_record);
	}

	/*
	 * Iterates the keys of the index and calls the callback with key and vector of records for that key.
	 * */
	template<typename data_record>
	void basic_index<data_record>::for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const {

		std::ifstream reader(filename(), std::ios::binary);
		reader.seekg(this->hash_table_byte_size(), std::ios::beg);

		std::map<uint64_t, std::vector<data_record>> page;
		while (this->read_page_into(reader, page)) {
			for (auto &iter : page) {
				on_each_key(iter.first, iter.second);
			}
			page.clear();
		}
		
	}

	/*
	 * Reads the exact position of the key, returns SIZE_MAX if the key was not found.
	 * */
	template<typename data_record>
	size_t basic_index<data_record>::read_key_pos(uint64_t key) const {

		if (this->m_hash_table_size == 0) return 0;

		const size_t hash_pos = key % this->m_hash_table_size;

		if (!m_reader->seekg(hash_pos * sizeof(size_t))) return SIZE_MAX;

		size_t pos;
		m_reader->read((char *)&pos, sizeof(size_t));

		return pos;
	}

	/*
	 * Reads the count of unique recprds from the count file and puts it in the m_unique_count member.
	 * */
	template<typename data_record>
	void basic_index<data_record>::read_meta() {
		struct meta {
			size_t unique_count;
		};

		meta m;

		std::ifstream meta_reader(meta_filename(), std::ios::binary);

		if (meta_reader.is_open()) {
			meta_reader.read((char *)(&m), sizeof(meta));
		}

		m_unique_count = m.unique_count;
	}

	template<typename data_record>
	std::string basic_index<data_record>::mountpoint() const {
		return std::to_string(m_id % 8);
	}

	template<typename data_record>
	std::string basic_index<data_record>::filename() const {
		if (m_file_name != "") return m_file_name + ".data";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".data";
	}

	template<typename data_record>
	std::string basic_index<data_record>::meta_filename() const {
		if (m_file_name != "") return m_file_name + ".meta";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".meta";
	}

}


================================================
FILE: src/indexer/basic_index_builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>
#include <map>
#include <set>
#include <unordered_set>
#include <cstring>
#include <cassert>
#include <boost/filesystem.hpp>
#include "merger.h"
#include "score_builder.h"
#include "algorithm/hyper_log_log.h"
#include "config.h"
#include "profiler/profiler.h"
#include "logger/logger.h"
#include "memory/debugger.h"
#include "file/file.h"
#include "index_base.h"

namespace indexer {

	template<typename data_record>
	class basic_index_builder : public index_base<data_record>{
	private:
		// Non copyable
		basic_index_builder(const basic_index_builder &);
		basic_index_builder& operator=(const basic_index_builder &);
	public:

		basic_index_builder(const std::string &file_name);
		basic_index_builder(const std::string &db_name, size_t id);
		basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size);
		basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results);
		~basic_index_builder();

		void add(uint64_t key, const data_record &record);
		size_t cache_size() const;
		
		void append();
		void merge();
		void transform(const std::function<data_record(const data_record &, size_t)> &transform);
		void sort_by(const std::function<bool(const data_record &a, const data_record &b)> sort_by);

		void truncate();
		void truncate_cache_files();
		void create_directories();

	private:

		std::string m_file_name;
		std::string m_db_name;
		const size_t m_id;

		const size_t m_max_results;

		const size_t m_buffer_len = config::ft_shard_builder_buffer_len;
		char *m_buffer;
		std::mutex m_lock;

		// Caches
		std::vector<uint64_t> m_key_cache;
		std::vector<data_record> m_record_cache;

		std::map<uint64_t, vector<data_record>> m_cache;

		void read_append_cache();
		void read_data_to_cache();
		void sort_cache();
		void sort_record_list(uint64_t key, std::vector<data_record> &records);
		void reset_cache_variables();
		void save_file();
		void write_key(std::ofstream &key_writer, uint64_t key, size_t page_pos);
		size_t write_page(std::ofstream &writer, const std::vector<uint64_t> &keys);
		void reset_key_map(std::ofstream &key_writer);

		std::string mountpoint() const;
		std::string cache_filename() const;
		std::string key_cache_filename() const;
		std::string target_filename() const;
		std::string meta_filename() const;

	};

	template<typename data_record>
	basic_index_builder<data_record>::basic_index_builder(const std::string &file_name)
	: index_base<data_record>(), m_file_name(file_name), m_id(0),
		m_max_results(config::ft_max_results_per_section)
	{
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	basic_index_builder<data_record>::basic_index_builder(const std::string &db_name, size_t id)
	: index_base<data_record>(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	basic_index_builder<data_record>::basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size)
	: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {
		merger::register_merger((size_t)this, [this]() {append();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	basic_index_builder<data_record>::basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results)
	: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(max_results) {
		merger::register_merger((size_t)this, [this]() {append();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	basic_index_builder<data_record>::~basic_index_builder() {
		merger::deregister_merger((size_t)this);
	}

	template<typename data_record>
	void basic_index_builder<data_record>::add(uint64_t key, const data_record &record) {

		indexer::merger::lock();

		m_lock.lock();

		// Amortized constant
		m_key_cache.push_back(key);
		m_record_cache.push_back(record);

		assert(m_record_cache.size() == m_key_cache.size());

		m_lock.unlock();

	}

	/*
	 * Returns the allocated size of the cache (m_key_cache and m_record_cache).
	 * */
	template<typename data_record>
	size_t basic_index_builder<data_record>::cache_size() const {
		return m_key_cache.capacity() * sizeof(uint64_t) + m_record_cache.capacity() * sizeof(data_record);
	}

	template<typename data_record>
	void basic_index_builder<data_record>::append() {

		assert(m_record_cache.size() == m_key_cache.size());

		std::ofstream record_writer(cache_filename(), std::ios::binary | std::ios::app);
		if (!record_writer.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + cache_filename() + "). Error: " +
				std::string(strerror(errno)));
		}

		std::ofstream key_writer(key_cache_filename(), std::ios::binary | std::ios::app);
		if (!key_writer.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + key_cache_filename() + "). Error: " +
				std::string(strerror(errno)));
		}

		record_writer.write((const char *)m_record_cache.data(), m_record_cache.size() * sizeof(data_record));
		key_writer.write((const char *)m_key_cache.data(), m_key_cache.size() * sizeof(uint64_t));

		m_record_cache.clear();
		m_key_cache.clear();
		m_record_cache.shrink_to_fit();
		m_key_cache.shrink_to_fit();
	}

	template<typename data_record>
	void basic_index_builder<data_record>::merge() {

		{
			read_append_cache();
			sort_cache();
			save_file();
			truncate_cache_files();
		}

	}

	/*
		Transforms all the bitmaps in the index. Basically generating new bitmaps with the transform applied.
	*/
	template<typename data_record>
	void basic_index_builder<data_record>::transform(const std::function<data_record(const data_record &, size_t)> &transform) {

		read_data_to_cache();

		// Apply transforms.
		for (auto &iter : m_cache) {
			for (size_t i = 0; i < iter.second.size(); i++) {
				iter.second[i] = transform(iter.second[i], iter.second.size());
			}
		}

		save_file();
		truncate_cache_files();
	}

	template<typename data_record>
	void basic_index_builder<data_record>::sort_by(const std::function<bool(const data_record &a, const data_record &b)> comp) {
		read_data_to_cache();

		for (auto &iter : m_cache) {
			sort(iter.second.begin(), iter.second.end(), comp);
		}

		save_file();
		truncate_cache_files();
	}

	/*
		Deletes ALL data from this shard.
	*/
	template<typename data_record>
	void basic_index_builder<data_record>::truncate() {
		create_directories();
		truncate_cache_files();

		std::ofstream target_writer(target_filename(), std::ios::trunc);
		target_writer.close();
	}

	/*
		Deletes all data from caches.
	*/
	template<typename data_record>
	void basic_index_builder<data_record>::truncate_cache_files() {

		reset_cache_variables();

		file::delete_file(cache_filename());
		file::delete_file(key_cache_filename());
	}

	template<typename data_record>
	void basic_index_builder<data_record>::create_directories() {
		for (size_t i = 0; i < 8; i++) {
			boost::filesystem::create_directories(config::data_path() + "/" + std::to_string(i) + "/full_text/" +
				m_db_name);
		}
	}

	template<typename data_record>
	void basic_index_builder<data_record>::read_append_cache() {

		// Read the current file.
		read_data_to_cache();

		//profiler::instance prof("index_builder::read_append_cache");

		// Read the cache into memory.
		std::ifstream reader(cache_filename(), std::ios::binary);
		if (!reader.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + cache_filename() + "). Error: " + std::string(strerror(errno)));
		}

		std::ifstream key_reader(key_cache_filename(), std::ios::binary);
		if (!key_reader.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + key_cache_filename() + "). Error: " + std::string(strerror(errno)));
		}

		const size_t buffer_len = 10000;

		std::unique_ptr<data_record[]> buffer_allocator;
		try {
			buffer_allocator = std::make_unique<data_record[]>(buffer_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << buffer_len * sizeof(data_record) << " bytes" << std::endl;
			return;
		}

		std::unique_ptr<uint64_t[]> key_buffer_allocator;
		try {
			key_buffer_allocator = std::make_unique<uint64_t[]>(buffer_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << buffer_len * sizeof(uint64_t) << " bytes" << std::endl;
			return;
		}

		data_record *buffer = buffer_allocator.get();
		uint64_t *key_buffer = key_buffer_allocator.get();

		reader.seekg(0, std::ios::beg);

		unordered_map<uint64_t, uint32_t> internal_id_map; 
		unordered_map<uint64_t, vector<uint32_t>> bitmap_data;

		while (!reader.eof()) {

			reader.read((char *)buffer, buffer_len * sizeof(data_record));
			key_reader.read((char *)key_buffer, buffer_len * sizeof(uint64_t));

			const size_t read_bytes = reader.gcount();
			const size_t num_records = read_bytes / sizeof(data_record);

			for (size_t i = 0; i < num_records; i++) {
				m_cache[key_buffer[i]].push_back(buffer[i]);
			}
		}
	}

	/*
	 * Reads the file into RAM.
	 * */
	template<typename data_record>
	void basic_index_builder<data_record>::read_data_to_cache() {

		reset_cache_variables();

		std::ifstream reader(target_filename(), std::ios::binary);
		if (!reader.is_open()) return;

		reader.seekg(0, std::ios::end);
		const size_t file_size = reader.tellg();
		if (file_size <= this->hash_table_byte_size()) return;
		reader.seekg(this->hash_table_byte_size(), std::ios::beg);

		while (this->read_page_into(reader, m_cache)) {
		}
	}

	template<typename data_record>
	void basic_index_builder<data_record>::sort_cache() {
		for (auto &iter : m_cache) {
			sort_record_list(iter.first, iter.second);
		}
	}

	template<typename data_record>
	void basic_index_builder<data_record>::sort_record_list(uint64_t key, std::vector<data_record> &records) {

		// Sort records.
		std::sort(records.begin(), records.end());

		// Sum equal elements.
		for (size_t i = 0, j = 1; i < records.size() && j < records.size(); j++) {
			if (records[i] != records[j]) {
				i = j;
			} else {
				records[i] += records[j];
			}
		}

		// Delete consecutive equal elements. Only keeping the first unique.
		auto last = std::unique(records.begin(), records.end());
		records.erase(last, records.end());


		if (records.size() > m_max_results) {
			// Sort before truncation
			std::sort(records.begin(), records.end(), typename data_record::truncate_order());
			records.resize(config::ft_max_results_per_section);

			// Future fix here is to add hyper log log counting for words with too many urls.
		}

		std::sort(records.begin(), records.end());
	}

	template<typename data_record>
	void basic_index_builder<data_record>::reset_cache_variables() {
		m_cache = std::map<uint64_t, vector<data_record>>{};
	}

	template<typename data_record>
	void basic_index_builder<data_record>::save_file() {

		//profiler::instance prof("index_builder::save_file");

		std::ofstream writer(target_filename(), std::ios::binary | std::ios::trunc);
		if (!writer.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard. Error: " + std::string(strerror(errno)));
		}

		reset_key_map(writer);

		std::map<uint64_t, std::vector<uint64_t>> pages;
		for (auto &iter : m_cache) {
			if (this->m_hash_table_size) {
				pages[iter.first % this->m_hash_table_size].push_back(iter.first);
			} else {
				pages[0].push_back(iter.first);
			}
		}

		for (const auto &iter : pages) {
			size_t page_pos = write_page(writer, iter.second);
			write_key(writer, iter.first, page_pos);
			writer.flush();
		}
	}

	template<typename data_record>
	void basic_index_builder<data_record>::write_key(std::ofstream &key_writer, uint64_t key, size_t page_pos) {
		if (this->m_hash_table_size > 0) {
			assert(key < this->m_hash_table_size);
			key_writer.seekp(key * sizeof(uint64_t));
			key_writer.write((char *)&page_pos, sizeof(size_t));
		}
	}

	/*
	 * Writes the page with keys, appending it to the file stream writer.
	 * */
	template<typename data_record>
	size_t basic_index_builder<data_record>::write_page(std::ofstream &writer, const std::vector<uint64_t> &keys) {

		writer.seekp(0, ios::end);

		const size_t page_pos = writer.tellp();

		size_t num_keys = keys.size();

		writer.write((char *)&num_keys, 8);
		writer.write((char *)keys.data(), keys.size() * 8);

		std::vector<size_t> v_pos;
		std::vector<size_t> v_len;

		size_t pos = 0;
		for (uint64_t key : keys) {

			// Store position and length
			const size_t len = m_cache[key].size() * sizeof(data_record);
			
			v_pos.push_back(pos);
			v_len.push_back(len);

			pos += len;
		}
		
		writer.write((char *)v_pos.data(), keys.size() * 8);
		writer.write((char *)v_len.data(), keys.size() * 8);

		// Write data.
		size_t i = 0;
		for (uint64_t key : keys) {
			const size_t len = v_len[i];
			writer.write((char *)m_cache[key].data(), len);
			i++;
		}

		return page_pos;
	}

	template<typename data_record>
	void basic_index_builder<data_record>::reset_key_map(std::ofstream &key_writer) {
		key_writer.seekp(0);
		uint64_t data = SIZE_MAX;
		for (size_t i = 0; i < this->m_hash_table_size; i++) {
			key_writer.write((char *)&data, sizeof(uint64_t));
		}
	}

	template<typename data_record>
	std::string basic_index_builder<data_record>::mountpoint() const {
		return std::to_string(m_id % 8);
	}

	template<typename data_record>
	std::string basic_index_builder<data_record>::cache_filename() const {
		if (m_file_name != "") return m_file_name + ".cache";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".cache";
	}

	template<typename data_record>
	std::string basic_index_builder<data_record>::key_cache_filename() const {
		if (m_file_name != "") return m_file_name + ".cache.keys";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".cache.keys";
	}

	template<typename data_record>
	std::string basic_index_builder<data_record>::target_filename() const {
		if (m_file_name != "") return m_file_name + ".data";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".data";
	}

}


================================================
FILE: src/indexer/console.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "console.h"
#include <vector>
#include <iomanip>
#include "text/text.h"
#include "indexer/index_manager.h"
#include "indexer/sharded.h"
#include "indexer/basic_index.h"
#include "indexer/counted_record.h"
#include "URL.h"
#include "transfer/transfer.h"
#include "domain_stats/domain_stats.h"
#include "merger.h"
#include "file/tsv_file_remote.h"
#include "algorithm/bloom_filter.h"
#include "parser/parser.h"
#include "http/server.h"
#include "json.hpp"

namespace indexer {

	void cmd_index(index_manager &idx_manager, const std::vector<std::string> &args) {
		if (args.size() < 2) return;

		merger::start_merge_thread();

		const auto batch = args[1];
		size_t limit = 0;
		if (args.size() > 2) limit = stoull(args[2]);

		file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths.gz");
		std::vector<std::string> warc_paths;
		warc_paths_file.read_column_into(0, warc_paths);

		if (limit && warc_paths.size() > limit) warc_paths.resize(limit);

		for (auto &path : warc_paths) {
			const size_t pos = path.find(".warc.gz");
			if (pos != std::string::npos) {
				path.replace(pos, 8, ".gz");
			}
		}
		auto local_files = transfer::download_gz_files_to_disk(warc_paths);
		cout << "starting indexer" << endl;
		idx_manager.add_index_files_threaded(local_files, 24);
		cout << "done with indexer" << endl;
		transfer::delete_downloaded_files(local_files);

		merger::stop_merge_thread();
	}

	void cmd_search(index_manager &idx_manager, hash_table2::hash_table &ht, hash_table2::hash_table &url_ht, const std::string &query) {

		profiler::instance prof("domain search");
		std::vector<indexer::return_record> res = idx_manager.find(query);
		prof.stop();

		cout << "took " << prof.get() << "ms" << endl;

		cout << setw(50) << "domain";
		cout << setw(20) << "score";
		cout << endl;

		std::vector<uint64_t> domain_hashes;

		for (indexer::return_record &rec : res) {
			const auto host = ht.find(rec.m_value);
			domain_hashes.push_back(rec.m_value);

			cout << setw(50) << host;
			cout << setw(20) << rec.m_score;
			cout << endl;
		}

		profiler::instance prof2("url searches");

		cout << "sending " << domain_hashes.size() << " domain hashes" << endl;

		http::response http_res = transfer::post("http://65.108.132.103/?q=" + parser::urlencode(query), std::string((char *)domain_hashes.data(), domain_hashes.size() * sizeof(uint64_t)));

		const auto url_res = http_res.body();

		std::stringstream ss(url_res);

		std::map<uint64_t, std::vector<url_record>> results;
		while (!ss.eof()) {
			uint64_t incoming_domain_hash;
			ss.read((char *)&incoming_domain_hash, sizeof(uint64_t));
			if (ss.eof()) break;
			size_t num_records;
			ss.read((char *)&num_records, sizeof(size_t));
			for (size_t i = 0; i < num_records; i++) {
				uint64_t value;
				float score;
				ss.read((char *)&value, sizeof(uint64_t));
				ss.read((char *)&score, sizeof(float));
				results[incoming_domain_hash].push_back(url_record(value, score));
			}
		}

		for (auto domain_hash : domain_hashes) {
			for (const auto &url_record : results[domain_hash]) {
				const auto &line = url_ht.find(url_record.m_value);
				std::vector<std::string> cols;

				boost::algorithm::split(cols, line, boost::is_any_of("\t"));
				const auto url = cols[0];
				const auto title = cols[1];
				const auto snippet = cols[4];

				std::cout << url << std::endl;
			}
		}

		cout << "took " << prof2.get() << "ms" << endl;

		cout << "got " << results.size() << " responses" << endl;

	}

	void cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query) {

		indexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> word_index_builder("word_index", 256);
		indexer::sharded<indexer::basic_index, indexer::counted_record> word_index("word_index", 256);

		const uint64_t word_hash = ::algorithm::hash(query);
		std::vector<indexer::counted_record> res = word_index.find(word_hash, 100000);

		size_t pos = 0;
		for (auto &rec : res) {
			const auto host = ht.find(rec.m_value);
			cout << host << ": " << rec.m_count << " score: " << rec.m_score << " pos: " << pos << " m_value: " << rec.m_value << " doc_size: " << word_index_builder.document_size(rec.m_value) << endl;
			pos++;
		}

	}

	void cmd_domain_info(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &domain, size_t limit, size_t offset) {

		indexer::sharded<indexer::basic_index, indexer::counted_record> idx("title_word_counter", 997);

		const uint64_t domain_hash = ::algorithm::hash(domain);
		std::vector<indexer::counted_record> res = idx.find(domain_hash);

		sort(res.begin(), res.end(), indexer::counted_record::truncate_order());

		size_t pos = 0;
		for (auto &rec : res) {
			const auto word = ht.find(rec.m_value);
			cout << word << ": " << rec.m_count << endl;
			if (pos >= limit) break;
			pos++;
		}

	}

	void cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query, const std::string &domain) {

		indexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> word_index_builder("word_index", 256);
		indexer::sharded<indexer::basic_index, indexer::counted_record> word_index("word_index", 256);

		const uint64_t word_hash = ::algorithm::hash(query);
		std::vector<indexer::counted_record> res = word_index.find(word_hash);

		size_t pos = 0;
		for (auto &rec : res) {
			const auto host = ht.find(rec.m_value);
			if (host == domain) {
				cout << host << ": " << rec.m_count << " score: " << rec.m_score << " pos: " << pos << " m_value: " << rec.m_value << " doc_size: " << word_index_builder.document_size(rec.m_value) << endl;
			}
			pos++;
		}

	}

	void cmd_word_num(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query) {

		indexer::sharded<indexer::basic_index, indexer::counted_record> word_index("word_index", 256);

		const uint64_t word_hash = ::algorithm::hash(query);
		std::vector<indexer::counted_record> res = word_index.find(word_hash);

		cout << "num_records: " << res.size() << endl;

	}

	void cmd_harmonic(const std::vector<std::string> &args) {
		if (args.size() < 2) return;
		float harmonic = domain_stats::harmonic_centrality(URL(args[1]));
		cout << "url: " << args[1] << " has harmonic centrality " << harmonic << endl;
	}

	std::vector<std::string> input_to_args(const std::string &input) {
		const auto word_boundary = " \t,|!";

		std::vector<std::string> raw_words, words;
		boost::split(raw_words, input, boost::is_any_of(word_boundary));

		for (auto &word : raw_words) {
			if (word.size()) {
				words.push_back(word);
			}
		}

		return words;
	}

	void console() {
	}

	void index_link_batch(const std::string &batch) {

		::algorithm::bloom_filter urls_to_index(625000027);
		urls_to_index.read_file(config::data_path() + "/0/url_filter.bloom");

		size_t limit = 1000;
		size_t offset = 0;
		while (true) {
			indexer::index_manager idx_manager;

			merger::start_merge_thread();

			file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths");
			std::vector<std::string> warc_paths;
			warc_paths_file.read_column_into(0, warc_paths, limit, offset);

			if (warc_paths.size() == 0) {
				merger::stop_merge_thread();
				break;
			}

			auto local_files = transfer::download_gz_files_to_disk(warc_paths);
			cout << "starting indexer" << endl;
			idx_manager.add_link_files_threaded(local_files, 32, urls_to_index);
			cout << "done with indexer" << endl;
			transfer::delete_downloaded_files(local_files);

			merger::stop_merge_thread();

			offset += limit;
		}
	}

	void index_links() {

		domain_stats::download_domain_stats();
		LOG_INFO("Done download_domain_stats");
		
		for (const std::string &batch : config::link_batches) {
			index_link_batch(batch);
		}
	}

	void index_url_batch(const std::string &batch) {

		size_t limit = 1000;
		size_t offset = 0;
		while (true) {
			indexer::index_manager idx_manager;

			merger::start_merge_thread();

			file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths");
			std::vector<std::string> warc_paths;
			warc_paths_file.read_column_into(0, warc_paths, limit, offset);

			if (warc_paths.size() == 0) {
				merger::stop_merge_thread();
				break;
			}

			cout << "downloading " << warc_paths.size() << " to disc" << endl;
			auto local_files = transfer::download_gz_files_to_disk(warc_paths);
			cout << "starting indexer" << endl;
			idx_manager.add_index_files_threaded(local_files, 32);
			cout << "done with indexer" << endl;
			transfer::delete_downloaded_files(local_files);

			merger::stop_merge_thread();

			offset += limit;
		}
		profiler::print_report();
	}

	void index_urls() {

		domain_stats::download_domain_stats();
		LOG_INFO("Done download_domain_stats");
		
		for (const std::string &batch : config::batches) {
			index_url_batch(batch);
		}
	}

	void truncate_links() {
		{
			indexer::index_manager idx_manager;
			idx_manager.truncate_links();
		}
	}

	void domain_info_server() {

		domain_stats::download_domain_stats();
		LOG_INFO("Done download_domain_stats");

		indexer::index_manager idx_manager;
		hash_table2::hash_table ht("word_hash_table");

		indexer::sharded<indexer::basic_index, counted_record> fp_title_counter("first_page_title_word_counter", 101);
		indexer::sharded<indexer::basic_index, indexer::counted_record> title_counter("title_word_counter", 997);
		indexer::sharded<indexer::basic_index, indexer::counted_record> link_counter("link_word_counter", 4001);

		cout << "starting server..." << endl;

		::http::server srv([&ht, &fp_title_counter, &title_counter, &link_counter](const http::request &req) {
			http::response res;

			URL url = req.url();

			auto query = url.query();

			size_t limit = 1000;
			if (query.count("limit")) limit = std::stoi(query["limit"]);

			size_t offset = 0;
			if (query.count("offset")) offset = std::stoi(query["offset"]);

			if (url.path() == "/favicon.ico") {
				res.code(404);
				res.body("404");
				return res;
			}

			std::stringstream body;

			auto domain = url.path();
			domain.erase(0, 1);

			body << "<html><head><meta http-equiv='Content-type' content='text/html; charset=utf-8'></head><body>";

			body << "<h1>" << domain << "</h1>" << endl;
			body << "<h3>harmonic: " << domain_stats::harmonic_centrality(domain) << "</h3>" << endl;
			body << "<h3>hash: " << ::algorithm::hash(domain) << "</h3>" << endl;

			body << "<pre>";

			const uint64_t domain_hash = ::algorithm::hash(domain);
			auto fp_results = fp_title_counter.find(domain_hash);
			auto results = title_counter.find(domain_hash);
			auto link_results = link_counter.find(domain_hash);

			sort(fp_results.begin(), fp_results.end(), indexer::counted_record::truncate_order());
			sort(results.begin(), results.end(), indexer::counted_record::truncate_order());
			sort(link_results.begin(), link_results.end(), indexer::counted_record::truncate_order());

			body << "Limit: " + std::to_string(limit) << endl;
			body << "Offset: " + std::to_string(offset) << endl << endl;
			const size_t original_offset = offset;
			body << "</pre>";
			body << "<div class=lefter>";
			body << "<pre class=green>";
			for (auto &rec : fp_results) {
				const auto word = ht.find(rec.m_value);
				body << word << ": " << rec.m_count << endl;
			}
			body << "</pre>";
			body << "<pre class=green>";
			double threshold = results.size() ? results[0].m_count : 0.0;
			size_t offset_start = 0;
			for (auto &rec : results) {
				if (rec.m_count >= threshold * 0.8) {
					const auto word = ht.find(rec.m_value);
					body << word << ": " << rec.m_count << endl;
					offset_start++;
				} else {
					break;
				}
			}
			if (offset < offset_start) offset = offset_start;
			body << "</pre>";

			body << "<pre>";

			size_t pos = 0;
			for (auto &rec : results) {
				if (pos >= offset) {
					const auto word = ht.find(rec.m_value);
					body << word << ": " << rec.m_count << endl;
				}
				if (pos >= limit + offset) break;
				pos++;
			}

			body << "</pre></div><pre class=righter>";

			pos = 0;
			for (auto &rec : link_results) {
				if (pos >= original_offset) {
					const auto word = ht.find(rec.m_value);
					body << word << ": " << rec.m_count << endl;
				}
				if (pos >= limit + original_offset) break;
				pos++;
			}

			body << "</pre><style>.lefter {width: 50%; float: left; }";

			res.code(200);

			res.body(body.str());

			return res;
		});
	}

	void make_domain_index() {

		/*sharded_index<domain_record> idx("domain_info", 997);

		size_t count = 0;
		idx.for_each([&count](uint64_t key, roaring::Roaring &recs) {
			count++;
		});

		cout << "num_words: " << count << endl;

		return;*/

		domain_stats::download_domain_stats();
		LOG_INFO("Done download_domain_stats");

		indexer::sharded<indexer::basic_index, counted_record> fp_title_counter("first_page_title_word_counter", 101);
		indexer::sharded<indexer::basic_index, indexer::counted_record> title_counter("title_word_counter", 997);
		indexer::sharded<indexer::basic_index, indexer::counted_record> link_counter("link_word_counter", 4001);

		merger::start_merge_thread();

		sharded_index_builder<domain_record> idx("domain_info", 997);
		idx.truncate();

		fp_title_counter.for_each([&idx](uint64_t domain_hash, std::vector<counted_record> &records) {
			for (const auto &record : records) {
				idx.add(record.m_value, domain_record(domain_hash, 0.0f));
			}
		});

		merger::stop_merge_thread_only_append();
		idx.merge();
		merger::start_merge_thread();

		title_counter.for_each([&idx](uint64_t domain_hash, std::vector<counted_record> &records) {

			// Sort by score.
			sort(records.begin(), records.end(), counted_record::truncate_order());
			float threshold = records.size() > 0 ? records[0].m_count * 0.8f : 0.0f;
			for (const auto &record : records) {
				if (record.m_count < threshold) break;
				idx.add(record.m_value, domain_record(domain_hash, 0.0f));
			}
		});

		merger::stop_merge_thread_only_append();
		idx.merge();
		merger::start_merge_thread();

		link_counter.for_each([&idx](uint64_t domain_hash, std::vector<counted_record> &records) {

			// Sort by score.
			sort(records.begin(), records.end(), counted_record::truncate_order());
			for (size_t i = 0; i < records.size() && i < 100; i++) {
				idx.add(records[i].m_value, domain_record(domain_hash, 0.0f));
			}
		});

		merger::stop_merge_thread_only_append();
		idx.merge();
		idx.optimize();
	}

	void make_domain_index_scores() {

		domain_stats::download_domain_stats();
		LOG_INFO("Done download_domain_stats");

		hash_table2::hash_table ht("index_manager");

		sharded_index_builder<domain_record> idx("domain_info", 997);

		idx.for_each_record([&ht](domain_record &rec) {
			URL u;
			const auto domain = ht.find(rec.m_value);

			float harmonic = domain_stats::harmonic_centrality(domain);

			rec.m_score = harmonic;
		});
		
	}

	void make_url_bloom_filter() {

		hash_table2::hash_table ht("index_manager");

		::algorithm::bloom_filter urls_to_index(625000027);

		ht.for_each_key([&urls_to_index](uint64_t key) {
			urls_to_index.insert(key);
		});

		urls_to_index.write_file(config::data_path() + "/0/url_filter.bloom");

	}

	void count_words_that_hit_max() {

		sharded<basic_index, url_record> url_index("url_index", 4001);

		size_t counter = 0;
		url_index.for_each([&](uint64_t key, auto &records) {
			if (records.size() >= config::ft_max_results_per_section) {
				counter++;
				std::cout << counter << std::endl;
			}
		});

	}

	size_t count_urls() {
		indexer::index_manager idx_manager;
		return idx_manager.url_count();
	}

}


================================================
FILE: src/indexer/console.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {

	void console();
	void index_links();
	void index_urls();
	void truncate_links();
	void domain_info_server();
	void search_server();
	void make_domain_index();
	void make_domain_index_scores();
	void make_url_bloom_filter();
	void optimize_urls();
	void count_words_that_hit_max();
	size_t count_urls();

}


================================================
FILE: src/indexer/counted_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {
	/*
	This is a record type for counting things.
	*/
	#pragma pack(4)
	class counted_record {

		public:
		uint64_t m_value;
		uint64_t m_count;
		float m_score;

		counted_record() : m_value(0), m_count(1), m_score(0.0f) {};
		counted_record(uint64_t value) : m_value(value), m_count(1), m_score(0.0f) {};
		counted_record(uint64_t value, float score) : m_value(value), m_count(1), m_score(score) {};
		counted_record(uint64_t value, float score, size_t count) : m_value(value), m_count(count), m_score(score) {};

		bool operator==(const counted_record &b) const {
			return m_value == b.m_value;
		}

		bool operator<(const counted_record &b) const {
			return m_value < b.m_value;
		}

		counted_record &operator+=(const counted_record &b) {
			m_count += b.m_count;
			return *this;
		}

		/*
		 * Will be applied to records before truncating. Top records will be kept.
		 * */
		struct truncate_order {
			inline bool operator() (const counted_record &a, const counted_record &b) {
				return a.m_count > b.m_count;
			}
		};

		/*
		 * Will be applied before storing on disk. This is the order the records will be returned in.
		 * */
		struct storage_order {
			inline bool operator() (const counted_record &a, const counted_record &b) {
				return a.m_value < b.m_value;
			}
		};

		bool storage_equal(const counted_record &a) const {
			return m_value == a.m_value;
		}

	};
	#pragma pack()
}


================================================
FILE: src/indexer/domain_link_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {

	#pragma pack(4)
	class domain_link_record {
		public:
		uint64_t m_value;
		float m_score;
		uint64_t m_source_domain;
		uint64_t m_target_domain;

		domain_link_record() : m_value(0), m_score(0.0f) {};
		domain_link_record(uint64_t value) : m_value(value), m_score(0.0f) {};
		domain_link_record(uint64_t value, float score) : m_value(value), m_score(score) {};
		domain_link_record(uint64_t value, float score, uint64_t target_domain)
				: m_value(value), m_score(score), m_target_domain(target_domain) {};

		bool operator==(const domain_link_record &b) const {
			return m_value == b.m_value;
		}

		bool operator<(const domain_link_record &b) const {
			return m_value < b.m_value;
		}

		domain_link_record &operator+=(const domain_link_record &b) {
			return *this;
		}

		/*
		 * Will be applied to records before truncating. Top records will be kept.
		 * */
		struct truncate_order {
			inline bool operator() (const domain_link_record &a, const domain_link_record &b) {
				return a.m_score > b.m_score;
			}
		};

		/*
		 * Will be applied before storing on disk. This is the order the records will be returned in.
		 * */
		struct storage_order {
			inline bool operator() (const domain_link_record &a, const domain_link_record &b) {
				return a.m_target_domain < b.m_target_domain;
			}
		};

		bool storage_equal(const domain_link_record &a) const {
			return m_target_domain == a.m_target_domain;
		}

	};
	#pragma pack()
}


================================================
FILE: src/indexer/domain_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "generic_record.h"

namespace indexer {
	class domain_record: public generic_record {

		public:
		domain_record() : generic_record() {};
		domain_record(uint64_t value) : generic_record(value) {};
		domain_record(uint64_t value, float score) : generic_record(value, score) {};

	};
}


================================================
FILE: src/indexer/generic_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {
	/*
	This is the base class for the record stored on disk. Needs to be small!
	*/
	#pragma pack(4)
	class generic_record {

		public:
		uint64_t m_value;
		float m_score;

		explicit generic_record() : m_value(0), m_score(0.0f) {};
		explicit generic_record(uint64_t value) : m_value(value), m_score(0.0f) {};
		explicit generic_record(uint64_t value, float score) : m_value(value), m_score(score) {};

		bool operator==(const generic_record &b) const {
			return m_value == b.m_value;
		}

		bool operator<(const generic_record &b) const {
			return m_value < b.m_value;
		}

		struct storage_order {
			inline bool operator() (const generic_record &a, const generic_record &b) {
				return a.m_value < b.m_value;
			}
		};

		/*
		 * Will be applied to records before truncating. Top records will be kept.
		 * */
		struct truncate_order {
			inline bool operator() (const generic_record &a, const generic_record &b) {
				return a.m_score > b.m_score;
			}
		};

		struct score_order {
			inline bool operator() (const generic_record &a, const generic_record &b) {
				return a.m_score > b.m_score;
			}
		};

		bool storage_equal(const generic_record &a) const {
			return m_value == a.m_value;
		}

		generic_record operator+(const generic_record &b) const {
			// can be overloaded to perform summation over scores but default behaviour is to not add scores.
			generic_record sum;
			sum.m_value = m_value;
			sum.m_score = m_score /* + b.m_score */;
			return sum;
		}

		generic_record &operator+=(const generic_record &b) {
			// can be overloaded to perform summation over scores but default behaviour is to not add scores.
			// m_score += b.m_score;
			return *this;
		}

	};
	#pragma pack()
}


================================================
FILE: src/indexer/index.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <set>
#include <cmath>
#include <mutex>
#include "index_base.h"
#include "roaring/roaring.hh"
#include "algorithm/intersection.h"
#include "algorithm/top_k.h"

namespace indexer {

	template<typename data_record>
	class index : public index_base<data_record> {

	public:

		explicit index(const std::string &file_name);
		explicit index(const std::string &db_name, size_t id);
		explicit index(const std::string &db_name, size_t id, size_t hash_table_size);
		explicit index(std::istream *reader, size_t hash_table_size);
		~index();

		std::vector<data_record> find(uint64_t key) const;
		roaring::Roaring find_bitmap(uint64_t key) const;

		/*
		 * Find intersection of multiple keys
		 * Returns vector with records in storage order.
		 * */
		std::vector<data_record> find_intersection(const std::vector<uint64_t> &keys) const;

		/*
		 * Find intersection of multiple keys applying lambda function score_mod to the scores before.
		 * Returns n records with highest score.
		 * score_mod is applied in storage_order of data_record.
		 * */
		std::vector<data_record> find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t n,
				std::function<float(const data_record &)> score_mod = {}) const;

		/*
		 * Overload without total_num_results.
		 * */
		std::vector<data_record> find_top(const std::vector<uint64_t> &keys, size_t n,
				std::function<float(const data_record &)> score_mod = {}) const;


		/*
		 * Returns inverse document frequency (idf) for the last search.
		 * */
		float get_idf(size_t documents_with_term) const;
		size_t get_document_count() const { return m_unique_count; }

		void print_stats();

		std::set<uint64_t> get_keys(size_t with_more_than_records) const;
		const std::vector<data_record> &records() const { return m_records; }

		void for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const;

	private:

		mutable std::istream *m_reader;
		std::unique_ptr<std::ifstream> m_default_reader;

		std::string m_file_name;
		std::string m_db_name;
		size_t m_id;
		size_t m_unique_count = 0;

		std::vector<data_record> m_records;
		mutable std::vector<float> m_scores;

		size_t read_key_pos(uint64_t key) const;
		void read_meta();
		std::string mountpoint() const;
		std::string filename() const;
		std::string meta_filename() const;
		void read_records();
		
	};

	template<typename data_record>
	index<data_record>::index(const std::string &file_name)
	: index_base<data_record>(), m_file_name(file_name) {
		m_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);
		m_reader = m_default_reader.get();
		read_records();
	}

	template<typename data_record>
	index<data_record>::index(const std::string &db_name, size_t id)
	: index_base<data_record>(), m_db_name(db_name), m_id(id) {
		m_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);
		m_reader = m_default_reader.get();
		read_records();
	}

	template<typename data_record>
	index<data_record>::index(const std::string &db_name, size_t id, size_t hash_table_size)
	: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id) {
		m_default_reader = std::make_unique<std::ifstream>(filename(), std::ios::binary);
		m_reader = m_default_reader.get();
		read_records();
	}

	template<typename data_record>
	index<data_record>::index(std::istream *reader, size_t hash_table_size)
	: index_base<data_record>(hash_table_size) {
		m_reader = reader;
		read_records();
	}

	template<typename data_record>
	index<data_record>::~index() {
	}

	template<typename data_record>
	std::vector<data_record> index<data_record>::find(uint64_t key) const {

		std::lock_guard lock(this->m_lock);

		roaring::Roaring rr = find_bitmap(key);

		std::function<data_record(uint32_t)> id_to_rec = [this](uint32_t id) {
			data_record rec;
			m_reader->seekg((this->m_hash_table_size + 1) * sizeof(uint64_t) + id * sizeof(data_record), std::ios::beg);
			m_reader->read((char *)&rec, sizeof(data_record));
			return rec;
		};

		std::vector<data_record> ret;
		for (uint32_t internal_id : rr) {
			ret.emplace_back(id_to_rec(internal_id));
		}

		return ret;
	}

	template<typename data_record>
	roaring::Roaring index<data_record>::find_bitmap(uint64_t key) const {
		size_t key_pos = read_key_pos(key);

		std::lock_guard lock(this->m_lock);

		if (key_pos == SIZE_MAX) {
			return roaring::Roaring();
		}

		// Read page.
		m_reader->seekg(key_pos, std::ios::beg);
		size_t num_keys;
		m_reader->read((char *)&num_keys, sizeof(size_t));

		std::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);
		uint64_t *keys = keys_allocator.get();
		m_reader->read((char *)keys, num_keys * sizeof(uint64_t));

		size_t key_data_pos = SIZE_MAX;
		for (size_t i = 0; i < num_keys; i++) {
			if (keys[i] == key) {
				key_data_pos = i;
			}
		}

		if (key_data_pos == SIZE_MAX) {
			return roaring::Roaring();
		}

		char buffer[64];

		// Read position and length.
		m_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8, std::ios::beg);
		m_reader->read(buffer, 8);
		size_t pos = *((size_t *)(&buffer[0]));

		m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8, std::ios::beg);
		m_reader->read(buffer, 8);
		size_t len = *((size_t *)(&buffer[0]));

		m_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos, std::ios::beg);

		std::unique_ptr<char[]> data_allocator = std::make_unique<char[]>(len);
		char *data = data_allocator.get();

		m_reader->read(data, len);

		return roaring::Roaring::readSafe(data, len);
	}

	template<typename data_record>
	std::vector<data_record> index<data_record>::find_intersection(const std::vector<uint64_t> &keys) const {

		std::lock_guard lock(this->m_lock);

		std::vector<roaring::Roaring> bitmaps;
		for (auto key : keys) {
			bitmaps.emplace_back(std::move(find_bitmap(key)));
		}

		auto intersection = ::algorithm::intersection(bitmaps);
		std::vector<data_record> res;
		for (auto internal_id : intersection) {
			res.emplace_back(m_records[internal_id]);
		}

		return res;
	}

	template<typename data_record>
	std::vector<data_record> index<data_record>::find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t num,
			std::function<float(const data_record &)> score_mod) const {

		std::lock_guard lock(this->m_lock);

		std::vector<roaring::Roaring> bitmaps;
		for (auto key : keys) {
			bitmaps.emplace_back(std::move(find_bitmap(key)));
		}

		if (keys.size() == 0) {
			// Return all records...
			roaring::Roaring all_ids;
			all_ids.addRange(0, m_records.size());
			bitmaps.push_back(all_ids);
		}

		auto intersection = ::algorithm::intersection(bitmaps);

		total_num_results = intersection.cardinality();

		// Apply score modifications.
		std::vector<uint32_t> ids;
		if (score_mod) {
			for (auto internal_id : intersection) {
				ids.push_back(internal_id);
				m_scores[internal_id] = m_records[internal_id].m_score + score_mod(m_records[internal_id]);
			}
		} else {
			for (auto internal_id : intersection) {
				ids.push_back(internal_id);
				m_scores[internal_id] = m_records[internal_id].m_score;
			}
		}

		auto ordered = [this](const uint32_t &a, const uint32_t &b) {
			return m_scores[a] < m_scores[b];
		};

		std::vector<uint32_t> top_ids = ::algorithm::top_k<uint32_t>(ids, num, ordered);

		std::vector<data_record> ret;
		for (uint32_t internal_id : top_ids) {
			ret.push_back(m_records[internal_id]);
			ret.back().m_score = m_scores[internal_id];
		}

		std::sort(ret.begin(), ret.end(), typename data_record::truncate_order());

		return ret;
	}

	template<typename data_record>
	std::vector<data_record> index<data_record>::find_top(const std::vector<uint64_t> &keys, size_t num,
			std::function<float(const data_record &)> score_mod) const {
		size_t total_num_results;
		return find_top(total_num_results, keys, num, score_mod);
	}

	template<typename data_record>
	float index<data_record>::get_idf(size_t documents_with_term) const {
		if (documents_with_term) {
			const size_t documents_in_corpus = m_unique_count;
			float idf = std::log((float)documents_in_corpus / documents_with_term);
			return idf;
		}

		return 0.0f;
	}

	/*
	 * Reads the exact position of the key, returns SIZE_MAX if the key was not found.
	 * */
	template<typename data_record>
	size_t index<data_record>::read_key_pos(uint64_t key) const {

		if (this->m_hash_table_size == 0) return 0;

		const size_t hash_pos = key % this->m_hash_table_size;

		m_reader->seekg(hash_pos * sizeof(size_t), std::ios::beg);

		size_t pos;
		m_reader->read((char *)&pos, sizeof(size_t));

		return pos;
	}

	/*
	 * Reads the count of unique recprds from the count file and puts it in the m_unique_count member.
	 * */
	template<typename data_record>
	void index<data_record>::read_meta() {
		struct meta {
			size_t unique_count;
		};

		meta m;

		std::ifstream meta_reader(meta_filename(), std::ios::binary);

		if (meta_reader.is_open()) {
			meta_reader.read((char *)(&m), sizeof(meta));
		}

		m_unique_count = m.unique_count;
	}

	template<typename data_record>
	std::string index<data_record>::mountpoint() const {
		return std::to_string(m_id % 8);
	}

	template<typename data_record>
	std::string index<data_record>::filename() const {
		if (m_file_name != "") return m_file_name + ".data";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".data";
	}

	template<typename data_record>
	std::string index<data_record>::meta_filename() const {
		if (m_file_name != "") return m_file_name + ".meta";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".meta";
	}

	template<typename data_record>
	void index<data_record>::print_stats() {

		size_t total_num_keys = 0;
		size_t total_num_larger_100 = 0;
		size_t total_num_larger_10 = 0;
		size_t total_num_records = 0;
		size_t total_roaring_size = 0;
		size_t total_record_size = 0;
		size_t total_file_size = 0;
		size_t total_cardinality = 0;
		size_t total_page_header_size = 0;

		m_reader->seekg(this->hash_table_byte_size(), std::ios::beg);
		m_reader->read((char *)&total_num_records, sizeof(size_t));

		total_record_size = total_num_records * sizeof(data_record);

		for (size_t page = 0; page < this->m_hash_table_size; page++) {
			size_t key_pos = read_key_pos(page);

			if (key_pos == SIZE_MAX) {
				continue;
			}

			// Read page.
			m_reader->seekg(key_pos, std::ios::beg);
			size_t num_keys;
			m_reader->read((char *)&num_keys, sizeof(size_t));

			total_num_keys += num_keys;

			std::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);
			uint64_t *keys = keys_allocator.get();
			m_reader->read((char *)keys, num_keys * sizeof(uint64_t));
			total_page_header_size += num_keys * sizeof(uint64_t) * 3;

			for (size_t i = 0; i < num_keys; i++) {
				size_t key_data_pos = i;
				
				char buffer[64];

				// Read position and length.
				m_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8, std::ios::beg);
				m_reader->read(buffer, 8);
				size_t pos = *((size_t *)(&buffer[0]));

				m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8, std::ios::beg);
				m_reader->read(buffer, 8);
				size_t len = *((size_t *)(&buffer[0]));

				m_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos, std::ios::beg);

				std::unique_ptr<char[]> data_allocator = std::make_unique<char[]>(len);
				char *data = data_allocator.get();

				m_reader->read(data, len);

				roaring::Roaring rr = roaring::Roaring::readSafe(data, len);

				const size_t card = rr.cardinality();
				if (card > 100) {
					total_num_larger_100++;
				}
				if (card > 10) {
					total_num_larger_10++;
				}
				total_cardinality += card;
				total_roaring_size += len;
			}
		}

		std::cout << "total_num_keys: " << total_num_keys << std::endl;
		std::cout << "total_num_larger_10: " << total_num_larger_10 << std::endl;
		std::cout << "total_num_larger_100: " << total_num_larger_100 << std::endl;
		std::cout << "total_num_records: " << total_num_records << std::endl;
		std::cout << "record size: " << total_record_size << " (" << 100*((float)total_record_size / total_file_size) << "%)" << std::endl;
		std::cout << "page header size: " << total_page_header_size << " (" << 100*((float)total_page_header_size / total_file_size) << "%)" << std::endl;
		std::cout << "roaring size: " << total_roaring_size << " (" << 100*((float)total_roaring_size / total_file_size) << "%)" << std::endl;
		std::cout << "mean length for key: " << total_roaring_size / total_num_keys << std::endl;
		std::cout << "mean cardinality for key: " << total_cardinality / total_num_keys << std::endl;
	}

	template<typename data_record>
	std::set<uint64_t> index<data_record>::get_keys(size_t with_more_than_records) const {

		std::set<uint64_t> all_keys;

		for (size_t page = 0; page < this->m_hash_table_size; page++) {
			size_t key_pos = read_key_pos(page);

			if (key_pos == SIZE_MAX) {
				continue;
			}

			// Read page.
			m_reader->seekg(key_pos, std::ios::beg);
			size_t num_keys;
			m_reader->read((char *)&num_keys, sizeof(size_t));

			std::unique_ptr<uint64_t[]> keys_allocator = std::make_unique<uint64_t[]>(num_keys);
			uint64_t *keys = keys_allocator.get();
			m_reader->read((char *)keys, num_keys * sizeof(uint64_t));

			for (size_t i = 0; i < num_keys; i++) {
				size_t key_data_pos = i;
				
				char buffer[64];

				// Read position and length.
				m_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8, std::ios::beg);
				m_reader->read(buffer, 8);
				size_t pos = *((size_t *)(&buffer[0]));

				m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8, std::ios::beg);
				m_reader->read(buffer, 8);
				size_t len = *((size_t *)(&buffer[0]));

				m_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos, std::ios::beg);

				std::unique_ptr<char[]> data_allocator = std::make_unique<char[]>(len);
				char *data = data_allocator.get();

				m_reader->read(data, len);

				roaring::Roaring rr = roaring::Roaring::readSafe(data, len);

				const size_t card = rr.cardinality();
				if (card > with_more_than_records) {
					all_keys.insert(keys[i]);
				}
			}
		}

		return all_keys;
	}

	template<typename data_record>
	void index<data_record>::for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const {

		m_reader->seekg(this->hash_table_byte_size(), std::ios::beg);

		size_t num_records = 0;
		m_reader->read((char *)&num_records, sizeof(size_t));
		m_reader->seekg(num_records * sizeof(data_record), std::ios::cur);

		std::map<uint64_t, roaring::Roaring> page;
		while (this->read_bitmap_page_into(*m_reader, page)) {
			for (auto &iter : page) {
				on_each_key(iter.first, iter.second);
			}
			page.clear();
		}
	}

	template<typename data_record>
	void index<data_record>::read_records() {
		size_t num_records = 0;
		m_reader->seekg(this->hash_table_byte_size());
		m_reader->read((char *)&num_records, sizeof(uint64_t));
		m_records.resize(num_records);
		m_reader->read((char *)m_records.data(), num_records * sizeof(data_record));
		m_scores.resize(num_records);
		std::fill(m_scores.begin(), m_scores.end(), 0.0f);
	}

}


================================================
FILE: src/indexer/index_base.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <vector>
#include <memory>
#include "config.h"
#include "logger/logger.h"
#include "roaring/roaring.hh"

namespace indexer {

	template<typename data_record>
	class index_base {

		public:

		index_base();
		explicit index_base(size_t hash_table_size);

		void set_hash_table_size(size_t size) { m_hash_table_size = size; }

		protected:

			size_t m_hash_table_size;
			mutable std::recursive_mutex m_lock;

			bool read_page_into(std::istream &reader, std::map<uint64_t, std::vector<data_record>> &into) const;
			bool read_bitmap_page_into(std::istream &reader, std::map<uint64_t, roaring::Roaring> &into) const;
			size_t hash_table_byte_size() const { return m_hash_table_size * sizeof(size_t); }
	};

	template<typename data_record>
	index_base<data_record>::index_base()
	: m_hash_table_size(config::shard_hash_table_size)
	{}

	template<typename data_record>
	index_base<data_record>::index_base(size_t hash_table_size)
	: m_hash_table_size(hash_table_size)
	{}

	template<typename data_record>
	bool index_base<data_record>::read_page_into(std::istream &reader, std::map<uint64_t, std::vector<data_record>> &into) const {

		uint64_t num_keys;
		reader.read((char *)&num_keys, sizeof(uint64_t));
		if (reader.eof()) return false;

		std::unique_ptr<char[]> vector_buffer_allocator;
		try {
			vector_buffer_allocator = std::make_unique<char[]>(num_keys * sizeof(uint64_t));
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << num_keys << " keys" << std::endl;
			return false;
		}

		char *vector_buffer = vector_buffer_allocator.get();

		// Read the keys.
		reader.read(vector_buffer, num_keys * sizeof(uint64_t));
		std::vector<uint64_t> keys;
		for (size_t i = 0; i < num_keys; i++) {
			keys.push_back(*((uint64_t *)(&vector_buffer[i*8])));
		}

		// Read the positions.
		reader.read(vector_buffer, num_keys * 8);
		std::vector<size_t> positions;
		for (size_t i = 0; i < num_keys; i++) {
			positions.push_back(*((size_t *)(&vector_buffer[i*8])));
		}

		// Read the lengths.
		reader.read(vector_buffer, num_keys * 8);
		std::vector<size_t> lens;
		size_t max_len = 0;
		size_t data_size = 0;
		for (size_t i = 0; i < num_keys; i++) {
			size_t len = *((size_t *)(&vector_buffer[i*8]));
			if (len > max_len) max_len = len;
			lens.push_back(len);
			data_size += len;
		}

		if (data_size == 0) return true;

		std::unique_ptr<char[]> buffer_allocator;
		try {
			buffer_allocator = std::make_unique<char[]>(max_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << max_len << " bytes" << std::endl;
			return false;
		}
		char *buffer = buffer_allocator.get();

		// Read the records.
		for (size_t i = 0; i < num_keys; i++) {
			const size_t len = lens[i];
			reader.read(buffer, len);
			const size_t read_len = reader.gcount();
			if (read_len != len) {
				LOG_INFO("Data stopped before end. Ignoring shard");
				return false;
			}

			const data_record *records = (data_record *)buffer;
			const size_t num_records = len / sizeof(data_record);

			for (size_t j = 0; j < num_records; j++) {
				into[keys[i]].push_back(records[j]);
			}
		}

		return true;
	}

	template<typename data_record>
	bool index_base<data_record>::read_bitmap_page_into(std::istream &reader, std::map<uint64_t, roaring::Roaring> &into) const {

		uint64_t num_keys;
		reader.read((char *)&num_keys, sizeof(uint64_t));
		if (reader.eof()) return false;

		std::unique_ptr<char[]> vector_buffer_allocator;
		try {
			vector_buffer_allocator = std::make_unique<char[]>(num_keys * sizeof(uint64_t));
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << num_keys << " keys" << std::endl;
			return false;
		}

		char *vector_buffer = vector_buffer_allocator.get();

		// Read the keys.
		reader.read(vector_buffer, num_keys * sizeof(uint64_t));
		std::vector<uint64_t> keys;
		for (size_t i = 0; i < num_keys; i++) {
			keys.push_back(*((uint64_t *)(&vector_buffer[i*8])));
		}

		// Read the positions.
		reader.read(vector_buffer, num_keys * 8);
		std::vector<size_t> positions;
		for (size_t i = 0; i < num_keys; i++) {
			positions.push_back(*((size_t *)(&vector_buffer[i*8])));
		}

		// Read the lengths.
		reader.read(vector_buffer, num_keys * 8);
		std::vector<size_t> lens;
		size_t max_len = 0;
		size_t data_size = 0;
		for (size_t i = 0; i < num_keys; i++) {
			size_t len = *((size_t *)(&vector_buffer[i*8]));
			if (len > max_len) max_len = len;
			lens.push_back(len);
			data_size += len;
		}

		if (data_size == 0) return true;

		std::unique_ptr<char[]> buffer_allocator;
		try {
			buffer_allocator = std::make_unique<char[]>(max_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << max_len << " bytes" << std::endl;
			throw exception;
		}
		char *buffer = buffer_allocator.get();

		// Read the bitmap data.
		for (size_t i = 0; i < num_keys; i++) {
			const size_t len = lens[i];
			reader.read(buffer, len);
			const size_t read_len = reader.gcount();
			if (read_len != len) {
				LOG_INFO("Data stopped before end. Ignoring shard ");
				throw std::runtime_error("Data stopped before end. File is corrupt.");
			}

			into[keys[i]] = roaring::Roaring::readSafe(buffer, len);
		}

		return true;
	}

}


================================================
FILE: src/indexer/index_builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>
#include <map>
#include <set>
#include <unordered_set>
#include <cstring>
#include <cassert>
#include <numeric>
#include <boost/filesystem.hpp>
#include <sstream>
#include "merger.h"
#include "score_builder.h"
#include "index_utils.h"
#include "index_base.h"
#include "index.h"
#include "algorithm/hyper_log_log.h"
#include "config.h"
#include "profiler/profiler.h"
#include "logger/logger.h"
#include "file/file.h"
#include "memory/debugger.h"
#include "roaring/roaring.hh"
#include "URL.h"

namespace indexer {

	/*
		<hash-table-data> uint8_t[hash_table_size]
		<num-records> uint64_t
		<records> data_record[num-records] sequence of records, the position of the record is the internal_id
		<page-data> page[num_pages]

		page format:
		<num_keys> uint64_t
		<key-data> uint64_t[num_keys] sorted by key for binary search
		<pos-data> uint64_t[num_keys] position of record data start
		<len-data> uint64_t[num_keys] length of record data
		<record-data> <bitmap>[num_keys] bitmap is a roaring bitmap (CRoaring)
	*/

	enum class algorithm { bm25 = 101, tf_idf = 102};

	template<typename data_record>
	class index_builder : public index_base<data_record> {
	private:
		// Non copyable
		index_builder(const index_builder &);
		index_builder& operator=(const index_builder &);
	public:

		explicit index_builder(const std::string &file_name);
		explicit index_builder(size_t hash_table_size, const std::string &file_name);
		explicit index_builder(const std::string &db_name, size_t id);
		explicit index_builder(const std::string &db_name, size_t id, size_t hash_table_size);
		explicit index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results);
		explicit index_builder(const std::string &db_name, size_t id, std::function<uint32_t(const data_record &)> &rec_to_id);
		~index_builder();

		void add(uint64_t key, const data_record &record);
		size_t cache_size() const;
		void transform(const std::function<uint32_t(uint32_t)> &transform);
		
		void append();
		void merge();
		void merge(std::unordered_map<uint64_t, uint32_t> &internal_id_map);
		void merge_with(const index<data_record> &other);
		void optimize();

		void truncate();
		void truncate_cache_files();
		void create_directories();

		/*void calculate_scores(algorithm algo, const score_builder &score);

		void calculate_scores_for_token(algorithm algo, const score_builder &score, uint64_t token,
			std::vector<data_record> &records);
		float calculate_score_for_record(algorithm algo, const score_builder &score, uint64_t token,
			const data_record &record);*/

		size_t get_max_id();

		static void create_directories(const std::string &db_name);

	private:

		std::string m_file_name;
		std::string m_db_name;
		const size_t m_id;

		const size_t m_max_results;

		std::mutex m_lock;

		// Caches
		std::vector<uint64_t> m_key_cache;
		std::vector<data_record> m_record_cache;
		

		std::vector<data_record> m_records;
		std::map<uint64_t, uint32_t> m_record_id_map;
		std::map<uint64_t, roaring::Roaring> m_bitmaps;

		std::function<uint32_t(const data_record &)> m_record_id_to_internal_id = [this](const data_record &record) {
			if (m_record_id_map.count(record.m_value) == 0) {
				m_record_id_map[record.m_value] = m_records.size();
				m_records.push_back(record);
			}
			return m_record_id_map[record.m_value];
		};

		void read_append_cache();
		void read_append_cache(std::unordered_map<uint64_t, uint32_t> &internal_id_map);
		void read_data_to_cache();
		bool read_page(std::ifstream &reader);
		void reset_cache_variables();
		void save_file();
		void write_key(std::ostream &key_writer, uint64_t key, size_t page_pos);
		size_t write_page(std::ostream &writer, const std::vector<uint64_t> &keys);
		void reset_key_map(std::ostream &key_writer);
		std::vector<data_record> read_records() const;
		void write_records(std::ostream &writer);
		uint32_t default_record_to_internal_id(const data_record &record);

		std::string mountpoint() const;
		std::string cache_filename() const;
		std::string key_cache_filename() const;
		std::string target_filename() const;
		std::string meta_filename() const;

		bool needs_optimization() const;
		void sort_records();
		void sort_records_and_bitmaps(std::vector<data_record> &records, std::map<uint64_t, roaring::Roaring> &bitmaps);

	};

	template<typename data_record>
	index_builder<data_record>::index_builder(const std::string &file_name)
	: index_base<data_record>(), m_file_name(file_name), m_id(0),
		m_max_results(config::ft_max_results_per_section)
	{
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	index_builder<data_record>::index_builder(size_t hash_table_size, const std::string &file_name)
	: index_base<data_record>(hash_table_size), m_file_name(file_name), m_id(0),
		m_max_results(config::ft_max_results_per_section)
	{
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	index_builder<data_record>::index_builder(const std::string &db_name, size_t id)
	: index_base<data_record>(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	index_builder<data_record>::index_builder(const std::string &db_name, size_t id, size_t hash_table_size)
	: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	index_builder<data_record>::index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results)
	: index_base<data_record>(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(max_results) {
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	index_builder<data_record>::index_builder(const std::string &db_name, size_t id,
		std::function<uint32_t(const data_record &)> &rec_to_id)
	: index_base<data_record>(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) {
		m_record_id_to_internal_id = rec_to_id;
		merger::register_merger((size_t)this, [this]() {merge();});
		merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); });
	}

	template<typename data_record>
	index_builder<data_record>::~index_builder() {
		merger::deregister_merger((size_t)this);
	}

	template<typename data_record>
	void index_builder<data_record>::add(uint64_t key, const data_record &record) {
		indexer::merger::lock();

		std::lock_guard guard(m_lock);

		// Amortized constant
		m_key_cache.push_back(key);
		m_record_cache.push_back(record);

	}

	/*
	 * Returns the allocated size of the cache (m_key_cache and m_record_cache).
	 * */
	template<typename data_record>
	size_t index_builder<data_record>::cache_size() const {
		return m_key_cache.capacity() * sizeof(uint64_t) + m_record_cache.capacity() * sizeof(data_record);
	}

	/*
		Transforms all the bitmaps in the index. Basically generating new bitmaps with the transform applied.
	*/
	template<typename data_record>
	void index_builder<data_record>::transform(const std::function<uint32_t(uint32_t)> &transform) {
		read_data_to_cache();

		// Apply transforms.
		for (auto &iter : m_bitmaps) {

			::roaring::Roaring rr;
			for (uint32_t v : iter.second) {
				const uint32_t v_trans = transform(v);
				rr.add(v_trans);
			}
			m_bitmaps[iter.first] = rr;
		}

		save_file();
		truncate_cache_files();
	}

	template<typename data_record>
	void index_builder<data_record>::append() {

		assert(m_record_cache.size() == m_key_cache.size());

		std::ofstream record_writer(cache_filename(), std::ios::binary | std::ios::app);
		if (!record_writer.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + cache_filename() + "). Error: " +
				std::string(strerror(errno)));
		}

		std::ofstream key_writer(key_cache_filename(), std::ios::binary | std::ios::app);
		if (!key_writer.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + key_cache_filename() + "). Error: " +
				std::string(strerror(errno)));
		}

		record_writer.write((const char *)m_record_cache.data(), m_record_cache.size() * sizeof(data_record));
		key_writer.write((const char *)m_key_cache.data(), m_key_cache.size() * sizeof(uint64_t));

		m_record_cache.clear();
		m_key_cache.clear();
		m_record_cache.shrink_to_fit();
		m_key_cache.shrink_to_fit();
	}

	template<typename data_record>
	void index_builder<data_record>::merge() {
		std::unordered_map<uint64_t, uint32_t> internal_id_map;
		merge(internal_id_map);
	}

	template<typename data_record>
	void index_builder<data_record>::merge(std::unordered_map<uint64_t, uint32_t> &internal_id_map) {

		{
			read_append_cache(internal_id_map);
			save_file();
			truncate_cache_files();
		}

	}

	template<typename data_record>
	void index_builder<data_record>::merge_with(const index<data_record> &other) {
		/*
		 * The only algorithm I can come up with is to append the records from 'other' that are not present in 'this'.
		 * And also create a map from ids in 'other' to ids in the new record array.
		 * Then transform the bitmaps in other before merging them.
		 * */

		const auto &other_records = other.records();

		typename data_record::storage_order ordered;

		if (!std::is_sorted(other_records.cbegin(), other_records.cend(), ordered))
			throw std::runtime_error("index_builder::merge_with needs optimized input");

		read_data_to_cache();

		if (!std::is_sorted(m_records.cbegin(), m_records.cend(), ordered))
			throw std::runtime_error("index_builder::merge_with needs to run on optimized index");

		std::map<uint32_t, uint32_t> id_map;
		std::vector<data_record> new_records;

		size_t i = 0, j = 0;
		while (i < m_records.size() && j < other_records.size()) {
			if (ordered(m_records[i], other_records[j])) {
				i++;
			} else if (m_records[i].storage_equal(other_records[j])) {
				id_map[j] = i;
				i++;
				j++;
			} else {
				id_map[j] = m_records.size() + new_records.size();
				new_records.push_back(other_records[j]);
				j++;
			}
		}
		while (j < other_records.size()) {
			id_map[j] = m_records.size() + new_records.size();
			new_records.push_back(other_records[j]);
			j++;
		}

		m_records.insert(m_records.end(), new_records.cbegin(), new_records.cend());

		other.for_each([this, &id_map](uint64_t key, roaring::Roaring &bitmap) {
			roaring::Roaring new_bitmap;
			for (auto idx : bitmap) {
				new_bitmap.add(id_map[idx]);
			}
			// Union the bitmaps.
			m_bitmaps[key] |= new_bitmap;
		});

		sort_records_and_bitmaps(m_records, m_bitmaps);

		save_file();
		truncate_cache_files();

	}

	template<typename data_record>
	void index_builder<data_record>::optimize() {
		if (needs_optimization()) {
			sort_records();
		}
	}

	/*
		Deletes ALL data from this shard.
	*/
	template<typename data_record>
	void index_builder<data_record>::truncate() {
		create_directories();
		truncate_cache_files();

		std::ofstream target_writer(target_filename(), std::ios::trunc);
		target_writer.close();
	}

	/*
		Deletes all data from caches.
	*/
	template<typename data_record>
	void index_builder<data_record>::truncate_cache_files() {

		reset_cache_variables();

		file::delete_file(cache_filename());
		file::delete_file(key_cache_filename());

	}

	template<typename data_record>
	void index_builder<data_record>::create_directories() {
		create_db_directories(m_db_name);
	}

	template<typename data_record>
	size_t index_builder<data_record>::get_max_id() {

		read_data_to_cache();

		uint32_t max_internal_id = 0;
		for (const auto &iter : m_bitmaps) {
			uint32_t internal_id = iter.second.maximum();
			if (internal_id > max_internal_id) {
				max_internal_id = internal_id;
			}
		}

		return (size_t)max_internal_id;
	}

	template<typename data_record>
	void index_builder<data_record>::create_directories(const std::string &db_name) {
		for (size_t i = 0; i < 8; i++) {
			file::create_directory(config::data_path() + "/" + std::to_string(i) + "/full_text/" + db_name);
		}
	}

	template<typename data_record>
	void index_builder<data_record>::read_append_cache() {
		std::unordered_map<uint64_t, uint32_t> internal_id_map;
		read_append_cache(internal_id_map);
	}

	template<typename data_record>
	void index_builder<data_record>::read_append_cache(std::unordered_map<uint64_t, uint32_t> &internal_id_map) {

		// Read the current file.
		read_data_to_cache();

		//profiler::instance prof("index_builder::read_append_cache");

		// Read the cache into memory.
		std::ifstream reader(cache_filename(), std::ios::binary);
		if (!reader.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + cache_filename() + "). Error: " + std::string(strerror(errno)));
		}

		std::ifstream key_reader(key_cache_filename(), std::ios::binary);
		if (!key_reader.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + key_cache_filename() + "). Error: " + std::string(strerror(errno)));
		}

		const size_t buffer_len = 10000;

		std::unique_ptr<data_record[]> buffer_allocator;
		try {
			buffer_allocator = std::make_unique<data_record[]>(buffer_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << buffer_len * sizeof(data_record) << " bytes" << std::endl;
			return;
		}

		std::unique_ptr<uint64_t[]> key_buffer_allocator;
		try {
			key_buffer_allocator = std::make_unique<uint64_t[]>(buffer_len);
		} catch (std::bad_alloc &exception) {
			std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl;
			std::cout << "tried to allocate: " << buffer_len * sizeof(uint64_t) << " bytes" << std::endl;
			return;
		}

		data_record *buffer = buffer_allocator.get();
		uint64_t *key_buffer = key_buffer_allocator.get();

		reader.seekg(0, std::ios::beg);

		std::unordered_map<uint64_t, vector<uint32_t>> bitmap_data;

		while (!reader.eof()) {

			reader.read((char *)buffer, buffer_len * sizeof(data_record));
			key_reader.read((char *)key_buffer, buffer_len * sizeof(uint64_t));

			const size_t read_bytes = reader.gcount();
			const size_t num_records = read_bytes / sizeof(data_record);

			for (size_t i = 0; i < num_records; i++) {
				const auto map_iter = internal_id_map.find(buffer[i].m_value);
				if (map_iter == internal_id_map.end()) {
					const uint32_t internal_id = m_record_id_to_internal_id(buffer[i]);
					internal_id_map[buffer[i].m_value] = internal_id;
					bitmap_data[key_buffer[i]].push_back(internal_id);
				} else {
					bitmap_data[key_buffer[i]].push_back(map_iter->second);
				}
			}
		}

		// Insert the bitmap data.
		for (const auto &iter : bitmap_data) {
			m_bitmaps[iter.first].addMany(iter.second.size(), iter.second.data());
		}
	}

	/*
	 * Reads the file into RAM.
	 * */
	template<typename data_record>
	void index_builder<data_record>::read_data_to_cache() {

		//profiler::instance prof("index_builder::read_data_to_cache");

		reset_cache_variables();

		std::ifstream reader(target_filename(), std::ios::binary);
		if (!reader.is_open()) return;

		reader.seekg(0, std::ios::end);
		const size_t file_size = reader.tellg();
		if (file_size <= this->hash_table_byte_size()) return;
		reader.seekg(this->hash_table_byte_size(), std::ios::beg);

		size_t num_records;
		reader.read((char *)&num_records, sizeof(size_t));

		// Read records.
		const size_t record_buffer_len = 10000;
		std::unique_ptr<data_record[]> record_buffer_allocator = std::make_unique<data_record[]>(record_buffer_len);
		data_record *record_buffer = record_buffer_allocator.get();

		size_t records_read = 0;
		while (records_read < num_records) {
			size_t records_left = num_records - records_read;
			size_t records_to_read = min(records_left, record_buffer_len);
			reader.read((char *)record_buffer, sizeof(data_record) * records_to_read);

			for (size_t i = 0; i < records_to_read; i++) {
				m_record_id_map[record_buffer[i].m_value] = m_records.size();
				m_records.push_back(record_buffer[i]);
			}

			records_read += records_to_read;
		}

		while (this->read_bitmap_page_into(reader, m_bitmaps)) {
		}
	}

	template<typename data_record>
	void index_builder<data_record>::reset_cache_variables() {
		m_records = std::vector<data_record>{};
		m_record_id_map = std::map<uint64_t, uint32_t>{};
		m_bitmaps = std::map<uint64_t, roaring::Roaring>{};
	}

	template<typename data_record>
	void index_builder<data_record>::save_file() {

		//profiler::instance prof("index_builder::save_file");

		std::ostringstream writer;

		reset_key_map(writer);
		write_records(writer);

		std::map<uint64_t, std::vector<uint64_t>> pages;
		for (auto &iter : m_bitmaps) {
			if (this->m_hash_table_size) {
				pages[iter.first % this->m_hash_table_size].push_back(iter.first);
			} else {
				pages[0].push_back(iter.first);
			}
		}

		for (const auto &iter : pages) {
			size_t page_pos = write_page(writer, iter.second);
			write_key(writer, iter.first, page_pos);
			writer.flush();
		}

		std::ofstream file_writer(target_filename(), std::ios::binary | std::ios::trunc);
		if (!file_writer.is_open()) {
			throw LOG_ERROR_EXCEPTION("Could not open full text shard. Error: " + std::string(strerror(errno)));
		}

		file_writer.write(writer.str().c_str(), writer.str().size());
	}

	template<typename data_record>
	void index_builder<data_record>::write_key(std::ostream &key_writer, uint64_t key, size_t page_pos) {
		if (this->m_hash_table_size > 0) {
			assert(key < this->m_hash_table_size);
			key_writer.seekp(key * sizeof(uint64_t));
			key_writer.write((char *)&page_pos, sizeof(size_t));
		}
	}

	/*
	 * Writes the page with keys, appending it to the file stream writer.
	 * */
	template<typename data_record>
	size_t index_builder<data_record>::write_page(std::ostream &writer, const std::vector<uint64_t> &keys) {

		writer.seekp(0, ios::end);

		const size_t page_pos = writer.tellp();

		size_t num_keys = keys.size();

		writer.write((char *)&num_keys, 8);
		writer.write((char *)keys.data(), keys.size() * 8);

		std::vector<size_t> v_pos;
		std::vector<size_t> v_len;

		size_t max_len = 0;
		size_t pos = 0;
		for (uint64_t key : keys) {

			m_bitmaps[key].runOptimize();
			m_bitmaps[key].shrinkToFit();

			// Store position and length
			const size_t len = m_bitmaps[key].getSizeInBytes();

			if (len > max_len) max_len = len;
			
			v_pos.push_back(pos);
			v_len.push_back(len);

			pos += len;
		}
		
		writer.write((char *)v_pos.data(), keys.size() * 8);
		writer.write((char *)v_len.data(), keys.size() * 8);

		std::unique_ptr<char[]> buffer_allocator = make_unique<char[]>(max_len);
		char *buffer = buffer_allocator.get();

		// Write data.
		for (uint64_t key : keys) {
			const size_t len = m_bitmaps[key].getSizeInBytes();
			m_bitmaps[key].write(buffer);
			writer.write(buffer, len);
		}

		return page_pos;
	}

	template<typename data_record>
	void index_builder<data_record>::reset_key_map(std::ostream &key_writer) {
		key_writer.seekp(0);
		uint64_t data = SIZE_MAX;
		for (size_t i = 0; i < this->m_hash_table_size; i++) {
			key_writer.write((char *)&data, sizeof(uint64_t));
		}
	}

	template<typename data_record>
	std::vector<data_record> index_builder<data_record>::read_records() const {
		ifstream reader(target_filename(), std::ios::in);
		reader.seekg(this->hash_table_byte_size(), std::ios::beg);

		const size_t num_records = m_records.size();
		reader.read((char *)&num_records, sizeof(uint64_t));

		std::vector<data_record> records(num_records);
		reader.read((char *)records.data(), num_records * sizeof(data_record));

		return records;
	}

	template<typename data_record>
	void index_builder<data_record>::write_records(std::ostream &writer) {
		const size_t num_records = m_records.size();
		writer.write((char *)&num_records, sizeof(uint64_t));
		writer.write((char *)m_records.data(), num_records * sizeof(data_record));
	}

	template<typename data_record>
	uint32_t index_builder<data_record>::default_record_to_internal_id(const data_record &record) {
		if (m_record_id_map.count(record.m_value) == 0) {
			m_record_id_map[record.m_value] = m_records.size();
			m_records.push_back(record);
		}
		return m_record_id_map[record.m_value];
	}

	template<typename data_record>
	std::string index_builder<data_record>::mountpoint() const {
		return std::to_string(m_id % 8);
	}

	template<typename data_record>
	std::string index_builder<data_record>::cache_filename() const {
		if (m_file_name != "") return m_file_name + ".cache";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".cache";
	}

	template<typename data_record>
	std::string index_builder<data_record>::key_cache_filename() const {
		if (m_file_name != "") return m_file_name + ".cache.keys";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".cache.keys";
	}

	template<typename data_record>
	std::string index_builder<data_record>::target_filename() const {
		if (m_file_name != "") return m_file_name + ".data";
		return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) +
			".data";
	}

	template<typename data_record>
	bool index_builder<data_record>::needs_optimization() const {

		auto records = read_records();

		// Just check if the records are sorted by storage order.
		if (records.size() <= 1) return false;
		
		return !std::is_sorted(records.cbegin(), records.cend(), typename data_record::storage_order());
	}

	template<typename data_record>
	void index_builder<data_record>::sort_records() {

		read_data_to_cache();

		sort_records_and_bitmaps(m_records, m_bitmaps);

		save_file();
		truncate_cache_files();
	}

	template<typename data_record>
	void index_builder<data_record>::sort_records_and_bitmaps(std::vector<data_record> &records,
			std::map<uint64_t, roaring::Roaring> &bitmaps) {

		std::vector<uint32_t> permutation(records.size());
		std::iota(permutation.begin(), permutation.end(), 0);

		typename data_record::storage_order ordered;

		std::sort(permutation.begin(), permutation.end(), [&records, &ordered](const size_t &a, const size_t &b) {
			return ordered(records[a], records[b]);
		});
		// permutation now points from new position -> old position of record.

		std::vector<uint32_t> inverse(permutation.size());
		for (uint32_t i = 0; i < permutation.size(); i++) {
			inverse[permutation[i]] = i;
		}
		// inverse now points from old position -> new position of record.

		// Reorder the records.
		sort(records.begin(), records.end(), ordered);

		// Apply transforms.
		for (auto &iter : bitmaps) {

			::roaring::Roaring rr;
			for (uint32_t v : iter.second) {
				const uint32_t v_trans = inverse[v];
				rr.add(v_trans);
			}
			bitmaps[iter.first] = rr;
		}
	}

}


================================================
FILE: src/indexer/index_manager.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "index_manager.h"
#include "merger.h"
#include "domain_stats/domain_stats.h"
#include "url_link/link.h"
#include "algorithm/algorithm.h"
#include "algorithm/sort.h"
#include "utils/thread_pool.hpp"

using namespace std;

namespace indexer {

	index_manager::index_manager() {

		m_url_index_builder = std::make_unique<sharded_builder<basic_index_builder, url_record>>("url_index", 4001);
		m_url_index = std::make_unique<sharded<basic_index, url_record>>("url_index", 4001);

		m_link_index_builder = std::make_unique<sharded_builder<basic_index_builder, link_record>>("link_index", 4001);
		m_link_index = std::make_unique<sharded<basic_index, link_record>>("link_index", 4001);

		m_domain_link_index_builder = std::make_unique<sharded_builder<basic_index_builder, domain_link_record>>("domain_link_index", 4001);
		m_domain_link_index = std::make_unique<sharded<basic_index, domain_link_record>>("domain_link_index", 4001);

		m_hash_table_builder = std::make_unique<hash_table2::builder>("index_manager");
		m_hash_table = std::make_unique<hash_table2::hash_table>("index_manager");

	}

	index_manager::~index_manager() {
	}

	void index_manager::add_index_file(const string &local_path) {

		const vector<size_t> cols = {1, 2, 3, 4};
		const vector<float> scores = {10.0, 3.0, 2.0, 1};

		ifstream infile(local_path, ios::in);
		string line;

		// word_map holds a word hash (token) => score
		std::map<uint64_t, float> word_map;

		size_t num_added = 0;
		while (getline(infile, line)) {
			vector<string> col_values;
			boost::algorithm::split(col_values, line, boost::is_any_of("\t"));


			URL url(col_values[0]);

			const uint64_t url_hash = url.hash();
			const uint64_t domain_hash = url.host_hash();
			const float harmonic = domain_stats::harmonic_centrality(url);

			// add to hash table
			m_hash_table_builder->add(url_hash, line);

			url_record record(url_hash, 0.0f, domain_hash);
			record.url_length(url.path_with_query().size());

			const std::string site_colon = "site:" + url.host() + " site:www." + url.host() + " " + url.host() + " " + url.domain_without_tld();
			const auto site_colon_tokens = text::get_unique_full_text_tokens(site_colon);
			for (auto token : site_colon_tokens) {
				word_map[token] += harmonic * 20;
			}

			size_t col_idx = 0;
			for (size_t col : cols) {
				const auto tokens = text::get_unique_expanded_full_text_tokens(col_values[col]);
				for (auto token : tokens) {
					word_map[token] += scores[col_idx] * harmonic;
				}
			}

			for (const auto &iter : word_map) {
				record.m_score = iter.second;
				m_url_index_builder->add(iter.first, record);
				num_added++;
			}

			word_map.clear();
		}
		std::cout << "num added: " << num_added << std::endl;
	}

	void index_manager::add_index_files_threaded(const vector<string> &local_paths, size_t num_threads) {

		num_threads = 1;
		utils::thread_pool pool(num_threads);

		for (const string &local_path : local_paths) {
			pool.enqueue([this, local_path]() -> void {
				add_index_file(local_path);
			});
		}

		pool.run_all();

		m_hash_table_builder->merge();

	}

	void index_manager::add_link_file(const string &local_path, const ::algorithm::bloom_filter &urls_to_index) {

		profiler::instance prof("add " + local_path);
		ifstream infile(local_path, ios::in);
		string line;
		size_t added = 0;
		size_t parsed = 0;
		std::vector<std::string> col_values;
		while (getline(infile, line)) {

			col_values.clear();
			boost::algorithm::split(col_values, line, boost::is_any_of("\t"));

			URL target_url(col_values[2], col_values[3]);

			parsed++;

			URL source_url(col_values[0], col_values[1]);

			float target_harmonic = domain_stats::harmonic_centrality(target_url);
			float source_harmonic = domain_stats::harmonic_centrality(source_url);

			const std::string link_text = col_values[4].substr(0, 1000);

			const url_link::link link(source_url, target_url, source_harmonic, target_harmonic);

			const uint64_t domain_link_hash = source_url.domain_link_hash(target_url, link_text);
			const uint64_t link_hash = source_url.link_hash(target_url, link_text);
			const bool bloom_has_url = urls_to_index.exists(target_url.hash());

			std::vector<uint64_t> tokens = text::get_unique_expanded_full_text_tokens(link_text);

			if (bloom_has_url) {

				const bool has_url = m_hash_table->has(target_url.hash());

				if (has_url) {
					// Add the url link.
					link_record link_rec(link_hash, source_harmonic);
					link_rec.m_source_domain = source_url.hash();
					link_rec.m_target_hash = target_url.hash();

					for (auto token : tokens) {
						m_link_index_builder->add(token, link_rec);
					}
					added++;
				}
			}

			domain_link_record rec(domain_link_hash, source_harmonic);
			rec.m_source_domain = source_url.host_hash();
			rec.m_target_domain = target_url.host_hash();

			for (auto token : tokens) {
				m_domain_link_index_builder->add(token, rec);
			}

			tokens.clear();
		}

		cout << "Done with " << local_path << " added " << added << " total " << parsed << " took: " << prof.get() << "ms" << endl;
	}

	void index_manager::add_link_files_threaded(const vector<string> &local_paths, size_t num_threads, const ::algorithm::bloom_filter &urls_to_index) {

		utils::thread_pool pool(num_threads);

		for (auto &local_path : local_paths) {
			pool.enqueue([this, local_path, &urls_to_index]() -> void {
				add_link_file(local_path, urls_to_index);
			});
		}

		pool.run_all();
	}

	void index_manager::add_url_file(const string &local_path) {

		
	}

	void index_manager::add_url_files_threaded(const vector<string> &local_paths, size_t num_threads) {

		utils::thread_pool pool(num_threads);

		for (auto &local_path : local_paths) {
			pool.enqueue([this, local_path]() -> void {
				add_url_file(local_path);
			});
		}

		pool.run_all();
	}

	void index_manager::merge() {

		m_url_index_builder->append();
		m_url_index_builder->merge();

		m_link_index_builder->append();
		m_link_index_builder->merge();

		m_domain_link_index_builder->append();
		m_domain_link_index_builder->merge();

	}

	void index_manager::optimize() {
	}

	void index_manager::truncate() {
		m_url_index_builder->truncate();
		truncate_links();
	}

	void index_manager::truncate_links() {
		m_link_index_builder->truncate();
		m_domain_link_index_builder->truncate();
	}

	std::vector<return_record> index_manager::find(const string &query, full_text::search_metric &metric) {

		auto words = text::get_full_text_words(query, config::query_max_words);
		if (words.size() == 0) return {};

		auto tokens = text::get_full_text_tokens(query, config::query_max_words);

		auto links = m_link_index->find_intersection(tokens, 500000);

		metric.m_total_url_links_found = links.size();
		metric.m_links_handled = links.size();

		std::sort(links.begin(), links.end(), [](const auto &a, const auto &b) {
			return a.m_target_hash < b.m_target_hash;
		});

		auto domain_links = m_domain_link_index->find_intersection(tokens, 100000);

		metric.m_total_domain_links_found = domain_links.size();

		auto results = m_url_index->find_intersection(tokens);

		metric.m_total_found = results.size();

		size_t applied_domain_links = apply_domain_link_scores(domain_links, results);
		size_t applied_url_links = apply_link_scores(links, results);

		metric.m_link_url_matches = applied_url_links;
		metric.m_link_domain_matches = applied_domain_links;

		const auto sort_by = [](const auto &a, const auto &b) {
			if (a.m_score == b.m_score) return a.m_value < b.m_value;
			return a.m_score > b.m_score;
		};

		if (results.size() > config::pre_result_limit) {
			nth_element(results.begin(), results.begin() + (config::pre_result_limit - 1), results.end(), sort_by);
			std::sort(results.begin(), results.begin() + config::pre_result_limit, sort_by);
			results.resize(config::pre_result_limit);
		}

		const auto deduplicated = deduplicate_search_results(results, config::result_limit);
		const auto return_records = decorate_search_result(deduplicated);

		return return_records;
	}

	std::vector<url_record> index_manager::deduplicate_search_results(const std::vector<url_record> &results, size_t limit) {

		std::vector<url_record> deduped;
		std::vector<url_record> non_deduped;

		std::map<uint64_t, size_t> d_count;
		for (const auto &result : results) {
			if (d_count[result.m_domain_hash] < config::deduplicate_domain_count) {
				deduped.push_back(result);
			} else {
				non_deduped.push_back(result);
			}
			d_count[result.m_domain_hash]++;
		}
		if (deduped.size() < limit) {
			const size_t num_missing = limit - deduped.size();
			if (non_deduped.size() > num_missing) {
				non_deduped.resize(num_missing);
			}
			std::vector<url_record> ret;
			::algorithm::sort::merge_arrays(deduped, non_deduped, [] (const auto &a, const auto &b) {
				return a.m_score > b.m_score;
			}, ret);
			return ret;
		}

		deduped.resize(limit);

		return deduped;
	}

	std::vector<return_record> index_manager::decorate_search_result(const std::vector<url_record> &results) {
		std::vector<return_record> return_records;

		for (const auto &res : results) {
			const auto tsv_data = m_hash_table->find(res.m_value);
			return_record ret(res.m_value, res.m_score, tsv_data);
			ret.m_domain_hash = res.m_domain_hash;
			return_records.push_back(std::move(ret));
		}

		return return_records;
	}

	size_t index_manager::apply_domain_link_scores(const vector<domain_link_record> &links, std::vector<url_record> &results) {
		if (links.size() == 0) return 0;
		size_t applied_links = 0;

		{
			unordered_map<uint64_t, float> domain_scores;
			unordered_map<uint64_t, int> domain_counts;
			map<pair<uint64_t, uint64_t>, uint64_t> domain_unique;
			{
				for (const auto &link : links) {
					if (domain_unique.count(std::make_pair(link.m_source_domain, link.m_target_domain)) == 0) {
						const float domain_score = expm1(25.0f*link.m_score) / 50.0f;
						domain_scores[link.m_target_domain] += domain_score;
						domain_counts[link.m_target_domain]++;
						domain_unique[std::make_pair(link.m_source_domain, link.m_target_domain)] = link.m_source_domain;
					}
				}
			}

			for (auto &result : results) {
				const float domain_score = domain_scores[result.m_domain_hash];
				result.m_score += domain_score;
				applied_links += domain_counts[result.m_domain_hash];
			}
		}

		return applied_links;
	}

	size_t index_manager::apply_link_scores(const vector<link_record> &links, std::vector<url_record> &results) {

		if (links.size() == 0) return 0;

		size_t applied_links = 0;

		size_t i = 0, j = 0;
		std::map<std::pair<uint64_t, uint64_t>, uint64_t> domain_unique;
		while (i < links.size() && j < results.size()) {
			const uint64_t hash1 = links[i].m_target_hash;
			const uint64_t hash2 = results[j].m_value;

			if (hash1 < hash2) {
				i++;
			} else if (hash1 == hash2) {
				if (domain_unique.count(std::make_pair(links[i].m_source_domain, links[i].m_target_hash)) == 0) {
					const float url_score = expm1(25.0f*links[i].m_score) / 50.0f;
					results[j].m_score += url_score;
					applied_links++;
					domain_unique[std::make_pair(links[i].m_source_domain, links[i].m_target_hash)] = links[i].m_source_domain;
				}

				i++;
			} else {
				j++;
			}
		}
		return applied_links;
	}

	std::vector<return_record> index_manager::find(const string &query) {
		full_text::search_metric metric;
		return find(query, metric);
	}

}


================================================
FILE: src/indexer/index_manager.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <memory>
#include "full_text/search_metric.h"
#include "index_builder.h"
#include "index.h"
#include "sharded_index_builder.h"
#include "sharded_index.h"
#include "hash_table2/builder.h"
#include "sharded_builder.h"
#include "sharded.h"
#include "basic_index_builder.h"
#include "basic_index.h"
#include "counted_record.h"
#include "url_record.h"
#include "link_record.h"
#include "domain_link_record.h"
#include "domain_record.h"
#include "return_record.h"
#include "algorithm/bloom_filter.h"

namespace indexer {

	class index_manager {

	public:

		index_manager();
		~index_manager();

		void add_index_file(const std::string &local_path);
		void add_index_files_threaded(const vector<string> &local_paths, size_t num_threads);
		void add_link_file(const std::string &local_path, const ::algorithm::bloom_filter &urls_to_index);
		void add_link_files_threaded(const std::vector<std::string> &local_paths, size_t num_threads, const ::algorithm::bloom_filter &urls_to_index);
		void add_url_file(const std::string &local_path);
		void add_url_files_threaded(const std::vector<std::string> &local_paths, size_t num_threads);

		void merge();
		void optimize();
		void truncate();
		void truncate_links();

		size_t url_count() const {
			return m_hash_table->size();
		}

		std::vector<return_record> find(const std::string &query, full_text::search_metric &metric);
		std::vector<return_record> find(const std::string &query);

	private:

		std::unique_ptr<sharded_builder<basic_index_builder, url_record>> m_url_index_builder;
		std::unique_ptr<sharded<basic_index, url_record>> m_url_index;

		std::unique_ptr<sharded_builder<basic_index_builder, link_record>> m_link_index_builder;
		std::unique_ptr<sharded<basic_index, link_record>> m_link_index;

		std::unique_ptr<sharded_builder<basic_index_builder, domain_link_record>> m_domain_link_index_builder;
		std::unique_ptr<sharded<basic_index, domain_link_record>> m_domain_link_index;

		std::unique_ptr<hash_table2::builder> m_hash_table_builder;
		std::unique_ptr<hash_table2::hash_table> m_hash_table;

		size_t apply_domain_link_scores(const vector<domain_link_record> &links, std::vector<url_record> &results);
		size_t apply_link_scores(const vector<link_record> &links, std::vector<url_record> &results);
		std::vector<return_record> decorate_search_result(const std::vector<url_record> &results);
		std::vector<url_record> deduplicate_search_results(const std::vector<url_record> &results, size_t limit);

	};

}


================================================
FILE: src/indexer/index_reader.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "index_reader.h"
#include <string.h>

using namespace std;

namespace indexer {

	index_reader_file::index_reader_file(const std::string &filename) {
		m_reader = make_unique<ifstream>();
		m_reader->open(filename, ios::binary);
	}

	index_reader_file::index_reader_file(index_reader_file &&other) {
		m_reader = move(other.m_reader);
	}

	bool index_reader_file::seek(size_t position) {
		if (!m_reader->is_open()) return false;
		m_reader->seekg(position, ios::beg);
		return true;
	}

	void index_reader_file::read(char *buffer, size_t length) {
		m_reader->read(buffer, length);
	}

	size_t index_reader_file::size() {
		if (!m_reader->is_open()) return 0;
		m_reader->seekg(0, ios::end);
		return m_reader->tellg();
	}

	index_reader_ram::index_reader_ram(const std::string &str)
	: m_buffer(str.c_str()), m_len(str.size()) {
	}

	index_reader_ram::index_reader_ram(const char *buffer, size_t length)
	: m_buffer(buffer), m_len(length) {
	}

	index_reader_ram::index_reader_ram(index_reader_ram &&other)
	: m_buffer(other.m_buffer), m_len(other.m_len) {

		other.m_buffer = nullptr;
		other.m_len = 0;
	}


	bool index_reader_ram::seek(size_t position) {
		if (position < m_len) {
			m_pos = position;
			return true;
		}
		return false;
	}

	void index_reader_ram::read(char *buffer, size_t length) {
		if (m_pos + length <= m_len) {
			memcpy(buffer, &m_buffer[m_pos], length);
			m_pos += length;
		}
	}

}


================================================
FILE: src/indexer/index_reader.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <fstream>
#include <memory>

namespace indexer {

	/*
		This class provides an abstraction of data reading used by the index class.
		We provide an interface and two classes:
		index_reader_file
		and
		index_reader_ram
		to provide data directly from the file or from a preloaded sequence of bytes.
	*/

	class index_reader {

		public:

			virtual bool seek(size_t position) = 0;
			virtual void read(char *buffer, size_t length) = 0;
			virtual size_t size() = 0;
		
	};

	class index_reader_file : public index_reader {

		private:
			index_reader_file(const index_reader_file &);
			index_reader_file &operator=(const index_reader_file &);

		public:

			index_reader_file(const std::string &filename);
			index_reader_file(index_reader_file &&other);

			bool seek(size_t position);
			void read(char *buffer, size_t length);
			size_t size();
		
		private:
		
			std::unique_ptr<std::ifstream> m_reader;

	};

	class index_reader_ram : public index_reader {

		private:
			index_reader_ram(const index_reader_file &);
			index_reader_ram &operator=(const index_reader_file &);

		public:

			explicit index_reader_ram(const std::string &str);
			index_reader_ram(const char *buffer, size_t length);
			index_reader_ram(index_reader_ram &&other);

			bool seek(size_t position);
			void read(char *buffer, size_t length);
			size_t size() {return m_len; };
		
		private:
		
			const char *m_buffer;
			size_t m_len;
			size_t m_pos = 0;

	};


}


================================================
FILE: src/indexer/index_utils.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "index_utils.h"
#include <boost/filesystem.hpp>
#include "config.h"

namespace indexer {

	void create_db_directories(const std::string &db_name) {
		for (size_t i = 0; i < 8; i++) {
			boost::filesystem::create_directories(config::data_path() + "/" + std::to_string(i) + "/full_text/" + db_name);
		}
	}

	void delete_db_directories(const std::string &db_name) {
		for (size_t i = 0; i < 8; i++) {
			boost::filesystem::remove_all(config::data_path() + "/" + std::to_string(i) + "/full_text/" + db_name);
		}
	}

}


================================================
FILE: src/indexer/index_utils.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {

	void create_db_directories(const std::string &db_name);
	void delete_db_directories(const std::string &db_name);

}


================================================
FILE: src/indexer/link_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {

	#pragma pack(4)
	class link_record {
		public:
		uint64_t m_value;
		float m_score;
		uint64_t m_source_domain;
		uint64_t m_target_hash;

		link_record() : m_value(0), m_score(0.0f) {};
		link_record(uint64_t value) : m_value(value), m_score(0.0f) {};
		link_record(uint64_t value, float score) : m_value(value), m_score(score) {};

		bool operator==(const link_record &b) const {
			return m_value == b.m_value;
		}

		bool operator<(const link_record &b) const {
			return m_value < b.m_value;
		}

		link_record &operator+=(const link_record &b) {
			return *this;
		}

		/*
		 * Will be applied to records before truncating. Top records will be kept.
		 * */
		struct truncate_order {
			inline bool operator() (const link_record &a, const link_record &b) {
				return a.m_score > b.m_score;
			}
		};

		/*
		 * Will be applied before storing on disk. This is the order the records will be returned in.
		 * */
		struct storage_order {
			inline bool operator() (const link_record &a, const link_record &b) {
				return a.m_target_hash < b.m_target_hash;
			}
		};

		bool storage_equal(const link_record &a) const {
			return m_target_hash == a.m_target_hash;
		}

	};
	#pragma pack()
}


================================================
FILE: src/indexer/merger.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "merger.h"
#include "memory/memory.h"
#include "memory/debugger.h"
#include "utils/thread_pool.hpp"
#include <map>
#include <chrono>
#include <thread>

using namespace std;

namespace indexer {

	namespace merger {

		double mem_limit = 0.4;

		bool is_merging = false;
		map<size_t, std::function<void()>> mergers;
		map<size_t, std::function<void()>> appenders;
		map<size_t, std::function<size_t()>> sizes;
		mutex merger_lock;

		void set_mem_limit(double mem_limit) {
			::indexer::merger::mem_limit = mem_limit;
		}

		void wait_for_merges() {
			while (is_merging) {
				std::this_thread::sleep_for(100ms);
			}
		}

		void lock() {
			if (is_merging) {
				wait_for_merges();
			}
		}

		void register_appender(size_t id, std::function<void()> append, std::function<size_t()> size) {
			std::lock_guard lock(merger_lock);

			appenders[id] = append;
			sizes[id] = size;
		}

		void register_merger(size_t id, std::function<void()> merge) {
			std::lock_guard lock(merger_lock);

			mergers[id] = merge;
		}

		void deregister_merger(size_t id) {
			std::lock_guard lock(merger_lock);

			appenders.erase(id);
			mergers.erase(id);
			sizes.erase(id);
		}

		bool merge_thread_is_running = true;
		thread merge_thread_obj;

		void append_all() {
			is_merging = true;
			this_thread::sleep_for(1000ms);

			size_t available_memory = memory::get_total_memory();

			std::cout << "APPENDING ALL: " << appenders.size() << " mergers allocated memory: " << memory::allocated_memory() << " limit is: " <<
				(available_memory * mem_limit) << std::endl;
			
			utils::thread_pool pool(32);

			merger_lock.lock();
			for (auto &iter : appenders) {
				pool.enqueue([iter]() {
					try {
						iter.second();
					} catch (...) {

					}
				});
			}

			pool.run_all();

			cout << "done... allocated memory: " << memory::allocated_memory() << endl;

			merger_lock.unlock();
			is_merging = false;
		}

		void merge_all() {
			is_merging = true;
			this_thread::sleep_for(1000ms);

			size_t available_memory = memory::get_total_memory();

			std::cout << "MERGING ALL: " << mergers.size() << " mergers allocated memory: " << memory::allocated_memory() << " limit is: " <<
				(available_memory * mem_limit) << std::endl;
			
			utils::thread_pool pool(32);

			for (auto &iter : mergers) {
				pool.enqueue([iter]() {
					try {
					      iter.second();
					} catch (...) {
					      
					}
				});
			}

			pool.run_all();

			cout << "done... allocated memory: " << memory::allocated_memory() << endl;

			is_merging = false;
		}

		size_t total_sizes() {
			std::lock_guard lock(merger_lock);
			size_t sum = 0;
			for (const auto &iter : sizes) {
				sum += iter.second();
			}
			return sum;
		}

		void merge_thread() {
			memory::update();
			size_t available_memory = memory::get_total_memory();
			while (merge_thread_is_running) {
				if (total_sizes() > available_memory * mem_limit) {
					append_all();
				}
				this_thread::sleep_for(200ms);
			}
		}

		void start_merge_thread() {
			merge_thread_is_running = true;
			if (merge_thread_obj.joinable()) {
				throw std::runtime_error("Trying to start already started merge thread. Not allowed.");
			}
			merge_thread_obj = std::move(thread(merge_thread));
		}

		void stop_merge_thread() {
			merge_thread_is_running = false;
			merge_thread_obj.join();
			append_all();
			merge_all();
		}

		void stop_merge_thread_only_append() {
			merge_thread_is_running = false;
			merge_thread_obj.join();
			append_all();
		}

		void terminate_merge_thread() {
			merge_thread_is_running = false;
			merge_thread_obj.join();
		}

		void force_append() {
			append_all();
		}
	}

}


================================================
FILE: src/indexer/merger.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <functional>

using namespace std;

namespace indexer {

	namespace merger {
		void set_mem_limit(double mem_limit);
		void lock();
		void register_merger(size_t id, std::function<void()> merge);
		void register_appender(size_t id, std::function<void()> append, std::function<size_t()> size);
		void deregister_merger(size_t id);

		void start_merge_thread();
		void stop_merge_thread();
		void stop_merge_thread_only_append();
		void terminate_merge_thread();
		void force_append();
	};

}


================================================
FILE: src/indexer/regular_index_builder.h
================================================


================================================
FILE: src/indexer/return_record.h
================================================

#pragma once

#include "URL.h"
#include "generic_record.h"
#include "text/text.h"

namespace indexer {

	/*
	This is the returned record from the index_manager. It contains more data than the stored record.
	*/
	class return_record : public generic_record {

		public:
		uint64_t m_url_hash;
		uint64_t m_domain_hash;
		size_t m_num_url_links = 0;
		size_t m_num_domain_links = 0;
		URL m_url;
		std::string m_title;
		std::string m_snippet;
		std::string m_meta;

		return_record() : generic_record() {};
		return_record(uint64_t value) : generic_record(value) {};
		return_record(uint64_t value, float score) : generic_record(value, score) {};
		return_record(uint64_t value, float score, const std::string &tsv_data) : generic_record(value, score) {

			size_t pos_start = 0;
			size_t pos_end = 0;
			size_t col_num = 0;
			while (pos_end != std::string::npos) {
				pos_end = tsv_data.find('\t', pos_start);
				const size_t len = pos_end - pos_start;
				if (col_num == 0) {
					m_url = URL(tsv_data.substr(pos_start, len));
				}
				if (col_num == 1) {
					m_title = tsv_data.substr(pos_start, len);
				}
				if (col_num == 3) {
					m_meta = tsv_data.substr(pos_start, len);
				}
				if (col_num == 4) {
					m_snippet = make_snippet(tsv_data.substr(pos_start, len));
					if (m_snippet.size() == 0) {
						m_snippet = make_snippet(m_meta);
					}
				}

				pos_start = pos_end + 1;
				col_num++;
			}

		};

		private:
		std::string make_snippet(const std::string &text) const {
			auto response = text.substr(0, 140);
			text::trim(response);
			if (response.size() >= 140) response += "...";
			return response;
		}

	};
}


================================================
FILE: src/indexer/score_builder.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "score_builder.h"
#include <iostream>
#include <map>

namespace indexer {

	score_builder::score_builder(size_t num_documents, const std::map<uint64_t, size_t> *document_sizes)
	: m_num_documents(num_documents), m_document_sizes(document_sizes)
	{
		calculate_avg_document_size();
	}
		
	float score_builder::score() const {
		return 0.0f;
	}

	size_t score_builder::document_size(uint64_t doc_id) const {
		if (m_document_sizes->count(doc_id)) {
			return m_document_sizes->at(doc_id);
		}
		return 0;
	}

	void score_builder::calculate_avg_document_size() {
		m_avg_document_size = 0.0f;
		if (m_document_sizes->size()) {
			size_t sum = 0;
			for (const auto &iter : *m_document_sizes) {
				sum += iter.second;
			}
			m_avg_document_size = (float)sum / m_document_sizes->size();
		}
	}

}


================================================
FILE: src/indexer/score_builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <cstdint>
#include <iostream>
#include <map>

namespace indexer {

	class score_builder {

	public:

		score_builder(size_t num_documents, const std::map<uint64_t, size_t> *document_sizes);
		
		float score() const;
		size_t document_count() const { return m_num_documents; }
		size_t document_size(uint64_t doc_id) const;
		float avg_document_size() const { return m_avg_document_size; };

	private:

		size_t m_num_documents;
		float m_avg_document_size;
		const std::map<uint64_t, size_t> *m_document_sizes;

		void calculate_avg_document_size();

	};

}


================================================
FILE: src/indexer/sharded.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>
#include <memory>
#include "config.h"
#include "algorithm/sum_sorted.h"
#include "algorithm/intersection.h"
#include "utils/thread_pool.hpp"

namespace indexer {

	template<template<typename> typename index_type, typename data_record>
	class sharded {

	public:

		sharded(const std::string &db_name, size_t num_shards);
		sharded(const std::string &db_name, size_t num_shards, size_t hash_table_size);
		~sharded();

		/* 
		 * Find single key
		 * Returns vector with records in storage_order.
		 * */
		std::vector<data_record> find(uint64_t key) const;
		std::vector<data_record> find(uint64_t key, size_t limit) const;

		/*
		 * Find intersection of multiple keys
		 * Returns vector with records in storage order.
		 * */
		std::vector<data_record> find_intersection(const std::vector<uint64_t> &keys, size_t limit = 0) const;

		/*
		 * Find each key in keys and records with same m_value each key only returns top 'limit' number of results.
		 * Returns vector with summed records.
		 * */
		std::vector<data_record> find_sum(const std::vector<uint64_t> &keys, size_t limit) const;

		/*
		 * Iterates the keys of the index and calls the callback with key and vector of records for that key.
		 * */
		void for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const;
		void for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key, size_t num_threads) const;

	private:

		std::string m_db_name;
		size_t m_num_shards;
		size_t m_hash_table_size;

		void read_meta();
		std::string filename() const;

	};

	template<template<typename> typename index_type, typename data_record>
	sharded<index_type, data_record>::sharded(const std::string &db_name, size_t num_shards)
	: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(config::shard_hash_table_size)
	{
		read_meta();
	}

	template<template<typename> typename index_type, typename data_record>
	sharded<index_type, data_record>::sharded(const std::string &db_name, size_t num_shards, size_t hash_table_size)
	: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(hash_table_size)
	{
		read_meta();
	}

	template<template<typename> typename index_type, typename data_record>
	sharded<index_type, data_record>::~sharded() {
	}

	template<template<typename> typename index_type, typename data_record>
	std::vector<data_record> sharded<index_type, data_record>::find(uint64_t key) const {

		const size_t shard_id = key % m_num_shards;
		index_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);

		return idx.find(key);
	}

	template<template<typename> typename index_type, typename data_record>
	std::vector<data_record> sharded<index_type, data_record>::find(uint64_t key, size_t limit) const {

		const size_t shard_id = key % m_num_shards;
		index_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);

		return idx.find(key, limit);
	}

	template<template<typename> typename index_type, typename data_record>
	std::vector<data_record> sharded<index_type, data_record>::find_intersection(const std::vector<uint64_t> &keys, size_t limit) const {

		std::vector<std::unique_ptr<data_record[]>> results;
		std::vector<size_t> num_results;
		for (uint64_t key : keys) {

			const size_t shard_id = key % m_num_shards;
			index_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);
			
			size_t num_records;
			std::unique_ptr<data_record[]> res = idx.find_ptr(key, num_records);
			results.emplace_back(std::move(res));
			num_results.push_back(num_records);
		}

		std::vector<data_record> ret = ::algorithm::intersection(results, num_results);

		if (limit && ret.size() > limit) {
			std::nth_element(ret.begin(), ret.begin () + (limit - 1), ret.end(), [](const auto &a, const auto &b) {
				return a.m_score > b.m_score;
			});
			ret.resize(limit);
		}

		return ret;
	}

	template<template<typename> typename index_type, typename data_record>
	std::vector<data_record> sharded<index_type, data_record>::find_sum(const std::vector<uint64_t> &keys,
			size_t limit) const {

		std::vector<std::vector<data_record>> results;
		for (uint64_t key : keys) {
			const size_t shard_id = key % m_num_shards;
			index_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);

			std::vector<data_record> res = idx.find(key, limit);

			sort(res.begin(), res.end());

			results.emplace_back(std::move(res));
		}

		// Sum equal elements.
		return ::algorithm::sum_sorted<data_record>(results, [](data_record &a, const data_record &b) {
			a.m_score += b.m_score;
		});

	}

	template<template<typename> typename index_type, typename data_record>
	void sharded<index_type, data_record>::for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key) const {
		for_each(on_each_key, 32);
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded<index_type, data_record>::for_each(std::function<void(uint64_t key, std::vector<data_record> &recs)> on_each_key, size_t num_threads) const {
		utils::thread_pool pool(num_threads);

		for (size_t shard_id = 0; shard_id < m_num_shards; shard_id++) {
			pool.enqueue([this, shard_id, &on_each_key]() {
				index_type<data_record> idx(m_db_name, shard_id, m_hash_table_size);
				idx.for_each(on_each_key);
			});
		}

		pool.run_all();
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded<index_type, data_record>::read_meta() {
		std::ifstream meta_file(filename(), std::ios::binary);

		if (meta_file.is_open()) {

		}
	}

	template<template<typename> typename index_type, typename data_record>
	std::string sharded<index_type, data_record>::filename() const {
		// This file will contain meta data on the index. For example the hyper log log document counter.
		return config::data_path() + "/0/full_text/" + m_db_name + ".meta";
	}

}


================================================
FILE: src/indexer/sharded_builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <map>
#include <fstream>
#include "algorithm/hyper_log_log.h"
#include "utils/thread_pool.hpp"
#include "debug.h"
#include "config.h"

namespace indexer {

	template<template<typename> typename index_type, typename data_record>
	class sharded_builder {
	private:
		// Non copyable
		sharded_builder(const sharded_builder &);
		sharded_builder& operator=(const sharded_builder &);

	public:

		sharded_builder(const std::string &db_name, size_t num_shards);
		~sharded_builder();

		void add(uint64_t key, const data_record &record);
		
		void append();
		void merge();

		void truncate();
		void truncate_cache_files();
		void create_directories();

		size_t document_count() const { return m_document_counter.count(); }
		size_t document_size(uint64_t value) { return m_document_sizes[value]; }

		void calculate_scores();
		void sort_by_scores();

	private:

		std::mutex m_lock;
		std::string m_db_name;
		std::vector<std::shared_ptr<index_type<data_record>>> m_shards;

		::algorithm::hyper_log_log m_document_counter;
		std::map<uint64_t, size_t> m_document_sizes;
		float m_avg_document_size = 0.0f;
		size_t m_num_added_keys = 0;

		void read_meta();
		void write_meta();
		std::string filename() const;

	};

	template<template<typename> typename index_type, typename data_record>
	sharded_builder<index_type, data_record>::sharded_builder(const std::string &db_name, size_t num_shards) {

		m_db_name = db_name;
		for (size_t shard_id = 0; shard_id < num_shards; shard_id++) {
			m_shards.push_back(std::make_shared<index_type<data_record>>(db_name, shard_id));
		}
		create_directories();
		read_meta();
	}

	template<template<typename> typename index_type, typename data_record>
	sharded_builder<index_type, data_record>::~sharded_builder() {
		write_meta();
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::add(uint64_t key, const data_record &record) {
		m_shards[key % m_shards.size()]->add(key, record);

		m_document_counter.insert(record.m_value);

		/*m_num_added_keys++;
		m_document_sizes[record.m_value]++;*/ // Raw non unique document size.
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::append() {
		for (auto &shard : m_shards) {
			shard->append();
		}
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::merge() {
		utils::thread_pool pool(32);
		for (size_t i = 0; i < m_shards.size(); i++) {
			pool.enqueue([this, i]() {
				try {
					m_shards[i]->merge();
				} catch (...) {
				}
			});
		}

		pool.run_all();
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::truncate() {
		for (auto &shard : m_shards) {
			shard->truncate();
		}
		std::ofstream meta_file(filename(), std::ios::trunc);

		m_document_counter.reset();
		m_document_sizes.clear();
		m_num_added_keys = 0;
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::truncate_cache_files() {
		for (auto &shard : m_shards) {
			shard->truncate_cache_files();
		}
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::create_directories() {
		for (auto &shard : m_shards) {
			shard->create_directories();
		}
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::calculate_scores() {

		const size_t total_records = m_document_counter.count();
		double average_document_size = 0.0f;
		for (const auto &iter : m_document_sizes) {
			average_document_size += iter.second;
		}
		average_document_size /= m_document_sizes.size();

		const auto tf_idf = [this, total_records](const data_record &rec, size_t num_records) {
			data_record ret = rec;
			float tf = (float)rec.m_count/m_document_sizes[rec.m_value];
			float idf = (float)total_records/num_records;
			ret.m_score = tf*log(idf);
			return ret;
		};

		const auto bm25 = [this, total_records, average_document_size](const data_record &rec, size_t num_records) {

			if (m_document_sizes[rec.m_value] < 1000) {
				data_record ret = rec;
				ret.m_score = 0.0f;
				return ret;
			}

			// https://en.wikipedia.org/wiki/Okapi_BM25
			const double N = total_records; 
			const double n_q = num_records;
			const double idf = log((N - n_q + 0.5)/(n_q + 0.5) + 1.0);

			const double count_d = rec.m_count;
			const double doc_size_d = m_document_sizes[rec.m_value];

			const double f_q = count_d/doc_size_d;
			const double k1 = 1.2;
			const double b = 0.75;
			const double d_card = m_document_sizes[rec.m_value];

			const double score = idf * (f_q * (k1 + 1.0)) / (f_q + k1 * (1.0 - b + b * (d_card / average_document_size)));

			data_record ret = rec;
			ret.m_score = score;
			return ret;
		};

		(void)tf_idf;

		const auto algo = bm25;

		utils::thread_pool pool(32);
		for (size_t i = 0; i < m_shards.size(); i++) {
			pool.enqueue([this, i, algo](){
				m_shards[i]->transform(algo);
			});
		}
		pool.run_all();
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::sort_by_scores() {

		utils::thread_pool pool(32);
		for (size_t i = 0; i < m_shards.size(); i++) {
			pool.enqueue([this, i](){
				m_shards[i]->sort_by([](const data_record &a, const data_record &b) {
					return a.m_score > b.m_score;
				});
			});
		}
		pool.run_all();
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::read_meta() {
		std::ifstream meta_file(filename(), std::ios::binary);

		if (meta_file.is_open()) {

			meta_file.read((char *)&m_num_added_keys, sizeof(size_t));

			char *data = m_document_counter.data();
			meta_file.read(data, m_document_counter.data_size());

			size_t num_docs = 0;
			meta_file.read((char *)(&num_docs), sizeof(size_t));
			for (size_t i = 0; i < num_docs; i++) {
				uint64_t doc_id = 0;
				size_t count = 0;
				meta_file.read((char *)(&doc_id), sizeof(uint64_t));
				meta_file.read((char *)(&count), sizeof(size_t));
				m_document_sizes[doc_id] = count;
			}
		}
	}

	template<template<typename> typename index_type, typename data_record>
	void sharded_builder<index_type, data_record>::write_meta() {
		std::ofstream meta_file(filename(), std::ios::binary | std::ios::trunc);

		if (meta_file.is_open()) {

			meta_file.write((char *)&m_num_added_keys, sizeof(size_t));

			char *data = m_document_counter.data();
			meta_file.write(data, m_document_counter.data_size());

			// Write document sizes.
			const size_t num_docs = m_document_sizes.size();
			meta_file.write((char *)(&num_docs), sizeof(size_t));
			for (const auto &iter : m_document_sizes) {
				meta_file.write((char *)(&iter.first), sizeof(uint64_t));
				meta_file.write((char *)(&iter.second), sizeof(size_t));
			}
		}
	}

	template<template<typename> typename index_type, typename data_record>
	std::string sharded_builder<index_type, data_record>::filename() const {
		// This file will contain meta data on the index. For example the hyper log log document counter.
		return config::data_path() + "/0/full_text/" + m_db_name + ".meta";
	}

}


================================================
FILE: src/indexer/sharded_index.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "index.h"
#include "algorithm/intersection.h"
#include "algorithm/top_k.h"
#include "utils/thread_pool.hpp"
#include "config.h"
#include <mutex>

namespace indexer {

	template<typename data_record>
	class sharded_index {

	public:

		sharded_index(const std::string &db_name, size_t num_shards);
		sharded_index(const std::string &db_name, size_t num_shards, size_t hash_table_size);
		~sharded_index();

		/* 
		 * Find single key
		 * Returns vector with records in storage_order.
		 * */
		std::vector<data_record> find(uint64_t key) const;

		/* 
		 * Find single key
		 * Returns bitmap of internal ids.
		 * */
		roaring::Roaring find_bitmap(uint64_t key) const;

		/*
		 * Find intersection of multiple keys
		 * Returns vector with records in storage order.
		 * */
		std::vector<data_record> find_intersection(const std::vector<uint64_t> &keys) const;

		/*
		 * Find intersection of multiple keys applying lambda function score_mod to the scores before.
		 * Returns n records with highest score.
		 * score_mod is applied in storage_order of data_record.
		 * */
		std::vector<data_record> find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t n, 
				std::function<float(const data_record &)> score_mod = [](const data_record &) { return 0.0f; }) const;

		/*
		 * Overload without total_num_results
		 * */
		std::vector<data_record> find_top(const std::vector<uint64_t> &keys, size_t n, 
				std::function<float(const data_record &)> score_mod = [](const data_record &) { return 0.0f; }) const;

		/*
		 * Find intersection of multiple keys and run group by, the groups will be determined by the
		 * data_record::storage_equal predicate and 'score_formula' will be applied to m_score before summing.
		 * Returns vector with grouped records.
		 * */
		std::vector<data_record> find_group_by(const std::vector<uint64_t> &keys,
				std::function<float(float)> score_formula, std::vector<size_t> &counts) const;

		/*
		 * Calculates a set of keys that has more than the given number of records.
		 * Returns a std::set<uint64_t> with the keys.
		 * This function is slow. Needs to open each shard to retrieve the keys.
		 * */
		std::set<uint64_t> get_keys(size_t with_more_than_records) const;

		/*
		 * Iterates the keys of the index and calls the callback with key and the bitmap for that key.
		 * */
		void for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const;

		/*
		 * Returns the total number of records.
		 * */
		size_t num_records() const { return m_records.size(); }

		/*
		 * Copies all the records from the bitmap into the vector append_to
		 * */
		void get_records_for_bitmap(const roaring::Roaring &bitmap, std::vector<data_record> &append_to) const;

	private:

		std::string m_db_name;
		size_t m_num_shards;
		size_t m_hash_table_size;

		std::vector<data_record> m_records;
		mutable std::vector<float> m_scores;
		std::map<uint64_t, uint32_t> m_record_id_map;

		void read_meta();
		std::string filename() const;

	};

	template<typename data_record>
	sharded_index<data_record>::sharded_index(const std::string &db_name, size_t num_shards)
	: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(config::shard_hash_table_size)
	{
		read_meta();
	}

	template<typename data_record>
	sharded_index<data_record>::sharded_index(const std::string &db_name, size_t num_shards, size_t hash_table_size)
	: m_db_name(db_name), m_num_shards(num_shards), m_hash_table_size(hash_table_size)
	{
		read_meta();
	}

	template<typename data_record>
	sharded_index<data_record>::~sharded_index() {
	}

	template<typename data_record>
	std::vector<data_record> sharded_index<data_record>::find(uint64_t key) const {

		const size_t shard_id = key % m_num_shards;
		index<data_record> idx(m_db_name, shard_id, m_hash_table_size);

		roaring::Roaring rr = idx.find_bitmap(key);

		std::function<data_record(uint32_t id)> id_to_rec = [this](uint32_t id) {
			return m_records[id];
		};

		std::vector<data_record> ret;
		for (uint32_t internal_id : rr) {
			ret.emplace_back(id_to_rec(internal_id));
		}

		return ret;
	}

	template<typename data_record>
	roaring::Roaring sharded_index<data_record>::find_bitmap(uint64_t key) const {

		const size_t shard_id = key % m_num_shards;
		index<data_record> idx(m_db_name, shard_id, m_hash_table_size);

		return idx.find_bitmap(key);
	}

	template<typename data_record>
	std::vector<data_record> sharded_index<data_record>::find_intersection(const std::vector<uint64_t> &keys) const {

		std::vector<roaring::Roaring> results;
		for (uint64_t key : keys) {

			const size_t shard_id = key % m_num_shards;
			index<data_record> idx(m_db_name, shard_id, m_hash_table_size);
			
			roaring::Roaring res = idx.find_bitmap(key);
			results.emplace_back(std::move(res));
		}

		roaring::Roaring rr = ::algorithm::intersection(results);

		std::function<data_record(uint32_t id)> id_to_rec = [this](uint32_t id) {
			return m_records[id];
		};

		std::vector<data_record> ret;
		for (uint32_t internal_id : rr) {
			ret.emplace_back(id_to_rec(internal_id));
		}

		return ret;
	}

	template<typename data_record>
	std::vector<data_record> sharded_index<data_record>::find_top(size_t &total_num_results, const std::vector<uint64_t> &keys, size_t n,
			std::function<float(const data_record &)> score_mod) const {

		std::fill(m_scores.begin(), m_scores.end(), 0.0f);

		std::vector<roaring::Roaring> results;
		for (uint64_t key : keys) {

			const size_t shard_id = key % m_num_shards;
			index<data_record> idx(m_db_name, shard_id, m_hash_table_size);
			
			roaring::Roaring res = idx.find_bitmap(key);
			results.emplace_back(std::move(res));
		}

		roaring::Roaring rr = ::algorithm::intersection(results);

		total_num_results = rr.cardinality();

		// Apply score modifications.
		std::vector<uint32_t> ids;
		for (uint32_t internal_id : rr) {
			ids.push_back(internal_id);
			m_scores[internal_id] = m_records[internal_id].m_score * score_mod(m_records[internal_id].m_value);
		}

		auto ordered = [this](const uint32_t &a, const uint32_t &b) {
			return m_scores[a] < m_scores[b];
		};

		std::vector<uint32_t> top_ids = ::algorithm::top_k<uint32_t>(ids, n, ordered);

		std::vector<data_record> ret;
		for (uint32_t internal_id : top_ids) {
			ret.push_back(m_records[internal_id]);
			ret.back().m_score = m_scores[internal_id];
		}

		sort(ret.begin(), ret.end(), typename data_record::score_order());

		return ret;
	}

	template<typename data_record>
	std::vector<data_record> sharded_index<data_record>::find_top(const std::vector<uint64_t> &keys, size_t n,
			std::function<float(const data_record &)> score_mod) const {

		size_t total_num_results = 0;
		return find_top(total_num_results, keys, n, score_mod);
	}

	template<typename data_record>
	std::vector<data_record> sharded_index<data_record>::find_group_by(const std::vector<uint64_t> &keys,
			std::function<float(float)> score_formula, std::vector<size_t> &counts) const {

		std::vector<roaring::Roaring> results;
		for (uint64_t key : keys) {

			const size_t shard_id = key % m_num_shards;
			index<data_record> idx(m_db_name, shard_id, m_hash_table_size);
			
			roaring::Roaring res = idx.find_bitmap(key);
			results.emplace_back(std::move(res));
		}

		roaring::Roaring rr = ::algorithm::intersection(results);

		// Group by.
		std::vector<data_record> ret;
		for (uint32_t internal_id : rr) {
			if (internal_id >= m_records.size()) {
				std::cout << "internal_id: " << internal_id << " >= " << m_records.size() << std::endl;
				continue;
			}
			if (ret.size() && ret.back().storage_equal(m_records[internal_id])) {
				ret.back().m_score += score_formula(m_records[internal_id].m_score);
				counts.back()++;
			} else {
				ret.emplace_back(m_records[internal_id]);
				ret.back().m_score = score_formula(ret.back().m_score);
				counts.push_back(1);
			}
		}

		return ret;
	}

	template<typename data_record>
	std::set<uint64_t> sharded_index<data_record>::get_keys(size_t with_more_than_records) const {

		utils::thread_pool pool(32);
		std::mutex lock;
		std::set<uint64_t> all_keys;
		for (size_t shard_id = 0; shard_id < m_num_shards; shard_id++) {

			pool.enqueue([this, shard_id, with_more_than_records, &all_keys, &lock]() {
				index<data_record> idx(m_db_name, shard_id, m_hash_table_size);
				std::set<uint64_t> keys_for_shard = idx.get_keys(with_more_than_records);

				lock.lock();
				all_keys.insert(keys_for_shard.begin(), keys_for_shard.end());
				lock.unlock();
			});
		}

		pool.run_all();

		return all_keys;

	}

	template<typename data_record>
	void sharded_index<data_record>::for_each(std::function<void(uint64_t key, roaring::Roaring &bitmap)> on_each_key) const {
	
		utils::thread_pool pool(32);
		for (size_t shard_id = 0; shard_id < m_num_shards; shard_id++) {

			pool.enqueue([this, shard_id, &on_each_key]() {
				index<data_record> idx(m_db_name, shard_id, m_hash_table_size);
				idx.for_each(on_each_key);
			});
		}

		pool.run_all();
	}

	/*
	 * Copies all the records from the bitmap into the vector iterator "append_to"
	 * */
	template<typename data_record>
	void sharded_index<data_record>::get_records_for_bitmap(const roaring::Roaring &bitmap, std::vector<data_record> &append_to) const {
		for (uint32_t internal_id : bitmap) {
			append_to.emplace_back(m_records[internal_id]);
		}
	}

	template<typename data_record>
	void sharded_index<data_record>::read_meta() {
		std::ifstream meta_file(filename(), std::ios::binary);

		if (meta_file.is_open()) {

			// Read records.
			size_t num_records;
			meta_file.read((char *)(&num_records), sizeof(size_t));
			for (size_t i = 0; i < num_records; i++) {
				data_record rec;
				meta_file.read((char *)(&rec), sizeof(data_record));

				m_record_id_map[rec.m_value] = m_records.size();
				m_records.push_back(rec);
				m_scores.push_back(0.0f);
			}
		}
	}

	template<typename data_record>
	std::string sharded_index<data_record>::filename() const {
		// This file will contain meta data on the index. For example the hyper log log document counter.
		return config::data_path() + "/0/full_text/" + m_db_name + ".meta";
	}

}


================================================
FILE: src/indexer/sharded_index_builder.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "index_builder.h"
#include "algorithm/hyper_log_log.h"
#include "utils/thread_pool.hpp"
#include "utils/thread_pool_arg.h"

#include <numeric>

namespace indexer {

	template<typename data_record>
	class sharded_index_builder {
	private:
		// Non copyable
		sharded_index_builder(const sharded_index_builder &);
		sharded_index_builder& operator=(const sharded_index_builder &);

	public:

		sharded_index_builder(const std::string &db_name, size_t num_shards);
		~sharded_index_builder();

		void add(uint64_t key, const data_record &record);
		
		void append();
		void merge();
		void merge_one(size_t id);
		void optimize();

		/*
			This function calculate scores. Should run after a merge.
		*/
		void calculate_scores(algorithm algo);

		size_t num_documents() const { return m_document_counter.count(); }
		size_t document_size(uint64_t document_id) { return m_document_sizes[document_id]; }

		void truncate();
		void truncate_cache_files();
		void create_directories();

		void check();

		/*
		 * Loops over the records and applies transform.
		 * */
		void for_each_record(std::function<void(data_record &)> transform);

	private:

		std::mutex m_lock;
		std::string m_db_name;
		std::vector<std::shared_ptr<index_builder<data_record>>> m_shards;
		::algorithm::hyper_log_log m_document_counter;
		std::map<uint64_t, size_t> m_document_sizes;
		float m_avg_document_size = 0.0f;

		std::vector<data_record> m_records;
		std::map<uint64_t, uint32_t> m_record_id_map;

		void read_meta();
		void write_meta();
		std::string filename() const;
		bool needs_optimization() const;
		void sort_records();

	};

	template<typename data_record>
	sharded_index_builder<data_record>::sharded_index_builder(const std::string &db_name, size_t num_shards) {

		std::function<uint32_t(const data_record &)> rec_to_id = [this](const data_record &record) {
			std::lock_guard guard(m_lock);
			if (m_record_id_map.count(record.m_value) == 0) {
				m_record_id_map[record.m_value] = m_records.size();
				m_records.push_back(record);
			}
			return m_record_id_map[record.m_value];
		};

		m_db_name = db_name;
		for (size_t shard_id = 0; shard_id < num_shards; shard_id++) {
			m_shards.push_back(std::make_shared<index_builder<data_record>>(db_name, shard_id, rec_to_id));
		}
		create_directories();
		read_meta();
	}

	template<typename data_record>
	sharded_index_builder<data_record>::~sharded_index_builder() {
		write_meta();
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::add(uint64_t key, const data_record &record) {
		m_shards[key % m_shards.size()]->add(key, record);

		/*m_document_counter.insert(record.m_value);
		m_document_sizes[record.m_value]++; // Raw non unique document size.
		*/
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::append() {
		for (auto &shard : m_shards) {
			shard->append();
		}
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::merge() {

		utils::thread_pool_arg<std::unordered_map<uint64_t, uint32_t>> pool(32);

		for (size_t i = 0; i < m_shards.size(); i++) {
			pool.enqueue([this, i](std::unordered_map<uint64_t, uint32_t> &internal_id_map) {
				try {
					m_shards[i]->merge(internal_id_map);
				} catch (...) {
				}
			});
		}
		pool.run_all();
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::merge_one(size_t id) {
		m_shards[id]->merge();
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::optimize() {
		if (needs_optimization()) {
			sort_records();
		}
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::calculate_scores(algorithm algo) {

		(void)algo;

		/*const size_t num_docs = num_documents();
		score_builder score(num_docs, &m_document_sizes);
		
		for (auto &shard : m_shards) {
			shard->calculate_scores(algo, score);
		}*/
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::truncate() {
		for (auto &shard : m_shards) {
			shard->truncate();
		}
		std::ofstream meta_file(filename(), std::ios::trunc);
		m_records = std::vector<data_record>{};
		m_record_id_map = std::map<uint64_t, uint32_t>{};
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::truncate_cache_files() {
		for (auto &shard : m_shards) {
			shard->truncate_cache_files();
		}
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::create_directories() {
		for (auto &shard : m_shards) {
			shard->create_directories();
		}
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::read_meta() {
		std::ifstream meta_file(filename(), std::ios::binary);

		if (meta_file.is_open()) {

			// Read records.
			size_t num_records;
			meta_file.read((char *)(&num_records), sizeof(size_t));
			if (meta_file.eof()) return;
			for (size_t i = 0; i < num_records; i++) {
				data_record rec;
				meta_file.read((char *)(&rec), sizeof(data_record));

				m_record_id_map[rec.m_value] = m_records.size();
				m_records.push_back(rec);
			}

		}
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::write_meta() {
		std::ofstream meta_file(filename(), std::ios::binary | std::ios::trunc);

		if (meta_file.is_open()) {

			// Write records.
			const size_t num_records = m_records.size();
			meta_file.write((char *)(&num_records), sizeof(size_t));
			for (const data_record &record : m_records) {
				meta_file.write((char *)(&record), sizeof(data_record));
			}
		}
	}

	template<typename data_record>
	std::string sharded_index_builder<data_record>::filename() const {
		// This file will contain meta data on the index. For example the hyper log log document counter.
		return config::data_path() + "/0/full_text/" + m_db_name + ".meta";
	}

	template<typename data_record>
	bool sharded_index_builder<data_record>::needs_optimization() const {
		// Just check if the records are sorted by storage order.
		if (m_records.size() <= 1) return false;
		
		typename data_record::storage_order ordered;
		for (size_t i = 0; i < m_records.size() - 1; i++) {
			if (!ordered(m_records[i], m_records[i + 1])) {
				return true;
			}
		}
		return false;
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::sort_records() {
		std::vector<uint32_t> permutation(m_records.size());
		std::iota(permutation.begin(), permutation.end(), 0);

		typename data_record::storage_order ordered;

		std::sort(permutation.begin(), permutation.end(), [this, &ordered](const size_t &a, const size_t &b) {
			return ordered(m_records[a], m_records[b]);
		});
		// permutation now points from new position -> old position of record.

		std::vector<uint32_t> inverse(permutation.size());
		for (uint32_t i = 0; i < permutation.size(); i++) {
			inverse[permutation[i]] = i;
		}

		// inverse now points from old position -> new position of record.

		utils::thread_pool pool(32);
		for (size_t i = 0; i < m_shards.size(); i++) {
			pool.enqueue([this, i, &inverse]() {
				m_shards[i]->transform([&inverse](uint32_t v) {
					return inverse[v];
				});
			});
		}
		pool.run_all();

		// Reorder the records. Will be saved in meta file upon destruction.
		sort(m_records.begin(), m_records.end(), ordered);
	}

	template<typename data_record>
	void sharded_index_builder<data_record>::check() {
		const size_t num_records = m_records.size();

		std::cout << "num_records: " << num_records << std::endl;

		size_t total_max = 0;
		for (auto shard : m_shards) {
			size_t max_id = shard->get_max_id();
			if (max_id >= num_records) {
				std::cout << "found max id: " << max_id << " but only has " << num_records << " records" << std::endl;
			}
			if (max_id > total_max) total_max = max_id;
		}
		std::cout << "done, max_id was: " << total_max << std::endl;
	}

	/*
	 * Loops over the records and applies transform.
	 * */
	template<typename data_record>
	void sharded_index_builder<data_record>::for_each_record(std::function<void(data_record &)> transform) {
		for (auto &rec : m_records) {
			transform(rec);
		}
	}

}


================================================
FILE: src/indexer/url_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "generic_record.h"

namespace indexer {
	#pragma pack(4)
	class url_record : public generic_record {

		public:
		uint64_t m_domain_hash;
		uint32_t m_meta;

		url_record() : generic_record(), m_domain_hash(0) {};
		url_record(uint64_t value) : generic_record(value), m_domain_hash(0) {};
		url_record(uint64_t value, float score) : generic_record(value, score), m_domain_hash(0) {};
		url_record(uint64_t value, float score, uint64_t domain_hash) : generic_record(value, score), m_domain_hash(domain_hash) {};

		void url_length(uint16_t len) { m_meta = len | (m_meta << 16); };
		uint16_t url_length(void) const { return m_meta & 0xFFFF; };

	};
}


================================================
FILE: src/indexer/value_record.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace indexer {

	#pragma pack(4)
	class value_record {
		public:
		uint64_t m_value;

		value_record() : m_value(0) {};
		value_record(uint64_t value) : m_value(value) {};
		value_record(uint64_t value, float score) : m_value(value) {};

		bool operator==(const value_record &b) const {
			return m_value == b.m_value;
		}

		bool operator<(const value_record &b) const {
			return m_value < b.m_value;
		}

		value_record &operator+=(const value_record &b) {
			return *this;
		}

		/*
		 * Will be applied to records before truncating. Top records will be kept.
		 * */
		struct truncate_order {
			inline bool operator() (const value_record &a, const value_record &b) {
				return a.m_value > b.m_value;
			}
		};

		/*
		 * Will be applied before storing on disk. This is the order the records will be returned in.
		 * */
		struct storage_order {
			inline bool operator() (const value_record &a, const value_record &b) {
				return a.m_value < b.m_value;
			}
		};

		bool storage_equal(const value_record &a) const {
			return m_value == a.m_value;
		}

	};
}


================================================
FILE: src/indexer.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iostream>
#include "config.h"
#include "logger/logger.h"
#include "downloader/warc_downloader.h"
#include "tools/splitter.h"
#include "tools/counter.h"
#include "tools/calculate_harmonic.h"
#include "tools/generate_url_lists.h"
#include "tools/find_links.h"
#include "indexer/index_manager.h"
#include "URL.h"
#include "indexer/console.h"
#include <iostream>
#include <set>
#include "indexer/sharded_index.h"
#include "transfer/transfer.h"

void help() {
	std::cout << "Usage: ./tools [OPTION]..." << std::endl;
	std::cout << "--split run splitter" << std::endl;
	std::cout << "--harmonic-hosts create file /tmp/hosts.txt with hosts for harmonic centrality" << std::endl;
	std::cout << "--harmonic-links create file /tmp/edges.txt for edges for harmonic centrality" << std::endl;
	std::cout << "--harmonic calculates harmonic centrality" << std::endl;
}

int main(int argc, const char **argv) {

	//logger::start_logger_thread();
	//logger::verbose(true);

	if (getenv("ALEXANDRIA_CONFIG") != NULL) {
		config::read_config(getenv("ALEXANDRIA_CONFIG"));
	} else {
		config::read_config("/etc/alexandria.conf");
	}

	if (argc < 2) {
		help();
		return 0;
	}

	const std::string arg(argc > 1 ? argv[1] : "");

	if (arg == "--downloader" && argc > 2) {
		downloader::warc_downloader(argv[2]);
	} else if (arg == "--downloader-missing" && argc > 2) {
		downloader::warc_downloader_missing(string(argv[2]));
	} else if (arg == "--split") {
		tools::run_splitter();
	} else if (arg == "--count-overflow-words") {
		indexer::count_words_that_hit_max();
	} else if (arg == "--count") {
		std::cout << "count: " << indexer::count_urls() << std::endl;
	} else if (arg == "--count-domains") {
		tools::run_counter_per_domain(argv[2]);
	} else if (arg == "--make-urls" && argc > 2) {
		tools::generate_url_lists(argv[2]);
	} else if (arg == "--split-make-direct-links") {

		/*
		 * Make direct links by using the url bloom filter.
		 * */
		tools::run_split_direct_links();
	} else if (arg == "--split-build-url-bloom") {

		/*
		 * Make a bloom filter from all urls in the source batches.
		 * */
		tools::run_split_build_url_bloom();
	} else if (arg == "--split-build-direct-link-bloom") {

		/*
		 * Make a bloom filter from all direct links in the source batches.
		 * */
		tools::run_split_build_direct_link_bloom();
	} else if (arg == "--split-with-links") {

		/*
		 * split with links takes all the URL batches and splits them into smaller NODE-{node id} folders
		 * with links means it only takes URLs with direct links in them. this is a major
		 * optimization and makes our target index much much smaller.
		 *
		 * */
		tools::run_split_urls_with_direct_links();
	} else if (arg == "--split-links") {

		/*
		 * split links should run after --split-with-links because it takes all the link batches and splits
		 * them into LINK-{node id} folders but it ONLY takes links with target domain that is present in the
		 * URL files stored in the NODE- folders.
		 *
		 * */
		tools::run_split_links_with_relevant_domains();
	} else if (arg == "--search") {

		/*
		 * split links should run after --split-with-links because it takes all the link batches and splits
		 * them into LINK-{node id} folders but it ONLY takes links with target domain that is present in the
		 * URL files stored in the NODE- folders.
		 *
		 * */
		indexer::index_manager idx_manager;
		auto response = idx_manager.find(argv[2]);

		for (const auto &rec : response) {
			std::cout << rec.m_url << " score " << rec.m_score << std::endl;
		}
	} else if (arg == "--harmonic-hosts") {
		tools::calculate_harmonic_hosts();
	} else if (arg == "--harmonic-links") {
		tools::calculate_harmonic_links();
	} else if (arg == "--harmonic") {
		tools::calculate_harmonic();
	} else if (arg == "--host-hash") {
		URL url(argv[2]);
		cout << url.host_hash() << endl;
	} else if (arg == "--url-hash") {
		URL url(argv[2]);
		cout << url.hash() << endl;
	} else if (arg == "--host-hash-mod") {
		URL url(argv[2]);
		cout << url.host_hash() % stoull(argv[3]) << endl;
	} else if (arg == "--find-links") {
		tools::find_links();
	} else if (arg == "--console") {
		indexer::console();
	} else if (arg == "--index-links") {
		indexer::index_links();
	} else if (arg == "--index-urls") {
		indexer::index_urls();
	} else if (arg == "--make-domain-index") {
		indexer::make_domain_index();
	} else if (arg == "--make-domain-index-scores") {
		indexer::make_domain_index_scores();
	} else if (arg == "--truncate-links") {
		indexer::truncate_links();
	} else if (arg == "--make-url-bloom") {
		indexer::make_url_bloom_filter();
	} else {
		help();
	}

	logger::join_logger_thread();

	return 0;
}


================================================
FILE: src/logger/logger.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "logger.h"
#include <thread>
#include <queue>

using namespace std;

namespace logger {

	thread m_logger_thread;
	mutex *m_lock = nullptr;
	queue<string> *m_queue = nullptr;
	ofstream m_file;
	chrono::seconds m_reopen_interval = std::chrono::seconds(300);
	chrono::system_clock::time_point m_last_reopen;
	bool m_verbose = false;
	bool m_run_logger = true;
	bool m_logger_started = false;

	void verbose(bool verbose) {
		m_verbose = verbose;
	}

	void initialize() {
		m_lock = new mutex;
		m_queue = new queue<string>;
		m_logger_started = true;
	}

	void de_initialize() {
		delete m_lock;
		delete m_queue;
		m_lock = nullptr;
		m_queue = nullptr;
		m_logger_started = false;
	}

	void reopen() {
		auto now = chrono::system_clock::now();
		m_lock->lock();
		if (now - m_last_reopen > m_reopen_interval) {
			m_last_reopen = now;
			try {
				m_file.close();
			} catch (...) {

			}
			try {
				m_file.open(config::log_file_path, ofstream::out | ofstream::app);
				m_last_reopen = chrono::system_clock::now();
			} catch (exception &error) {
				try {
					m_file.close();
				} catch (...) {
					
				}
				throw error;
			}
		}
		m_lock->unlock();
	}

	string timestamp() {
		chrono::system_clock::time_point tp = std::chrono::system_clock::now();
		time_t tt = std::chrono::system_clock::to_time_t(tp);
		tm gmt{}; gmtime_r(&tt, &gmt);
		string buffer(100, 'x');
		sprintf(&buffer.front(), "%04d-%02d-%02d %02d:%02d:%02d", gmt.tm_year + 1900, (short)gmt.tm_mon + 1,
			(short)gmt.tm_mday, (short)gmt.tm_hour, (short)gmt.tm_min, (short)gmt.tm_sec);
		buffer.resize(19);
		return buffer;
	}

	string format(const string &type, const string &file, int line, const string &message, const string &meta) {
		string output;
		output.append(timestamp());
		output.append(" [" + type + "]");
		output.append(" " + file + ":" + to_string(line));
		output.append(" " + message);
		output.append(" " + meta);
		return output;
	}

	void log_message(const string &type, const string &file, int line, const string &message, const string &meta) {
		log_string(format(type, file, line, message, meta));
	}

	void log_string(const string &message) {
		if (!m_logger_started || m_lock == nullptr || m_queue == nullptr) return; // logger thread not started.
		m_lock->lock();
		if (m_verbose) cout << message << endl;
		m_queue->push(message);
		m_lock->unlock();
	}

	void log(const string &type, const string &file, int line, const string &message) {
		log_message(type, file, line, message, "");
	}

	void write_message_to_logfile(const string &message) {
		m_file << message << endl;
	}

	void logger_thread() {
		initialize();
		reopen();
		while (true) {
			while (m_queue->empty() && m_run_logger) {
				std::this_thread::sleep_for(std::chrono::milliseconds(50));
			}

			if (m_queue->empty()) break;

			m_lock->lock();
			string message = m_queue->front();
			m_queue->pop();
			m_lock->unlock();

			write_message_to_logfile(message);
		}

		de_initialize();
	}

	void start_logger_thread() {
		if (!m_logger_started) {
			m_run_logger = true;
			m_logger_thread = thread(logger_thread);
		}

		// Wait for logger thread to start.
		for (size_t i = 0; i < 20 && !m_logger_started; i++) {
			this_thread::sleep_for(1ms);
		}
	}

	void join_logger_thread() {
		if (m_logger_started) {
			m_run_logger = false;
			m_logger_thread.join();
			m_verbose = false;
		}
	}

	void sync() {
		std::this_thread::sleep_for(std::chrono::milliseconds(100));
	}

	logged_exception::logged_exception(const string &message, const string &file, int line)
	: m_message(message), m_file(file), m_line(line)
	{
		m_formatted_message = format("EXCEPTION", m_file, m_line, m_message, "");
	}
}


================================================
FILE: src/logger/logger.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "config.h"
#include <mutex>
#include <fstream>
#include <iostream>

#define LOG_INFO(msg) (logger::log("info", __FILE__, __LINE__, msg))
#define LOG_ERROR(msg) (logger::log("error", __FILE__, __LINE__, msg))

#define LOG_ERROR_EXCEPTION(msg) (logger::logged_exception(msg, std::string(__FILE__), __LINE__))

namespace logger {

	void verbose(bool verbose);
	void reopen();
	std::string timestamp();
	void log_message(const std::string &type, const std::string &file, int line, const std::string &message, const std::string &meta);
	void log_string(const std::string &message);

	// Should be called like this: logger::log("error", __FILE__, __LINE__, error.what());
	void log(const std::string &type, const std::string &file, int line, const std::string &message);
	void log(const std::string &type, const std::string &file, int line, const std::string &message, const std::string &meta);

	void start_logger_thread();
	void join_logger_thread();
	void sync();

	class logged_exception : public std::exception {

		public:
			logged_exception(const std::string &message, const std::string &file, int line);

			const char *what() const throw () {
				return m_formatted_message.c_str();
			}

		private:

			std::string m_message;
			std::string m_file;
			int m_line;
			std::string m_formatted_message;

	};

}


================================================
FILE: src/memory/debugger.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "debugger.h"
#include "memory.h"
#include "logger/logger.h"
#include <iostream>
#include <cstdlib>
#include <array>

using namespace std;

/*
	This memory manager exists so that we can know exactly how many bytes we currently have allocated. Since the OS
	is running a virtual memory system we can only know exactly how many bytes we have allocated right now if we
	keep a counter ourselves.

	To do this we overload the global new, new[], delete and delete[] operators.

	The problem is knowing how much memory is freed, we only have the pointer and not the length. To fix this issue
	we allocate sizeof(size_t) bytes more when allocating memory, then we store the length there and return a pointer
	to the address at offset "sizeof(size_t)" from the allocated pointer.

	This seems like absolute madness at first but I don't have any other solution.
*/

#include "sys/types.h"
#include "sys/sysinfo.h"

namespace memory {

	atomic_size_t mem_counter;
	size_t ptr_counter;
	size_t total_memory_on_host;

	void incr_mem_counter(size_t n) {
		mem_counter += n;
	}

	void decr_mem_counter(size_t n) {
		mem_counter -= n;
	}

	size_t allocated_memory() {
		return mem_counter;
	}

	size_t num_allocated() {
		return ptr_counter;
	}

	size_t record_usage_base = 0;
	size_t record_usage_peak = 0;
	size_t global_usage_peak = 0;

	void reset_usage() {
		record_usage_base = allocated_memory();
		record_usage_peak = record_usage_base;
	}

	void record_usage() {
		if (record_usage_peak < allocated_memory()) {
			record_usage_peak = allocated_memory();
		}
		if (global_usage_peak < get_usage()) {
			global_usage_peak = get_usage();
		}
	}

	size_t get_usage() {
		return record_usage_peak - record_usage_base;
	}

	size_t get_usage_peak() {
		return global_usage_peak;
	}

}


================================================
FILE: src/memory/debugger.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <atomic>

namespace memory {

	void incr_mem_counter(size_t n);
	void decr_mem_counter(size_t n);
	size_t allocated_memory(); // Returns number of allocated bytes.
	size_t num_allocated(); // Returns number of allocated pointers.

	void reset_usage();
	void record_usage();
	size_t get_usage();
	size_t get_usage_peak();

}


================================================
FILE: src/memory/memory.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "memory.h"
#include <unistd.h>
#include <iostream>
#include <fstream>

namespace memory {

	size_t available_memory = 0;
	size_t used_memory = 0;
	size_t total_memory = 0;

	size_t get_available_memory() {
		return available_memory;
	}

	size_t get_used_memory() {
		return used_memory;
	}

	size_t get_total_memory() {
		return total_memory;
	}

	/*
	 * inspired by https://stackoverflow.com/questions/349889/how-do-you-determine-the-amount-of-linux-system-ram-in-c
	 * */
	void update() {

		{
			std::string token;
			std::ifstream infile("/proc/meminfo", std::ios::in);
			if (infile.is_open()) {
				while (infile >> token) {
					if (token == "MemAvailable:") {
						size_t mem;
						if (infile >> mem) {
							available_memory = mem * 1000;
						} else {
							available_memory = 0;
						}
					}
					if (token == "MemTotal:") {
						size_t mem;
						if (infile >> mem) {
							total_memory = mem * 1000;
						} else {
							total_memory = 0;
						}
					}
				}
			}
		}

		{
			const size_t pid = getpid();
			std::string token;
			std::ifstream infile("/proc/" + std::to_string(pid) + "/stat", std::ios::in);
			if (infile.is_open()) {

				size_t counter = 1;
				while (infile >> token) {
					if (counter == 23) {
						used_memory = std::stoull(token);
						break;
					}
					counter++;
				}
			}
		}
	}

}


================================================
FILE: src/memory/memory.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace memory {

	/*
		Returns the number of bytes of available memory on the system. So this is how much our virtual memory can expand.
	*/
	size_t get_available_memory();

	/*
		Returns the size in bytes of our virtual memory (vsize)
		same as vsize in /proc/[pid]/stat here: https://man7.org/linux/man-pages/man5/proc.5.html
	*/
	size_t get_used_memory();

	/*
		Returns the total number of bytes in the system RAM.
	*/
	size_t get_total_memory();

	void update();

}


================================================
FILE: src/memory/overload.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "debugger.h"
#include <new>

using namespace std;

/*
	Overload the global new, new[], delete and delete[] operators.
*/
// https://en.cppreference.com/w/cpp/memory/new/operator_new

void *operator new(size_t n) {

	void *m = malloc(n + sizeof(size_t));

	if (m) {
		memory::incr_mem_counter(n);

		static_cast<size_t *>(m)[0] = n;
		return &(static_cast<size_t *>(m)[1]);
	}

	throw bad_alloc();
}

void *operator new[](size_t n) {

	void *m = malloc(n + sizeof(size_t));

	if (m) {
		memory::incr_mem_counter(n);

		static_cast<size_t *>(m)[0] = n;
		return &(static_cast<size_t *>(m)[1]);
	}

	throw bad_alloc();
}

void operator delete(void *p) noexcept {

	void *realp = &(static_cast<size_t *>(p)[-1]);
	const size_t n = static_cast<size_t *>(p)[-1];

	memory::decr_mem_counter(n);

	free(realp);
}

void operator delete[](void *p) noexcept {

	void *realp = &(static_cast<size_t *>(p)[-1]);
	const size_t n = static_cast<size_t *>(p)[-1];

	memory::decr_mem_counter(n);

	free(realp);
}


================================================
FILE: src/parser/cc_parser.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// main.cpp
#include "config.h"
#include "warc/warc.h"
#include "common/ThreadPool.h"
#include "logger/logger.h"
#include "text/text.h"
#include "transfer/transfer.h"
#include <iostream>

using namespace std;

namespace parser {

	void run_downloader(const string &warc_path) {

		warc::parser pp;
		warc::multipart_download("http://commoncrawl.s3.amazonaws.com/" + warc_path, [&pp](const string &chunk) {
			stringstream ss(chunk);
			pp.parse_stream(ss);
		});

		LOG_INFO("uploading: " + warc_path);
		int error;
		error = transfer::upload_gz_file(warc::get_result_path(warc_path), pp.result());
		error = transfer::upload_gz_file(warc::get_link_result_path(warc_path), pp.link_result());

		if (error) {
			LOG_INFO("error uploading: " + warc_path);
		}
	}

	void start_downloaders(const vector<string> &warc_paths) {
		const size_t num_threads = 48;
		ThreadPool pool(num_threads);
		vector<future<void>> results;

		for (const string &warc_path : warc_paths) {
			results.emplace_back(pool.enqueue([warc_path, num_threads] {
				sleep(rand() % (num_threads * 2));
				run_downloader(warc_path);
			}));
		}

		for(auto &&result: results) {
			result.get();
		}
	}

	vector<string> download_warc_paths() {
		int error;
		string content = transfer::file_to_string("nodes/" + config::node + "/warc.paths", error);
		if (error == transfer::ERROR) return {};

		content = text::trim(content);

		vector<string> raw_warc_paths;
		boost::algorithm::split(raw_warc_paths, content, boost::is_any_of("\n"));

		vector<string> warc_paths;
		for (const string &warc_path : raw_warc_paths) {
			if (text::trim(warc_path).size()) {
				warc_paths.push_back(text::trim(warc_path));
			}
		}

		return warc_paths;
	}

	bool upload_warc_paths(const vector<string> &warc_paths) {
		string content = boost::algorithm::join(warc_paths, "\n");
		int error = transfer::upload_file("nodes/" + config::node + "/warc.paths", content);
		return error == transfer::OK;
	}

	void warc_downloader() {

		const size_t timeout = 300;
		const size_t limit = 500;

		// main loop
		while (true) {

			// Check if there are any urls to digest every 'timeout' minutes.
			vector<string> warc_paths = download_warc_paths();

			if (warc_paths.size() > 0) {
				// Digest 'limit' number of warc paths.
				vector<string> warc_paths_to_download;
				while (warc_paths_to_download.size() < limit && warc_paths.size() > 0) {
					warc_paths_to_download.push_back(warc_paths.back());
					warc_paths.pop_back();
				}

				if (upload_warc_paths(warc_paths)) {
					start_downloaders(warc_paths_to_download);
				} else {
					LOG_INFO("Fatal, could not upload warc paths, will not download");
				}
			}

			sleep(timeout);
		}
	}
}


================================================
FILE: src/parser/cc_parser.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>

namespace parser {

	std::vector<std::string> download_warc_paths();
	bool upload_warc_paths(const std::vector<std::string> &warc_paths);

	void warc_downloader();
}


================================================
FILE: src/parser/entities.cpp
================================================
/*	Copyright 2012, 2016 Christoph Gärtner
	Distributed under the Boost Software License, Version 1.0
*/

#include "entities.h"

#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

using namespace std;

#define UNICODE_MAX 0x10FFFFul

static const char *const NAMED_ENTITIES[][2] = {
	{ "AElig;", "Æ" },
	{ "Aacute;", "Á" },
	{ "Acirc;", "Â" },
	{ "Agrave;", "À" },
	{ "Alpha;", "Α" },
	{ "Aring;", "Å" },
	{ "Atilde;", "Ã" },
	{ "Auml;", "Ä" },
	{ "Beta;", "Β" },
	{ "Ccedil;", "Ç" },
	{ "Chi;", "Χ" },
	{ "Dagger;", "‡" },
	{ "Delta;", "Δ" },
	{ "ETH;", "Ð" },
	{ "Eacute;", "É" },
	{ "Ecirc;", "Ê" },
	{ "Egrave;", "È" },
	{ "Epsilon;", "Ε" },
	{ "Eta;", "Η" },
	{ "Euml;", "Ë" },
	{ "Gamma;", "Γ" },
	{ "Iacute;", "Í" },
	{ "Icirc;", "Î" },
	{ "Igrave;", "Ì" },
	{ "Iota;", "Ι" },
	{ "Iuml;", "Ï" },
	{ "Kappa;", "Κ" },
	{ "Lambda;", "Λ" },
	{ "Mu;", "Μ" },
	{ "Ntilde;", "Ñ" },
	{ "Nu;", "Ν" },
	{ "OElig;", "Œ" },
	{ "Oacute;", "Ó" },
	{ "Ocirc;", "Ô" },
	{ "Ograve;", "Ò" },
	{ "Omega;", "Ω" },
	{ "Omicron;", "Ο" },
	{ "Oslash;", "Ø" },
	{ "Otilde;", "Õ" },
	{ "Ouml;", "Ö" },
	{ "Phi;", "Φ" },
	{ "Pi;", "Π" },
	{ "Prime;", "″" },
	{ "Psi;", "Ψ" },
	{ "Rho;", "Ρ" },
	{ "Scaron;", "Š" },
	{ "Sigma;", "Σ" },
	{ "THORN;", "Þ" },
	{ "Tau;", "Τ" },
	{ "Theta;", "Θ" },
	{ "Uacute;", "Ú" },
	{ "Ucirc;", "Û" },
	{ "Ugrave;", "Ù" },
	{ "Upsilon;", "Υ" },
	{ "Uuml;", "Ü" },
	{ "Xi;", "Ξ" },
	{ "Yacute;", "Ý" },
	{ "Yuml;", "Ÿ" },
	{ "Zeta;", "Ζ" },
	{ "aacute;", "á" },
	{ "acirc;", "â" },
	{ "acute;", "´" },
	{ "aelig;", "æ" },
	{ "agrave;", "à" },
	{ "alefsym;", "ℵ" },
	{ "alpha;", "α" },
	{ "amp;", "&" },
	{ "and;", "∧" },
	{ "ang;", "∠" },
	{ "apos;", "'" },
	{ "aring;", "å" },
	{ "asymp;", "≈" },
	{ "atilde;", "ã" },
	{ "auml;", "ä" },
	{ "bdquo;", "„" },
	{ "beta;", "β" },
	{ "brvbar;", "¦" },
	{ "bull;", "•" },
	{ "cap;", "∩" },
	{ "ccedil;", "ç" },
	{ "cedil;", "¸" },
	{ "cent;", "¢" },
	{ "chi;", "χ" },
	{ "circ;", "ˆ" },
	{ "clubs;", "♣" },
	{ "cong;", "≅" },
	{ "copy;", "©" },
	{ "crarr;", "↵" },
	{ "cup;", "∪" },
	{ "curren;", "¤" },
	{ "dArr;", "⇓" },
	{ "dagger;", "†" },
	{ "darr;", "↓" },
	{ "deg;", "°" },
	{ "delta;", "δ" },
	{ "diams;", "♦" },
	{ "divide;", "÷" },
	{ "eacute;", "é" },
	{ "ecirc;", "ê" },
	{ "egrave;", "è" },
	{ "empty;", "∅" },
	{ "emsp;", "\xE2\x80\x83" },
	{ "ensp;", "\xE2\x80\x82" },
	{ "epsilon;", "ε" },
	{ "equiv;", "≡" },
	{ "eta;", "η" },
	{ "eth;", "ð" },
	{ "euml;", "ë" },
	{ "euro;", "€" },
	{ "exist;", "∃" },
	{ "fnof;", "ƒ" },
	{ "forall;", "∀" },
	{ "frac12;", "½" },
	{ "frac14;", "¼" },
	{ "frac34;", "¾" },
	{ "frasl;", "⁄" },
	{ "gamma;", "γ" },
	{ "ge;", "≥" },
	{ "gt;", ">" },
	{ "hArr;", "⇔" },
	{ "harr;", "↔" },
	{ "hearts;", "♥" },
	{ "hellip;", "…" },
	{ "iacute;", "í" },
	{ "icirc;", "î" },
	{ "iexcl;", "¡" },
	{ "igrave;", "ì" },
	{ "image;", "ℑ" },
	{ "infin;", "∞" },
	{ "int;", "∫" },
	{ "iota;", "ι" },
	{ "iquest;", "¿" },
	{ "isin;", "∈" },
	{ "iuml;", "ï" },
	{ "kappa;", "κ" },
	{ "lArr;", "⇐" },
	{ "lambda;", "λ" },
	{ "lang;", "〈" },
	{ "laquo;", "«" },
	{ "larr;", "←" },
	{ "lceil;", "⌈" },
	{ "ldquo;", "“" },
	{ "le;", "≤" },
	{ "lfloor;", "⌊" },
	{ "lowast;", "∗" },
	{ "loz;", "◊" },
	{ "lrm;", "\xE2\x80\x8E" },
	{ "lsaquo;", "‹" },
	{ "lsquo;", "‘" },
	{ "lt;", "<" },
	{ "macr;", "¯" },
	{ "mdash;", "—" },
	{ "micro;", "µ" },
	{ "middot;", "·" },
	{ "minus;", "−" },
	{ "mu;", "μ" },
	{ "nabla;", "∇" },
	{ "nbsp;", " " },
	{ "ndash;", "–" },
	{ "ne;", "≠" },
	{ "ni;", "∋" },
	{ "not;", "¬" },
	{ "notin;", "∉" },
	{ "nsub;", "⊄" },
	{ "ntilde;", "ñ" },
	{ "nu;", "ν" },
	{ "oacute;", "ó" },
	{ "ocirc;", "ô" },
	{ "oelig;", "œ" },
	{ "ograve;", "ò" },
	{ "oline;", "‾" },
	{ "omega;", "ω" },
	{ "omicron;", "ο" },
	{ "oplus;", "⊕" },
	{ "or;", "∨" },
	{ "ordf;", "ª" },
	{ "ordm;", "º" },
	{ "oslash;", "ø" },
	{ "otilde;", "õ" },
	{ "otimes;", "⊗" },
	{ "ouml;", "ö" },
	{ "para;", "¶" },
	{ "part;", "∂" },
	{ "permil;", "‰" },
	{ "perp;", "⊥" },
	{ "phi;", "φ" },
	{ "pi;", "π" },
	{ "piv;", "ϖ" },
	{ "plusmn;", "±" },
	{ "pound;", "£" },
	{ "prime;", "′" },
	{ "prod;", "∏" },
	{ "prop;", "∝" },
	{ "psi;", "ψ" },
	{ "quot;", "\"" },
	{ "rArr;", "⇒" },
	{ "radic;", "√" },
	{ "rang;", "〉" },
	{ "raquo;", "»" },
	{ "rarr;", "→" },
	{ "rceil;", "⌉" },
	{ "rdquo;", "”" },
	{ "real;", "ℜ" },
	{ "reg;", "®" },
	{ "rfloor;", "⌋" },
	{ "rho;", "ρ" },
	{ "rlm;", "\xE2\x80\x8F" },
	{ "rsaquo;", "›" },
	{ "rsquo;", "’" },
	{ "sbquo;", "‚" },
	{ "scaron;", "š" },
	{ "sdot;", "⋅" },
	{ "sect;", "§" },
	{ "shy;", "\xC2\xAD" },
	{ "sigma;", "σ" },
	{ "sigmaf;", "ς" },
	{ "sim;", "∼" },
	{ "spades;", "♠" },
	{ "sub;", "⊂" },
	{ "sube;", "⊆" },
	{ "sum;", "∑" },
	{ "sup1;", "¹" },
	{ "sup2;", "²" },
	{ "sup3;", "³" },
	{ "sup;", "⊃" },
	{ "supe;", "⊇" },
	{ "szlig;", "ß" },
	{ "tau;", "τ" },
	{ "there4;", "∴" },
	{ "theta;", "θ" },
	{ "thetasym;", "ϑ" },
	{ "thinsp;", "\xE2\x80\x89" },
	{ "thorn;", "þ" },
	{ "tilde;", "˜" },
	{ "times;", "×" },
	{ "trade;", "™" },
	{ "uArr;", "⇑" },
	{ "uacute;", "ú" },
	{ "uarr;", "↑" },
	{ "ucirc;", "û" },
	{ "ugrave;", "ù" },
	{ "uml;", "¨" },
	{ "upsih;", "ϒ" },
	{ "upsilon;", "υ" },
	{ "uuml;", "ü" },
	{ "weierp;", "℘" },
	{ "xi;", "ξ" },
	{ "yacute;", "ý" },
	{ "yen;", "¥" },
	{ "yuml;", "ÿ" },
	{ "zeta;", "ζ" },
	{ "zwj;", "\xE2\x80\x8D" },
	{ "zwnj;", "\xE2\x80\x8C" }
};

static int cmp(const void *key, const void *value)
{
	return strncmp((const char *)key, *(const char *const *)value,
		strlen(*(const char *const *)value));
}

static const char *get_named_entity(const char *name)
{
	const char *const *entity = (const char *const *)bsearch(name,
		NAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES,
		sizeof *NAMED_ENTITIES, cmp);

	return entity ? entity[1] : NULL;
}

static size_t putc_utf8(unsigned long cp, char *buffer)
{
	unsigned char *bytes = (unsigned char *)buffer;

	if(cp <= 0x007Ful)
	{
		bytes[0] = (unsigned char)cp;
		return 1;
	}

	if(cp <= 0x07FFul)
	{
		bytes[1] = (unsigned char)((2 << 6) | (cp & 0x3F));
		bytes[0] = (unsigned char)((6 << 5) | (cp >> 6));
		return 2;
	}

	if(cp <= 0xFFFFul)
	{
		bytes[2] = (unsigned char)(( 2 << 6) | ( cp       & 0x3F));
		bytes[1] = (unsigned char)(( 2 << 6) | ((cp >> 6) & 0x3F));
		bytes[0] = (unsigned char)((14 << 4) |  (cp >> 12));
		return 3;
	}

	if(cp <= 0x10FFFFul)
	{
		bytes[3] = (unsigned char)(( 2 << 6) | ( cp        & 0x3F));
		bytes[2] = (unsigned char)(( 2 << 6) | ((cp >>  6) & 0x3F));
		bytes[1] = (unsigned char)(( 2 << 6) | ((cp >> 12) & 0x3F));
		bytes[0] = (unsigned char)((30 << 3) |  (cp >> 18));
		return 4;
	}

	return 0;
}

static bool parse_entity(
	const char *current, char **to, const char **from)
{
	const char *end = strchr(current, ';');
	if(!end) return 0;

	if(current[1] == '#')
	{
		char *tail = NULL;
		int errno_save = errno;
		bool hex = current[2] == 'x' || current[2] == 'X';

		errno = 0;
		unsigned long cp = strtoul(
			current + (hex ? 3 : 2), &tail, hex ? 16 : 10);

		bool fail = errno || tail != end || cp > UNICODE_MAX;
		errno = errno_save;
		if(fail) return 0;

		*to += putc_utf8(cp, *to);
		*from = end + 1;

		return 1;
	}
	else
	{
		const char *entity = get_named_entity(&current[1]);
		if(!entity) return 0;

		size_t len = strlen(entity);
		memcpy(*to, entity, len);

		*to += len;
		*from = end + 1;

		return 1;
	}
}

size_t decode_html_entities_utf8(char *dest, const char *src)
{
	if(!src) src = dest;

	char *to = dest;
	const char *from = src;

	for(const char *current; (current = strchr(from, '&'));)
	{
		memmove(to, from, (size_t)(current - from));
		to += current - from;

		if(parse_entity(current, &to, &from))
			continue;

		from = current;
		*to++ = *from++;
	}

	size_t remaining = strlen(from);

	memmove(to, from, remaining);
	to += remaining;
	*to = 0;

	return (size_t)(to - dest);
}


================================================
FILE: src/parser/entities.h
================================================
/*	Copyright 2012 Christoph Gärtner
	Distributed under the Boost Software License, Version 1.0
*/

#ifndef DECODE_HTML_ENTITIES_UTF8_
#define DECODE_HTML_ENTITIES_UTF8_

#include <stddef.h>

extern size_t decode_html_entities_utf8(char *dest, const char *src);
/*	Takes input from <src> and decodes into <dest>, which should be a buffer
	large enough to hold <strlen(src) + 1> characters.

	If <src> is <NULL>, input will be taken from <dest>, decoding
	the entities in-place.

	The function returns the length of the decoded string.
*/

#endif


================================================
FILE: src/parser/html_link.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "html_link.h"

using namespace std;

namespace parser {

	html_link::html_link(const string &host, const string &path, const string &target_host, const string &target_path, bool nofollow,
		const string &text) :
		m_host(host),
		m_path(path),
		m_target_host(target_host),
		m_target_path(target_path),
		m_nofollow(nofollow),
		m_text(text)
	{
		
	}

	html_link::html_link(const string &host, const string &path, const string &target_host, const string &target_path, bool nofollow) :
		m_host(host),
		m_path(path),
		m_target_host(target_host),
		m_target_path(target_path),
		m_nofollow(nofollow)
	{
		
	}

	html_link::~html_link() {}

}


================================================
FILE: src/parser/html_link.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <string>
#include "URL.h"

namespace parser {

	class html_link {

		public:
			html_link(const std::string &host, const std::string &path, const std::string &target_host, const std::string &target_path, bool nofollow,
				const std::string &text);
			html_link(const std::string &host, const std::string &path, const std::string &target_host, const std::string &target_path, bool nofollow);
			~html_link();

			URL source_url() const { return URL(m_host, m_path); };
			URL target_url() const { return URL(m_target_host, m_target_path); };
			std::string host() const { return m_host; };
			std::string path() const { return m_path; };
			std::string target_host() const { return m_target_host; };
			std::string target_path() const { return m_target_path; };
			bool nofollow() const { return m_nofollow; };
			std::string text() const {return m_text; };

		private:
			std::string m_host;
			std::string m_path;
			std::string m_target_host;
			std::string m_target_path;
			bool m_nofollow;
			std::string m_text;

	};

}


================================================
FILE: src/parser/html_parser.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "entities.h"
#include "html_parser.h"
#include "parser.h"
#include "config.h"
#include "text/text.h"
#include <curl/curl.h>

using namespace std;

namespace parser {

	const vector<string> non_content_tags{"script", "noscript", "style", "embed", "label", "form", "input",
		"iframe", "head", "meta", "link", "object", "aside", "channel", "img"};

	html_parser::html_parser()
	: m_long_text_len(100000)
	{
		m_long_str_buf = std::make_unique<char[]>(m_long_text_len);
		m_clean_buff = std::make_unique<char[]>(m_long_text_len);
		m_encoding_buffer = std::make_unique<unsigned char []>(m_long_text_len);
	}

	html_parser::html_parser(size_t long_text_len)
	: m_long_text_len(long_text_len)
	{
		m_long_str_buf = std::make_unique<char[]>(m_long_text_len);
		m_clean_buff = std::make_unique<char[]>(m_long_text_len);
		m_encoding_buffer = std::make_unique<unsigned char []>(m_long_text_len);
	}

	html_parser::~html_parser() {
	}

	void html_parser::parse(const string &html) {
		parse(html, "");
	}

	void html_parser::parse(const string &html, const string &url) {

		m_should_insert = false;
		m_should_insert = false;

		parse_url(url, m_host, m_path, "");

		m_title.clear();
		m_h1.clear();
		m_meta.clear();
		m_text.clear();
		m_invisible_pos.clear();
		m_links.clear();
		m_internal_links.clear();

		parse_encoding(html);
		if (m_encoding == ENC_UNKNOWN) {
			m_should_insert = false;
			return;
		}

		find_scripts(html);
		find_styles(html);
		sort_invisible();
		find_links(html, url);

		m_title = get_tag_content(html, "<title", "</title>");
		m_h1 = get_tag_content(html, "<h1", "</h1>");
		m_meta = get_meta_tag(html);
		m_text = get_text_content(html);

		if (m_encoding == ENC_ISO_8859_1) {
			iso_to_utf8(m_title);
			iso_to_utf8(m_h1);
			iso_to_utf8(m_meta);
			iso_to_utf8(m_text);
		}

		clean_text(m_title);
		if (m_title.size() == 0 || is_exotic_language(m_title) || m_title.size() > HTML_PARSER_MAX_TITLE_LEN) return;
		m_should_insert = true;

		clean_text(m_h1);
		clean_text(m_meta);
		clean_text(m_text);

		if (m_h1.size() > HTML_PARSER_MAX_H1_LEN) {
			m_should_insert = false;
			return;
		}
	}

	void html_parser::find_scripts(const string &html) {
		size_t pos = 0;
		pair<size_t, size_t> tag(0, 0);
		while (pos != string::npos) {
			tag = find_tag(html, "<script", "</script>", tag.second);
			if (tag.second == string::npos) {
				break;
			}
			m_invisible_pos.push_back(tag);
		}
	}

	void html_parser::find_styles(const string &html) {
		size_t pos = 0;
		pair<size_t, size_t> tag(0, 0);
		while (pos != string::npos) {
			tag = find_tag(html, "<style", "</style>", tag.second);
			if (tag.second == string::npos) {
				break;
			}
			m_invisible_pos.push_back(tag);
		}
	}

	void html_parser::find_links(const string &html, const string &base_url) {
		size_t pos = 0;
		pair<size_t, size_t> tag(0, 0);
		while (pos != string::npos) {
			tag = find_tag(html, "<a ", "</a>", tag.second);
			if (tag.second == string::npos) {
				break;
			}

			parse_link(html.substr(tag.first, tag.second - tag.first), base_url);
		}
	}

	int html_parser::parse_link(const string &link, const string &base_url) {
		const string href_key = "href=\"";
		const size_t key_len = href_key.size();
		const size_t href_start = link.find(href_key);
		if (href_start == string::npos) return ::parser::ERROR;
		const size_t href_end = link.find("\"", href_start + key_len);
		if (href_end == string::npos) return ::parser::ERROR;
		string href = link.substr(href_start + key_len, href_end - href_start - key_len);

		const string rel_key = "rel=\"";
		const size_t rel_key_len = rel_key.size();
		const size_t rel_start = link.find(rel_key);
		bool nofollow = false;
		if (rel_start != string::npos) {
			// "rel=" present in string
			const size_t rel_end = link.find("\"", rel_start + key_len);
			const string rel = link.substr(rel_start + rel_key_len, rel_end - rel_start - rel_key_len);
			if (rel.find("nofollow") != string::npos) nofollow = true;
		}

		string host;
		string path;
		if (parse_url(href, host, path, base_url) != ::parser::OK) return ::parser::ERROR;

		if (host == m_host) {
			// Ignore internal links for now.
			if (!nofollow) {
				m_internal_links.emplace_back(std::make_pair(URL(m_host, m_path).hash(), URL(host, path).hash()));
			}
			return ::parser::OK;
		}

		const size_t content_start = link.find(">", href_end) + 1;
		if (content_start == string::npos) return ::parser::ERROR;
		const size_t content_end = link.find("</a>", content_start);
		string content = link.substr(content_start, content_end - content_start);

		if (m_encoding == ENC_ISO_8859_1) {
			iso_to_utf8(content);
		}
		clean_text(content);

		if (content == "") return ::parser::ERROR;

		m_links.push_back(html_link(m_host, m_path, host, path, nofollow, content));

		return ::parser::OK;
	}

	int html_parser::parse_url(const string &url, string &host, string &path, const string &base_url) {
		CURLU *h = curl_url();
		if (!h) return ::parser::ERROR;

		if (base_url.size()) {
			curl_url_set(h, CURLUPART_URL, base_url.c_str(), 0);
		}

		CURLUcode uc = curl_url_set(h, CURLUPART_URL, url.c_str(), 0);
		if (uc) {
			curl_url_cleanup(h);
			return ::parser::ERROR;
		}

		char *chost;
		uc = curl_url_get(h, CURLUPART_HOST, &chost, 0);
		if (!uc) {
			host = chost;
			remove_www(host);
			curl_free(chost);
		}

		char *cpath;
		uc = curl_url_get(h, CURLUPART_PATH, &cpath, 0);
		if (!uc) {
			if (strnlen(cpath, m_long_text_len) < m_long_text_len) {
				decode_html_entities_utf8(m_clean_buff.get(), cpath);
				path = m_clean_buff.get();
			} else {
				path = cpath;
			}
			curl_free(cpath);
		}

		char *cquery;
		uc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0);
		if (!uc) {
			if (strnlen(cquery, m_long_text_len) < m_long_text_len) {
				decode_html_entities_utf8(m_clean_buff.get(), cquery);
				path += "?" + string(m_clean_buff.get());
			} else {
				path += "?" + string(cquery);
			}
			curl_free(cquery);
		}

		curl_url_cleanup(h);

		return ::parser::OK;
	}

	void html_parser::remove_www(string &path) {
		size_t pos = path.find("www.");
		if (pos == 0) path.erase(0, 4);
		text::trim_inplace(path);
	}

	void html_parser::parse_encoding(const string &html) {
		m_encoding = ENC_UTF_8;
		const size_t pos_start = html.find("charset=");
		if (pos_start == string::npos || pos_start > 1024) return;

		string encoding = html.substr(pos_start, 40);
		encoding = text::lower_case(encoding);

		const size_t utf8_start = encoding.find("utf-8");
		const size_t iso88591_start = encoding.find("iso-8859-1");
		if (utf8_start != string::npos) m_encoding = ENC_UTF_8;
		else if (iso88591_start != string::npos) m_encoding = ENC_ISO_8859_1;
		else m_encoding = ENC_UNKNOWN;
	}

	void html_parser::iso_to_utf8(string &str) {
		string str_out;
		for (std::string::iterator it = str.begin(); it != str.end(); ++it)
		{
			uint8_t ch = *it;
			if (ch < 0x80) {
				str_out.push_back(ch);
			}
			else {
				str_out.push_back(0xc0 | ch >> 6);
				str_out.push_back(0x80 | (ch & 0x3f));
			}
		}
		str = str_out;
	}

	string html_parser::title() const {
		return m_title;
	} 

	string html_parser::meta() const {
		return m_meta;
	}

	string html_parser::h1() const {
		return m_h1;
	}

	string html_parser::text() const {
		return m_text;
	}

	vector<html_link> html_parser::links() const {
		return m_links;
	}

	vector<std::pair<uint64_t, uint64_t>> html_parser::internal_links() const {
		return m_internal_links;
	}

	bool html_parser::should_insert() const {
		return m_should_insert;
	}

	string html_parser::url_tld(const string &url) {

		string response;
		string host;
		vector<string> parts;
		CURLU *h = curl_url();
		if (!h) return "";

		CURLUcode uc = curl_url_set(h, CURLUPART_URL, url.c_str(), 0);
		if (uc) {
			curl_url_cleanup(h);
			return "";
		}

		char *chost;
		uc = curl_url_get(h, CURLUPART_HOST, &chost, 0);
		if (!uc) {
			host = chost;
			boost::split(parts, host, boost::is_any_of("."));
			curl_free(chost);

			if (parts.size()) {
				response = parts.back();
			}
		}

		curl_url_cleanup(h);

		return response;
	}

	inline pair<size_t, size_t> html_parser::find_tag(const string &html, const string &tag_start, const string &tag_end,
		size_t pos) {
		size_t pos_start = html.find(tag_start, pos);
		if (pos_start == string::npos) return pair<size_t, size_t>(string::npos, string::npos);

		const size_t pos_end = html.find(tag_end, pos_start);
		if (pos_end == string::npos) return pair<size_t, size_t>(string::npos, string::npos);
		return pair<size_t, size_t>(pos_start, pos_end + tag_end.size());
	}

	string html_parser::get_tag_content(const string &html, const string &tag_start, const string &tag_end) {
		size_t pos_start = html.find(tag_start);
		if (pos_start == string::npos || is_invisible(pos_start)) return "";
		pos_start = html.find(">", pos_start);

		const size_t pos_end = html.find(tag_end, pos_start);
		const size_t len = pos_end - pos_start;
		if (pos_end == string::npos) return "";
		return (string)html.substr(pos_start + 1, len - 1);
	}

	string html_parser::get_meta_tag(const string &html) {
		size_t pos_start = 0;
		while ((pos_start = html.find("<meta", pos_start + 1)) != string::npos)  {
			const size_t pos_end = html.find(">", pos_start);
			const size_t pos_description = html.find("description\"", pos_start);
			if (pos_description < pos_end) {
				const size_t pos_end_tag = html.find(">", pos_description);
				const size_t pos_start_tag = html.rfind("<", pos_description);

				const string s = "content=";
				const size_t content_start = html.find(s, pos_start_tag);
				if (content_start != string::npos && content_start <= pos_end_tag) {
					return (string)html.substr(content_start + s.size(), pos_end_tag - content_start - s.size() - 1);
				}
			}
		}
		return "";
	}

	void html_parser::clean_text(string &str) {
		strip_tags(str);
		if (str.size() >= m_long_text_len) return;
		decode_html_entities_utf8(m_clean_buff.get(), str.c_str());
		str = m_clean_buff.get();
		strip_whitespace(str);
		text::trim_both_inplace(str);
	}

	void html_parser::strip_tags(string &html) {
		const int len = html.size();
		bool copy = true;
		bool last_was_space = false;
		int i = 0, j = 0;
		const char *html_s = html.c_str();
		for (; i < len; i++) {
			if (html_s[i] == '<') copy = false;
			if (isspace(html_s[i])) {
				html[j] = ' ';
				if (copy && !last_was_space) j++;
				last_was_space = true;
			} else {
				html[j] = html_s[i];
				if (copy) j++;
				last_was_space = false;
			}
			if (html_s[i] == '>') copy = true;
		}
		html.resize(j);
	}

	void html_parser::strip_whitespace(string &html) {
		const int len = html.size();
		bool last_was_space = false;
		int i = 0, j = 0;
		const char *html_s = html.c_str();
		for (; i < len; i++) {
			if (isspace(html_s[i])) {
				html[j] = ' ';
				if (!last_was_space) j++;
				last_was_space = true;
			} else {
				html[j] = html_s[i];
				j++;
				last_was_space = false;
			}
		}
		html.resize(j);
	}

	/*
	 * This function returns the text content of the html by first trying to fetch content after the first <h1>...</h1> tag. If no h1 tag is present
	 * it tries to fetch content from the start of the <body>
	 * */
	string html_parser::get_text_content(const string &html) {
		size_t pos_start = html.find("</h1>");

		// Start from body if no h1 is present
		if (pos_start == string::npos || is_invisible(pos_start)) {
			pos_start = html.find("<body");
		}
		if (pos_start == string::npos || is_invisible(pos_start)) {
			return "";
		}

		const size_t len = html.size();
		bool copy = true;
		bool ignore = false;
		bool last_was_space = false;
		size_t i = pos_start, j = 0;

		auto interval = m_invisible_pos.begin();
		const auto invisible_end = m_invisible_pos.end();
		while (interval != m_invisible_pos.end() && interval->first < pos_start) {
			interval++;
		}

		const char *html_s = html.c_str();

		for (; i < len && j < m_long_text_len; i++) {
			if (html_s[i] == '<') {
				if (interval != invisible_end && interval->first == i) {
					// Skip the whole invisible tag.
					i = interval->second - 1;
					interval++;
					continue;
				}
				// Insert a space, because we don't want to concatenate words.
				m_long_str_buf[j] = ' ';
				if (copy && !last_was_space) j++;
				last_was_space = true;

				copy = false;
			}
			if (isspace(html_s[i])) {
				if (j < m_long_text_len) m_long_str_buf[j] = ' ';
				if (copy && !last_was_space) j++;
				last_was_space = true;
			} else {
				if (j < m_long_text_len) m_long_str_buf[j] = html_s[i];
				if (copy) j++;
				last_was_space = false;
			}
			if (!ignore && html_s[i] == '>') copy = true;
		}

		string text(m_long_str_buf.get(), j);

		return text;
	}

	bool html_parser::is_exotic_language_debug(const string &str) const {
		const size_t len = str.size();
		const char *cstr = str.c_str();
		int num_exotic = 0;
		int num_normal = 0;
		int num_seminormal = 0;
		for (size_t i = 0; i < len;) {
			int multibyte_len = 1;
			int cumsum = 0;
			for (size_t j = i + 1; (j < len) && IS_MULTIBYTE_CODEPOINT(cstr[j]); j++, multibyte_len++) {
				cumsum += (unsigned char)cstr[j];
			}

			if (multibyte_len > 2) {
				num_exotic++;
			} else if (multibyte_len == 2){
				num_seminormal++;
			} else {
				num_normal++;
			}

			i += multibyte_len;
		}

		int total = (num_seminormal + num_exotic + num_normal);

		cout << str << " exotic: " << num_exotic << " seminormal: " << num_seminormal << " normal: " << num_normal << endl;

		if (num_exotic > 5) return true;
		if (total <= 3) return false;
		if ((float)(num_seminormal + num_exotic) / ((float)total) > 0.5) return true;

		return false;
	}

	bool html_parser::is_exotic_language(const string &str) const {
		const size_t len = str.size();
		const char *cstr = str.c_str();
		int num_exotic = 0;
		int num_normal = 0;
		int num_seminormal = 0;
		for (size_t i = 0; i < len;) {
			int multibyte_len = 1;
			int cumsum = 0;
			for (size_t j = i + 1; (j < len) && IS_MULTIBYTE_CODEPOINT(cstr[j]); j++, multibyte_len++) {
				cumsum += (unsigned char)cstr[j];
			}

			if (multibyte_len > 2) {
				num_exotic++;
			} else if (multibyte_len == 2){
				num_seminormal++;
			} else {
				num_normal++;
			}

			i += multibyte_len;
		}

		int total = (num_seminormal + num_exotic + num_normal);

		if (num_exotic > 5) return true;
		if (total <= 3) return false;
		if ((float)(num_seminormal + num_exotic) / ((float)total) > 0.5) return true;

		return false;
	}

	void html_parser::sort_invisible() {
		sort(m_invisible_pos.begin(), m_invisible_pos.end(), [](const pair<int, int>& lhs, const pair<int, int>& rhs) {
			return lhs.first < rhs.first;
		});
	}

	inline bool html_parser::is_invisible(size_t pos) {
		for (const auto &interval : m_invisible_pos) {
			if (interval.first <= pos && pos < interval.second) return true;
		}
		return false;
	}

}


================================================
FILE: src/parser/html_parser.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <string>
#include <vector>
#include <map>
#include <iostream>
#include <algorithm>
#include <cctype>
#include <string.h>
#include <memory>
#include <boost/algorithm/string.hpp>

#include "html_link.h"
#include "parser/unicode.h"

#define HTML_PARSER_MAX_H1_LEN 400
#define HTML_PARSER_MAX_TITLE_LEN 400

#define ENC_UTF_8 1
#define ENC_ISO_8859_1 2
#define ENC_UNKNOWN -1

namespace parser {

	class html_parser {

	public:

		html_parser();
		html_parser(size_t long_text_len);
		~html_parser();

		void parse(const std::string &html, const std::string &url);
		void parse(const std::string &html);

		std::string title() const;
		std::string meta() const;
		std::string h1() const;
		std::string text() const;
		std::vector<html_link> links() const;
		std::vector<std::pair<uint64_t, uint64_t>> internal_links() const;
		bool should_insert() const;

		// Return top level domain
		std::string url_tld(const std::string &url);
		bool is_exotic_language_debug(const std::string &str) const;
		bool is_exotic_language(const std::string &str) const;

	private:

		std::vector<html_link> m_links;
		std::vector<std::pair<uint64_t, uint64_t>> m_internal_links;
		std::vector<std::pair<size_t, size_t>> m_invisible_pos;

		const size_t m_long_text_len = 1000;
		std::unique_ptr<char[]> m_long_str_buf;
		std::unique_ptr<char[]> m_clean_buff;
		std::unique_ptr<unsigned char[]> m_encoding_buffer;
		bool m_should_insert;
		int m_encoding = ENC_UNKNOWN;

		std::string m_title;
		std::string m_h1;
		std::string m_meta;
		std::string m_text;

		std::string m_host;
		std::string m_path;

		void find_scripts(const std::string &html);
		void find_styles(const std::string &html);
		void find_links(const std::string &html, const std::string &base_url);

		int parse_link(const std::string &link, const std::string &base_url);
		int parse_url(const std::string &url, std::string &host, std::string &path, const std::string &base_url);
		inline void remove_www(std::string &path);
		void parse_encoding(const std::string &html);
		void iso_to_utf8(std::string &text);

		inline std::pair<size_t, size_t> find_tag(const std::string &html, const std::string &tag_start, const std::string &tag_end,
			size_t pos);
		std::string get_tag_content(const std::string &html, const std::string &tag_start, const std::string &tag_end);
		std::string get_meta_tag(const std::string &html);
		void clean_text(std::string &str);
		void strip_whitespace(std::string &html);
		void strip_tags(std::string &html);
		std::string get_text_content(const std::string &html);
		void sort_invisible();
		inline bool is_invisible(size_t pos);

	};

}


================================================
FILE: src/parser/parser.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "parser.h"
#include <curl/curl.h>

using namespace std;

namespace parser {

	bool is_percent_encoding(const char *cstr) {
		const char first = tolower(cstr[1]);
		const char second = tolower(cstr[2]);
		const bool first_valid = (first >= '0' && first <= '9') || (first >= 'a' && first <= 'f');
		const bool second_valid = (second >= '0' && second <= '9') || (second >= 'a' && second <= 'f');
		return cstr[0] == '%' && first_valid && second_valid;
	}

	string urldecode(const string &str) {
		const size_t len = str.size();
		const char *cstr = str.c_str();
		char *ret = new char[len + 1];
		size_t j = 0;
		for (size_t i = 0; i < len; i++) {
			if (i < len - 2 && is_percent_encoding(&cstr[i])) {
				ret[j++] = (char)stoi(string(&cstr[i + 1], 2), NULL, 16);
				i += 2;
			} else if (i < len - 1 && cstr[i] == '%' && cstr[i + 1] == '%') {
				ret[j++] = '%';
				i++;
			} else {
				ret[j++] = cstr[i];
			}
		}
		ret[j] = '\0';

		string ret_str(ret);

		delete[] ret;

		return ret_str;
	}

	string urlencode(const string &str) {
		CURL *curl = curl_easy_init();
		if (curl) {
			char *output = curl_easy_escape(curl, str.c_str(), str.size());
			if (output) {
				string ret(output);
				curl_free(output);
				curl_easy_cleanup(curl);
				return ret;
			}
			curl_easy_cleanup(curl);
		}

		return str;
	}

	string get_http_header(const string &record, const string &key) {
		const size_t pos = record.find(key);
		const size_t pos_end = record.find("\n", pos);
		if (pos == string::npos) {
			return "";
		}

		if (pos_end == string::npos) {
			return record.substr(pos + key.size());
		}

		return record.substr(pos + key.size(), pos_end - pos - key.size() - 1);
	}
}


================================================
FILE: src/parser/parser.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace parser {

	const int OK = 0;
	const int ERROR = 1;

	std::string urldecode(const std::string &str);
	std::string urlencode(const std::string &str);
	std::string get_http_header(const std::string &record, const std::string &key);
}


================================================
FILE: src/parser/unicode.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "unicode.h"

using namespace std;

namespace parser {

	std::string unicode::encode(const std::string &str) {

		const char *cstr = str.c_str();
		size_t len = str.size();

		char *target = new char[str.size()];

		size_t last_unicode = len;
		size_t utf8_len = 0;
		for (size_t i = 0; i < len; i++) {
			bool copy = true;
			if (utf8_len == 0) {
				if (IS_UTF8_START_1(cstr[i])) {
					utf8_len = 1;
					last_unicode = i;
				} else if (IS_UTF8_START_2(cstr[i])) {
					utf8_len = 2;
					last_unicode = i;
				} else if (IS_UTF8_START_3(cstr[i])) {
					utf8_len = 3;
					last_unicode = i;
				} else if (IS_UNKNOWN_UTF8_START(cstr[i])) {
					copy = false;
				} else if ('\x00' <= cstr[i] && cstr[i] <= '\x1f') {
					copy = false;
				}
			} else if (IS_MULTIBYTE_CODEPOINT(cstr[i])) {
				utf8_len--;
			} else {
				// This unicode character has been terminated too soon.
				copy = false;
				for (size_t j = last_unicode; j <= i; j++) {
					target[j] = '?';
				}
				utf8_len = 0;
			}
			if (copy) {
				target[i] = cstr[i];
			} else {
				target[i] = '?';
			}
		}

		std::string ret(target, len);
		delete []target;
		if (utf8_len) {
			return ret.substr(0, last_unicode);
		} else {
			return ret;
		}
	}

	bool unicode::is_valid(const std::string &str) {
		
		const char *cstr = str.c_str();
		size_t len = str.size();

		size_t utf8_len = 0;
		for (size_t i = 0; i < len; i++) {
			if (utf8_len == 0) {
				if (IS_UTF8_START_1(cstr[i])) {
					utf8_len = 1;
				} else if (IS_UTF8_START_2(cstr[i])) {
					utf8_len = 2;
				} else if (IS_UTF8_START_3(cstr[i])) {
					utf8_len = 3;
				} else if (IS_UNKNOWN_UTF8_START(cstr[i])) {
					return false;
				}
			} else if (IS_MULTIBYTE_CODEPOINT(cstr[i])) {
				utf8_len--;
			} else {
				// This unicode character has been terminated too soon.
				return false;
			}
		}

		if (utf8_len) {
			return false;
		}

		return true;
	}

}


================================================
FILE: src/parser/unicode.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

#define IS_MULTIBYTE_CODEPOINT(ch) (((unsigned char)ch >> 7) && !(((unsigned char)ch >> 6) & 0x1))
#define IS_UTF8_START_1(ch) (((unsigned char)ch >> 5) == 0b00000110 && ((unsigned char)ch & 0b00011111) >= 0b00000010)
#define IS_UTF8_START_2(ch) (((unsigned char)ch >> 4) == 0b00001110)
#define IS_UTF8_START_3(ch) (((unsigned char)ch >> 3) == 0b00011110)
#define IS_UNKNOWN_UTF8_START(ch) (ch >> 7)

namespace parser {

	class unicode {

		public:
			
			static std::string encode(const std::string &str);
			static bool is_valid(const std::string &str);

	};

}


================================================
FILE: src/profiler/profiler.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "profiler.h"
#include "logger/logger.h"
#include <vector>
#include <map>

using namespace std;

namespace profiler {

	map<string, double> profiles_per_name;

	std::chrono::_V2::system_clock::time_point start_time = std::chrono::high_resolution_clock::now();

	instance::instance(const string &name) :
		m_name(name)
	{
		m_start_time = std::chrono::high_resolution_clock::now();
	}

	instance::instance() :
		m_name("unnamed profile")
	{
		m_start_time = std::chrono::high_resolution_clock::now();
	}

	instance::~instance() {
		if (!m_has_stopped) {
			stop();
		}
	}

	void instance::enable() {
		m_enabled = true;
	}

	double instance::get() const {
		auto timer_elapsed = chrono::high_resolution_clock::now() - m_start_time;
		auto microseconds = chrono::duration_cast<std::chrono::microseconds>(timer_elapsed).count();

		return (double)microseconds/1000;
	}

	double instance::get_micro() const {
		if (!m_enabled) return 0;
		auto timer_elapsed = chrono::high_resolution_clock::now() - m_start_time;
		auto microseconds = chrono::duration_cast<std::chrono::microseconds>(timer_elapsed).count();

		return (double)microseconds;
	}

	void instance::stop() {
		m_has_stopped = true;
		profiles_per_name[m_name] += get();
		if (!m_enabled) return;
		LOG_INFO("profiler [" + m_name + "] took " + to_string(get()) + "ms");
	}

	void instance::print() {
		if (!m_enabled) return;
		cout << "profiler [" + m_name + "] took " + to_string(get()) + "ms" << endl;
	}

	void print_memory_status() {
		ifstream infile("/proc/" + to_string(getpid()) + "/status");
		if (infile.is_open()) {
			string line;
			while (getline(infile, line)) {
				LOG_INFO(line);
			}
		}
	}

	void tick(const string &name, const string &section) {
		(void)name;
		(void)section;
	}
	void report_reset();
	void report_print();

	double now_micro() {
		auto timer_elapsed = chrono::high_resolution_clock::now() - start_time;
		auto microseconds = chrono::duration_cast<std::chrono::microseconds>(timer_elapsed).count();

		return (double)microseconds;
	}

	size_t timestamp() {
		const auto p1 = std::chrono::system_clock::now();
		return std::chrono::duration_cast<std::chrono::seconds>(p1.time_since_epoch()).count();
	}

	void print_report() {

		double total_ms = 0.0;
		for (const auto &iter : profiles_per_name) {
			total_ms += iter.second;
		}

		for (const auto &iter : profiles_per_name) {
			cout << iter.first << ": " << iter.second << "ms (" << 100.0 * (iter.second / total_ms) << "%)" << endl;
		}
	}

}


================================================
FILE: src/profiler/profiler.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <chrono>
#include <fstream>
#include <unistd.h>

namespace profiler {

	class instance {

	public:

		explicit instance(const std::string &name);
		instance();
		~instance();

		void enable();
		double get() const;
		double get_micro() const;
		void stop();
		void print();

	private:
		std::string m_name;
		bool m_enabled = true;
		bool m_has_stopped = false;
		std::chrono::_V2::system_clock::time_point m_start_time;
	};

	void print_memory_status();

	void tick(const std::string &name, const std::string &section);
	void report_reset();
	void report_print();
	double now_micro();
	size_t timestamp();
	void print_report();

}


================================================
FILE: src/scraper/scraper.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "scraper.h"
#include "parser/html_parser.h"
#include "common/datetime.h"
#include "text/text.h"
#include "logger/logger.h"
#include <memory>

using namespace std;

namespace scraper {

	string user_agent_token() {
		return "AlexandriaOrgBot";
	}

	string user_agent() {
		string ua_version = "1.0";
		string ua = "Mozilla/5.0 (Linux) (compatible; "+user_agent_token()+"/"+ua_version+"; +https://www.alexandria.org/bot.html)";
		return ua;
	}

	scraper_stats::scraper_stats() {
	}

	scraper_stats::~scraper_stats() {
		m_running = false;
		if (m_thread.joinable()) m_thread.join();
	}

	void scraper_stats::gather_statistics(const map<string, unique_ptr<scraper>> &scrapers, size_t urls_in_queue) {
		start_count(urls_in_queue);
		for (const auto &iter : scrapers) {
			if (iter.second->finished()) {
				count_finished(*(iter.second));
			} else {
				count_unfinished(*(iter.second));
			}
		}
		end_count();
	}

	void scraper_stats::start_thread(size_t timeout) {
		m_timeout = timeout;
		m_thread = std::move(thread([this]() {
			this->run();
		}));
	}

	void scraper_stats::start_count(size_t urls_in_queue) {
		m_lock.lock();
		m_unfinished_scrapers = 0;
		m_unfinished_scraped_urls = 0;
		m_unfinished_scraped_urls_non200 = 0;
		m_unfinished_scraped_errors = 0;
		m_urls_in_queue = urls_in_queue;
		m_urls_assigned = 0;
	}

	void scraper_stats::end_count() {
		m_lock.unlock();
	}

	void scraper_stats::count_finished(const scraper &scraper) {
		m_scraped_urls += scraper.num_scraped();
		m_scraped_urls_non200 += scraper.num_scraped_non200();
		m_scraped_errors += scraper.num_errors();
		m_finished_scrapers += 1;
		m_num_blocked += scraper.blocked() ? 1 : 0;
	}

	void scraper_stats::count_unfinished(const scraper &scraper) {
		m_unfinished_scraped_urls += scraper.num_scraped();
		m_unfinished_scraped_urls_non200 += scraper.num_scraped_non200();
		m_unfinished_scraped_errors += scraper.num_errors();
		m_unfinished_scrapers += 1;
		m_urls_assigned += scraper.size();
	}

	void scraper_stats::run() {
		size_t time_start = profiler::timestamp();
		while (m_running) {
			std::this_thread::sleep_for(std::chrono::seconds(m_timeout));
			log_report(profiler::timestamp() - time_start);
		}
	}

	void scraper_stats::log_report(size_t dt) {
		m_lock.lock();
		std::stringstream ss;
		ss.precision(2);
		ss << endl;
		ss << "Scraper stats:" << endl;
		ss << m_urls_in_queue << " urls in queue (not assigned to any scraper)" << endl;
		ss << m_urls_assigned << " urls assigned to running scrapers" << endl;
		ss << (m_scraped_urls + m_unfinished_scraped_urls) << " urls done (200 response)" << endl;
		ss << (m_scraped_urls_non200 + m_unfinished_scraped_urls_non200) << " urls (non 200 response)" << endl;
		ss << (m_scraped_errors + m_unfinished_scraped_errors) << " urls (errors)" << endl;
		ss << fixed << (double)(m_scraped_urls + m_unfinished_scraped_urls)/dt << "/s" << endl;
		ss << m_finished_scrapers << " finished scrapers" << endl;
		ss << m_unfinished_scrapers << " unfinished scrapers" << endl;
		ss << m_num_blocked << " blocked scrapers" << endl;
		m_lock.unlock();
		LOG_INFO(ss.str());
	}

	scraper::scraper(const string &domain, scraper_store *store) :
		m_domain(domain), m_store(store)
	{
		//m_domain_data.m_domain = domain;
		m_curl = curl_easy_init();
	}

	scraper::~scraper() {
		if (m_thread.joinable()) m_thread.join();
		upload_domain_info();
		curl_easy_cleanup(m_curl);
	}

	void scraper::push_url(const URL &url) {
		m_queue.push(url);
	}

	void scraper::run() {

		download_domain_data();
		download_robots();

		while (m_queue.size()) {
			URL url = filter_url(m_queue.front());
			m_queue.pop();
			if (robots_allow_url(url)) {
				if (m_timeout) {
					this_thread::sleep_for(std::chrono::seconds(m_timeout/2 + (rand() % m_timeout)));
				}
				handle_url(url);
			}
			if (m_consecutive_error_count > 20) break;
		}

		m_finished = true;
	}

	void scraper::handle_url(const URL &url) {
		cout << url.str() << endl;
		m_buffer.resize(0);
		curl_easy_setopt(m_curl, CURLOPT_USERAGENT, user_agent().c_str());
		curl_easy_setopt(m_curl, CURLOPT_FOLLOWLOCATION, 1l);
		curl_easy_setopt(m_curl, CURLOPT_MAXREDIRS, 5l);
		curl_easy_setopt(m_curl, CURLOPT_WRITEFUNCTION, curl_string_reader);
		curl_easy_setopt(m_curl, CURLOPT_WRITEDATA, this);
		curl_easy_setopt(m_curl, CURLOPT_URL, url.str().c_str());
		curl_easy_setopt(m_curl, CURLOPT_TIMEOUT, 30);
		curl_easy_setopt(m_curl, CURLOPT_ERRORBUFFER, m_curl_error_buffer);

		CURLcode res = curl_easy_perform(m_curl);

		if (res == CURLE_OK) {
			m_consecutive_error_count = 0;
			long response_code;
			char *new_url_str = nullptr;
			curl_easy_getinfo(m_curl, CURLINFO_RESPONSE_CODE, &response_code);
			curl_easy_getinfo(m_curl, CURLINFO_EFFECTIVE_URL, &new_url_str);

			// Fetch IP address.
			char *ip_cstr;
			string ip;
			if (!curl_easy_getinfo(m_curl, CURLINFO_PRIMARY_IP, &ip_cstr) && ip_cstr != nullptr) ip = string(ip_cstr);

			if (new_url_str != nullptr) {
				string new_u_str(new_url_str);
				URL new_url(new_u_str);
				update_url(new_url, response_code, common::cur_datetime(), URL());
				if (url.canonically_different(new_url)) {
					update_url(url, 301, common::cur_datetime(), new_url); // A bit of cheeting heere, it is not sure the original url had a 301 response code.
				}
				if (response_code == 200) {
					handle_200_response(m_buffer, response_code, ip, new_url);
				} else {
					handle_non_200_response(m_buffer, response_code, ip, new_url);
				}
			} else {
				update_url(url, response_code, common::cur_datetime(), URL());
				if (response_code == 200) {
					handle_200_response(m_buffer, response_code, ip, url);
				} else {
					handle_non_200_response(m_buffer, response_code, ip, url);
				}
			}
		} else {
			/*
			 * Handle everything here: https://curl.se/libcurl/c/libcurl-errors.html
			 * */
			vector<CURLcode> domain_errors = {
				CURLE_COULDNT_RESOLVE_HOST,
				CURLE_COULDNT_CONNECT,
			};

			handle_curl_error(url, res, string(m_curl_error_buffer));

			if (res == CURLE_COULDNT_RESOLVE_HOST || res == CURLE_COULDNT_CONNECT) {
				update_url(url, 10000 + res, common::cur_datetime(), URL());
				mark_all_urls_with_error(10000 + res);
			} else {
				update_url(url, 10000 + res, common::cur_datetime(), URL());
			}
		}

		m_buffer.resize(0);
		m_buffer.shrink_to_fit();
	}

	void scraper::mark_all_urls_with_error(size_t error_code) {
		while (m_queue.size()) {
			URL url = filter_url(m_queue.front());
			m_queue.pop();
			update_url(url, error_code, common::cur_datetime(), URL());
		}
	}

	void scraper::update_url(const URL &url, size_t http_code, size_t last_visited, const URL &redirect) {
		// Store information about URL.
	}

	void scraper::handle_curl_error(const URL &url, size_t curl_error, const std::string &error_msg) {
		m_num_errors++;
		m_consecutive_error_count++;
		m_store->add_curl_error(url.str() + "\t" + to_string(curl_error) + "\t" + error_msg + "\n");
		m_store->upload_curl_errors();
	}

	void scraper::handle_200_response(const string &data, size_t response_code, const string &ip, const URL &url) {
		(void)response_code;
		m_num_200++;
		parser::html_parser html_parser(100000);
		html_parser.parse(data, url.str());

		m_num_total++;
		if (url.has_www()) m_num_www++; 
		if (url.has_https()) m_num_https++; 
		if (m_num_total == 3) upload_domain_info();

		const string date = common::iso8601_datetime();

		if (html_parser.should_insert()) {
			const string line = (url.str()
				+ '\t' + html_parser.title()
				+ '\t' + html_parser.h1()
				+ '\t' + html_parser.meta()
				+ '\t' + html_parser.text()
				+ '\t' + date
				+ '\t' + ip
				+ '\n');
			m_store->add_scraper_data(line);
			string links;
			for (const auto &link : html_parser.links()) {
				links += (link.host()
					+ '\t' + link.path()
					+ '\t' + link.target_host()
					+ '\t' + link.target_path()
					+ '\t' + link.text()
					+ '\t' + (link.nofollow() ? "1" : "0")
					+ '\n');
			}
			m_store->add_link_data(links);
			m_store->upload_results();
		}
	}

	void scraper::handle_non_200_response(const string &data, size_t response_code, const string &ip, const URL &url) {

		m_num_non200++;

		check_for_captcha_block(data, response_code);

		parser::html_parser html_parser;
		html_parser.parse(data, url.str());

		const string date = common::iso8601_datetime();

		if (html_parser.should_insert()) {
			const string line = (url.str()
				+ '\t' + html_parser.title()
				+ '\t' + html_parser.h1()
				+ '\t' + html_parser.meta()
				+ '\t' + html_parser.text()
				+ '\t' + date
				+ '\t' + ip
				+ '\n');
			m_store->add_non_200_scraper_data(line);
			m_store->upload_non_200_results();
		}
	}

	void scraper::check_for_captcha_block(const std::string &data, size_t response_code) {
		if (response_code != 200 && (data.find("Captcha") != string::npos || data.find("captcha") != string::npos)) {
			m_blocked = true;
			mark_all_urls_with_error(10000 + 999);
		}
	}

	void scraper::download_domain_data() {
		
	}

	void scraper::download_robots() {
		const URL robots_path = filter_url(URL("http://" + m_domain + "/robots.txt"));
		m_robots_content = simple_get(robots_path);

		scraper::upload_robots_txt(m_robots_content);
	}

	bool scraper::robots_allow_url(const URL &url) const {
		googlebot::RobotsMatcher matcher;
		bool allowed = matcher.OneAgentAllowedByRobots(m_robots_content, user_agent_token(), url.str());
		return allowed;
	}

	string scraper::simple_get(const URL &url) {
		curl_easy_setopt(m_curl, CURLOPT_USERAGENT, user_agent().c_str());
		curl_easy_setopt(m_curl, CURLOPT_FOLLOWLOCATION, 1l);
		curl_easy_setopt(m_curl, CURLOPT_MAXREDIRS, 5l);
		curl_easy_setopt(m_curl, CURLOPT_WRITEFUNCTION, curl_string_reader);
		curl_easy_setopt(m_curl, CURLOPT_WRITEDATA, this);
		curl_easy_setopt(m_curl, CURLOPT_URL, url.str().c_str());
		curl_easy_setopt(m_curl, CURLOPT_TIMEOUT, 30);
		curl_easy_setopt(m_curl, CURLOPT_ERRORBUFFER, m_curl_error_buffer);

		m_buffer.resize(0);
		CURLcode res = curl_easy_perform(m_curl);
		if (res == CURLE_OK) {
			long response_code;
			char *new_url_str = nullptr;
			curl_easy_getinfo(m_curl, CURLINFO_RESPONSE_CODE, &response_code);
			curl_easy_getinfo(m_curl, CURLINFO_EFFECTIVE_URL, &new_url_str);

			check_for_captcha_block(m_buffer, response_code);
		} else {
			/*
			 * Handle everything here: https://curl.se/libcurl/c/libcurl-errors.html
			 * */
			vector<CURLcode> domain_errors = {
				CURLE_COULDNT_RESOLVE_HOST,
				CURLE_COULDNT_CONNECT,
			};

			handle_curl_error(url, res, string(m_curl_error_buffer));

			if (res == CURLE_COULDNT_RESOLVE_HOST || res == CURLE_COULDNT_CONNECT) {
				mark_all_urls_with_error(10000 + res);
			} else {
			}
		}

		return m_buffer;
	}

	void scraper::upload_domain_info() {
		if (m_num_total > 0) {
			// TODO.. Upload data about domain.
		}
	}

	void scraper::upload_robots_txt(const string &robots_content) {
		// TODO.. Upload data about robots.txt
	}

	URL scraper::filter_url(const URL &url) {
		URL ret(url);
		//if (m_domain_data.m_has_https && !url.has_https()) ret.set_scheme("https");
		//if (m_domain_data.m_has_www && !url.has_www()) ret.set_www(true);

		return ret;
	}

	void scraper::start_thread() {
		m_started = true;
		m_thread = std::move(thread([this](){
			this->run();
		}));
	}

	size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata) {
		const size_t byte_size = size * nmemb;
		scraper *s = static_cast<scraper *>(userdata);
		if (s->m_buffer_len < s->m_buffer.size() + byte_size) return 0;
		s->m_buffer.append(ptr, byte_size);
		return byte_size;
	}

	size_t read_max_scrapers() {
		ifstream infile("/tmp/num_scrapers");
		if (!infile.is_open()) return 0;
		size_t max_scrapers;
		infile >> max_scrapers;
		return max_scrapers;
	}

	bool reset_scraper_urls() {
		string content = "";
		int error = transfer::upload_file("nodes/" + config::node + "/scraper.urls", content);
		return error == transfer::OK;
	}

	vector<string> download_scraper_urls() {
		int error;
		string content = transfer::file_to_string("nodes/" + config::node + "/scraper.urls", error);
		if (error == transfer::ERROR) return {};

		reset_scraper_urls();

		vector<string> raw_urls;
		boost::algorithm::split(raw_urls, content, boost::is_any_of("\n"));

		vector<string> urls;
		for (const string &url : raw_urls) {
			if (text::trim(url).size()) {
				urls.push_back(url);
			}
		}

		return urls;
	}

	void run_scraper_on_urls(const vector<string> &input_urls) {
		size_t max_scrapers = 1000;
		scraper_store store;
		scraper_stats stats;
		map<string, unique_ptr<scraper>> scrapers;

		stats.start_thread(60); // Report statistics every minute.

		vector<string> urls = input_urls;
		while (urls.size() || scrapers.size()) {

			LOG_INFO("Starting scrapers with: " + to_string(urls.size()) + " urls");

			size_t new_max_scrapers = read_max_scrapers();
			if (new_max_scrapers) {
				max_scrapers = new_max_scrapers;
			}

			vector<string> unhandled_urls;

			for (const string &url_str : urls) {
				URL url(url_str);

				if (scrapers.count(url.host()) == 0) {
					if (scrapers.size() >= max_scrapers) {
						unhandled_urls.push_back(url_str);
					} else {
						scrapers[url.host()] = make_unique<scraper>(url.host(), &store);
						scrapers[url.host()]->push_url(url);
					}
				} else {
					scrapers[url.host()]->push_url(url);
				}
			}
			// Start scrapers.
			for (auto &iter : scrapers) {
				if (!iter.second->started()) {
					iter.second->start_thread();
				}
			}
			
			// Wait for some scrapers to finish before we assign new scrapers again.
			while (scrapers.size() > max_scrapers * 0.8) {
				stats.gather_statistics(scrapers, urls.size());
				for (auto iter = scrapers.begin(); iter != scrapers.end(); ) {
					if (iter->second->finished()) {
						iter = scrapers.erase(iter);
					} else {
						iter++;
					}
				}
				this_thread::sleep_for(1000ms);
			}
			stats.gather_statistics(scrapers, urls.size());
			urls = unhandled_urls;

			// Check for new urls and append them.
			vector<string> new_urls = download_scraper_urls();
			urls.insert(urls.end(), new_urls.begin(), new_urls.end());

			if (urls.size() == 0) {
				// We don't have any new urls. Just sleep a bit before checking again.
				std::this_thread::sleep_for(std::chrono::seconds(60));
			}
		}
		
	}

	void url_downloader() {

		const size_t timeout = 300;
		//const size_t limit = 500;

		// main loop
		while (true) {

			// Check if there are any urls to digest every 'timeout' minutes.
			vector<string> urls = download_scraper_urls();

			if (urls.size() > 0) {
				run_scraper_on_urls(urls);
			}

			sleep(timeout);
		}
	}
}


================================================
FILE: src/scraper/scraper.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iostream>
#include <queue>
#include <curl/curl.h>
#include <thread>
#include <boost/filesystem.hpp>
#include "transfer/transfer.h"
#include "robots.h"
#include "scraper_store.h"
#include "URL.h"
#include "profiler/profiler.h"

namespace scraper {

	std::string user_agent_token();
	std::string user_agent();

	/*
	 * The scraper!
	 * */
	class scraper {
		public:

			scraper(const std::string &domain, scraper_store *store);
			~scraper();

			void set_timeout(size_t timeout_in_seconds) { m_timeout = timeout_in_seconds; }
			void push_url(const URL &url);
			void run();
			void start_thread();
			bool finished() const { return m_finished; };
			bool started() { return m_started; }
			std::string domain() { return m_domain; }
			size_t num_scraped() const { return m_num_200; }
			size_t num_scraped_non200() const { return m_num_non200; }
			size_t num_errors() const { return m_num_errors; }
			size_t size() const { return m_queue.size(); }
			bool blocked() const { return m_blocked; }

		private:
			std::thread m_thread;
			bool m_started = false;
			bool m_finished = false;
			std::string m_domain;
			std::string m_buffer;
			char m_curl_error_buffer[CURL_ERROR_SIZE];
			size_t m_buffer_len = 1024*1024*10;
			size_t m_num_200 = 0;
			size_t m_num_non200 = 0;
			size_t m_num_errors = 0;
			bool m_blocked = false;
			CURL *m_curl;
			scraper_store *m_store;
			std::queue<URL> m_queue;
			googlebot::RobotsMatcher m_robots;
			std::string m_robots_content;
			size_t m_num_total = 0;
			size_t m_num_www = 0;
			size_t m_num_https = 0;
			size_t m_consecutive_error_count = 0;
			size_t m_timeout = 30;

			void handle_curl_error(const URL &url, size_t curl_error, const std::string &error_msg);
			void handle_url(const URL &url);
			void mark_all_urls_with_error(size_t error_code);
			void update_url(const URL &url, size_t http_code, size_t last_visited, const URL &redirect);
			void handle_200_response(const std::string &data, size_t response_code, const std::string &ip, const URL &url);
			void handle_non_200_response(const std::string &data, size_t response_code, const std::string &ip, const URL &url);
			void check_for_captcha_block(const std::string &data, size_t response_code);
			void download_domain_data();
			void download_robots();
			bool robots_allow_url(const URL &url) const;
			std::string simple_get(const URL &url);
			void upload_domain_info();
			void upload_robots_txt(const std::string &robots_content);
			URL filter_url(const URL &url);

		public:

			friend size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata);
	};

	class scraper_stats {
		public:
			scraper_stats();
			~scraper_stats();
			void gather_statistics(const std::map<std::string, std::unique_ptr<scraper>> &scrapers, size_t urls_in_queue);
			void start_thread(size_t timeout);
			void start_count(size_t urls_in_queue);
			void end_count();
			void count_finished(const scraper &scraper);
			void count_unfinished(const scraper &scraper);

		private:
			std::thread m_thread;
			size_t m_timeout = 300;
			size_t m_num_blocked = 0;
			size_t m_finished_scrapers = 0;
			size_t m_unfinished_scrapers = 0;
			size_t m_scraped_urls = 0;
			size_t m_unfinished_scraped_urls = 0;
			size_t m_scraped_urls_non200 = 0;
			size_t m_unfinished_scraped_urls_non200 = 0;
			size_t m_scraped_errors = 0;
			size_t m_unfinished_scraped_errors = 0;
			size_t m_urls_in_queue = 0;
			size_t m_urls_assigned = 0;
			bool m_running = true;
			std::mutex m_lock;

			void run();
			void log_report(size_t dt);
	};

	size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata);

	size_t read_max_scrapers();
	void url_downloader();
	void run_scraper_on_urls(const std::vector<std::string> &input_urls);

}


================================================
FILE: src/scraper/scraper_store.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "scraper_store.h"
#include "common/system.h"
#include "common/datetime.h"
#include "warc/warc.h"
#include "transfer/transfer.h"
#include "logger/logger.h"

using namespace std;

namespace scraper {

	scraper_store::scraper_store() {

	}

	scraper_store::scraper_store(bool do_upload)
	: m_do_upload(do_upload)
	{
	}
	
	scraper_store::~scraper_store() {
		m_upload_limit = 0;
		upload_results();
		upload_non_200_results();
	}

	void scraper_store::add_scraper_data(const std::string &line) {
		m_lock.lock();
		m_results.push_back(line);
		m_lock.unlock();
	}

	void scraper_store::add_non_200_scraper_data(const std::string &line) {
		m_lock.lock();
		m_non_200_results.push_back(line);
		m_lock.unlock();
	}

	void scraper_store::add_link_data(const std::string &links) {
		m_lock.lock();
		m_link_results.push_back(links);
		m_lock.unlock();
	}

	void scraper_store::add_curl_error(const string &line) {
		m_lock.lock();
		m_curl_errors.push_back(line);
		m_lock.unlock();
	}

	void scraper_store::upload_url_datas() {
		if (!m_do_upload) return;
		m_lock.lock();
		// todo upload data
		m_lock.unlock();
	}

	void scraper_store::upload_domain_datas() {
		if (!m_do_upload) return;
		m_lock.lock();
		// todo upload data
		m_lock.unlock();
	}

	void scraper_store::upload_robots_datas() {
		if (!m_do_upload) return;
		m_lock.lock();
		// todo upload data
		m_lock.unlock();
	}

	void scraper_store::upload_results() {
		if (!m_do_upload) return;
		m_lock.lock();
		if (m_results.size() >= m_upload_limit) {
			const string all_results = boost::algorithm::join(m_results, "");
			const string all_link_results = boost::algorithm::join(m_link_results, "");

			m_results.resize(0);
			m_link_results.resize(0);

			m_lock.unlock();

			internal_upload_results(all_results, all_link_results);

			return;
		}
		m_lock.unlock();
	}

	void scraper_store::upload_non_200_results() {
		if (!m_do_upload) return;
		m_lock.lock();
		if (m_non_200_results.size() >= m_non_200_upload_limit) {
			const string all_results = boost::algorithm::join(m_non_200_results, "");

			m_non_200_results.resize(0);

			m_lock.unlock();

			internal_upload_non_200_results(all_results);

			return;
		}
		m_lock.unlock();
	}

	void scraper_store::upload_curl_errors() {
		if (!m_do_upload) return;
		m_lock.lock();
		if (m_curl_errors.size() >= m_curl_errors_upload_limit) {
			const string all_results = boost::algorithm::join(m_curl_errors, "");

			m_curl_errors.resize(0);

			m_lock.unlock();

			internal_upload_curl_errors(all_results);

			return;
		}
		m_lock.unlock();
	}

	std::string scraper_store::tail() const {
		if (m_results.size() == 0) return "";
		return m_results.back();
	}

	void scraper_store::try_upload_until_complete(const string &path, const string &data) {

		size_t retry_num = 1;
		while (transfer::upload_gz_file(path, data) == transfer::ERROR) {
			LOG_INFO("Error uploading file " + path + " retry no " + to_string(retry_num++));
			std::this_thread::sleep_for(std::chrono::seconds(30));
		}
	}

	void scraper_store::internal_upload_results(const string &all_results, const string &all_link_results) {
		const string warc_path = "crawl-data/ALEXANDRIA-SCRAPER-01/files/" + common::uuid() + "-" + to_string(common::cur_datetime()) + "-" +
			to_string(m_file_index++) + ".warc.gz";
		try_upload_until_complete(warc::get_result_path(warc_path), all_results);
		try_upload_until_complete(warc::get_link_result_path(warc_path), all_link_results);
	}

	void scraper_store::internal_upload_non_200_results(const string &all_results) {
		const string warc_path = "crawl-data/ALEXANDRIA-SCRAPER-01/non-200-responses/" + common::uuid() + "-" + to_string(common::cur_datetime()) +
			"-" + to_string(m_file_index++) + ".warc.gz";
		try_upload_until_complete(warc::get_result_path(warc_path), all_results);
	}

	void scraper_store::internal_upload_curl_errors(const string &all_results) {
		const string warc_path = "crawl-data/ALEXANDRIA-SCRAPER-01/curl-errors/" + common::uuid() + "-" + to_string(common::cur_datetime()) +
			"-" + to_string(m_file_index++) + ".warc.gz";
		try_upload_until_complete(warc::get_result_path(warc_path), all_results);
	}

}


================================================
FILE: src/scraper/scraper_store.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>
#include <mutex>
#include <thread>

namespace scraper {

	/*
	 * Responsible for storing scraper data on a file and upload it to our fileserver when the file reaches a number of urls.
	 * */
	class scraper_store {
		public:
			scraper_store();
			scraper_store(bool do_upload);
			~scraper_store();

			void add_scraper_data(const std::string &line);
			void add_non_200_scraper_data(const std::string &line);
			void add_link_data(const std::string &links);
			void add_curl_error(const std::string &line);
			void upload_url_datas();
			void upload_domain_datas();
			void upload_robots_datas();
			void upload_results();
			void upload_non_200_results();
			void upload_curl_errors();
			std::string tail() const;

			std::vector<std::string> get_results() const { return m_results; }

		private:
			std::mutex m_lock;
			std::vector<std::string> m_results;
			std::vector<std::string> m_non_200_results;
			std::vector<std::string> m_link_results;
			std::vector<std::string> m_curl_errors;
			size_t m_file_index = 0;
			size_t m_upload_limit = 50000;
			size_t m_non_200_upload_limit = 10000;
			size_t m_curl_errors_upload_limit = 10000;
			bool m_do_upload = true;

			void try_upload_until_complete(const std::string &path, const std::string &data);
			void internal_upload_results(const std::string &all_results, const std::string &all_link_results);
			void internal_upload_non_200_results(const std::string &all_results);
			void internal_upload_curl_errors(const std::string &all_results);

	};

}


================================================
FILE: src/scraper.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <iostream>
#include <signal.h>
#include <set>
#include <boost/algorithm/string.hpp>
#include "fcgio.h"
#include "config.h"
#include "logger/logger.h"
#include "scraper/scraper.h"

using namespace std;

void custom_scraper() {

	set<string> files = {
		"1081037252118226853.gz",
		"10929784512354426297.gz",
		"11734959054377540990.gz",
		"1231587059077024966.gz",
		"12502184239462757041.gz",
		"12938836205580400636.gz",
		"13296278169331508461.gz",
		"14413462586171452382.gz",
		"15525439295995440529.gz",
		"16672519014390713150.gz",
		"18394430357962364895.gz",
		"10327881400750748691.gz",
		"10670281930934377105.gz",
		"10803309592637608156.gz",
		"1081037252118226853.gz", 
		"10834835858785818363.gz",
		"10929784512354426297.gz",
		"11126428663436160103.gz",
		"11147566439172409894.gz",
		"11190665490273023949.gz",
		"11494937404220367031.gz",
		"11734959054377540990.gz",
		"11828921816388240862.gz",
		"12060772154545358825.gz",
		"12162727308599252185.gz",
		"1231587059077024966.gz", 
		"12422730800151531594.gz",
		"12502184239462757041.gz",
		"12607232937660003080.gz",
		"12718743898666138934.gz",
		"12938836205580400636.gz",
		"13296278169331508461.gz",
		"13298202493829067141.gz",
		"13361744378846796689.gz",
		"13490885160851937523.gz",
		"13574739826384812082.gz",
		"13587802784601809709.gz",
		"13631835647153009173.gz",
		"1367770908792956967.gz", 
		"14046839555269968094.gz",
		"14413462586171452382.gz",
		"14541904792326560616.gz",
		"1482373106349460952.gz", 
		"14837337010216722341.gz",
		"15086873759162732674.gz",
		"15141235398943116798.gz",
		"15184607826907101421.gz",
		"15202491165257081552.gz",
		"15282359210281111669.gz",
		"15389582257311135463.gz",
		"15391345478373482283.gz",
		"15525439295995440529.gz",
		"15534406110118601925.gz",
		"15538335442391548855.gz",
		"15612477389751002303.gz",
		"15624474507591924007.gz",
		"15676254393982196237.gz",
		"15984927866124019398.gz",
		"16082148041043793761.gz",
		"16126091541072713257.gz",
		"16255682052513253306.gz",
		"16337701239641827376.gz",
		"16383716280375787103.gz",
		"16529912269361020733.gz",
		"16534544105461457700.gz",
		"16639969140692056885.gz",
		"16672519014390713150.gz",
		"16744732358440828846.gz",
		"16836166158893839160.gz",
		"17068835535637839797.gz",
		"1729061688188470388.gz", 
		"17360561405055540730.gz",
		"1746843565446970019.gz", 
		"17640709097762418065.gz",
		"18131842535353305093.gz",
		"18187211227753083566.gz",
		"18394430357962364895.gz",
		"1934117982241616211.gz", 
		"2211216046817783595.gz", 
		"2239809113491403275.gz", 
		"2327635888646701575.gz", 
		"2478041411438244752.gz", 
		"2551177065288807556.gz", 
		"2601237824066336189.gz", 
		"2646934360799240353.gz", 
		"2868212837076456812.gz", 
		"2926810779085983621.gz", 
		"3091319073926623211.gz", 
		"338937183383628192.gz",  
		"3604690558929123764.gz", 
		"3606044194188728481.gz", 
		"3852426225324652244.gz", 
		"3972328001646307399.gz", 
		"4007769859008228127.gz", 
		"4072548759689568430.gz", 
		"4193623627004305293.gz", 
		"4226856446620685890.gz", 
		"4312881270332666532.gz", 
		"4473520710685818343.gz", 
		"4720198542499220909.gz", 
		"4734886902380514989.gz", 
		"4800764859071121577.gz", 
		"4837392932044495189.gz", 
		"493001789945179170.gz",  
		"5263808122620003539.gz", 
		"5284265763220135234.gz", 
		"5322267948444699594.gz", 
		"5339170779334172446.gz", 
		"5496827761574196815.gz", 
		"5683557192991319856.gz", 
		"5772366474889297285.gz", 
		"5790856524309526271.gz", 
		"5853082621493931535.gz", 
		"5936310530969939988.gz", 
		"5958586233415593683.gz", 
		"5969382542874041237.gz", 
		"5969882935831645732.gz", 
		"6133590028181400561.gz", 
		"6168304203247739410.gz", 
		"619121932569169133.gz",  
		"6233832895907042056.gz", 
		"6371233587304885182.gz", 
		"6665598992901336677.gz", 
		"6747719063536596803.gz", 
		"6783121411632321193.gz", 
		"6878954272251422334.gz", 
		"6944679014837000907.gz", 
		"7204366432079867323.gz", 
		"7261759399318904627.gz", 
		"7279922463899918193.gz", 
		"7372161099870305017.gz", 
		"7483704574748382827.gz", 
		"7500975006697782336.gz", 
		"7577940383110528297.gz", 
		"7660839115654270407.gz", 
		"7690859939878490358.gz", 
		"7794216653216203685.gz", 
		"7969521158007747392.gz", 
		"7972503305086309118.gz", 
		"7977087069524267698.gz", 
		"801925665986995127.gz",  
		"8357461134896215565.gz", 
		"8473327975000475483.gz", 
		"8558287370764624669.gz", 
		"88637784417391575.gz",   
		"9219910288440466216.gz", 
		"9257832192261807811.gz", 
		"9300442310473380111.gz", 
		"9529889625719263624.gz", 
		"9668036200275969373.gz", 
		"990293958999783642.gz"
	};

	boost::filesystem::create_directories("output");

	for (string file : files) {

		ifstream infile("output/" + file);
		if (infile.is_open()) continue;

		stringstream ss;
		int error;
		transfer::gz_file_to_stream("crawl-data/ALEXANDRIA-TEST-SIZES/files/" + file, ss, error);

		if (error == transfer::OK) {
			string line;

			scraper::scraper_store store(false);
			map<string, unique_ptr<scraper::scraper>> scrapers;
			while (getline(ss, line)) {
				vector<string> cols;
				boost::algorithm::split(cols, line, boost::is_any_of("\t"));

				URL url(cols[0]);

				if (scrapers.count(url.host()) == 0) {
					scrapers[url.host()] = make_unique<scraper::scraper>(url.host(), &store);
					scrapers[url.host()]->set_timeout(0);
				}

				scrapers[url.host()]->push_url(url);
			}

			for (auto &_scraper : scrapers) {
				_scraper.second->run();
			}

			const string filename = "output/" + file;
			ofstream outfile(filename, ios::trunc | ios::binary);

			boost::iostreams::filtering_ostream compress_stream;
			compress_stream.push(boost::iostreams::gzip_compressor());
			compress_stream.push(outfile);

			for (const string row : store.get_results()) {
				compress_stream << row;
			}
		}
		return;
	}
/*
	scraper::scraper_store store(false);
	scraper::scraper _scraper("heroes.thelazy.net", &store);
	_scraper.set_timeout(0);
	_scraper.push_url(URL("https://heroes.thelazy.net//index.php/Main_Page"));
	_scraper.push_url(URL("https://heroes.thelazy.net//index.php/Dungeon"));
	_scraper.run();

	for (const string row : store.get_results()) {
		cout << row << endl;
	}*/
}

int main(int argc, const char **argv) {

	struct sigaction act{SIG_IGN};
	sigaction(SIGPIPE, &act, NULL);

	logger::start_logger_thread();

	if (getenv("ALEXANDRIA_CONFIG") != NULL) {
		config::read_config(getenv("ALEXANDRIA_CONFIG"));
	} else {
		config::read_config("/etc/alexandria.conf");
	}

	custom_scraper();

	logger::join_logger_thread();

	return 0;
}


================================================
FILE: src/search_engine/search_allocation.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "full_text/result_set.h"
#include "full_text/record.h"
#include "full_text/link_record.h"
#include "full_text/domain_link_record.h"
#include "config.h"
#include <map>
#include <vector>

namespace search_engine {

	/*
		The idea with this namespace is to handle all the memory allocation needed for serving a request to the search engine.
	*/

	template <typename data_record>
	struct storage {
		/*
			result_sets holds pre-allocated object of class full_text::result_set.
			result_sets[0 ... config::query_max_words]
		*/
		std::vector<std::unique_ptr<full_text::result_set<data_record>>> m_result_sets;

		// To hold the intersection of the result sets.
		std::unique_ptr<full_text::result_set<data_record>> m_intersected_result;
	};

	class allocation {

		public:

			allocation() {
				m_storage = create_storage();
				m_link_storage = std::make_unique();
				m_domain_link_storage = std::make_unique();
			}

		private:
			std::unique_ptr<storage<full_text::record>> m_storage;
			std::unique_ptr<storage<full_text::link_record>> m_link_storage;
			std::unique_ptr<storage<full_text::domain_link_record>> m_domain_link_storage;
	};

	template <typename data_record>
	std::unique_ptr<storage<data_record>> *create_storage() {
		auto storage = new Storage<data_record>;

		// Allocate result_sets.
		for (size_t j = 0; j < config::query_max_words; j++) {
			auto result_set = std::make_unique<full_text::result_set<data_record>>(config::ft_max_results_per_section * config::ft_max_sections);
			storage->result_sets.push_back(std::move(result_set));
		}
		storage->intersected_result = std::make_unique<full_text::result_set<data_record>>(config::ft_max_results_per_section * config::ft_max_sections);

		return storage;
	}

	allocation *create_allocation();
	void delete_allocation(allocation *allocation);

}


================================================
FILE: src/search_engine/search_engine.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "search_engine.h"
#include <cmath>

using namespace std;

namespace search_engine {

	void reset_search_metric(struct full_text::search_metric &metric) {
		metric.m_total_found = 0;
		metric.m_total_url_links_found = 0;
		metric.m_total_domain_links_found = 0;
		metric.m_links_handled = 0;
		metric.m_link_domain_matches = 0;
		metric.m_link_url_matches = 0;
	}

	std::vector<full_text::record> search_deduplicate(storage<full_text::record> *storage,
		const full_text::index<full_text::record> &index, const vector<full_text::link_record> &links,
		const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) {

		vector<full_text::record> complete_result = search_wrapper(storage, index, links, domain_links, query, config::pre_result_limit, metric);

		vector<full_text::record> deduped_result = deduplicate_result_vector<full_text::record>(complete_result, limit);

		return deduped_result;
	}

}


================================================
FILE: src/search_engine/search_engine.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>
#include <cmath>
#include "full_text/index.h"
#include "full_text/record.h"
#include "full_text/link_record.h"
#include "full_text/domain_link_record.h"
#include "full_text/shard.h"
#include "full_text/search_metric.h"
#include "logger/logger.h"
#include "profiler/profiler.h"
#include "parser/parser.h"
#include "transfer/transfer.h"
#include "algorithm/hash.h"
#include "algorithm/sort.h"
#include "algorithm/algorithm.h"
#include "search_allocation.h"
#include <cassert>

namespace search_engine {

	using std::string;
	using std::vector;
	using std::future;
	using std::thread;
	using std::span;
	using std::pair;
	using std::map;
	using std::unordered_map;

	/*
		Public interface
	*/

	/*
		Our main search routine, no deduplication just raw search.
	*/
	template<typename data_record>
	vector<data_record> search(storage<data_record> *storage, const full_text::index<data_record> &index,
		const vector<full_text::link_record> &links, const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit,
		struct full_text::search_metric &metric);

	/*
		Only for FullTextRecords since deduplication requires domain hashes.
	*/
	vector<full_text::record> search_deduplicate(storage<full_text::record> *storage,
		const full_text::index<full_text::record> &index, const vector<full_text::link_record> &links,
		const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric);

	/*
		Search for the exact phrase. Will treat the whole phrase as an n_gram so will only give results when num words in query are less
		or equal to config::n_gram.
	*/
	template<typename data_record>
	vector<data_record> search_exact(storage<data_record> *storage, const full_text::index<data_record> &index,
		const string &query, size_t limit, struct full_text::search_metric &metric);

	template<typename data_record>
	vector<data_record> search_ids(storage<data_record> *storage, const full_text::index<data_record> &index,
		const string &query, size_t limit);

	template<typename data_record>
	full_text::result_set<data_record> *search_remote(const std::string &query, storage<data_record> *storage);


	template<typename data_record>
	class comparator_class {
	public:
		// Comparator function
		bool operator()(data_record &a, data_record &b)
		{
			if (a.m_score == b.m_score) return a.m_value < b.m_value;
			return a.m_score > b.m_score;
		}
	};

	void reset_search_metric(struct full_text::search_metric &metric);

	template<typename data_record>
	void set_total_found(const vector<full_text::result_set<data_record> *> result_vector, struct full_text::search_metric &metric, double result_quote) {

		size_t largest_total = 0;
		for (full_text::result_set<data_record> *result : result_vector) {
			if (result->total_num_results() > largest_total) {
				largest_total = result->total_num_results();
			}
		}

		metric.m_total_found = (size_t)(largest_total * result_quote);
	}

	template<typename data_record>
	size_t largest_result(const vector<full_text::result_set<data_record> *> &result_vector) {

		size_t largest_size = 0;
		for (full_text::result_set<data_record> *result : result_vector) {
			if (result->size() > largest_size) {
				largest_size = result->size();
			}
		}

		return largest_size;
	}

	/*
		Add scores for the given links to the result set. The links are assumed to be ordered by link.m_target_hash ascending.
	*/
	template<typename data_record>
	size_t apply_link_scores(const vector<full_text::link_record> &links, full_text::result_set<data_record> *results) {

		if (typeid(data_record) != typeid(full_text::record)) return 0;
		if (links.size() == 0) return 0;

		size_t applied_links = 0;

		size_t i = 0;
		size_t j = 0;
		map<pair<uint64_t, uint64_t>, uint64_t> domain_unique;
		full_text::record *data = (full_text::record *)results->data_pointer();
		while (i < links.size() && j < results->size()) {

			const uint64_t hash1 = links[i].m_target_hash;
			const uint64_t hash2 = data[j].m_value;

			if (hash1 < hash2) {
				i++;
			} else if (hash1 == hash2) {

				if (domain_unique.count(std::make_pair(links[i].m_source_domain, links[i].m_target_hash)) == 0) {
					const float url_score = expm1(25.0f*links[i].m_score) / 50.0f;
					data[j].m_score += url_score;
					applied_links++;
					domain_unique[std::make_pair(links[i].m_source_domain, links[i].m_target_hash)] = links[i].m_source_domain;
				}

				i++;
			} else {
				j++;
			}
		}

		return applied_links;
	}

	template<typename data_record>
	size_t apply_domain_link_scores(const vector<full_text::domain_link_record> &links, full_text::result_set<data_record> *results) {

		if (typeid(data_record) != typeid(full_text::record)) return 0;
		if (links.size() == 0) return 0;

		size_t applied_links = 0;
		{
			std::unordered_map<uint64_t, float> domain_scores;
			std::unordered_map<uint64_t, int> domain_counts;
			std::map<std::pair<uint64_t, uint64_t>, uint64_t> domain_unique;
			{
				for (const full_text::domain_link_record &link : links) {

					if (domain_unique.count(std::make_pair(link.m_source_domain, link.m_target_domain)) == 0) {

						const float domain_score = expm1(25.0f*link.m_score) / 50.0f;
						domain_scores[link.m_target_domain] += domain_score;
						domain_counts[link.m_target_domain]++;
						domain_unique[std::make_pair(link.m_source_domain, link.m_target_domain)] = link.m_source_domain;

					}
				}
			}

			// Loop over the results and add the calculated domain scores.
			full_text::record *data = (full_text::record *)results->data_pointer();
			for (size_t i = 0; i < results->size(); i++) {
				const float domain_score = domain_scores[data[i].m_domain_hash];
				data[i].m_score += domain_score;
				applied_links += domain_counts[data[i].m_domain_hash];
			}
		}

		return applied_links;
	}

	template<typename data_record>
	size_t lower_bound(const data_record *data, size_t pos, size_t len, uint64_t value) {
		while (pos < len) {
			size_t m = (pos + len) >> 1;
			if (data[m].m_value < value) {
				pos = m + 1;
			} else {
				len = m;
			}
		}

		return pos;
	}

	template<typename data_record>
	void value_intersection(const vector<full_text::result_set<data_record> *> &result_sets, vector<int> sections, vector<data_record> &dest) {

		if (result_sets.size() == 0) {
			return;
		}

		size_t shortest_vector_position = 0;
		size_t shortest_len = SIZE_MAX;
		{
			size_t iter_index = 0;
			for (full_text::result_set<data_record> *result_set : result_sets) {
				if (shortest_len > result_set->size()) {
					shortest_len = result_set->size();
					shortest_vector_position = iter_index;
				}
				iter_index++;
			}
		}

		vector<size_t> positions(result_sets.size(), 0);

		const data_record *shortest_data = result_sets[shortest_vector_position]->section_pointer(sections[shortest_vector_position]);

		while (positions[shortest_vector_position] < shortest_len) {

			bool all_equal = true;
			uint64_t value = shortest_data[positions[shortest_vector_position]].m_value;

			float score_sum = 0.0f;
			size_t iter_index = 0;
			for (full_text::result_set<data_record> *result_set : result_sets) {
				const data_record *data_arr = result_set->section_pointer(sections[iter_index]);
				const size_t len = result_set->size();

				size_t *pos = &(positions[iter_index]);
				
				// this is a linear search.
				while (*pos < len && value > data_arr[*pos].m_value) {
					(*pos)++;
				}

				if (*pos < len && value == data_arr[*pos].m_value) {
					const float score = data_arr[*pos].m_score;
					score_sum += score;
				}
				if ((*pos < len && value < data_arr[*pos].m_value) || *pos >= len) {
					all_equal = false;
					break;
				}
				iter_index++;
			}
			if (all_equal) {
				dest.push_back(shortest_data[positions[shortest_vector_position]]);
				dest.back().m_score = score_sum / result_sets.size();
			}

			positions[shortest_vector_position]++;
		}
	}

	template<typename data_record>
	void calculate_intersection(const vector<full_text::result_set<data_record> *> &result_sets, full_text::result_set<data_record> *dest) {

		for (full_text::result_set<data_record> *result : result_sets) {
			if (result->size() == 0) return;
		}

		vector<full_text::result_set<data_record> *> sorted_result_sets(result_sets);

		sort(sorted_result_sets.begin(), sorted_result_sets.end(), [](const full_text::result_set<data_record> *a, const full_text::result_set<data_record> *b) {
			return a->total_num_results() < b->total_num_results();
		});

		vector<int> lengths;
		for (full_text::result_set<data_record> *result : sorted_result_sets) {
			lengths.push_back(result->num_sections());
		}

		vector<vector<int>> partitions = Algorithm::incremental_partitions(lengths, config::ft_section_depth);

		// First just try the top sections.
		{
			vector<data_record> result;
			value_intersection(sorted_result_sets, partitions[0], result);
			if (result.size() >= config::result_limit) {
				dest->copy_vector(result);
				return;
			}
		}

		vector<int> maximum(sorted_result_sets.size(), 0);
		for (const vector<int> &vec : partitions) {
			for (size_t i = 0; i < vec.size(); i++) {
				if (vec[i] > maximum[i]) maximum[i] = vec[i];
			}
		}
		for (size_t i = 0; i < maximum.size(); i++) {
			sorted_result_sets[i]->read_to_section(maximum[i]);
		}

		size_t idx = 0;
		const size_t num_threads = 8;

		ThreadPool pool(num_threads);
		vector<vector<data_record>> results(partitions.size());
		std::vector<std::future<vector<data_record>>> thread_results;
		for (const vector<int> &partition : partitions) {
			thread_results.emplace_back(pool.enqueue([sorted_result_sets, partition]() {
				vector<data_record> result;
				value_intersection(sorted_result_sets, partition, result);
				return result;
			}));
			idx++;
		}
		idx = 0;
		for (auto && result: thread_results) {
			results[idx] = result.get();
			idx++;
		}
		// merge
		vector<data_record> merged_vec;
		Sort::merge_arrays(results, [](const data_record &a, const data_record &b) {
			return a.m_value < b.m_value;
		}, merged_vec);

		// copy.
		dest->copy_vector(merged_vec);
	}

	template<typename data_record>
	void sort_by_score(vector<data_record> &results) {
		sort(results.begin(), results.end(), [](const data_record &a, const data_record &b) {
			return a.m_score > b.m_score;
		});
	}

	/*
		puts the top n elements in the first n slots of results. Then sorts those top n elements by value.

		this function assumes that the input results are sorted by value! so it does nothing for n < results.size()
	*/
	template<typename data_record>
	void get_unsorted_results_with_top_scores(full_text::result_set<data_record> *result, size_t n) {

		if (result->size() > n) {
			span<data_record> *arr = result->span_pointer();
			nth_element(arr->begin(), arr->begin() + (n - 1), arr->end(), SearchEngine::comparator_class<data_record>{});

			sort(arr->begin(), arr->begin() + n, [](const data_record &a, const data_record &b) {
				return a.m_value < b.m_value;
			});

			result->resize(n);
		}
	}

	template<typename data_record>
	bool result_has_many_domains(const full_text::result_set<data_record> *results) {

		if (results->size() == 0) return false;

		const data_record *data = results->data_pointer();
		const uint64_t first_domain_hash = data[0].m_domain_hash;
		for (size_t i = 0; i < results->size(); i++) {
			if (data[i].m_domain_hash != first_domain_hash) {
				return true;
			}
		}

		return false;
	}

	template<typename data_record>
	void deduplicate_domains(full_text::result_set<data_record> *results, size_t results_per_domain, size_t limit) {

		vector<data_record> deduplicate;
		unordered_map<uint64_t, size_t> domain_counts;
		data_record *records = results->data_pointer();
		size_t j = 0;
		for (size_t i = 0; i < results->size() && j < limit; i++) {
			records[j] = records[i];
			if (domain_counts[records[i].m_domain_hash] < results_per_domain) {
				j++;
				domain_counts[records[i].m_domain_hash]++;
			}
		}
		results->resize(j);
	}

	template<typename data_record>
	vector<data_record> deduplicate_result_vector(const vector<data_record> &results, size_t limit) {

		vector<data_record> deduped;
		vector<data_record> non_deduped;

		map<uint64_t, size_t> d_count;
		for (const data_record &result : results) {
			if (d_count[result.m_domain_hash] < config::deduplicate_domain_count) {
				deduped.push_back(result);
			} else {
				non_deduped.push_back(result);
			}
			d_count[result.m_domain_hash]++;
		}
		if (deduped.size() < limit) {
			const size_t num_missing = limit - deduped.size();
			if (non_deduped.size() > num_missing) {
				non_deduped.resize(num_missing);
			}
			vector<data_record> ret;
			Sort::merge_arrays(deduped, non_deduped, [] (const data_record &a, const data_record &b) {
				return a.m_score > b.m_score;
			}, ret);
			return ret;
		}

		deduped.resize(limit);

		return deduped;
	}

	template<typename data_record>
	vector<full_text::result_set<data_record> *> search_shards(vector<full_text::result_set<data_record> *> &result_sets,
		const vector<FullTextShard<data_record> *> &shards, const vector<string> &words) {

		assert(words.size() <= config::query_max_words);
		assert(words.size() <= result_sets.size());

		vector<full_text::result_set<data_record> *> result_vector;
		vector<string> searched_words;
		size_t word_id = 0;
		for (const string &word : words) {

			// One word should only be searched once.
			if (find(searched_words.begin(), searched_words.end(), word) != searched_words.end()) continue;
			
			searched_words.push_back(word);

			uint64_t word_hash = Hash::str(word);

			shards[word_hash % config::ft_num_shards]->find(word_hash, result_sets[word_id]);

			result_vector.push_back(result_sets[word_id]);
			word_id++;
		}

		return result_vector;
	}

	template<typename data_record>
	vector<full_text::result_set<data_record> *> search_shards_exact(vector<full_text::result_set<data_record> *> &result_sets,
		const vector<FullTextShard<data_record> *> &shards, const vector<string> &words) {

		assert(words.size() <= config::query_max_words);
		assert(words.size() <= result_sets.size());

		vector<full_text::result_set<data_record> *> result_vector;

		uint64_t n_gram_hash = Hash::str(boost::join(words, " "));

		shards[n_gram_hash % config::ft_num_shards]->find(n_gram_hash, result_sets[0]);

		result_vector.push_back(result_sets[0]);

		return result_vector;
	}

	template <typename data_record>
	full_text::result_set<data_record> *make_search(storage<data_record> *storage,
			const vector<FullTextShard<data_record> *> &shards, const vector<full_text::link_record> &links,
			const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) {

		reset_search_metric(metric);

		vector<string> words = Text::get_full_text_words(query, config::query_max_words);
		if (words.size() == 0) return new full_text::result_set<data_record>(0);

		vector<full_text::result_set<data_record> *> result_vector = search_shards<data_record>(storage->result_sets, shards, words);

		full_text::result_set<data_record> *flat_result;
		if (result_vector.size() > 1) {

			// We need to calculate the intersection of the given results.
			flat_result = storage->intersected_result;
			flat_result->resize(0);
			calculate_intersection<data_record>(result_vector, flat_result);

			set_total_found<data_record>(result_vector, metric, (double)flat_result->size() / largest_result(result_vector));
		} else {
			flat_result = result_vector[0];
			set_total_found<data_record>(result_vector, metric, 1.0);
		}

		// Close file pointers.
		for (full_text::result_set<data_record> *result_set : result_vector) {
			result_set->close_sections();
		}

		metric.m_link_domain_matches = apply_domain_link_scores(domain_links, flat_result);
		metric.m_link_url_matches = apply_link_scores(links, flat_result);

		get_unsorted_results_with_top_scores<data_record>(flat_result, limit);

		return flat_result;
	}

	template <typename data_record>
	full_text::result_set<data_record> *make_search_exact(storage<data_record> *storage,
			const vector<FullTextShard<data_record> *> &shards, const string &query, size_t limit, struct full_text::search_metric &metric) {

		reset_search_metric(metric);

		vector<string> words = Text::get_full_text_words(query, config::query_max_words);
		if (words.size() == 0) return new full_text::result_set<data_record>(0);

		vector<full_text::result_set<data_record> *> result_vector = search_shards_exact<data_record>(storage->result_sets, shards, words);

		full_text::result_set<data_record> *flat_result;
		if (result_vector.size() > 1) {

			// We need to calculate the intersection of the given results.
			flat_result = storage->intersected_result;
			flat_result->resize(0);
			calculate_intersection<data_record>(result_vector, flat_result);

			set_total_found<data_record>(result_vector, metric, (double)flat_result->size() / largest_result(result_vector));
		} else {
			flat_result = result_vector[0];
			set_total_found<data_record>(result_vector, metric, 1.0);
		}

		// Close file pointers.
		for (full_text::result_set<data_record> *result_set : result_vector) {
			result_set->close_sections();
		}

		get_unsorted_results_with_top_scores<data_record>(flat_result, limit);

		return flat_result;
	}

	template<typename data_record>
	vector<data_record> search_wrapper(storage<data_record> *storage, const full_text::index<data_record> &index,
		const vector<full_text::link_record> &links, const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit,
		struct full_text::search_metric &metric) {


		full_text::result_set<data_record> *result = make_search<data_record>(storage, index.shards(), links, domain_links, query, limit, metric);

		vector<data_record> complete_result(result->span_pointer()->begin(), result->span_pointer()->end());

		// Sort.
		sort_by_score<data_record>(complete_result);

		return complete_result;
	}

	template<typename data_record>
	vector<data_record> search_wrapper_exact(storage<data_record> *storage, const full_text::index<data_record> &index,
		const string &query, size_t limit, struct full_text::search_metric &metric) {

		full_text::result_set<data_record> *result = make_search_exact<data_record>(storage, index.shards(), query, limit, metric);

		vector<data_record> complete_result(result->span_pointer()->begin(), result->span_pointer()->end());

		// Sort.
		sort_by_score<data_record>(complete_result);

		return complete_result;
	}

	template<typename data_record>
	vector<data_record> search(storage<data_record> *storage, const full_text::index<data_record> &index,
		const vector<full_text::link_record> &links, const vector<full_text::domain_link_record> &domain_links, const string &query, size_t limit,
		struct full_text::search_metric &metric) {

		vector<data_record> complete_result = search_wrapper(storage, index, links, domain_links, query, limit, metric);
		
		if (complete_result.size() > limit) {
			complete_result.resize(limit);
		}

		return complete_result;
	}

	template<typename data_record>
	vector<data_record> search_exact(storage<data_record> *storage, const full_text::index<data_record> &index,
		const string &query, size_t limit, struct full_text::search_metric &metric) {

		vector<data_record> complete_result = search_wrapper_exact(storage, index, query, limit, metric);
		
		if (complete_result.size() > limit) {
			complete_result.resize(limit);
		}

		return complete_result;
	}

	template<typename data_record>
	vector<data_record> search_ids(storage<data_record> *storage, const full_text::index<data_record> &index,
		const string &query, size_t limit) {

		vector<string> words = text::get_expanded_full_text_words(query);

		uint64_t key = algorithm::hash(boost::algorithm::join(words, " "));

		index.shards()[key % config::ft_num_shards]->find(key, storage->result_sets[0]);

		vector<data_record> ret(storage->result_sets[0]->span_pointer()->begin(), storage->result_sets[0]->span_pointer()->end());

		storage->result_sets[0]->close_sections();

		return ret;
	}

	template<typename data_record>
	full_text::result_set<data_record> *search_remote(const std::string &query, storage<data_record> *storage) {
		storage->result_sets[0]->resize(0);

		string buffer;
		int error;
		transfer::url_to_string(config::data_node + "/?i=" + parser::urlencode(query), buffer, error);
		if (error == transfer::OK) {
			const size_t num_records = buffer.size() / sizeof(data_record);
			data_record *data_ptr = storage->result_sets[0]->data_pointer();
			memcpy(data_ptr, buffer.c_str(), buffer.size());
			storage->result_sets[0]->resize(num_records);
		}
		return storage->result_sets[0];
	}

}


================================================
FILE: src/server/search_server.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "search_server.h"

#include <iostream>
#include "http/server.h"
#include "indexer/index_manager.h"
#include "indexer/url_record.h"
#include "hash_table2/hash_table.h"
#include "transfer/transfer.h"
#include "parser/parser.h"
#include "parser/unicode.h"
#include "api/result_with_snippet.h"
#include "api/api_response.h"
#include "full_text/search_metric.h"

namespace server {

	void search_server() {

		indexer::index_manager idx_manager;

		cout << "starting server..." << endl;

		::http::server srv([&idx_manager](const http::request &req) {
			http::response res;

			res.content_type("application/json");

			URL url = req.url();

			auto query = url.query();

			size_t limit = 1000;
			if (query.count("limit")) limit = std::stoi(query["limit"]);

			(void)limit;

			if (url.path() == "/favicon.ico") {
				res.code(404);
				res.body("404");
				return res;
			}

			stringstream body;

			// implement the same search server logic we have on alexandria.org now.
			LOG_INFO("Serving request: " + url.path());

			bool deduplicate = true;
			if (query.find("d") != query.end()) {
				if (query["d"] == "a") {
					deduplicate = false;
				}
			}

			if (query.find("q") != query.end() && deduplicate) {

				full_text::search_metric metric;

				profiler::instance profiler;

				auto results = idx_manager.find(query["q"], metric);

				api::api_response api_res(results, metric, profiler.get());
				body << api_res;
			}

			res.code(200);

			res.body(body.str());

			return res;
		});
	}
}


================================================
FILE: src/server/search_server.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

namespace server {
	void search_server();
}


================================================
FILE: src/server/url_server.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "url_server.h"

#include <iostream>
#include "http/server.h"
#include "indexer/index_manager.h"
#include "indexer/url_record.h"

namespace server {
	void url_server() {

		cout << "starting server..." << endl;

		::http::server srv([](const http::request &req) {
			http::response res;

			URL url = req.url();

			auto query = url.query();

			stringstream body;

			if (req.request_method() == "POST") {
				const string req_body = req.request_body();

				const size_t num_hashes = req_body.size() / sizeof(uint64_t);
				std::vector<uint64_t> domain_hashes(num_hashes);
				memcpy((char *)domain_hashes.data(), req_body.c_str(), num_hashes * sizeof(uint64_t));

				auto tokens = text::get_tokens(query["q"]);

				size_t len = std::stoull(query["len"]);

				std::map<uint64_t, std::vector<indexer::url_record>> results;

				utils::thread_pool pool(32);
				std::mutex result_lock;
				cout << "received " << domain_hashes.size() << " hashes" << endl;
				size_t all_total_num_results = 0;
				for (auto dom_hash : domain_hashes) {
					pool.enqueue([dom_hash, tokens, &query, &result_lock, &results, &all_total_num_results, len]() {
						std::vector<indexer::url_record> res;

						vector<indexer::link_record> links;
						{
							// read links
							const string file = config::data_path() + "/" + to_string(dom_hash % 8) +
								"/full_text/url_links/" + to_string(dom_hash) + ".data";
							indexer::index_reader_file reader(file);

							if (reader.size()) {
								if (reader.size() > 10 * 1024* 1024) {
									indexer::index<indexer::link_record> idx("url_links", dom_hash, 1000);
									links = idx.find_top(tokens, 1000);
								} else {
									const size_t size = reader.size();
									std::unique_ptr<char[]> buffer = std::make_unique<char[]>(size);
									reader.seek(0);
									reader.read(buffer.get(), size);
									std::istringstream ram_reader(string(buffer.get(), size));
									indexer::index<indexer::link_record> idx(&ram_reader, 1000);
									links = idx.find_top(tokens, 1000);
								}
							}

							std::sort(links.begin(), links.end(), indexer::link_record::storage_order());

							auto link_formula = [](float score) {
								return expm1(20.0f * score) / 10.0f;
							};

							std::vector<indexer::link_record> grouped;
							for (auto rec : links) {
								if (grouped.size() && grouped.back().storage_equal(rec)) {
									grouped.back().m_score += link_formula(rec.m_score);
								} else {
									grouped.emplace_back(rec);
									grouped.back().m_score = link_formula(rec.m_score);
								}
							}

							links = grouped;
						}

						const string file = config::data_path() + "/" + to_string(dom_hash % 8) + "/full_text/url/" +
							to_string(dom_hash) + ".data";
						indexer::index_reader_file reader(file);

						size_t mod_incr = 0;
						auto score_mod = [&mod_incr, &links](const indexer::url_record &record) {
							while (mod_incr < links.size() && links[mod_incr].m_target_hash < record.m_value) {
								mod_incr++;
							}
							float link_score = 0.0f;
							if (mod_incr < links.size() && links[mod_incr].m_target_hash == record.m_value) {
								link_score += links[mod_incr].m_score;
							}
							return record.m_score + ((1000.0f - record.url_length()) / 500.0f) + link_score;
						};

						size_t total_num_results = 0;

						if (reader.size()) {
							if (reader.size() > 10 * 1024* 1024) {
								indexer::index<indexer::url_record> idx("url", dom_hash, 1000);
								res = idx.find_top(total_num_results, tokens, len, score_mod);
							} else {
								const size_t size = reader.size();
								std::unique_ptr<char[]> buffer = std::make_unique<char[]>(size);
								reader.seek(0);
								reader.read(buffer.get(), size);
								std::istringstream ram_reader(std::string(buffer.get(), size));
								indexer::index<indexer::url_record> idx(&ram_reader, 1000);
								res = idx.find_top(total_num_results, tokens, len, score_mod);
							}
						}

						std::lock_guard lock(result_lock);
						all_total_num_results += total_num_results;
						results[dom_hash] = res;
					});
				}

				pool.run_all();

				// Output result.
				body.write((char *)&all_total_num_results, sizeof(size_t));
				for (auto domain_hash : domain_hashes) {
					body.write((char *)&domain_hash, sizeof(uint64_t));
					size_t num_records = results[domain_hash].size();
					body.write((char *)&num_records, sizeof(size_t));

					for (const auto &record : results[domain_hash]) {
						body.write((char *)&(record.m_value), sizeof(uint64_t));
						body.write((char *)&(record.m_score), sizeof(float));
					}
				}

				res.content_type("application/octet-stream");
			}

			res.code(200);

			const string res_str = body.str();
			cout << "outputting: " << res_str.size() << endl;
			res.body(res_str);

			return res;
		});
	}
}


================================================
FILE: src/server/url_server.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

namespace server {
	void url_server();
}


================================================
FILE: src/server.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <random>

#include <iostream>
#include <signal.h>
#include "fcgio.h"
#include "config.h"
#include "logger/logger.h"
#include "profiler/profiler.h"
#include "indexer/console.h"
#include "json.hpp"
#include "server/search_server.h"
#include "server/url_server.h"

#include <fstream>

using namespace std;

int main(int argc, const char **argv) {

	struct sigaction act{SIG_IGN};
	sigaction(SIGPIPE, &act, NULL);

	logger::start_logger_thread();

	if (getenv("ALEXANDRIA_CONFIG") != NULL) {
		config::read_config(getenv("ALEXANDRIA_CONFIG"));
	} else {
		config::read_config("/etc/alexandria.conf");
	}

	const string arg(argc > 1 ? argv[1] : "");

	server::search_server();

	logger::join_logger_thread();

	return 0;
}


================================================
FILE: src/stats/stats.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "config.h"
#include "text/text.h"
#include "full_text/full_text_index.h"
#include "full_text/full_text_shard.h"

namespace stats {

	std::hash<std::string> hasher;

	template<typename data_record>
	std::map<std::string, double> word_stats(const full_text::full_text_index<data_record> &index, const std::string &query, size_t index_size);

	template<typename data_record>
	std::map<std::string, double> get_word_counts(const std::vector<full_text::full_text_shard<data_record> *> &shards, const std::string &query) {

		std::vector<std::string> words = text::get_full_text_words(query);
		if (words.size() == 0) return {};

		std::map<std::string, double> result;
		std::vector<std::string> searched_words;
		for (const std::string &word : words) {

			// One word should only be searched once.
			if (find(searched_words.begin(), searched_words.end(), word) != searched_words.end()) continue;
			searched_words.push_back(word);

			uint64_t word_hash = hasher(word);
			result[word] = shards[word_hash % config::ft_num_shards]->total_num_results(word_hash);
		}

		return result;
	}

	template<typename data_record>
	std::map<std::string, double> word_stats(const full_text::full_text_index<data_record> &index, const std::string &query, size_t index_size) {

		std::map<std::string, double> complete_result = get_word_counts<data_record>(index.shards(), query);

		for (const auto &iter : complete_result) {
			complete_result[iter.first] /= index_size;
		}

		return complete_result;
	}

}


================================================
FILE: src/text/stopwords.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "stopwords.h"

using namespace std;

bool stopwords::is_stop_word(const string &word) {
	return (s_english.find(word) != s_english.end()) || (s_swedish.find(word) != s_swedish.end());
}

set<string> stopwords::s_english{
	"the",
	"of",
	"and",
	"in",
	"to",
	"a",
	"is",
	"as",
	"for",
	"was",
	"by",
	"that",
	"with",
	"on",
	"from",
	"are",
	"an",
	"or",
	"it",
	"at",
	"his",
	"be",
	"which",
	"this",
	"he",
	"were",
	"not",
	"also",
	"has",
	"have",
	"its",
	"their",
	"but",
	"first",
	"had",
	"one",
	"other",
	"new",
	"they",
	"such",
	"been",
	"can",
	"after",
	"more",
	"who",
	"two",
	"all",
	"some",
	"most",
	"may",
	"into",
	"when",
	"between",
	"than",
	"there",
	"these",
	"during",
	"only",
	"many",
	"time",
	"would",
	"states",
	"no",
	"over",
	"about",
	"while",
	"use",
	"both",
	"if",
	"where",
	"then",
	"i",
	"through",
	"since",
	"being",
	"made",
	"became",
	"part",
	"her",
	"de",
	"three",
	"any",
	"up",
	"each",
	"them",
	"often",
	"will",
	"him",
	"so",
	"out",
	"same",
	"because",
	"well",
	"several",
	"form",
	"name",
	"could",
	"although",
	"set",
	"different",
	"1",
	"2",
	"3",
	"4",
	"5",
	"6",
	"7",
	"8",
	"9",
	"0"
};

set<string> stopwords::s_swedish{
	"och",
	"i",
	"av",
	"som",
	"en",
	"att",
	"till",
	"den",
	"med",
	"på",
	"är",
	"för",
	"det",
	"de",
	"ett",
	"var",
	"från",
	"har",
	"om",
	"vid",
	"inte",
	"även",
	"eller",
	"sig",
	"men",
	"efter",
	"man",
	"kan",
	"sin",
	"där",
	"andra",
	"hade",
	"blev",
	"då",
	"första",
	"finns",
	"mot",
	"sedan",
	"så",
	"genom",
	"över",
	"detta",
	"också",
	"bland",
	"mellan",
	"två",
	"när",
	"fick",
	"samt",
	"skulle",
	"annat",
	"dock",
	"denna",
	"inom",
	"olika",
	"vilket",
	"ut",
	"flera",
	"se",
	"vara",
	"upp",
	"ha",
	"senare",
	"många",
	"kom",
	"än",
	"dessa",
	"alla",
	"samma",
	"del",
	"stora",
	"sitt",
	"sina",
	"mycket",
	"tre",
	"mer",
	"utan",
	"nya",
	"ofta",
	"enligt",
	"blir",
	"några",
	"kunde",
	"hela",
	"gjorde",
	"varit",
	"här",
	"ska",
	"eftersom",
	"få",
	"fanns",
	"bara",
	"något",
	"kommer",
	"både",
	"kallas",
	"vissa",
	"får",
	"cirka",
	"ur",
	"endast",
	"tog",
	"dem",
	"medan",
	"redan",
	"fyra",
	"någon",
	"nu",
	"går",
	"innan",
	"bli",
	"allt",
	"därefter",
	"därför",
	"hur",
	"varje",
	"per",
	"åt",
	"antal",
	"delen",
	"vilken",
	"vad",
	"helt",
	"sätt",
	"vill",
	"åren",
	"gör",
	"kallade",
	"främst",
	"båda",
	"själv",
	"1",
	"2",
	"3",
	"4",
	"5",
	"6",
	"7",
	"8",
	"9",
	"0"
};


================================================
FILE: src/text/stopwords.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <set>

class stopwords {

public:

	static bool is_stop_word(const std::string &word);

private:

	static std::set<std::string> s_english;
	static std::set<std::string> s_swedish;

};


================================================
FILE: src/text/text.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "text.h"

namespace text {

	bool is_clean_char(const char *ch, size_t multibyte_len) {
		if (multibyte_len == 1) {
			return (ch[0] >= 'a' && ch[0] <= 'z') || (ch[0] >= '0' && ch[0] <= '9');
		} else if (multibyte_len == 2) {
			return (strncmp(ch, "å", 2) == 0) || (strncmp(ch, "ä", 2) == 0) || (strncmp(ch, "ö", 2) == 0);
		}
		return false;
	}

	bool is_clean_word(const std::string &s) {
		const char *str = s.c_str();
		size_t len = s.size();
		for (size_t i = 0; i < len; ) {
			size_t multibyte_len = 1;
			for (size_t j = i + 1; IS_MULTIBYTE_CODEPOINT(str[j]) && (j < len); j++, multibyte_len++) {
			}

			if (!is_clean_char(&str[i], multibyte_len)) {
				return false;
			}

			i += multibyte_len;
		}

		return true;
	}

	std::string clean_word(const std::string &s) {
		std::string result;
		const char *str = s.c_str();
		size_t len = s.size();
		for (size_t i = 0; i < len; ) {
			size_t multibyte_len = 1;
			for (size_t j = i + 1; IS_MULTIBYTE_CODEPOINT(str[j]) && (j < len); j++, multibyte_len++) {
			}

			if (is_clean_char(&str[i], multibyte_len)) {
				result.append(&str[i], multibyte_len);
			}

			i += multibyte_len;
		}

		return result;
	}

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
	*/
	std::vector<std::string> get_words(const std::string &str, size_t limit) {

		const std::string word_boundary = " \t,|!";

		std::string str_lc = lower_case(str);

		std::vector<std::string> raw_words, words;
		boost::split(raw_words, str_lc, boost::is_any_of(word_boundary));

		for (std::string &word : raw_words) {
			trim_both_inplace(word);
			if (is_clean_word(word) && word.size() <= CC_MAX_WORD_LEN &&
					word.size() > 0) {
				words.push_back(word);
			}
			if (limit && words.size() == limit) break;
		}

		return words;
	}

	std::vector<std::string> get_words(const std::string &str) {

		return get_words(str, 0);
	}

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
	*/
	std::vector<std::string> get_full_text_words(const std::string &str, size_t limit) {

		const std::string word_boundary = " \t,|!";

		std::string str_lc = lower_case(str);

		std::vector<std::string> raw_words, words;
		boost::split(raw_words, str_lc, boost::is_any_of(word_boundary));

		for (std::string &word : raw_words) {
			if (parser::unicode::is_valid(word)) {
				trim_both_inplace(word);
				if (word.size() <= CC_MAX_WORD_LEN && word.size() > 0) {
					words.push_back(word);
				}
				if (limit && words.size() == limit) break;
			}
			
		}

		return words;
	}

	std::vector<std::string> get_full_text_words(const std::string &str) {

		return get_full_text_words(str, 0);
	}

	std::vector<uint64_t> get_full_text_tokens(const std::string &str, size_t limit) {

		const auto words = get_full_text_words(str, limit);
		std::vector<uint64_t> ret(words.size());

		std::transform(words.cbegin(), words.cend(), ret.begin(), [](const std::string &word) {
			return algorithm::hash(word);
		});

		return ret;

	}

	std::vector<uint64_t> get_full_text_tokens(const std::string &str) {

		return get_full_text_tokens(str, 0);

	}

	std::vector<uint64_t> get_unique_full_text_tokens(const std::string &str, size_t limit) {

		auto vec = get_full_text_tokens(str, 0);
		std::set<uint64_t> s;
		const unsigned size = vec.size();
		for (unsigned i = 0; i < size; ++i) s.insert(vec[i]);

		vec.assign(s.begin(), s.end());

		return vec;
	}

	std::vector<uint64_t> get_unique_full_text_tokens(const std::string &str) {

		return get_unique_full_text_tokens(str, 0);

	}

	/*
		This should be the fast way of getting tokens out of a string. It should just read the whole string and
		store tokens using the str2token hash function.
	*/
	std::vector<uint64_t> get_tokens(const std::string &str, std::function<uint64_t(std::string)> str2token) {
		const char *word_boundary = " \t,|!";
		std::string cur_token;
		std::vector<uint64_t> tokens;
		for (const char &ch : str) {
			// If is word boundary.
			if (strchr(word_boundary, ch)) {
				if (cur_token.size() && parser::unicode::is_valid(cur_token)) {
					trim_punct_inplace(cur_token);
					tokens.push_back(str2token(cur_token));
				}
				cur_token.clear();
			} else {
				// This if statement trims the token.
				if (!isspace(ch)) {
					cur_token.insert(cur_token.end(), tolower(ch));
				}
			}
		}

		// Remember the last token.
		if (cur_token.size() && parser::unicode::is_valid(cur_token)) {
			trim_punct_inplace(cur_token);
			tokens.push_back(str2token(cur_token));
		}

		return tokens;
	}

	std::vector<uint64_t> get_tokens(const std::string &str) {
		return get_tokens(str, algorithm::hash);
	}

	std::vector<std::string> get_snippets(const std::string &str) {
		const size_t snippet_len = 300;
		const char *word_boundary = " \t,|!";
		std::string cur_snippet;
		std::string cur_token;
		std::vector<std::string> snippets;
		for (const char &ch : str) {
			// If is word boundary.
			if (strchr(word_boundary, ch)) {
				if (cur_token.size() && parser::unicode::is_valid(cur_token)) {
					if (cur_snippet.size() + cur_token.size() <= snippet_len) {
						cur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end());
						cur_snippet.insert(cur_snippet.end(), ' ');
					} else {
						trim_inplace(cur_snippet);
						snippets.push_back(cur_snippet);
						cur_snippet.clear();
						cur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end());
						cur_snippet.insert(cur_snippet.end(), ' ');
					}
				}
				cur_token.clear();
			} else {
				// This if statement trims the token.
				cur_token.insert(cur_token.end(), ch);
			}
		}

		if (cur_token.size() && parser::unicode::is_valid(cur_token)) {
			cur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end());
		}

		trim_inplace(cur_snippet);
		snippets.push_back(cur_snippet);

		return snippets;
	}

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
		These functions also expand on blend chars.
	*/
	std::vector<std::string> get_expanded_full_text_words(const std::string &str, size_t limit) {

		const std::string word_boundary = " \t,|!";
		const std::string blend_chars = ".-:";

		std::string str_lc = lower_case(str);

		std::vector<std::string> raw_words, words, blended;
		boost::split(raw_words, str_lc, boost::is_any_of(word_boundary));

		for (std::string &word : raw_words) {
			if (parser::unicode::is_valid(word)) {
				trim_both_inplace(word);
				if (word.size() <= CC_MAX_WORD_LEN && word.size() > 0) {
					words.push_back(word);

					if (limit && words.size() == limit) break;

					boost::split(blended, word, boost::is_any_of(blend_chars));
					if (blended.size() > 1) {
						for (std::string &blended_word : blended) {
							trim_both_inplace(blended_word);
							words.push_back(blended_word);
							if (limit && words.size() == limit) break;
						}
					}
				}
			}
			
		}

		return words;
	}

	std::vector<std::string> get_expanded_full_text_words(const std::string &str) {

		return get_expanded_full_text_words(str, 0);
	}

	/*
	 * Exactly the same algorithm as above but returns tokens.
	 * */
	std::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str, size_t limit) {

		const auto words = get_expanded_full_text_words(str, limit);
		std::vector<uint64_t> ret(words.size());

		std::transform(words.cbegin(), words.cend(), ret.begin(), [](const std::string &word) {
			return algorithm::hash(word);
		});

		return ret;
	}

	std::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str) {

		return get_expanded_full_text_tokens(str, 0);

	}

	std::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str, size_t limit) {

		auto vec = get_expanded_full_text_tokens(str, 0);
		std::set<uint64_t> s;
		const unsigned size = vec.size();
		for (unsigned i = 0; i < size; ++i) s.insert(vec[i]);

		vec.assign(s.begin(), s.end());

		return vec;
	}

	std::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str) {

		return get_unique_expanded_full_text_tokens(str, 0);

	}

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
	*/
	std::vector<std::string> get_words_without_stopwords(const std::string &str, size_t limit) {

		const std::string word_boundary = " \t,|!,";

		std::string str_lc = lower_case(str);

		std::vector<std::string> raw_words, words;
		boost::split(raw_words, str_lc, boost::is_any_of(word_boundary));

		for (std::string &word : raw_words) {
			trim_both_inplace(word);
			if (is_clean_word(word) && !stopwords::is_stop_word(word) && word.size() <= CC_MAX_WORD_LEN &&
					word.size() > 0) {
				words.push_back(word);
			}
			if (limit && words.size() == limit) break;
		}

		return words;
	}

	std::vector<std::string> get_words_without_stopwords(const std::string &str) {

		return get_words_without_stopwords(str, 0);
	}

	void words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t)> &ins) {
		
		const size_t word_iter_max = words.size();

		for (size_t i = 0; i < word_iter_max; i++) {
			for (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) {
				std::string n_gram = words[i];
				for (size_t k = i + 1; k <= i + j; k++) {
					n_gram += " " + words[k];
				}
				ins(algorithm::hash(n_gram));
			}
		}
	}

	void words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &)> &ins) {
		
		const size_t word_iter_max = words.size();

		for (size_t i = 0; i < word_iter_max; i++) {
			for (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) {
				std::string n_gram = words[i];
				for (size_t k = i + 1; k <= i + j; k++) {
					n_gram += " " + words[k];
				}
				ins(algorithm::hash(n_gram), n_gram);
			}
		}
	}

	void words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &, size_t)> &ins) {
		
		const size_t word_iter_max = words.size();

		for (size_t i = 0; i < word_iter_max; i++) {
			for (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) {
				std::string n_gram = words[i];
				for (size_t k = i + 1; k <= i + j; k++) {
					n_gram += " " + words[k];
				}
				ins(algorithm::hash(n_gram), n_gram, j + 1);
			}
		}
	}

	std::map<std::string, size_t> get_word_counts(const std::string &text) {
		std::vector<std::string> words = get_full_text_words(text);
		std::map<std::string, size_t> counts;
		for (const std::string &word : words) {
			counts[word]++;
		}

		return counts;
	}

	std::map<std::string, float> get_word_frequency(const std::string &text) {
		std::vector<std::string> words = get_full_text_words(text);
		std::map<std::string, size_t> counts;
		size_t total = 0;
		for (const std::string &word : words) {
			counts[word]++;
			total++;
		}

		std::map<std::string, float> freq;
		for (const auto &iter : counts) {
			freq[iter.first] = (float)iter.second / total;
		}

		return freq;
	}

}


================================================
FILE: src/text/text.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#define CC_MAX_WORD_LEN 100

#include <vector>
#include <map>
#include <iostream>
#include <algorithm>
#include <boost/algorithm/string.hpp>
#include <sstream>
#include "stopwords.h"
#include "parser/unicode.h"
#include "algorithm/hash.h"

namespace text {

	/*
	 * excludes + from punctuation trim since we want to be able to search for c++
	 */
	inline bool my_ispunct(int ch) {
		if (ch == '+') return false;
		if (ch == '#') return false;
		return ispunct(ch);
	}

	/*
	 * trim whitespace from beginning (in place)
	 * */
	inline void ltrim_inplace(std::string &s) {
		s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {
			return !isspace(ch);
		}));
	}

	/*
	 * trim whitespace from end (in place)
	 * */
	inline void rtrim_inplace(std::string &s) {
		s.erase(find_if(s.rbegin(), s.rend(), [](int ch) {
			return !isspace(ch);
		}).base(), s.end());
	}

	/*
	 * trim whitespace from both beginning and end (in place)
	 * */
	inline void trim_inplace(std::string &s) {
		ltrim_inplace(s);
		rtrim_inplace(s);
	}

	/*
	 * trim whitespace from both beginning and end (return result)
	 * */
	inline std::string trim(const std::string &s) {
		std::string copy = s;
		ltrim_inplace(copy);
		rtrim_inplace(copy);
		return copy;
	}

	/*
	 * trim punctuation from beginning (in place)
	 * */
	inline void ltrim_punct_inplace(std::string &s) {
		s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {
			return !my_ispunct(ch);
		}));
	}

	/*
	 * trim punctuation from end (in place)
	 * */
	inline void rtrim_punct_inplace(std::string &s) {
		s.erase(find_if(s.rbegin(), s.rend(), [](int ch) {
			return !my_ispunct(ch);
		}).base(), s.end());
	}

	/*
	 * trim punctuation from both beginning and end (in place)
	 * */
	inline void trim_punct_inplace(std::string &s) {
		ltrim_punct_inplace(s);
		rtrim_punct_inplace(s);
	}

	/*
	 * trim punctuation from both beginning and end (return result)
	 * */
	inline std::string trim_punct(const std::string &s) {
		std::string copy = s;
		ltrim_punct_inplace(copy);
		rtrim_punct_inplace(copy);
		return copy;
	}

	/*
	 * trim both whitespace and punctuation from beginning (in place)
	 * */
	inline void ltrim_both_inplace(std::string &s) {
		s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {
			return !isspace(ch) && !my_ispunct(ch);
		}));
	}

	/*
	 * trim both whitespace and punctuation from end (in place)
	 * */
	inline void rtrim_both_inplace(std::string &s) {
		s.erase(find_if(s.rbegin(), s.rend(), [](int ch) {
			return !isspace(ch) && !my_ispunct(ch);
		}).base(), s.end());
	}

	/*
	 * trim both whitespace and punctuation from both beginning and end (in place)
	 * */
	inline void trim_both_inplace(std::string &s) {
		ltrim_both_inplace(s);
		rtrim_both_inplace(s);
	}

	/*
	 * trim both whitespace and punctuation from both beginning and end (return result)
	 * */
	inline std::string trim_both(const std::string &s) {
		std::string copy = s;
		ltrim_both_inplace(copy);
		rtrim_both_inplace(copy);
		return copy;
	}

	inline std::string lower_case(const std::string &str) {
		std::string ret = str;
		transform(ret.begin(), ret.end(), ret.begin(), [](unsigned char c){ return tolower(c); });
		return ret;
	}

	bool is_clean_char(const char *ch, size_t multibyte_len);
	bool is_clean_word(const std::string &s);
	std::string clean_word(const std::string &s);

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
	*/
	std::vector<std::string> get_words(const std::string &str, size_t limit);
	std::vector<std::string> get_words(const std::string &str);

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
	*/
	std::vector<std::string> get_full_text_words(const std::string &str, size_t limit);
	std::vector<std::string> get_full_text_words(const std::string &str);

	std::vector<uint64_t> get_full_text_tokens(const std::string &str, size_t limit);
	std::vector<uint64_t> get_full_text_tokens(const std::string &str);

	std::vector<uint64_t> get_unique_full_text_tokens(const std::string &str, size_t limit);
	std::vector<uint64_t> get_unique_full_text_tokens(const std::string &str);

	std::vector<uint64_t> get_tokens(const std::string &str, std::function<uint64_t(std::string)> str2token);
	std::vector<uint64_t> get_tokens(const std::string &str);

	std::vector<std::string> get_snippets(const std::string &str);

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
		These functions also expand on blend chars.
	*/
	std::vector<std::string> get_expanded_full_text_words(const std::string &str, size_t limit);
	std::vector<std::string> get_expanded_full_text_words(const std::string &str);

	std::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str, size_t limit);
	std::vector<uint64_t> get_expanded_full_text_tokens(const std::string &str);

	std::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str, size_t limit);
	std::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::string &str);

	/*
		Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length.
	*/
	std::vector<std::string> get_words_without_stopwords(const std::string &str, size_t limit);
	std::vector<std::string> get_words_without_stopwords(const std::string &str);

	void words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t)> &ins);
	void words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &)> &ins);
	void words_to_ngram_hash(const std::vector<std::string> &words, size_t n_grams, const std::function<void(uint64_t, const std::string &, size_t)> &ins);

	std::map<std::string, size_t> get_word_counts(const std::string &text);
	std::map<std::string, float> get_word_frequency(const std::string &text);

}


================================================
FILE: src/tools/calculate_harmonic.cpp
================================================

#include "calculate_harmonic.h"
#include "splitter.h"

#include "config.h"
#include "url_link/link.h"
#include "URL.h"
#include "common/ThreadPool.h"
#include "algorithm/algorithm.h"
#include "algorithm/hyper_ball.h"
#include <iostream>
#include <vector>
#include <mutex>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/filesystem.hpp>
#include <boost/algorithm/string.hpp>
#include <unordered_map>
#include <unordered_set>
#include <iomanip>

namespace tools {

	std::unordered_map<uint64_t, std::string> run_uniq_host(const std::vector<std::string> files) {

		std::unordered_map<uint64_t, std::string> hosts;

		for (const std::string &warc_path : files) {

			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				uint64_t host_hash = url.host_hash();
				if (hosts.count(host_hash) == 0) {
					hosts[host_hash] = url.host();
				}
			}
		}

		return hosts;
	}

	struct pair_hash {
		inline size_t operator() (const std::pair<uint32_t, uint32_t> &p) const {
			return (uint64_t)p.first << 32 | (uint64_t)p.second;
		}
	};

	std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> run_uniq_link(const std::vector<std::string> files, const std::unordered_map<uint64_t, uint32_t> &hosts) {

		std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> edges;

		for (const std::string &warc_path : files) {

			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);

				const uint64_t source_hash = link.source_url().host_hash();
				const uint64_t target_hash = link.target_url().host_hash();

				const size_t source_count = hosts.count(source_hash);
				const size_t target_count = hosts.count(target_hash);
				if (source_count && target_count) {
					// Link between two hosts in the host map.
					edges.insert(std::make_pair(hosts.at(source_hash), hosts.at(target_hash)));
				}
			}
		}

		return edges;
	}

	void calculate_harmonic_hosts() {

		auto files = generate_list_with_target_url_files();

		std::vector<std::vector<std::string>> chunks;
		algorithm::vector_chunk<std::string>(files, files.size() / s_num_threads, chunks);

		ThreadPool pool(s_num_threads);
		std::vector<std::future<std::unordered_map<uint64_t, std::string>>> results;

		for (const std::vector<std::string> &chunk : chunks) {

			results.emplace_back(pool.enqueue([chunk] {
				return run_uniq_host(chunk);
			}));
		}

		std::unordered_map<uint64_t, std::string> hosts;
		size_t idx = 0;
		std::cout.precision(2);
		for (auto &result : results) {
			const std::unordered_map<uint64_t, std::string> result_map = result.get();
			for (const auto &iter : result_map) {
				hosts[iter.first] = iter.second;
			}
			const double percent = (100.0*(double)idx/results.size());
			std::cout << "hosts contains " << hosts.size() << " elements " << percent << "% done" << std::endl;
			idx++;
		}

		idx = 0;
		std::ofstream outfile(config::data_path() + "/hosts.txt", std::ios::trunc);
		for (const auto &iter : hosts) {
			outfile << idx << '\t' << iter.first << '\t' << iter.second << '\n';
			idx++;
		}
		outfile.close();
	}

	std::unordered_map<uint64_t, uint32_t> read_hosts_file() {

		// Load the hosts
		std::ifstream infile(config::data_path() + "/hosts.txt");

		std::unordered_map<uint64_t, uint32_t> ret;

		std::string line;
		while (getline(infile, line)) {
			std::vector<std::string> parts;
			boost::algorithm::split(parts, line, boost::is_any_of("\t"));

			uint32_t id = std::stoi(parts[0]);
			uint64_t hash = std::stoull(parts[1]);
			ret[hash] = id;
		}

		return ret;
	}

	std::vector<uint32_t> read_hosts_file_vec() {

		// Load the hosts
		std::ifstream infile(config::data_path() + "/hosts.txt");

		std::vector<uint32_t> ret;

		std::string line;
		while (getline(infile, line)) {
			std::vector<std::string> parts;
			boost::algorithm::split(parts, line, boost::is_any_of("\t"));

			uint32_t id = std::stoi(parts[0]);
			ret.push_back(id);
		}

		return ret;
	}

	std::map<uint32_t, std::string> read_hosts_file_with_names() {

		// Load the hosts
		std::ifstream infile(config::data_path() + "/hosts.txt");

		std::map<uint32_t, std::string> ret;

		std::string line;
		while (getline(infile, line)) {
			std::vector<std::string> parts;
			boost::algorithm::split(parts, line, boost::is_any_of("\t"));

			uint32_t id = std::stoi(parts[0]);
			ret[id] = parts[2];
		}

		return ret;
	}

	std::unique_ptr<std::vector<uint32_t>[]> read_edge_file(size_t vlen) {

		// Load the hosts
		std::ifstream infile(config::data_path() + "/edges.txt");

		auto edge_map = std::make_unique<std::vector<uint32_t>[]>(vlen);

		std::string line;
		while (getline(infile, line)) {
			std::vector<std::string> parts;
			boost::algorithm::split(parts, line, boost::is_any_of("\t"));

			uint32_t from = std::stoi(parts[0]); // I think we are counting from 0 now but from 1 when we created the edge file.
			uint32_t to = std::stoi(parts[1]);
			edge_map[to].push_back(from);
		}

		return edge_map;
	}

	void calculate_harmonic_links() {

		std::unordered_map<uint64_t, uint32_t> hosts = read_hosts_file();

		std::cout << "loaded " << hosts.size() << " hosts" << std::endl;

		auto files = generate_list_with_target_link_files();

		std::vector<std::vector<std::string>> chunks;
		algorithm::vector_chunk<std::string>(files, files.size() / (s_num_threads * 500), chunks);

		ThreadPool pool(s_num_threads);
		std::vector<std::future<std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash>>> results;

		for (const std::vector<std::string> &chunk : chunks) {
			results.emplace_back(pool.enqueue([chunk, &hosts] {
				return run_uniq_link(chunk, hosts);
			}));
		}

		std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> edges;
		size_t idx = 0;
		std::cout.precision(2);
		for (auto &result : results) {
			const std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> result_set = result.get();
			size_t idasd = 0;
			for (const std::pair<uint32_t, uint32_t> &edge : result_set) {
				edges.insert(edge);
				idasd++;
			}
			const double percent = (100.0*(double)idx/results.size());
			std::cout << "edges contains " << edges.size() << " elements " << percent << "% done" << std::endl;
			idx++;
		}

		std::ofstream outfile(config::data_path() + "/edges.txt", std::ios::trunc);
		for (const std::pair<uint32_t, uint32_t>& edge : edges) {
			outfile << edge.first << '\t' << edge.second << '\n';
		}
		outfile.close();
	}

	void calculate_harmonic() {

		std::vector<uint32_t> hosts = read_hosts_file_vec();
		auto edge_map = read_edge_file(hosts.size());

		const size_t num_hosts = hosts.size();

		std::cout << "loaded " << hosts.size() << " hosts" << std::endl;

		std::cout << "running harmonic centrality algorithm on " << s_num_threads << " threads" << std::endl;

		//vector<double> harmonic = algorithm::harmonic_centrality_threaded(hosts.size(), edge_map, 3, num_threads);

		std::vector<double> harmonic = algorithm::hyper_ball(hosts.size(), edge_map);

		edge_map.reset(nullptr);

		std::map<uint32_t, std::string> host_names = read_hosts_file_with_names();

		// Save harmonic centrality.
		std::ofstream outfile(config::data_path() + "/harmonic.txt", std::ios::trunc);
		for (size_t i = 0; i < hosts.size(); i++) {
			const double harmonic_float = harmonic[i] / num_hosts;
			outfile << std::setprecision(15) << host_names.at(hosts[i]) << '\t' << harmonic_float << '\n';
		}

	}

}


================================================
FILE: src/tools/calculate_harmonic.h
================================================

#pragma once

namespace tools {

	void calculate_harmonic_hosts();
	void calculate_harmonic_links();
	void calculate_harmonic();

}


================================================
FILE: src/tools/counter.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "counter.h"

#include <iostream>
#include <future>
#include <vector>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/filesystem.hpp>
#include "config.h"
#include "URL.h"
#include "url_link/link.h"
#include "transfer/transfer.h"
#include "algorithm/hyper_log_log.h"
#include "algorithm/algorithm.h"
#include "file/tsv_file_remote.h"
#include "common/system.h"

namespace tools {

	std::map<std::string, size_t> count_urls_per_domain(const std::vector<std::string> &warc_paths) {

		const std::set<std::string> domains = {
			"theinstantpottable.com",
			"thehighlineboutique.com",
			"harveyspet.com",
			"finertech.com",
			"canadiantiresucks.net",
			"thecounter.org",
			"learningworksforkids.com",
			"doodlecraftblog.com",
			"heroes.thelazy.net",
			"stedmansonline.com",
			"restaurantbusinessonline.com",
			"gotohomerepair.com",
			"aboutbail.com",
			"spacefuture.com",
			"personaltelco.net",
			"helis.com"
		};
		std::vector<std::string> saved_rows;

		std::map<std::string, size_t> counts;

		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				if (domains.find(url.host()) != domains.end()) {
					saved_rows.push_back(line);
				}
				counts[url.host()]++;
			}

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 

			idx++;
		}

		// Save rows.
		if (saved_rows.size() > 0) {
			boost::filesystem::create_directories(config::data_path() + "/crawl-data/ALEXANDRIA-TEST-SIZES/files/");
			std::ofstream outfile(config::data_path() + "/crawl-data/ALEXANDRIA-TEST-SIZES/files/" + common::uuid() + ".gz");
			boost::iostreams::filtering_ostream compress_stream;
			compress_stream.push(boost::iostreams::gzip_compressor());
			compress_stream.push(outfile);
			for (const std::string& row : saved_rows) {
				compress_stream << row << "\n";
			}
		}

		return counts;
	}

	void run_counter_per_domain(const std::string &batch) {

		const size_t num_threads = 12;

		std::vector<std::string> files;
		std::vector<std::string> link_files;

		const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz";

		std::ifstream infile(file_name);

		boost::iostreams::filtering_istream decompress_stream;
		decompress_stream.push(boost::iostreams::gzip_decompressor());
		decompress_stream.push(infile);

		std::string line;
		while (getline(decompress_stream, line)) {
			std::string warc_path = config::data_path() + "/" + line;
			const size_t pos = warc_path.find(".warc.gz");
			if (pos != std::string::npos) {
				warc_path.replace(pos, 8, ".gz");
			}

			files.push_back(warc_path);
		}

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / num_threads), thread_input);

		/*
		Run url counters
		*/
		std::vector<std::future<std::map<std::string, size_t>>> futures;
		for (size_t i = 0; i < num_threads && i < thread_input.size(); i++) {
			futures.emplace_back(std::async(std::launch::async, count_urls_per_domain, thread_input[i]));
		}

		std::map<std::string, size_t> all_counts;
		for (auto &future : futures) {
			std::map<std::string, size_t> result = future.get();
			for (const auto &iter : result) {
				all_counts[iter.first] += iter.second;
			}
		}

		futures.clear();

		for (const auto &iter : all_counts) {
			std::cout << iter.first << "\t" << iter.second << std::endl;
		}
	}

	algorithm::hyper_log_log *count_urls(const std::vector<std::string> &warc_paths) {

		algorithm::hyper_log_log *counter = new algorithm::hyper_log_log();

		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				counter->insert(url.hash());
			}

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 

			idx++;
		}

		return counter;
	}

	algorithm::hyper_log_log *count_links(const std::vector<std::string> &warc_paths) {

		algorithm::hyper_log_log *counter = new algorithm::hyper_log_log();

		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);
				counter->insert(link.target_url().hash());
			}

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 

			idx++;
		}

		return counter;
	}

	void run_counter() {

		const size_t num_threads = 12;

		std::vector<std::string> files;
		std::vector<std::string> link_files;

		for (const std::string &batch : config::batches) {

			const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz";

			std::ifstream infile(file_name);

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				std::string warc_path = config::data_path() + "/" + line;
				const size_t pos = warc_path.find(".warc.gz");
				if (pos != std::string::npos) {
					warc_path.replace(pos, 8, ".gz");
				}

				files.push_back(warc_path);
			}
		}

		for (const std::string &batch : config::link_batches) {

			const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz";

			std::ifstream infile(file_name);

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				std::string warc_path = config::data_path() + "/" + line;
				const size_t pos = warc_path.find(".warc.gz");

				if (pos != std::string::npos) {
					warc_path.replace(pos, 8, ".links.gz");
				}

				link_files.push_back(warc_path);
			}
		}

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / num_threads), thread_input);

		std::vector<std::vector<std::string>> link_thread_input;
		algorithm::vector_chunk(link_files, ceil((double)link_files.size() / num_threads), link_thread_input);

		std::mutex write_file_mutex;

		/*
		Run url counters
		*/
		std::vector<std::future<algorithm::hyper_log_log *>> futures;
		for (size_t i = 0; i < num_threads && i < thread_input.size(); i++) {
			futures.emplace_back(std::async(std::launch::async, count_urls, thread_input[i]));
		}

		algorithm::hyper_log_log url_counter;
		for (auto &future : futures) {
			algorithm::hyper_log_log *result = future.get();
			url_counter += *(result);
			delete result;
		}

		futures.clear();

		/*
		Run link counters
		*/
		for (size_t i = 0; i < num_threads && i < link_thread_input.size(); i++) {
			futures.emplace_back(std::async(std::launch::async, count_links, link_thread_input[i]));
		}

		algorithm::hyper_log_log link_counter;
		for (auto &future : futures) {
			algorithm::hyper_log_log *result = future.get();
			link_counter += *(result);
			delete result;
		}

		std::cout << "Uniq urls: " << url_counter.count() << std::endl;
		std::cout << "Uniq links: " << link_counter.count() << std::endl;
	}

	std::vector<std::string> download_link_batch(const std::string &batch, size_t limit, size_t offset) {
		
		file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths.gz");
		std::vector<std::string> warc_paths;
		warc_paths_file.read_column_into(0, warc_paths);

		std::vector<std::string> files_to_download;
		for (size_t i = offset; i < warc_paths.size() && i < (offset + limit); i++) {
			std::string warc_path = warc_paths[i];
			const size_t pos = warc_path.find(".warc.gz");
			if (pos != std::string::npos) {
				warc_path.replace(pos, 8, ".links.gz");
			}
			files_to_download.push_back(warc_path);
		}

		return transfer::download_gz_files_to_disk(files_to_download);
	}

}


================================================
FILE: src/tools/counter.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace tools {

	void run_counter_per_domain(const std::string &batch);
	void run_counter();
	void count_all_links();

}


================================================
FILE: src/tools/find_links.cpp
================================================

#include "find_links.h"
#include "file/gz_tsv_file.h"
#include "URL.h"
#include "algorithm/algorithm.h"
#include <boost/algorithm/string.hpp>
#include <iostream>
#include <vector>
#include <set>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/algorithm/string.hpp>
#include <math.h>
#include "utils/thread_pool.hpp"
#include "algorithm/hash.h"
#include "common/system.h"
#include "config.h"

namespace tools {

	void find_links_for_hosts_chunk(const std::set<size_t> &host_hashes, const std::vector<std::string> &files) {

		size_t links_written = 0;
		const size_t links_per_file = 1000000;

		std::ofstream outfile;

		outfile.open(config::data_path() + "/crawl-data/SMALL-LINK-MIX/files/" + common::uuid() + "_" + std::to_string(links_written) + "-" +
			std::to_string(links_written + links_per_file) + ".gz", std::ios::binary);

		boost::iostreams::filtering_ostream compress_stream;
		compress_stream.push(boost::iostreams::gzip_compressor());
		compress_stream.push(outfile);

		for (auto file : files) {
			std::ifstream infile(config::data_path() + "/" + file);

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				std::vector<std::string> col_values;
				boost::algorithm::split(col_values, line, boost::is_any_of("\t"));

				const size_t host_hash = algorithm::hash(col_values[2]);

				if (host_hashes.find(host_hash) != host_hashes.end()) {

					// Write link to current file.

					compress_stream << line << "\n";
					links_written++;
					if ((links_written % links_per_file) == 0) {
						std::cout << "writing file" << std::endl;
						compress_stream.strict_sync();
						compress_stream.pop();
						outfile.close();
						outfile.open(config::data_path() + "/crawl-data/SMALL-LINK-MIX/files/" + common::uuid() +
							"_" + std::to_string(links_written) + "-" + std::to_string(links_written + links_per_file) + ".gz",
							std::ios::binary);
						compress_stream.push(outfile);
					}
				}
			}
		}
	}

	void find_links_for_hosts(const std::set<size_t> &host_hashes) {
		const std::string batch = "LINK-MIX";
		const size_t num_threads = 12;
		size_t limit = 4000;

		file::gz_tsv_file batch_file(config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz");

		std::vector<std::string> rows;
		batch_file.read_column_into(0, rows);

		if (rows.size() > limit) rows.resize(limit);

		std::vector<std::vector<std::string>> chunks;
		algorithm::vector_chunk<std::string>(rows, ceil(rows.size() / num_threads) + 1, chunks);

		utils::thread_pool threads(num_threads);

		for (auto chunk : chunks) {
			threads.enqueue([&host_hashes, chunk]() {
				find_links_for_hosts_chunk(host_hashes, chunk);
			});
		}

		threads.run_all();
	}

	void find_links() {
		const auto batch = "SMALL-MIX";
		size_t limit = 20;

		file::gz_tsv_file batch_file(config::data_path() + "/crawl-data/"+batch+"/warc.paths.gz");

		std::vector<std::string> rows;
		batch_file.read_column_into(0, rows);

		if (rows.size() > limit) rows.resize(limit);

		// Load all the host hashes into a set
		std::set<size_t> host_hashes;

		for (auto row : rows) {
			std::ifstream infile(config::data_path() + "/" + row);

			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				std::vector<std::string> col_values;
				boost::algorithm::split(col_values, line, boost::is_any_of("\t"));

				URL url(col_values[0]);

				host_hashes.insert(url.host_hash());
			}
		}

		std::cout << "found " << host_hashes.size() << " hosts" << std::endl;

		find_links_for_hosts(host_hashes);
	}


}


================================================
FILE: src/tools/find_links.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>

namespace tools {

	void find_links();

}


================================================
FILE: src/tools/generate_url_lists.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <vector>
#include <iostream>
#include <fstream>
#include "generate_url_lists.h"

#include <boost/filesystem.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/algorithm/string.hpp>

using namespace std;
using namespace boost::filesystem;

namespace tools {

	vector<string> read_urls_with_many_links(const std::string &file_path) {

		std::ifstream infile(file_path);
		if (!infile.is_open()) return {};

		vector<string> ret_urls;

		boost::iostreams::filtering_istream decompress_stream;
		decompress_stream.push(boost::iostreams::gzip_decompressor());
		decompress_stream.push(infile);

		string line;
		while (getline(decompress_stream, line)) {
			vector<string> cols;
			boost::algorithm::split(cols, line, boost::is_any_of("\t"));
			if (stoull(cols[1]) > 1) {
				ret_urls.push_back(cols[0]);
			}
		}

		return ret_urls;
	}

	vector<string> read_urls(const std::string &path) {
		// Only read the first 10 files.
		vector<string> urls;
		for (size_t i = 1; i <= 10; i++) {
			string file_path = path + "/top_" + to_string(i) + ".gz";
			if (is_regular_file(file_path)) {
				vector<string> new_urls = read_urls_with_many_links(file_path);
				if (new_urls.size() == 0) break;
				urls.insert(urls.end(), new_urls.begin(), new_urls.end());
			}
		}

		return urls;
	}

	void generate_url_lists(const std::string &batch_path) {
		path pth(batch_path);
		directory_iterator end_iter;

		vector<string> urls;

		for (directory_iterator iter(pth); iter != end_iter; iter++) {
			if (is_directory(iter->path())) {
				string current_file = iter->path().string();
				vector<string> new_urls = read_urls(current_file);
				urls.insert(urls.end(), new_urls.begin(), new_urls.end());
			}
		}

		for (const string &url : urls) {
			cout << url << endl;
		}

	}

}


================================================
FILE: src/tools/generate_url_lists.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "config.h"

namespace tools {

	void generate_url_lists(const std::string &batch_path);

}


================================================
FILE: src/tools/splitter.cpp
================================================

#include "splitter.h"
#include "config.h"
#include "roaring/roaring64map.hh"
#include "algorithm/bloom_filter.h"
#include <iostream>
#include <vector>
#include <unordered_set>
#include <fstream>
#include <cmath>
#include <thread>
#include <future>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/filesystem.hpp>
#include "url_link/link.h"
#include "algorithm/algorithm.h"
#include "URL.h"
#include "common/system.h"

namespace tools {

	std::vector<std::string> target_url_batches() {
		std::vector<std::string> batches;
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			batches.push_back("NODE-" + std::to_string(node_id) + s_suffix);
		}

		return batches;
	}

	std::vector<std::string> target_link_batches() {
		std::vector<std::string> batches;
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			batches.push_back("LINK-" + std::to_string(node_id) + s_suffix);
		}

		return batches;
	}

	std::vector<std::string> generate_list_with_files(const std::vector<std::string> &batches, const std::string &suffix = ".gz", const std::string &warc_paths_suffix = ".gz") {

		std::vector<std::string> file_names;
		for (const auto &batch : batches) {

			const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths" + warc_paths_suffix;

			std::ifstream infile(file_name);

			if (warc_paths_suffix == ".gz") {

				boost::iostreams::filtering_istream decompress_stream;
				decompress_stream.push(boost::iostreams::gzip_decompressor());
				decompress_stream.push(infile);

				std::string line;
				while (getline(decompress_stream, line)) {
					std::string warc_path = config::data_path() + "/" + line;
					const size_t pos = warc_path.find(".warc.gz");

					if (pos != std::string::npos) {
						warc_path.replace(pos, 8, suffix);
					}

					file_names.push_back(warc_path);
				}
			} else {
				std::string line;
				while (getline(infile, line)) {
					std::string warc_path = config::data_path() + "/" + line;
					const size_t pos = warc_path.find(".warc.gz");

					if (pos != std::string::npos) {
						warc_path.replace(pos, 8, suffix);
					}

					file_names.push_back(warc_path);
				}
			}
		}

		return file_names;
	}

	std::vector<std::string> generate_list_with_url_files() {

		// create a list with .gz files that contains urls
		return generate_list_with_files(config::batches, ".gz");

	}

	std::vector<std::string> generate_list_with_link_files() {

		// create a list with .gz files that contains links
		return generate_list_with_files(config::link_batches, ".links.gz");

	}

	std::vector<std::string> generate_list_with_direct_link_files() {

		// create a list with .gz files that contains links
		return generate_list_with_files(config::link_batches, ".direct.links.gz");

	}

	std::vector<std::string> generate_list_with_target_url_files() {

		// create a list with .gz files that contains urls
		return generate_list_with_files(target_url_batches(), "", "");

	}

	std::vector<std::string> generate_list_with_target_link_files() {

		// create a list with .gz files that contains links
		return generate_list_with_files(target_link_batches(), "", "");

	}

	// File structure is [data_path]/crawl-data/NODE-[node_id]/files/uuid-file_index.gz
	std::string write_cache(size_t file_index, std::vector<std::string> &lines, size_t node_id) {

		auto uuid = common::uuid();

		const std::string filename = "crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/files/" + uuid + "-" + std::to_string(file_index) + ".gz";
		std::ofstream outfile(config::data_path() + "/" + filename, std::ios::trunc | std::ios::binary);

		boost::iostreams::filtering_ostream compress_stream;
		compress_stream.push(boost::iostreams::gzip_compressor());
		compress_stream.push(outfile);

		for (const std::string &line : lines) {
			compress_stream << line << "\n";
		}
		lines.clear();
		return filename;
	}

	// File structure is [DATA_PATH]/crawl-data/NODE-[node_id]/files/uuid-file_index.gz
	std::string write_link_cache(size_t file_index, std::vector<std::string> &lines, size_t node_id) {

		auto uuid = common::uuid();

		const std::string filename = "crawl-data/LINK-" + std::to_string(node_id) + s_suffix + "/files/" + uuid + "-" + std::to_string(file_index) + ".gz";
		std::ofstream outfile(config::data_path() + "/" + filename, std::ios::trunc | std::ios::binary);

		boost::iostreams::filtering_ostream compress_stream;
		compress_stream.push(boost::iostreams::gzip_compressor());
		compress_stream.push(outfile);

		for (const std::string &line : lines) {
			compress_stream << line << "\n";
		}
		lines.clear();
		return filename;
	}

	void splitter(const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {

		const size_t max_cache_size = 10000;
		size_t file_index = 1;

		using vec2d_str = std::vector<std::vector<std::string>>;

		vec2d_str file_names(config::nodes_in_cluster);
		vec2d_str cache(config::nodes_in_cluster);
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				const size_t node_id = url.index_on_node();
				cache[node_id].push_back(line);
			}

			for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
				if (cache[node_id].size() > max_cache_size) {
					file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
				}
			}
		}
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
		}

		write_file_mutex.lock();
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths";
			std::ofstream outfile(filename, std::ios::app);
			for (const std::string &file : file_names[node_id]) {
				outfile << file << "\n";
			}
		}
		write_file_mutex.unlock();
	}

	void link_splitter(const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {

		const size_t max_cache_size = 1000000;
		size_t file_index = 1;
		
		using vec2d_str = std::vector<std::vector<std::string>>;

		vec2d_str file_names(config::nodes_in_cluster);
		vec2d_str cache(config::nodes_in_cluster);
		size_t done = 0;
		for (const std::string &warc_path : warc_paths) {

			std::cout << "done " << done << "/" << warc_paths.size() << std::endl;
			done++;

			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);
				const size_t node_id = link.index_on_node();
				cache[node_id].push_back(line);
			}

			for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
				if (cache[node_id].size() > max_cache_size) {
					file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));
				}
			}
		}
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));
		}

		write_file_mutex.lock();
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			const auto filename = config::data_path() + "/crawl-data/LINK-" + std::to_string(node_id) + s_suffix + "/warc.paths";
			std::ofstream outfile(filename, std::ios::app);
			for (const std::string &file : file_names[node_id]) {
				outfile << file << "\n";
			}
		}
		write_file_mutex.unlock();
	}

	void link_splitter_with_hosts(const std::unordered_set<size_t> &hosts, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {

		const size_t max_cache_size = 1000000;
		size_t file_index = 1;
		
		using vec2d_str = std::vector<std::vector<std::string>>;

		vec2d_str file_names(config::nodes_in_cluster);
		vec2d_str cache(config::nodes_in_cluster);
		size_t done = 0;
		for (const std::string &warc_path : warc_paths) {

			std::cout << "done " << done << "/" << warc_paths.size() << std::endl;
			done++;

			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);
				const auto target_host = link.target_host_hash();
				if (hosts.count(target_host)) {
					const size_t node_id = link.index_on_node();
					cache[node_id].push_back(line);
				}
			}

			for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
				if (cache[node_id].size() > max_cache_size) {
					file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));
				}
			}
		}
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id));
		}

		write_file_mutex.lock();
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			const auto filename = config::data_path() + "/crawl-data/LINK-" + std::to_string(node_id) + s_suffix + "/warc.paths";
			std::ofstream outfile(filename, std::ios::app);
			for (const std::string &file : file_names[node_id]) {
				outfile << file << "\n";
			}
		}
		write_file_mutex.unlock();
	}

	void splitter_with_urls(const std::unordered_set<size_t> &urls, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {

		const size_t max_cache_size = 150000;
		size_t file_index = 1;

		std::vector<std::vector<std::string>> file_names(config::nodes_in_cluster);
		std::vector<std::vector<std::string>> cache(config::nodes_in_cluster);
		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::cout << warc_path << std::endl;
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				if (urls.count(url.hash())) {
					const size_t node_id = url.index_on_node();
					cache[node_id].push_back(line);
				}
			}

			for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
				if (cache[node_id].size() > max_cache_size) {
					file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
				}
			}

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 
			idx++;
		}
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
		}

		write_file_mutex.lock();
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths";
			std::ofstream outfile(filename, std::ios::app);
			for (const std::string &file : file_names[node_id]) {
				outfile << file << "\n";
			}
		}
		write_file_mutex.unlock();
	}

	void splitter_with_roaring(const ::roaring::Roaring64Map &urls, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {

		const size_t max_cache_size = 150000;
		size_t file_index = 1;

		std::vector<std::vector<std::string>> file_names(config::nodes_in_cluster);
		std::vector<std::vector<std::string>> cache(config::nodes_in_cluster);
		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::cout << warc_path << std::endl;
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				if (urls.contains(url.hash() >> 20)) {
					const size_t node_id = url.index_on_node();
					cache[node_id].push_back(line);
				}
			}

			for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
				if (cache[node_id].size() > max_cache_size) {
					file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
				}
			}

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 
			idx++;
		}
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
		}

		write_file_mutex.lock();
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths";
			std::ofstream outfile(filename, std::ios::app);
			for (const std::string &file : file_names[node_id]) {
				outfile << file << "\n";
			}
		}
		write_file_mutex.unlock();
	}

	void splitter_with_bloom(const ::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths, std::mutex &write_file_mutex) {

		const size_t max_cache_size = 10000;
		size_t file_index = 1;

		std::vector<std::vector<std::string>> file_names(config::nodes_in_cluster);
		std::vector<std::vector<std::string>> cache(config::nodes_in_cluster);
		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				if (bloom.exists(url.hash())) {
					const size_t node_id = url.index_on_node();
					cache[node_id].push_back(line);
				}
			}

			for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
				if (cache[node_id].size() > max_cache_size) {
					file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
				}
			}

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 
			idx++;
		}
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id));
		}

		write_file_mutex.lock();
		for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) {
			const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths";
			std::ofstream outfile(filename, std::ios::app);
			for (const std::string &file : file_names[node_id]) {
				outfile << file << "\n";
			}
		}
		write_file_mutex.unlock();
	}

	std::unordered_set<size_t> build_link_set(const std::vector<std::string> &warc_paths, size_t hash_min, size_t hash_max) {

		std::unordered_set<size_t> result;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);
				const size_t hash = link.target_url().hash();
				if (hash >= hash_min && hash <= hash_max) {
					result.insert(hash);
				}
			}
		}

		return result;
	}

	/*
	 * Input is a vector with paths to url files. Returns an unordered set with all the host hashes.
	 * */
	std::unordered_set<size_t> build_url_host_set(const std::vector<std::string> &warc_paths) {

		std::unordered_set<size_t> hosts;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				hosts.insert(url.host_hash());
			}
		}

		return hosts;
	}

	std::unordered_set<size_t> build_url_set(const std::vector<std::string> &warc_paths) {

		std::unordered_set<size_t> url_hashes;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				url_hashes.insert(url.hash());
			}
		}

		return url_hashes;
	}

	void create_warc_directories() {
		// Create directories.
		for (const auto &batch : target_url_batches()) {
			boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch);
			boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch + "/files");
		}
		for (const auto &batch : target_link_batches()) {
			boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch);
			boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch + "/files");
		}
	}

	void run_splitter() {

		tools::create_warc_directories();

		std::vector<std::thread> threads;
		auto files = generate_list_with_url_files();
		auto link_files = generate_list_with_link_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		std::vector<std::vector<std::string>> link_thread_input;
		algorithm::vector_chunk(link_files, ceil((double)link_files.size() / s_num_threads), link_thread_input);

		std::mutex write_file_mutex;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(splitter, thread_input[i], ref(write_file_mutex)));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

		threads.clear();

		/*
		Run link_splitter threads
		for (size_t i = 0; i < link_thread_input.size(); i++) {
			threads.emplace_back(thread(link_splitter, link_thread_input[i], ref(write_file_mutex)));
		}

		for (thread &one_thread : threads) {
			one_thread.join();
		}
		*/
	}

	void run_url_splitter_on_urls_in_set(const std::unordered_set<size_t> &urls) {

		tools::create_warc_directories();

		std::vector<std::thread> threads;
		auto files = generate_list_with_url_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		std::mutex write_file_mutex;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(splitter_with_urls, std::cref(urls), std::cref(thread_input[i]), ref(write_file_mutex)));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

	}

	void run_url_splitter_on_urls_in_roaring(const ::roaring::Roaring64Map &urls) {

		tools::create_warc_directories();

		std::vector<std::thread> threads;
		auto files = generate_list_with_url_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		std::mutex write_file_mutex;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(splitter_with_roaring, std::cref(urls), std::cref(thread_input[i]), ref(write_file_mutex)));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

	}

	void run_url_splitter_on_urls_in_bloom_filter(const ::algorithm::bloom_filter &bloom) {

		tools::create_warc_directories();

		std::vector<std::thread> threads;
		auto files = generate_list_with_url_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		std::mutex write_file_mutex;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(splitter_with_bloom, std::cref(bloom), std::cref(thread_input[i]), ref(write_file_mutex)));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

	}

	void run_link_splitter_on_links_with_target_host_in_set(const std::unordered_set<size_t> &hosts) {

		tools::create_warc_directories();

		std::vector<std::thread> threads;
		auto files = generate_list_with_link_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		std::mutex write_file_mutex;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(link_splitter_with_hosts, std::cref(hosts), std::cref(thread_input[i]), ref(write_file_mutex)));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

	}

	std::unordered_set<size_t> generate_set_of_urls() {

		auto url_files = generate_list_with_url_files();

		// create an unordered set that contains host hashes of all the urls.
		std::cout << "building url hashes map" << std::endl;
		std::unordered_set<size_t> url_hashes;

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(url_files, ceil((double)url_files.size() / s_num_threads), thread_input);

		std::vector<std::future<std::unordered_set<size_t>>> futures;

		for (size_t i = 0; i < thread_input.size(); i++) {
			futures.emplace_back(std::async(std::launch::async, build_url_set, thread_input[i]));
		}

		for (auto &fut : futures) {
			auto result = fut.get();
			url_hashes.insert(result.begin(), result.end());
		}

		return url_hashes;
	}

	void run_split_links_with_relevant_domains() {

		auto url_files = generate_list_with_target_url_files();

		// create an unordered set that contains host hashes of all the urls.
		std::cout << "building host hashes map" << std::endl;
		std::unordered_set<size_t> host_hashes;

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(url_files, ceil((double)url_files.size() / s_num_threads), thread_input);

		std::vector<std::future<std::unordered_set<size_t>>> futures;

		for (size_t i = 0; i < thread_input.size(); i++) {
			futures.emplace_back(std::async(std::launch::async, build_url_host_set, thread_input[i]));
		}

		for (auto &fut : futures) {
			auto result = fut.get();
			host_hashes.insert(result.begin(), result.end());
		}

		std::cout << "done. the map size is " << host_hashes.size() << std::endl;

		run_link_splitter_on_links_with_target_host_in_set(host_hashes);
	}

	void split_make_bloom(::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths) {

		std::vector<uint64_t> cache;

		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const URL url(line.substr(0, line.find("\t")));
				cache.push_back(url.hash());
			}

			bloom.insert_many(cache);
			cache.clear();

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 
			idx++;
		}

	}

	void run_split_build_url_bloom() {

		std::vector<std::thread> threads;
		auto files = generate_list_with_url_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		::algorithm::bloom_filter bloom;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(split_make_bloom, std::ref(bloom), std::cref(thread_input[i])));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

		bloom.write_file(config::data_path() + "/0/url_filter_main.bloom");
	}

	void split_make_direct_links(const ::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths) {

		size_t done = 0;
		for (const std::string &warc_path : warc_paths) {

			std::cout << "done " << done << "/" << warc_paths.size() << std::endl;
			done++;

			auto target_warc_path = warc_path;
			const size_t pos = target_warc_path.find(".links.gz");

			if (pos != std::string::npos) {
				target_warc_path.replace(pos, 9, ".direct.links.gz");
			} else {
				std::cout << "ERROR: " << warc_path << std::endl;
				return;
			}

			std::ofstream outfile(target_warc_path, std::ios::trunc | std::ios::binary);

			boost::iostreams::filtering_ostream compress_stream;
			compress_stream.push(boost::iostreams::gzip_compressor());
			compress_stream.push(outfile);

			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);

				if (bloom.exists(link.target_url().hash())) {
					compress_stream << line << "\n";
				}
			}
		}
	}

	void run_split_direct_links() {

		::algorithm::bloom_filter bloom;
		bloom.read_file(config::data_path() + "/0/url_filter_main.bloom");

		std::vector<std::thread> threads;
		auto files = generate_list_with_link_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(split_make_direct_links, std::cref(bloom), std::cref(thread_input[i])));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}
	}

	void split_make_link_bloom(::algorithm::bloom_filter &bloom, const std::vector<std::string> &warc_paths) {

		std::vector<uint64_t> cache;

		size_t idx = 0;
		for (const std::string &warc_path : warc_paths) {
			std::ifstream infile(warc_path);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			std::string line;
			while (getline(decompress_stream, line)) {
				const url_link::link link(line);
				const size_t hash = link.target_url().hash();
				cache.push_back(hash);
			}

			bloom.insert_many(cache);
			cache.clear();

			if (idx % 100 == 0) {
				std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl;
			} 
			idx++;
		}

	}

	void run_split_build_direct_link_bloom() {

		std::vector<std::thread> threads;
		auto files = generate_list_with_direct_link_files();

		std::vector<std::vector<std::string>> thread_input;
		algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input);

		::algorithm::bloom_filter bloom;

		/*
		Run splitter threads
		*/
		for (size_t i = 0; i < thread_input.size(); i++) {
			threads.emplace_back(std::thread(split_make_link_bloom, std::ref(bloom), std::cref(thread_input[i])));
		}

		for (std::thread &one_thread : threads) {
			one_thread.join();
		}

		bloom.write_file(config::data_path() + "/0/direct_link_filter_main.bloom");
	}

	void run_split_urls_with_direct_links() {
		
		::algorithm::bloom_filter bloom;
		bloom.read_file(config::data_path() + "/0/direct_link_filter_main.bloom");

		run_url_splitter_on_urls_in_bloom_filter(bloom);
	}


}


================================================
FILE: src/tools/splitter.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <vector>

namespace tools {

	const std::string s_suffix = "-small";
	const size_t s_num_threads = 12;

	std::vector<std::string> target_url_batches();
	std::vector<std::string> target_link_batches();

	std::vector<std::string> generate_list_with_url_files();
	std::vector<std::string> generate_list_with_link_files();
	std::vector<std::string> generate_list_with_target_url_files();
	std::vector<std::string> generate_list_with_target_link_files();

	void run_splitter();
	void run_split_urls_with_direct_links();
	void run_split_links_with_relevant_domains();
	void run_split_build_url_bloom();
	void run_split_direct_links();
	void run_split_build_direct_link_bloom();

}


================================================
FILE: src/transfer/transfer.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "config.h"
#include "transfer.h"
#include <fstream>
#include "common/ThreadPool.h"
#include "logger/logger.h"
#include "profiler/profiler.h"
#include "file/file.h"
#include "text/text.h"
#include "parser/parser.h"
#include "algorithm/hash.h"

using namespace std;

namespace transfer {

	size_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, stringstream *ss) {
		size_t byte_size = size * nmemb;
		ss->write((char *)ptr, byte_size);
		return byte_size;
	}

	size_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, ostream *os) {
		size_t byte_size = size * nmemb;
		os->write((char *)ptr, byte_size);
		return byte_size;
	}

	size_t curl_string_writer(void *ptr, size_t size, size_t nmemb, string *str) {
		size_t byte_size = size * nmemb;
		str->append((char *)ptr, byte_size);
		return byte_size;
	}

	struct curl_string_read_struct {
		const char *buffer;
		size_t buffer_len;
		size_t offset;
	};

	size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata) {
		struct curl_string_read_struct *arg = (struct curl_string_read_struct *)userdata;

		if (arg->offset >= arg->buffer_len) {
			return 0ull;
		}

		size_t max_read = size * nmemb;
		size_t read_bytes = arg->buffer_len - arg->offset;
		if (read_bytes > max_read) read_bytes = max_read;

		memcpy(ptr, &arg->buffer[arg->offset], read_bytes);

		arg->offset += read_bytes;

		return read_bytes;
	}

	size_t curl_file_reader(char *ptr, size_t size, size_t nmemb, void *userdata) {
		std::ifstream *infile = (std::ifstream *)userdata;

		if (infile->eof()) {
			return 0ull;
		}

		size_t max_read = size * nmemb;

		infile->read(ptr, max_read);

		return infile->gcount();
	}

	void set_internal_auth(CURL *curl) {
		curl_easy_setopt(curl, CURLOPT_USERNAME, username.c_str());
		curl_easy_setopt(curl, CURLOPT_PASSWORD, password.c_str());
	}

	string make_url(const string &url) {

		if (url.find("http://") == 0 || url.find("https://") == 0) {
			return url;
		}

		if (url.size() && url[0] != '/') {
			return "http://" + config::master + "/" + url;
		}
		return "http://" + config::master + url;
	}

	string file_to_string(const string &file_path, int &error) {
		CURL *curl = curl_easy_init();
		error = ERROR;
		if (curl) {
			CURLcode res;
			LOG_INFO("Downloading url: " + make_url(file_path));
			curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());

			set_internal_auth(curl);

			stringstream response;
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			res = curl_easy_perform(curl);

			if (res == CURLE_OK) {
				long response_code;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
				if (response_code == 200) {
					error = OK;
				}
			}

			curl_easy_cleanup(curl);

			return response.str();
		}

		return "";
	}

	string gz_file_to_string(const string &file_path, int &error) {
		CURL *curl = curl_easy_init();
		error = ERROR;
		if (curl) {
			CURLcode res;
			LOG_INFO("Downloading url: " + make_url(file_path));
			curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());

			set_internal_auth(curl);

			stringstream response;
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			res = curl_easy_perform(curl);

			string response_str;
			try {
				boost::iostreams::filtering_istream decompress_stream;
				decompress_stream.push(boost::iostreams::gzip_decompressor());
				decompress_stream.push(response);

				response_str = string(istreambuf_iterator<char>(decompress_stream), {});
			} catch (...) {
				curl_easy_cleanup(curl);
				error = ERROR;
				return "";
			}


			if (res == CURLE_OK) {
				long response_code;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
				if (response_code == 200) {
					error = OK;
				}
			}

			curl_easy_cleanup(curl);

			return response_str;
		}

		return "";
	}

	void file_to_stream(const string &file_path, ostream &output_stream, int &error) {
		CURL *curl = curl_easy_init();
		error = ERROR;
		if (curl) {
			CURLcode res;
			LOG_INFO("Downloading url: " + make_url(file_path));
			curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());

			set_internal_auth(curl);

			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &output_stream);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_ostream_writer);

			res = curl_easy_perform(curl);

			if (res == CURLE_OK) {
				long response_code;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
				if (response_code == 200) {
					error = OK;
				}
			}

			curl_easy_cleanup(curl);

		}
	}

	void gz_file_to_stream(const string &file_path, ostream &output_stream, int &error) {
		CURL *curl = curl_easy_init();
		error = ERROR;
		if (curl) {
			CURLcode res;
			LOG_INFO("Downloading url: " + make_url(file_path));
			curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str());

			set_internal_auth(curl);

			stringstream response;
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			res = curl_easy_perform(curl);

			if (res == CURLE_OK) {
				long response_code;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
				if (response_code == 200) {
					error = OK;
				}
			}

			try {
				boost::iostreams::filtering_istream decompress_stream;
				decompress_stream.push(boost::iostreams::gzip_decompressor());
				decompress_stream.push(response);

				output_stream << decompress_stream.rdbuf();
			} catch(...) {
				error = ERROR;
			}

			curl_easy_cleanup(curl);
		}
	}

	void url_to_string(const string &url, string &buffer, int &error) {
		CURL *curl = curl_easy_init();
		error = ERROR;
		const size_t original_buffer_size = buffer.size();
		if (curl) {
			CURLcode res;
			LOG_INFO("Downloading url: " + url);
			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 5000);
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 5);

			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_string_writer);

			res = curl_easy_perform(curl);

			if (res == CURLE_OK) {
				long response_code;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
				if (response_code >= 200 && response_code < 300) {
					error = OK;
				}
			} else {
				// If an error ocurred we set the size of the buffer to the original size, removing any appended data.
				buffer.resize(original_buffer_size);
			}

			curl_easy_cleanup(curl);
		}
	}

	string run_gz_download_thread(const string &file_path) {
		size_t hsh = algorithm::hash(file_path);
		const string target_filename = config::data_path() + "/" + to_string(hsh % 8) + "/tmp/tmp_" + to_string(hsh);
		ofstream target_file(target_filename, ios::binary | ios::trunc);
		int error;
		gz_file_to_stream(file_path, target_file, error);
		if (error != OK) {
			return "";
		}
		return target_filename;
	}

	vector<string> download_gz_files_to_disk(const vector<string> &files_to_download) {
		
		ThreadPool pool(config::num_async_file_transfers);
		std::vector<std::future<string>> results;

		for (const string &file : files_to_download) {
			results.emplace_back(
				pool.enqueue([file] {
					return run_gz_download_thread(file);
				})
			);
		}

		vector<string> local_filenames;
		for(auto && result: results) {
			const string filename = result.get();
			if (filename != "") {
				local_filenames.push_back(filename);
			}
		}

		return local_filenames;
	}

	void delete_downloaded_files(const vector<string> &files) {
		LOG_INFO("Deleting " + to_string(files.size()) + " downloaded files");
		for (const string &file : files) {
			file::delete_file(file);
		}
	}

	size_t head_content_length(const string &url, int &error) {
		CURL *curl = curl_easy_init();
		error = ERROR;
		if (curl) {
			CURLcode res;
			LOG_INFO("Making head request to:" + url);
			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

			stringstream response;
			curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
			curl_easy_setopt(curl, CURLOPT_HEADER, 1);
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			res = curl_easy_perform(curl);

			string response_str;
			try {
				response_str = string(istreambuf_iterator<char>(response), {});
			} catch (...) {
				curl_easy_cleanup(curl);
				error = ERROR;
				return 0;
			}

			if (res == CURLE_OK) {
				long response_code;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
				if (response_code == 200) {
					error = OK;
				} else {
					curl_easy_cleanup(curl);
					return 0;
				}
			}

			curl_easy_cleanup(curl);

			const string content_len_str = parser::get_http_header(text::lower_case(response_str), "content-length: ");
			size_t content_len;
			try {
				content_len = stoull(content_len_str);
			} catch (...) {
				error = ERROR;
				return 0;
			}

			return content_len;
		}

		return 0;
	}

	int upload_file(const string &path, const string &data) {
		CURL *curl = curl_easy_init();
		if (curl) {
			CURLcode res;
			const string url = "http://" + config::upload + "/" + path;
			LOG_INFO("Uploading file to:" + url);
			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);

			struct curl_string_read_struct arg;
			arg.buffer = data.c_str();
			arg.buffer_len = data.size();
			arg.offset = 0;

			curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);
			curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());
			curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());
			curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);
			curl_easy_setopt(curl, CURLOPT_READDATA, &arg);

			res = curl_easy_perform(curl);

			curl_easy_cleanup(curl);

			if (res == CURLE_OK) {
				return OK;
			}
			return ERROR;
		}

		return ERROR;
	}

	int upload_gz_file(const string &path, const string &data) {
		CURL *curl = curl_easy_init();
		if (curl) {
			CURLcode res;
			const string url = "http://" + config::upload + "/" + path;
			LOG_INFO("Uploading file to:" + url);
			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);

			stringstream ss(data);
			boost::iostreams::filtering_istream compress_stream;
			compress_stream.push(boost::iostreams::gzip_compressor());
			compress_stream.push(ss);

			string compressed_data = string(istreambuf_iterator<char>(compress_stream), {});

			struct curl_string_read_struct arg;
			arg.buffer = compressed_data.c_str();
			arg.buffer_len = compressed_data.size();
			arg.offset = 0;

			curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);
			curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());
			curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());
			curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);
			curl_easy_setopt(curl, CURLOPT_READDATA, &arg);

			res = curl_easy_perform(curl);

			curl_easy_cleanup(curl);

			if (res == CURLE_OK) {
				return OK;
			}
			return ERROR;
		}

		return ERROR;
	}

	int upload_file_from_disk(const string &dest_path, const string &filename) {
		CURL *curl = curl_easy_init();
		if (curl) {
			CURLcode res;
			const string url = "http://" + config::upload + "/" + dest_path;
			LOG_INFO("Uploading file to:" + url);
			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);

			std::ifstream infile(filename, std::ios::in | std::ios::binary);

			curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);
			curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());
			curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());
			curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_file_reader);
			curl_easy_setopt(curl, CURLOPT_READDATA, &infile);

			res = curl_easy_perform(curl);

			curl_easy_cleanup(curl);

			if (res == CURLE_OK) {
				return OK;
			}
			return ERROR;
		}

		return ERROR;
	}

	/*
	 * Perform simple GET request and return response.
	 * */
	http::response get(const string &url) {
		return get(url, vector<string>{});
	}

	http::response get(const string &url, const vector<string> &headers) {
		CURL *curl = curl_easy_init();
		struct curl_slist *header_list = NULL;
		http::response response;
		if (curl) {

			for (const string &header : headers) {
				header_list = curl_slist_append(header_list, header.c_str());
			}

			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

			curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());
			curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());
			curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);

			stringstream response_stream;
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			curl_easy_perform(curl);

			curl_slist_free_all(header_list);

			size_t code = 0;
			curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);

			response.code(code);
			response.body(response_stream.str());

			curl_easy_cleanup(curl);
		}

		return response;
	}

	/*
	 * Perform simple POST request and return response.
	 * */
	http::response post(const string &url, const string &data) {
		return post(url, data, {});
	}

	http::response post(const string &url, const string &data, const vector<string> &headers) {
		CURL *curl = curl_easy_init();
		struct curl_slist *header_list = NULL;
		http::response response;
		if (curl) {

			for (const string &header : headers) {
				header_list = curl_slist_append(header_list, header.c_str());
			}

			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

			struct curl_string_read_struct arg;
			arg.buffer = data.c_str();
			arg.buffer_len = data.size();
			arg.offset = 0;

			curl_easy_setopt(curl, CURLOPT_POST, 1l);
			curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());
			curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());
			curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);
			curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);
			curl_easy_setopt(curl, CURLOPT_READDATA, &arg);

			stringstream response_stream;
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			CURLcode curl_result = curl_easy_perform(curl);

			if (curl_result == CURLE_OK) {
				size_t code = 0;
				curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
				response.code(code);
				response.body(response_stream.str());
			} else {
				response.code(0);
				response.body("");
			}

			curl_easy_cleanup(curl);
		}

		return response;
	}

	/*
	 * Perform simple PUT request and return response.
	 * */
	http::response put(const string &url, const string &data) {
		CURL *curl = curl_easy_init();
		http::response response;
		if (curl) {
			curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L);
			curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);

			struct curl_string_read_struct arg;
			arg.buffer = data.c_str();
			arg.buffer_len = data.size();
			arg.offset = 0;

			curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l);
			curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str());
			curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str());
			curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader);
			curl_easy_setopt(curl, CURLOPT_READDATA, &arg);

			stringstream response_stream;
			curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream);
			curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer);

			curl_easy_perform(curl);

			size_t code = 0;
			curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);

			response.code(code);
			response.body(response_stream.str());

			curl_easy_cleanup(curl);
		}

		return response;
	}
}


================================================
FILE: src/transfer/transfer.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <curl/curl.h>
#include <iostream>
#include <sstream>

#include "http/response.h"

#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>

namespace transfer {

	const std::string username = "alexandria";
	const std::string password = "wmXN6U4u";

	const int OK = 0;
	const int ERROR = 1;

	size_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, std::stringstream *ss);
	size_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, std::ostream *os);

	std::string file_to_string(const std::string &file_path, int &error);
	std::string gz_file_to_string(const std::string &file_path, int &error);

	void file_to_stream(const std::string &file_path, std::ostream &output_stream, int &error);
	void gz_file_to_stream(const std::string &file_path, std::ostream &output_stream, int &error);

	void url_to_string(const std::string &url, std::string &buffer, int &error);

	std::vector<std::string> download_gz_files_to_disk(const std::vector<std::string> &files_to_download);
	void delete_downloaded_files(const std::vector<std::string> &files);

	// Make a http HEAD request and return the content length. Return 0 on failure and sets the error parameter to transfer::ERROR
	size_t head_content_length(const std::string &url, int &error);

	int upload_file(const std::string &path, const std::string &data);
	int upload_gz_file(const std::string &path, const std::string &data);
	int upload_file_from_disk(const std::string &dest_path, const std::string &filename);

	/*
	 * Perform simple GET request and return response.
	 * */
	http::response get(const std::string &url);
	http::response get(const std::string &url, const std::vector<std::string> &headers);

	/*
	 * Perform simple POST request and return response.
	 * */
	http::response post(const std::string &url, const std::string &data);
	http::response post(const std::string &url, const std::string &data, const std::vector<std::string> &headers);

	/*
	 * Perform simple PUT request and return response.
	 * */
	http::response put(const std::string &url, const std::string &data);

}


================================================
FILE: src/url_link/link.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "link.h"
#include <boost/algorithm/string.hpp>

using namespace std;

namespace url_link {

	link::link() {

	}

	link::link(const string &standard_link_data) {
			vector<string> col_values;
			boost::algorithm::split(col_values, standard_link_data, boost::is_any_of("\t"));

			m_source_url = URL(col_values[0], col_values[1]);
			m_target_url = URL(col_values[2], col_values[3]);
			m_link_text = col_values[4].substr(0, 1000);

			m_target_host_hash = m_target_url.host_hash();
			m_source_harmonic = 0;
			m_target_harmonic = 0;
	}

	link::link(const URL &source_url, const URL &target_url, float source_harmonic, float target_harmonic)
	:
		m_source_url(source_url),
		m_target_url(target_url),
		m_target_host_hash(target_url.host_hash()),
		m_source_harmonic(source_harmonic),
		m_target_harmonic(target_harmonic)
	{
	}

	link::~link() {

	}

	float link::url_score() const {
		return max(m_source_harmonic - m_target_harmonic, m_source_harmonic / 100.0f);
	}

	float link::domain_score() const {
		return max(m_source_harmonic - m_target_harmonic, m_source_harmonic / 100.0f)/100.0;
	}

}


================================================
FILE: src/url_link/link.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "URL.h"
#include "config.h"

namespace url_link {

	class link {

	public:
		link();
		explicit link(const std::string &standard_link_data);
		link(const URL &source_url, const URL &target_url, float source_harmonic, float target_harmonic);
		~link();

		float url_score() const;
		float domain_score() const;

		const URL &source_url() const { return m_source_url; }
		const URL &target_url() const { return m_target_url; }
		const uint64_t &target_host_hash() const { return m_target_host_hash; }
		const float &source_harmonic() const { return m_source_harmonic; }
		const float &target_harmonic() const { return m_target_harmonic; }

		size_t index_on_node() const {
			return target_url().host_hash() % config::nodes_in_cluster;
		}

	private:
		URL m_source_url;
		URL m_target_url;
		uint64_t m_target_host_hash;
		float m_source_harmonic;
		float m_target_harmonic;
		std::string m_link_text;
	};
}


================================================
FILE: src/utils/id_allocator.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <mutex>
#include <memory>

namespace utils {

	/*
	 * Very simple helper for allocating one shared object per id by multiple threads. Each thread should keep its own cache of the pointers since
	 * the get function locks execution.
	 *
	 *
	 * - thread A
	 *   std::unordered_map<uint64_t, data *> local_cache;
	 *   for (...) {
	 *		if (!local_cache.count(id)) {
	 *			local_cache[id] = alloc.get(id, ...); // alloc is shared instance of id_allocator
	 *		}
	 *
	 *		local_cache[id] can be used now.
	 *   }
	 * */

	template<typename alloc_type>
	class id_allocator {

		public:

			/*
			 * Allocates a pointer to an "alloc_type" object associated with id. The rest of the arguments are passed to the constructor of
			 * alloc_type.
			 * */
			template<class... type_args>
			alloc_type *get(uint64_t id, type_args&&... args) {

				std::lock_guard guard(m_lock);

				if (m_map.count(id) == 0) {
					m_map[id] = std::make_unique<alloc_type>(std::forward<type_args>(args)...);
				}

				return m_map[id].get();
			}

		private:

			std::mutex m_lock;
			std::unordered_map<uint64_t, std::unique_ptr<alloc_type>> m_map;

	};
	
}


================================================
FILE: src/utils/thread_pool.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "thread_pool.hpp"
#include <chrono>
#include <iostream>
#include <thread>
#include <future>
#include <queue>

using namespace std::chrono_literals;

namespace utils {

	thread_pool::thread_pool(size_t num_threads, size_t max_queue_len)
	: m_max_queue_len(max_queue_len) {
		for (size_t i = 0; i < num_threads; i++) {
			m_workers.emplace_back([this]() {
				this->handle_work();
			});
		}
	}
	thread_pool::~thread_pool() {
		run_all();
	}

	void thread_pool::enqueue(std::function<void()> &&fun) {

		if (m_stop) {
			throw std::runtime_error("enqueue on stopped thread_pool not allowed");
		}

		if (m_max_queue_len > 0) {
			while (true) {
				
				{
					std::lock_guard lock(m_queue_lock);
					if (m_queue.size() < m_max_queue_len) {
						m_queue.emplace(std::move(fun));
						break;
					}
				}
				std::this_thread::sleep_for(100ms);
			}
		} else {
			m_queue_lock.lock();
			m_queue.emplace(std::move(fun));
			m_queue_lock.unlock();
		}

		m_condition.notify_one();
	}

	void thread_pool::run_all() {
		if (m_stop) return; // Already stopped..
		m_queue_lock.lock();
		m_stop = true;
		m_queue_lock.unlock();
		m_condition.notify_all();

		for (std::thread &thread : m_workers) {
			if (thread.joinable()) {
				thread.join();
			}
		}
	}

	void thread_pool::handle_work() {
		while (true) {

			std::function<void()> task;

			{
				std::unique_lock<std::mutex> lock(m_queue_lock);
				m_condition.wait(lock, [this] {
					return m_stop || !m_queue.empty();
				});
				if (m_stop && m_queue.empty()) return;
				task = std::move(m_queue.front());
				m_queue.pop();
			}

			task();
		}
	}
	
}


================================================
FILE: src/utils/thread_pool.hpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <thread>
#include <future>
#include <queue>

namespace utils {

	class thread_pool {

		public:

			explicit thread_pool(size_t num_workers, size_t max_queue_len = 0);
			~thread_pool();

			void enqueue(std::function<void()> &&fun);
			void run_all();

		private:

			void handle_work();

			std::vector<std::thread> m_workers;
			std::queue<std::function<void()>> m_queue;

			std::mutex m_queue_lock;
			std::condition_variable m_condition;
			bool m_stop = false;
			size_t m_max_queue_len;

	};
	
}


================================================
FILE: src/utils/thread_pool_arg.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include <iostream>
#include <thread>
#include <future>
#include <queue>

namespace utils {

	template <typename arg>
	class thread_pool_arg {

		public:

			explicit thread_pool_arg(size_t);
			~thread_pool_arg();

			void enqueue(std::function<void(arg &)> &&fun);
			void run_all();

		private:

			void handle_work();

			std::vector<std::thread> m_workers;
			std::queue<std::function<void(arg &)>> m_queue;

			std::mutex m_queue_lock;
			std::condition_variable m_condition;
			bool m_stop = false;

	};

	template<typename arg>
	thread_pool_arg<arg>::thread_pool_arg(size_t num_threads) {
		for (size_t i = 0; i < num_threads; i++) {
			m_workers.emplace_back([this]() {
				this->handle_work();
			});
		}
	}

	template<typename arg>
	thread_pool_arg<arg>::~thread_pool_arg() {
		run_all();
	}

	template<typename arg>
	void thread_pool_arg<arg>::enqueue(std::function<void(arg &)> &&fun) {

		if (m_stop) {
			throw std::runtime_error("enqueue on stopped thread_pool_arg not allowed");
		}

		m_queue_lock.lock();
		m_queue.emplace(std::move(fun));
		m_queue_lock.unlock();

		m_condition.notify_one();
	}

	template<typename arg>
	void thread_pool_arg<arg>::run_all() {
		if (m_stop) return; // Already stopped..
		m_queue_lock.lock();
		m_stop = true;
		m_queue_lock.unlock();
		m_condition.notify_all();

		for (std::thread &thread : m_workers) {
			if (thread.joinable()) {
				thread.join();
			}
		}
	}

	template<typename arg>
	void thread_pool_arg<arg>::handle_work() {

		arg a;
		while (true) {

			std::function<void(arg &)> task;

			{
				std::unique_lock<std::mutex> lock(m_queue_lock);
				m_condition.wait(lock, [this] {
					return m_stop || !m_queue.empty();
				});
				if (m_stop && m_queue.empty()) return;
				task = std::move(m_queue.front());
				m_queue.pop();
			}

			task(a);
		}
	}
}


================================================
FILE: src/warc/tlds.h
================================================

#pragma once

#include <iostream>
#include <unordered_set>

namespace warc {

	const std::unordered_set<std::string> double_tlds({
		"co.uk"
	});

	const std::unordered_set<std::string> tlds({
		"se",
		"com",
		"org",
		"net",
		"int",
		"edu",
		"gov",
		"mil",
		"ad",
		"as",
		"az",
		"bz",
		"cc",
		"cd",
		"co",
		"dj",
		"fm",
		"gg",
		"io",
		"la",
		"me",
		"ms",
		"nu",
		"sc",
		"tf",
		"tv",
		"ws",
		"ai",
		"as",
		"au",
		"bm",
		"bs",
		"gi",
		"gu",
		"uk",
		"us",
		"sh",
		"ca",
		"to",
		"ac",
		"academy",
		"accountant",
		"accountants",
		"active",
		"actor",
		"ads",
		"adult",
		"aero",
		"africa",
		"agency",
		"airforce",
		"amazon",
		"analytics",
		"apartments",
		"app",
		"apple",
		"archi",
		"army",
		"art",
		"associates",
		"attorney",
		"auction",
		"audible",
		"audio",
		"author",
		"auto",
		"autos",
		"aws",
		"baby",
		"band",
		"bank",
		"bar",
		"barefoot",
		"bargains",
		"baseball",
		"basketball",
		"beauty",
		"beer",
		"best",
		"bestbuy",
		"bet",
		"bible",
		"bid",
		"bike",
		"bingo",
		"bio",
		"biz",
		"black",
		"blackfriday",
		"blockbuster",
		"blog",
		"blue",
		"boo",
		"book",
		"boots",
		"boston",
		"bot",
		"boutique",
		"box",
		"broadway",
		"broker",
		"build",
		"builders",
		"business",
		"buy",
		"buzz",
		"cab",
		"cafe",
		"call",
		"cam",
		"camera",
		"camp",
		"cancerresearch",
		"capital",
		"car",
		"cards",
		"care",
		"career",
		"careers",
		"cars",
		"case",
		"cash",
		"casino",
		"catering",
		"catholic",
		"center",
		"cern",
		"ceo",
		"cfd",
		"channel",
		"chat",
		"charity",
		"cheap",
		"christmas",
		"church",
		"circle",
		"city",
		"claims",
		"cleaning",
		"click",
		"clinic",
		"clothing",
		"cloud",
		"club",
		"coach",
		"codes",
		"coffee",
		"college",
		"community",
		"company",
		"compare",
		"computer",
		"condos",
		"construction",
		"consulting",
		"contact",
		"contractors",
		"cooking",
		"cool",
		"coop",
		"country",
		"coupon",
		"coupons",
		"courses",
		"cpa",
		"credit",
		"creditcard",
		"cruise",
		"cricket",
		"cruises",
		"cyou",
		"dad",
		"dance",
		"data",
		"date",
		"dating",
		"day",
		"deal",
		"deals",
		"degree",
		"delivery",
		"democrat",
		"dental",
		"dentist",
		"design",
		"dev",
		"diamonds",
		"diet",
		"digital",
		"direct",
		"directory",
		"discount",
		"diy",
		"docs",
		"doctor",
		"dog",
		"domains",
		"dot",
		"download",
		"drive",
		"duck",
		"earth",
		"eat",
		"eco",
		"education",
		"email",
		"energy",
		"engineer",
		"engineering",
		"edeka",
		"entertainment",
		"enterprises",
		"equipment",
		"esq",
		"estate",
		"events",
		"exchange",
		"expert",
		"exposed",
		"express",
		"fail",
		"faith",
		"family",
		"fan",
		"fans",
		"farm",
		"fashion",
		"fast",
		"feedback",
		"fiat",
		"film",
		"final",
		"finance",
		"financial",
		"fire",
		"fish",
		"fishing",
		"fit",
		"fitness",
		"flights",
		"florist",
		"flowers",
		"fly",
		"foo",
		"food",
		"foodnetwork",
		"football",
		"forsale",
		"forum",
		"foundation",
		"free",
		"frontdoor",
		"fun",
		"fund",
		"furniture",
		"fyi",
		"gallery",
		"game",
		"games",
		"garden",
		"gay",
		"gdn",
		"gift",
		"gifts",
		"gives",
		"glass",
		"gle",
		"global",
		"gold",
		"golf",
		"google",
		"gop",
		"graphics",
		"green",
		"gripe",
		"grocery",
		"group",
		"guide",
		"guitars",
		"guru",
		"hair",
		"hangout",
		"health",
		"healthcare",
		"help",
		"here",
		"hiphop",
		"hiv",
		"hockey",
		"holdings",
		"holiday",
		"homegoods",
		"homes",
		"homesense",
		"horse",
		"hospital",
		"host",
		"hosting",
		"hot",
		"hotels",
		"house",
		"how",
		"ice",
		"icu",
		"inc",
		"industries",
		"info",
		"ing",
		"ink",
		"institute[50]",
		"insurance",
		"insure",
		"international",
		"investments",
		"irish",
		"jewelry",
		"jobs",
		"joy",
		"kim",
		"kitchen",
		"kosher",
		"land",
		"lat",
		"law",
		"lawyer",
		"lease",
		"leclerc",
		"legal",
		"lgbt",
		"life",
		"lifeinsurance",
		"lighting",
		"like",
		"limited",
		"limo",
		"link",
		"live",
		"living",
		"loan",
		"loans",
		"locker",
		"lol",
		"lotto",
		"love",
		"ltd",
		"luxury",
		"makeup",
		"management",
		"map",
		"market",
		"marketing",
		"markets",
		"mba",
		"med",
		"media",
		"meet",
		"meme",
		"memorial",
		"men",
		"menu",
		"mint",
		"mobi",
		"mobile",
		"mobily",
		"moe",
		"mom",
		"money",
		"monster",
		"mortgage",
		"motorcycles",
		"mov",
		"movie",
		"museum",
		"music",
		"name",
		"navy",
		"network",
		"new",
		"news",
		"ngo",
		"ninja",
		"now",
		"ntt",
		"observer",
		"off",
		"org",
		"one",
		"ong",
		"onl",
		"online",
		"ooo",
		"open",
		"organic",
		"origins",
		"page",
		"partners",
		"parts",
		"party",
		"pay",
		"pet",
		"pharmacy",
		"phone",
		"photo",
		"photography",
		"photos",
		"physio",
		"pics",
		"pictures",
		"pid",
		"pin",
		"pink",
		"pizza",
		"place",
		"plumbing",
		"plus",
		"poker",
		"porn",
		"post",
		"press",
		"prime",
		"pro",
		"productions",
		"prof",
		"promo",
		"properties",
		"property",
		"protection",
		"pub",
		"qpon",
		"racing",
		"radio",
		"read",
		"realestate",
		"realtor",
		"realty",
		"recipes",
		"red",
		"rehab",
		"reit",
		"rent",
		"rentals",
		"repair",
		"report",
		"republican",
		"rest",
		"restaurant",
		"review",
		"reviews",
		"rich",
		"rip",
		"rocks",
		"rodeo",
		"room",
		"rugby",
		"run",
		"safe",
		"sale",
		"salon",
		"save",
		"sbi",
		"scholarships",
		"school",
		"science",
		"search",
		"secure",
		"security",
		"select",
		"services",
		"sex",
		"sexy",
		"shoes",
		"shop",
		"shopping",
		"show",
		"showtime",
		"silk",
		"singles",
		"site",
		"ski",
		"skin",
		"sky",
		"sling",
		"smile",
		"sncf",
		"soccer",
		"social",
		"software",
		"solar",
		"solutions",
		"song",
		"space",
		"spreadbetting",
		"spot",
		"sport",
		"storage",
		"store",
		"stream",
		"studio",
		"study",
		"style",
		"sucks",
		"supplies",
		"supply",
		"support",
		"surf",
		"surgery",
		"systems",
		"talk",
		"tattoo",
		"tax",
		"taxi",
		"team",
		"tech",
		"technology",
		"tel",
		"tennis",
		"theater",
		"theatre",
		"tickets",
		"tips",
		"tires",
		"today",
		"tools",
		"top",
		"tours",
		"town",
		"toys",
		"trade",
		"trading",
		"training",
		"travel",
		"travelersinsurance",
		"trust",
		"tube",
		"tunes",
		"uconnect",
		"university",
		"uno",
		"vacations",
		"ventures",
		"vet",
		"video",
		"villas",
		"vin",
		"vip",
		"vision",
		"vodka",
		"volvo",
		"vote",
		"voting",
		"voyage",
		"wang",
		"watch",
		"watches",
		"weather",
		"webcam",
		"website",
		"wed",
		"wedding",
		"whoswho",
		"wiki",
		"win",
		"wine",
		"winners",
		"work",
		"works",
		"world",
		"wow",
		"wtf",
		"xxx",
		"xyz",
		"yachts",
		"yoga",
		"you",
		"youtube",
		"zero",
		"zip",
		"zone"
	});
}


================================================
FILE: src/warc/warc.cpp
================================================

#include "warc.h"
#include "tlds.h"
#include "text/text.h"
#include "logger/logger.h"
#include "transfer/transfer.h"

using namespace std;

namespace warc {

	parser::parser() {
		m_z_buffer_in = new char[WARC_PARSER_ZLIB_IN];
		m_z_buffer_out = new char[WARC_PARSER_ZLIB_OUT];
	}

	parser::~parser() {
		delete [] m_z_buffer_in;
		delete [] m_z_buffer_out;
	}

	bool parser::parse_stream(istream &stream) {
		return parse_stream(stream, [this](const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date) {
			handle_html(url, html, ip, date);
		});
	}

	bool parser::parse_stream(std::istream &stream, std::function<void(const std::string &url, const ::parser::html_parser &html, const std::string &ip,
				const std::string &date)> callback) {
		m_callback = callback;
		size_t total_bytes_read = 0;
		while (stream.good()) {
			stream.read(m_z_buffer_in, WARC_PARSER_ZLIB_IN);

			auto bytes_read = stream.gcount();
			total_bytes_read += bytes_read;

			if (bytes_read > 0) {
				if (unzip_chunk(bytes_read) < 0) {
					cout << "Stopped because fatal error" << endl;
					break;
				}
			}
		}

		return true;
	}

	void parser::handle_html(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date) {

		m_result += (url
				+ '\t' + html.title()
				+ '\t' + html.h1()
				+ '\t' + html.meta()
				+ '\t' + html.text()
				+ '\t' + date
				+ '\t' + ip
				+ '\n');
		for (const auto &link : html.links()) {
			m_links += (link.host()
				+ '\t' + link.path()
				+ '\t' + link.target_host()
				+ '\t' + link.target_path()
				+ '\t' + link.text()
				+ '\t' + (link.nofollow() ? "1" : "0")
				+ '\n');
		}

		// internal links are too messy for us now.
		/*for (const auto &link : html.internal_links()) {
			// link is a std::pair<uint64_t, uint64_t>
			m_internal_links.append((char *)&link.first, sizeof(uint64_t));
			m_internal_links.append((char *)&link.second, sizeof(uint64_t));
		}*/

	}

	int parser::unzip_record(char *data, int size) {

		/*
			data is:
			#|------------------|-----|------------------------|--|----#-------|
			 |doc_a______________doc_b_doc_c_____|
								 WARC_PARSER_ZLIB_IN
			 |_________________________________________________________|
																   size
		*/

		int data_size = size;
		int consumed = 0, consumed_total = 0;
		int avail_in_before_inflate;
		int ret = Z_OK;
		unsigned have;

		if (!m_continue_inflate) {
			m_zstream.zalloc = Z_NULL;
			m_zstream.zfree = Z_NULL;
			m_zstream.opaque = Z_NULL;

			m_zstream.avail_in = 0;
			m_zstream.next_in = Z_NULL;

			int err = inflateInit2(&m_zstream, 16);
			if (err != Z_OK) {
				cout << "zlib error" << endl;
			}
		} else {
			// just continue on the last one.
		}

		/* decompress until deflate stream ends or end of file */
		do {

			m_zstream.next_in = (unsigned char *)(data + consumed_total);

			m_zstream.avail_in = min(WARC_PARSER_ZLIB_IN, data_size);

			if (m_zstream.avail_in == 0)
				break;

			/* run inflate() on input until output buffer not full */
			do {

				m_zstream.avail_out = WARC_PARSER_ZLIB_OUT;
				m_zstream.next_out = (unsigned char *)m_z_buffer_out;

				avail_in_before_inflate = m_zstream.avail_in;

				ret = inflate(&m_zstream, Z_NO_FLUSH);

				// consumed is the number of bytes read from input in this inflate
				consumed = (avail_in_before_inflate - m_zstream.avail_in);
				data_size -= consumed;
				consumed_total += consumed;
				assert(ret != Z_STREAM_ERROR);  /* state not clobbered */
				switch (ret) {
				case Z_BUF_ERROR:
					//cout << "Z_BUF_ERROR" << endl;
					// Not fatal, just keep going.
					break;
				case Z_NEED_DICT:
					ret = Z_DATA_ERROR;	 /* and fall through */
					cout << "Z_MEM_ERROR" << endl;
					(void)inflateEnd(&m_zstream);
					return -1;
				case Z_DATA_ERROR:
				case Z_MEM_ERROR:
					cout << "Z_MEM_ERROR" << endl;
					(void)inflateEnd(&m_zstream);
					return -1;
				}

				have = WARC_PARSER_ZLIB_OUT - m_zstream.avail_out;
				handle_record_chunk((char *)m_z_buffer_out, have);

			} while (m_zstream.avail_out == 0);

			if (data_size <= 0) {
				break;
			}

			/* done when inflate() says it's done */
		} while (ret != Z_STREAM_END);

		//cout << "ret: " << ret << endl;
		//cout << "Ending with code: " << ret << endl;
		if (ret == Z_OK || ret == Z_BUF_ERROR) {
			m_continue_inflate = true;
		} else {
			m_continue_inflate = false;
			(void)inflateEnd(&m_zstream);
		}

		/* clean up and return */
		return consumed_total;
	}

	int parser::unzip_chunk(int bytes_in) {

		int consumed = 0;
		int consumed_total = 0;

		char *ptr = m_z_buffer_in;
		int len = bytes_in;

		while (len > 0) {
			consumed = unzip_record(ptr, len);
			//cout << "consumed: " << consumed << " len: " << len << endl;
			if (consumed == 0) {
				cout << "Nothing consumed, done..." << endl;
				break;
			}
			if (consumed < 0) {
				cout << "Encountered fatal error" << endl;
				return -1;
			}
			ptr += consumed;
			len -= consumed;
			consumed_total += consumed;
		}

		return 0;
	}

	/*
	 * Handles unzipped data. The data pointer is either pointing to a new warc record or it is the continuation of a previous warc record.
	 * */
	void parser::handle_record_chunk(char *data, int len) {

		m_handled += len;
		m_num_handled++;

		if (len > 8 && strncmp(data, "WARC/1.0", 8) == 0) {
			// data is the start of a warc record
			string record(data, len);
			m_current_record.assign(data, len);
		} else {
			m_current_record.append(data, len);
		}

		if (m_current_record.find("\r\n\r\n") != string::npos) {

			const string warc_header = get_warc_header(m_current_record);
			const string content_len_str = ::parser::get_http_header(warc_header, "Content-Length: ");

			size_t content_len = stoull(content_len_str);
			size_t received_content = m_current_record.size() - (warc_header.size() + 8);

			if (content_len == received_content) {
				const string type = ::parser::get_http_header(warc_header, "WARC-Type: ");

				if (type == "response") {
					parse_record(warc_header, m_current_record);
				}
			}
		}

	}

	void parser::parse_record(const string &warc_header, const string &warc_record) {

		const string url = ::parser::get_http_header(warc_header, "WARC-Target-URI: ");
		const string tld = m_html_parser.url_tld(url);

		if (tlds.count(tld) == 0) return;

		const string ip = ::parser::get_http_header(warc_header, "WARC-IP-Address: ");
		const string date = ::parser::get_http_header(warc_header, "WARC-Date: ");

		const size_t warc_response_start = warc_record.find("\r\n\r\n");
		const size_t response_body_start = warc_record.find("\r\n\r\n", warc_response_start + 4);

		string http_header = warc_record.substr(warc_response_start + 4, response_body_start - warc_response_start - 4);
		text::lower_case(http_header);

		//const size_t http_code = http_response_code(http_header);
		//const string location = ::parser::get_http_header(warc_header, "location: ");

		string html = warc_record.substr(response_body_start + 4);
		m_html_parser.parse(html, url);

		if (m_html_parser.should_insert()) {
			m_callback(url, m_html_parser, ip, date);
		}
	}

	string parser::get_warc_header(const string &record) {
		const size_t pos = record.find("\r\n\r\n");
		return record.substr(0, pos);
	}

	size_t parser::http_response_code(const string &http_header) {
		const size_t return_on_invalid = 500;
		const size_t code_start = http_header.find(' ');
		const size_t code_end = http_header.find(' ', code_start);
		if (code_start == string::npos || code_end == string::npos) return return_on_invalid;

		size_t response_code = stoull(http_header.substr(code_start + 1, 3));

		if (response_code < 100 || response_code >= 600) return return_on_invalid;

		return response_code;
	}

	void multipart_download(const string &url, const std::function<void(const string &chunk)> &callback) {

		int error;
		size_t content_len = transfer::head_content_length(url, error);

		if (error == transfer::ERROR) {
			throw std::runtime_error("Could not make HEAD request to: " + url);
		}

		const size_t max_parts = 50;
		const size_t max_retries = 3;

		size_t part = 1;
		size_t read_bytes = 0;
		while (read_bytes < content_len && part < max_parts) {
			size_t retry = 0;
			while (retry < max_retries) {
				string buffer;
				transfer::url_to_string(url + "?partNumber=" + to_string(part), buffer, error);
				if (error == transfer::OK) {
					read_bytes += buffer.size();
					callback(buffer);
					break;
				} else {
					throw std::runtime_error("Got error response");
				}
				retry++;
			}
			if (retry == max_retries) {
				break;
			}
			part++;
		}
	}

	string get_result_path(const string &warc_path) {
		string path = warc_path;
		path.replace(path.find(".warc.gz"), 8, string(".gz"));
		return path;
	}

	string get_link_result_path(const string &warc_path) {
		string path = warc_path;
		path.replace(path.find(".warc.gz"), 8, string(".links.gz"));
		return path;
	}

	string get_internal_link_result_path(const string &warc_path) {
		string path = warc_path;
		path.replace(path.find(".warc.gz"), 8, string(".internal.gz"));
		return path;
	}

}


================================================
FILE: src/warc/warc.h
================================================

#pragma once

#include <iostream>
#include "parser/html_parser.h"
#include "parser/parser.h"
#include "zlib.h"

#define WARC_PARSER_ZLIB_IN 1024*1024*16
#define WARC_PARSER_ZLIB_OUT 1024*1024*16

namespace warc {

	using std::string;

	class parser {

		public:

			parser();
			~parser();

			bool parse_stream(std::istream &stream);
			bool parse_stream(std::istream &stream, std::function<void(const std::string &url, const ::parser::html_parser &html, const std::string &ip,
						const std::string &date)>);
			const string &result() const { return m_result; };
			const string &link_result() const { return m_links; };
			const string &internal_link_result() const { return m_internal_links; };
			void handle_html(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date);

		private:

			int m_cur_offset = 0;
			bool m_continue_inflate = false;
			std::string m_result;
			std::string m_links;
			std::string m_internal_links;
			::parser::html_parser m_html_parser;
			std::function<void(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date)>
				m_callback;

			char *m_z_buffer_in;
			char *m_z_buffer_out;

			z_stream m_zstream; /* decompression stream */

			size_t m_handled = 0;
			size_t m_num_handled = 0;
			string m_current_record;

			int unzip_record(char *data, int size);
			int unzip_chunk(int bytes_in);

			void handle_record_chunk(char *data, int len);
			void parse_record(const std::string &warc_header, const std::string &warc_record);
			std::string get_warc_header(const std::string &record);
			size_t http_response_code(const string &http_header);

	};

	void multipart_download(const string &url, const std::function<void(const string &chunk)> &callback);

	string get_result_path(const string &warc_path);
	string get_link_result_path(const string &warc_path);
	string get_internal_link_result_path(const string &warc_path);
}


================================================
FILE: tests/main.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#define BOOST_TEST_MODULE "Unit tests for alexandria.org"

#define BOOST_TEST_NO_MAIN
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>

#include "config.h"
#include "logger/logger.h"

#include <iostream>
#include <stdlib.h>
#include <fstream>
#include <streambuf>
#include <math.h>
#include <vector>
#include <set>
#include <map>

using std::string;
using std::vector;
using std::ifstream;
using std::stringstream;
using std::set;
using std::map;
using std::pair;

void run_before() {
	config::read_config("../tests/test_config.conf");
	logger::start_logger_thread();
}

void run_after() {
	logger::join_logger_thread();
}

int BOOST_TEST_CALL_DECL
main(int argc, char* argv[]) {

	run_before();

    int ret = ::boost::unit_test::unit_test_main(&init_unit_test, argc, argv);

	run_after();

	return ret;
}


================================================
FILE: tests/test_algorithm.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "algorithm/algorithm.h"
#include "algorithm/intersection.h"
#include "algorithm/hyper_ball.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_algorithm)

BOOST_AUTO_TEST_CASE(intersection_test) {

	{
		const vector<int> result = algorithm::intersection<int>({
			{1, 2, 3},
			{2, 3},
			{2, 3, 4}
		});

		BOOST_CHECK_EQUAL(2, result.size());
		BOOST_CHECK_EQUAL(2, result[0]);
		BOOST_CHECK_EQUAL(3, result[1]);
	}

	{
		const vector<int> result = algorithm::intersection<int>({
			{1, 2, 3, 5},
			{2, 3, 5, 7},
			{2, 3, 4, 5}
		});

		BOOST_CHECK_EQUAL(3, result.size());
		BOOST_CHECK_EQUAL(2, result[0]);
		BOOST_CHECK_EQUAL(3, result[1]);
		BOOST_CHECK_EQUAL(5, result[2]);
	}

	{
		const vector<int> result = algorithm::intersection<int>({});

		BOOST_CHECK_EQUAL(0, result.size());
	}

	{
		const vector<int> result = algorithm::intersection<int>({
			{1, 2, 3, 5, 6, 7, 8},
			{9, 10},
			{1, 2, 3, 4, 5}
		});

		BOOST_CHECK_EQUAL(0, result.size());
	}

	{

		class T {
			public:
			size_t m_v;
			float m_s;

			T(size_t v, float s) : m_v(v), m_s(s) {}

			bool operator<(const T &other) const {
				return m_v < other.m_v;
			}

			bool operator==(const T &other) const {
				return m_v == other.m_v;
			}

		};
		const vector<T> result = algorithm::intersection<T>({
			{T(1, 1.0f), T(2, 1.0f), T(3, 1.0f), T(4, 1.0f)},
			{T(3, 2.0f), T(4, 2.0f), T(5, 2.0f)},
			{T(4, 3.0f), T(5, 3.0f), T(6, 3.0f), T(7, 3.0f), T(8, 3.0f)}
		}, [](T &a, const T &b) {
			return a.m_s += b.m_s;
		});

		BOOST_CHECK_EQUAL(1, result.size());
		BOOST_CHECK_EQUAL(result[0].m_v, 4);
		BOOST_CHECK_EQUAL(result[0].m_s, 6.0f);
	}
}

BOOST_AUTO_TEST_CASE(incremental_partitions) {

	{
		vector<vector<int>> res = algorithm::incremental_partitions({5}, 64);
		BOOST_CHECK_EQUAL(res.size(), 5);
	}
	{
		vector<vector<int>> res = algorithm::incremental_partitions({6}, 64);
		BOOST_CHECK_EQUAL(res.size(), 6);
	}
	{
		vector<vector<int>> res = algorithm::incremental_partitions({3}, 64);
		BOOST_CHECK_EQUAL(res.size(), 3);
		BOOST_CHECK(res[0] == vector<int>{0});
		BOOST_CHECK(res[1] == vector<int>{1});
		BOOST_CHECK(res[2] == vector<int>{2});
	}

	{
		vector<vector<int>> res = algorithm::incremental_partitions({2, 2}, 64);
		BOOST_CHECK_EQUAL(res.size(), 4);
		BOOST_CHECK((res[0] == vector<int>{0, 0}));
		BOOST_CHECK((res[1] == vector<int>{1, 0}));
		BOOST_CHECK((res[2] == vector<int>{0, 1}));
		BOOST_CHECK((res[3] == vector<int>{1, 1}));
	}
	{
		vector<vector<int>> res = algorithm::incremental_partitions({3, 3}, 64);
		BOOST_CHECK_EQUAL(res.size(), 9);
		BOOST_CHECK((res[0] == vector<int>{0, 0}));
		BOOST_CHECK((res[1] == vector<int>{1, 0}));
		BOOST_CHECK((res[2] == vector<int>{0, 1}));
		BOOST_CHECK((res[3] == vector<int>{1, 1}));
		BOOST_CHECK((res[4] == vector<int>{2, 0}));
		BOOST_CHECK((res[5] == vector<int>{0, 2}));
		BOOST_CHECK((res[6] == vector<int>{2, 1}));
		BOOST_CHECK((res[7] == vector<int>{1, 2}));
		BOOST_CHECK((res[8] == vector<int>{2, 2}));
	}
	{
		vector<vector<int>> res = algorithm::incremental_partitions({3, 3}, 5);
		BOOST_CHECK_EQUAL(res.size(), 5);
		BOOST_CHECK((res[0] == vector<int>{0, 0}));
		BOOST_CHECK((res[1] == vector<int>{1, 0}));
		BOOST_CHECK((res[2] == vector<int>{0, 1}));
		BOOST_CHECK((res[3] == vector<int>{1, 1}));
		BOOST_CHECK((res[4] == vector<int>{2, 0}));
	}
	{
		vector<vector<int>> res = algorithm::incremental_partitions({3, 3, 3}, 64);
		BOOST_CHECK_EQUAL(res.size(), 27);
		BOOST_CHECK((res[0] == vector<int>{0, 0, 0}));
		BOOST_CHECK((res[1] == vector<int>{1, 0, 0}));
		BOOST_CHECK((res[2] == vector<int>{0, 1, 0}));
		BOOST_CHECK((res[3] == vector<int>{0, 0, 1}));
		BOOST_CHECK((res[4] == vector<int>{1, 1, 0}));
		BOOST_CHECK((res[5] == vector<int>{1, 0, 1}));
		BOOST_CHECK((res[6] == vector<int>{0, 1, 1}));
		BOOST_CHECK((res[7] == vector<int>{2, 0, 0}));
		BOOST_CHECK((res[8] == vector<int>{0, 2, 0}));
		BOOST_CHECK((res[9] == vector<int>{0, 0, 2}));
		BOOST_CHECK((res[10] == vector<int>{1, 1, 1}));
		BOOST_CHECK((res[11] == vector<int>{2, 1, 0}));
		BOOST_CHECK((res[12] == vector<int>{2, 0, 1}));
		BOOST_CHECK((res[13] == vector<int>{1, 2, 0}));
		BOOST_CHECK((res[14] == vector<int>{1, 0, 2}));
		BOOST_CHECK((res[15] == vector<int>{0, 2, 1}));
	}
	{
		vector<vector<int>> res = algorithm::incremental_partitions({2, 3}, 64);
		BOOST_CHECK_EQUAL(res.size(), 6);
		BOOST_CHECK((res[0] == vector<int>{0, 0}));
		BOOST_CHECK((res[1] == vector<int>{1, 0}));
		BOOST_CHECK((res[2] == vector<int>{0, 1}));
		BOOST_CHECK((res[3] == vector<int>{1, 1}));
		BOOST_CHECK((res[4] == vector<int>{0, 2}));
		BOOST_CHECK((res[5] == vector<int>{1, 2}));
	}

}

BOOST_AUTO_TEST_CASE(harmonic_centrality) {
	{
		set<pair<uint32_t, uint32_t>> e = {std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0)};
		vector<double> h = algorithm::harmonic_centrality(3, e, 6);
		BOOST_CHECK(h.size() == 3);
		BOOST_CHECK((h == vector<double>{1.5, 1.5, 1.5}));
	}

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 1),
			std::make_pair(1, 2),
			std::make_pair(2, 0),
			std::make_pair(2, 3),
			std::make_pair(3, 4),
			std::make_pair(3, 5),
			std::make_pair(4, 2),
			std::make_pair(5, 4),
		};
		vector<double> h = algorithm::harmonic_centrality(7, e, 6);
		BOOST_CHECK(h.size() == 7);
		BOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001);
		BOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001);
		BOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001);
		BOOST_CHECK_EQUAL(h[6], 0.0);
	}

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 1),
			std::make_pair(1, 2),
			std::make_pair(2, 1),
			std::make_pair(3, 1),
			std::make_pair(4, 1),
			std::make_pair(5, 1),
			std::make_pair(6, 1),
			std::make_pair(7, 1),
		};
		vector<double> h = algorithm::harmonic_centrality(8, e, 6);
		BOOST_CHECK(h.size() == 8);
		BOOST_CHECK_CLOSE(h[1], 7, 0.000001);
	}
}

BOOST_AUTO_TEST_CASE(harmonic_centrality_threaded) {
	{
		set<pair<uint32_t, uint32_t>> e = {std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0)};
		vector<double> h = algorithm::harmonic_centrality_threaded(3, e, 6, 3);
		BOOST_CHECK(h.size() == 3);
		BOOST_CHECK((h == vector<double>{1.5, 1.5, 1.5}));
	}

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 1),
			std::make_pair(1, 2),
			std::make_pair(2, 0),
			std::make_pair(2, 3),
			std::make_pair(3, 4),
			std::make_pair(3, 5),
			std::make_pair(4, 2),
			std::make_pair(5, 4),
		};
		vector<double> h = algorithm::harmonic_centrality_threaded(7, e, 6, 2);
		BOOST_CHECK(h.size() == 7);
		BOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001);
		BOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001);
		BOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001);
		BOOST_CHECK_EQUAL(h[6], 0.0);
	}

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 1),
			std::make_pair(1, 2),
			std::make_pair(2, 1),
			std::make_pair(3, 1),
			std::make_pair(4, 1),
			std::make_pair(5, 1),
			std::make_pair(6, 1),
			std::make_pair(7, 1),
		};
		vector<double> h = algorithm::harmonic_centrality_threaded(8, e, 6, 1);
		BOOST_CHECK(h.size() == 8);
		BOOST_CHECK_CLOSE(h[1], 7, 0.000001);
	}
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_bloom_filter.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include <fstream>
#include "algorithm/bloom_filter.h"
#include "algorithm/hash.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_bloom_filter)

BOOST_AUTO_TEST_CASE(test_bloom_filter) {
	algorithm::bloom_filter bf;

	bf.insert("test");
	BOOST_CHECK(bf.exists("test"));
	BOOST_CHECK(!bf.exists("test2"));

	bf.insert("test2");
	BOOST_CHECK(bf.exists("test2"));
}

BOOST_AUTO_TEST_CASE(test_bloom_filter_merge) {

	algorithm::bloom_filter bf1;
	bf1.insert("test1");
	bf1.insert("test2");

	algorithm::bloom_filter bf2;
	bf2.insert("test3");
	bf2.insert("test4");

	bf1.merge(bf2);

	BOOST_CHECK(bf1.exists("test1"));
	BOOST_CHECK(bf1.exists("test2"));
	BOOST_CHECK(bf1.exists("test3"));
	BOOST_CHECK(bf1.exists("test4"));

	BOOST_CHECK(!bf1.exists("test0"));
	BOOST_CHECK(!bf1.exists("test5"));
	BOOST_CHECK(!bf1.exists("random"));
	BOOST_CHECK(!bf1.exists("random2"));
}

BOOST_AUTO_TEST_CASE(test_bloom_filter_save) {
	{
		algorithm::bloom_filter bf;
		bf.insert("test1");
		bf.insert("test2");
		bf.write_file("/tmp/bloom");
	}

	{
		algorithm::bloom_filter bf;
		bf.read_file("/tmp/bloom");

		BOOST_CHECK(bf.exists("test1"));
		BOOST_CHECK(bf.exists("test2"));
		BOOST_CHECK(!bf.exists("test3"));
	}
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_cc_parser.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "config.h"
#include "warc/warc.h"
#include "URL.h"
#include "parser/cc_parser.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(cc_parser)

BOOST_AUTO_TEST_CASE(download_warc_paths) {
	{
		vector<string> paths = parser::download_warc_paths();
		BOOST_CHECK_EQUAL(paths.size(), 0);

		paths.push_back("test_path/testing1");
		paths.push_back("test_path/testing2");

		BOOST_CHECK(parser::upload_warc_paths(paths));
	}
	{
		vector<string> paths = parser::download_warc_paths();
		BOOST_CHECK_EQUAL(paths.size(), 2);
		BOOST_CHECK_EQUAL(paths[0], "test_path/testing1");
		BOOST_CHECK_EQUAL(paths[1], "test_path/testing2");
	}
	BOOST_CHECK(parser::upload_warc_paths({}));
}

BOOST_AUTO_TEST_CASE(download_warc) {
	// This amazon bucket is gone
	/*
	string buffer;
	warc::multipart_download("http://alexandria-test-data.s3.amazonaws.com/multipart_test", [&buffer](const string &data) {
		buffer.append(data);
	});

	BOOST_CHECK_EQUAL(buffer.size(), 15728640);
	BOOST_CHECK_EQUAL(algorithm::hash(buffer), 1803966798292769636ull);
	*/
}

BOOST_AUTO_TEST_CASE(parse_cc_batch) {
	ifstream infile(config::test_data_path + "bokus_test.warc.gz", std::ios::binary);

	warc::parser pp;
	pp.parse_stream(infile);

	{
		stringstream ss(pp.result());
		string line;
		bool found_url = false;
		while (getline(ss, line)) {
			vector<string> cols;
			boost::algorithm::split(cols, line, boost::is_any_of("\t"));

			if (cols[0] == "https://www.bokus.com/recension/670934") {
				BOOST_CHECK(cols[1].substr(0, 26) == "Mycket intressant läsning");
				BOOST_CHECK(cols[2].substr(0, 25) == "Recension av Lena Klippvi");
				BOOST_CHECK(cols[3].substr(0, 25) == "Mycket intressant läsnin");
				BOOST_CHECK(cols[4].substr(0, 120) == "Recenserad produkt Los Angeles's Original Farmers Market Häftad (Trade Paper) Mycket intressant läsning om hur Farmers");
				BOOST_CHECK(cols[5] == "2021-07-31T20:08:45Z");
				BOOST_CHECK(cols[6] == "213.187.205.190");
				found_url = true;
			}
		}
		BOOST_CHECK(found_url);
	}

	{
		stringstream ss(pp.link_result());
		string line;
		int links_found = 0;
		while (getline(ss, line)) {
			vector<string> cols;
			boost::algorithm::split(cols, line, boost::is_any_of("\t"));

			if (links_found == 0) {
				BOOST_CHECK(cols[0] == "bokus.com");
				BOOST_CHECK(cols[1] == "/recension/670934");
				BOOST_CHECK(cols[2] == "help.bokus.com");
				BOOST_CHECK(cols[3] == "/");
				BOOST_CHECK(cols[4] == "Vanliga frågor & svar");
			}
			links_found++;
		}
		BOOST_CHECK_EQUAL(links_found, 8);
	}

	/*{
		const char *internal_links = pp.internal_link_result().c_str();
		{
			const uint64_t hash1 = *((uint64_t *)&internal_links[0]);
			const uint64_t hash2 = *((uint64_t *)&internal_links[8]);
			BOOST_CHECK_EQUAL(hash1, URL("https://www.bokus.com/recension/670934").hash());
			BOOST_CHECK_EQUAL(hash2, URL("https://www.bokus.com/cgi-bin/logout_user_info.cgi").hash());
		}
		{
			const uint64_t hash1 = *((uint64_t *)&internal_links[16]);
			const uint64_t hash2 = *((uint64_t *)&internal_links[24]);
			BOOST_CHECK_EQUAL(hash1, URL("https://www.bokus.com/recension/670934").hash());
			BOOST_CHECK_EQUAL(hash2, URL("https://www.bokus.com/cgi-bin/log_in_real.cgi").hash());
		}
	}*/
}

BOOST_AUTO_TEST_CASE(parse_cc_batch_multistream) {

	string response;
	{
		warc::parser pp;
		ifstream infile(config::test_data_path + "warc_test.gz", std::ios::binary);
		pp.parse_stream(infile);

		response = pp.result();
	}

	vector<string> files = {
		config::test_data_path + "warc_test.gz.aa",
		config::test_data_path + "warc_test.gz.ab",
		config::test_data_path + "warc_test.gz.ac",
		config::test_data_path + "warc_test.gz.ad",
		config::test_data_path + "warc_test.gz.ae",
		config::test_data_path + "warc_test.gz.af",
		config::test_data_path + "warc_test.gz.ag",
		config::test_data_path + "warc_test.gz.ah",
		config::test_data_path + "warc_test.gz.ai",
		config::test_data_path + "warc_test.gz.aj"
	};

	warc::parser pp;

	for (const string &filename : files) {
		ifstream infile(filename, std::ios::binary);
		pp.parse_stream(infile);
	}

	BOOST_CHECK_EQUAL(pp.result().size(), response.size());
}

BOOST_AUTO_TEST_CASE(parse_cc_batch_301) {

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_config.conf
================================================

# Cluster config
nodes_in_cluster = 3
node_id = 0
url_store_host = "http://localhost";

data_path = .

index_snippets = 1

# Indexer config
batches[] = ALEXANDRIA-MANUAL-01
batches[] = CC-MAIN-2021-25
batches[] = CC-MAIN-2021-31

link_batches[] = CC-MAIN-2021-31
link_batches[] = CC-MAIN-2021-25
link_batches[] = CC-MAIN-2021-21
link_batches[] = CC-MAIN-2021-17
link_batches[] = CC-MAIN-2021-10
link_batches[] = CC-MAIN-2021-04
link_batches[] = CC-MAIN-2020-50
link_batches[] = CC-MAIN-2020-45

# Server config
worker_count = 8
query_max_words = 10 # Maximum number of words used in query.
query_max_len = 200
deduplicate_domain_count = 5
pre_result_limit = 200000
result_limit = 1000

# Full text config
ft_max_sections = 4
ft_max_results_per_section = 2000000

n_grams = 1
shard_hash_table_size = 100000


================================================
FILE: tests/test_config2.conf
================================================

# Cluster config
nodes_in_cluster = 8;
node_id = 1;

index_snippets = 0

# Indexer config
batches[] = ALEXANDRIA-MANUAL-02
batches[] = CC-MAIN-2021-20
batches[] = CC-MAIN-2021-30

link_batches[] = CC-MAIN-2021-30
link_batches[] = CC-MAIN-2021-20
link_batches[] = CC-MAIN-2021-20
link_batches[] = CC-MAIN-2021-10
link_batches[] = CC-MAIN-2021-11
link_batches[] = CC-MAIN-2021-00
link_batches[] = CC-MAIN-2020-51
link_batches[] = CC-MAIN-2020-40

# Server config
worker_count = 9
query_max_words = 100 # Maximum number of words used in query.
query_max_len = 0
deduplicate_domain_count = 5000
pre_result_limit = 2
result_limit = 10

# Full text config
ft_max_sections = 2
ft_max_results_per_section = 20

n_grams = 5
shard_hash_table_size = 100000


================================================
FILE: tests/test_configuration.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "config.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_config)

BOOST_AUTO_TEST_CASE(read_config) {
	config::read_config("../tests/test_config.conf");
	BOOST_CHECK_EQUAL(config::nodes_in_cluster, 3);
	BOOST_CHECK_EQUAL(config::node_id, 0);

	vector<string> batches{"ALEXANDRIA-MANUAL-01", "CC-MAIN-2021-25", "CC-MAIN-2021-31"};
	BOOST_CHECK(config::batches == batches);

	vector<string> link_batches{
		"CC-MAIN-2021-31",
        "CC-MAIN-2021-25",
        "CC-MAIN-2021-21",
        "CC-MAIN-2021-17",
        "CC-MAIN-2021-10",
        "CC-MAIN-2021-04",
        "CC-MAIN-2020-50",
        "CC-MAIN-2020-45"
	};
	BOOST_CHECK(config::link_batches == link_batches);
	BOOST_CHECK_EQUAL(config::worker_count, 8);
	BOOST_CHECK_EQUAL(config::query_max_words, 10);
	BOOST_CHECK_EQUAL(config::query_max_len, 200);
	BOOST_CHECK_EQUAL(config::deduplicate_domain_count, 5);
	BOOST_CHECK_EQUAL(config::pre_result_limit, 200000);
	BOOST_CHECK_EQUAL(config::result_limit, 1000);
	BOOST_CHECK_EQUAL(config::ft_max_sections, 4);
	BOOST_CHECK_EQUAL(config::ft_max_results_per_section, 2000000);

	config::read_config("../tests/test_config2.conf");
	BOOST_CHECK_EQUAL(config::nodes_in_cluster, 8);
	BOOST_CHECK_EQUAL(config::node_id, 1);

	vector<string> batches2{"ALEXANDRIA-MANUAL-02", "CC-MAIN-2021-20", "CC-MAIN-2021-30"};
	BOOST_CHECK(config::batches == batches2);

	vector<string> link_batches2{
		"CC-MAIN-2021-30",
        "CC-MAIN-2021-20",
        "CC-MAIN-2021-20",
        "CC-MAIN-2021-10",
        "CC-MAIN-2021-11",
        "CC-MAIN-2021-00",
        "CC-MAIN-2020-51",
        "CC-MAIN-2020-40"
	};
	BOOST_CHECK(config::link_batches == link_batches2);
	BOOST_CHECK_EQUAL(config::worker_count, 9);
	BOOST_CHECK_EQUAL(config::query_max_words, 100);
	BOOST_CHECK_EQUAL(config::query_max_len, 0);
	BOOST_CHECK_EQUAL(config::deduplicate_domain_count, 5000);
	BOOST_CHECK_EQUAL(config::pre_result_limit, 2);
	BOOST_CHECK_EQUAL(config::result_limit, 10);
	BOOST_CHECK_EQUAL(config::ft_max_sections, 2);
	BOOST_CHECK_EQUAL(config::ft_max_results_per_section, 20);

	BOOST_CHECK_EQUAL(config::n_grams, 5);
	BOOST_CHECK_EQUAL(config::index_snippets, false);

	config::read_config("../tests/test_config.conf");
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_counted_index_builder.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "indexer/basic_index_builder.h"
#include "indexer/basic_index.h"
#include "indexer/counted_record.h"
#include "indexer/sharded_builder.h"
#include "indexer/sharded.h"

using namespace indexer;

BOOST_AUTO_TEST_SUITE(test_basic_index_builder)

BOOST_AUTO_TEST_CASE(test_case_1) {

	{
		basic_index_builder<counted_record> idx("test_index", 0);

		idx.truncate();

		idx.add(101, counted_record(1000, 1.0f));

		idx.append();
		idx.merge();
	}

	{
		basic_index<counted_record> idx("test_index", 0);

		std::vector<counted_record> res = idx.find(101);
		BOOST_REQUIRE(res.size() == 1);
		BOOST_CHECK(res[0].m_value == 1000);
		BOOST_CHECK(res[0].m_count == 1);
	}

}

BOOST_AUTO_TEST_CASE(test_case_2) {

	{
		basic_index_builder<counted_record> idx("test_index", 0);

		idx.truncate();

		idx.add(101, counted_record(1000));
		idx.add(101, counted_record(1000));

		idx.append();
		idx.merge();
	}

	{
		basic_index<counted_record> idx("test_index", 0);

		std::vector<counted_record> res = idx.find(101);
		BOOST_REQUIRE(res.size() == 1);
		BOOST_CHECK(res[0].m_value == 1000);
		BOOST_CHECK(res[0].m_count == 2);
	}

}

BOOST_AUTO_TEST_CASE(test_case_3) {

	{
		basic_index_builder<counted_record> idx("test_index", 0);

		idx.truncate();

		idx.add(101, counted_record(1000));
		idx.add(101, counted_record(1001));
		idx.add(101, counted_record(1000));

		idx.append();
		idx.merge();
	}

	{
		basic_index<counted_record> idx("test_index", 0);

		std::vector<counted_record> res = idx.find(101);
		BOOST_REQUIRE(res.size() == 2);
		BOOST_CHECK(res[0].m_value == 1000);
		BOOST_CHECK(res[0].m_count == 2);
		BOOST_CHECK(res[1].m_value == 1001);
		BOOST_CHECK(res[1].m_count == 1);
	}

}

BOOST_AUTO_TEST_CASE(test_case_4) {

	{
		sharded_builder<basic_index_builder, counted_record> idx("test_index", 10);

		idx.truncate();

		idx.add(101, indexer::counted_record(1000));
		idx.add(101, indexer::counted_record(1001));
		idx.add(101, indexer::counted_record(1000));
		idx.add(102, indexer::counted_record(1002));

		idx.append();
		idx.merge();

		BOOST_CHECK(idx.document_count() == 3);
	}

	{
		sharded<basic_index, counted_record> idx("test_index", 10);

		std::vector<counted_record> res = idx.find(101);
		BOOST_REQUIRE(res.size() == 2);
		BOOST_CHECK(res[0].m_value == 1000);
		BOOST_CHECK(res[0].m_count == 2);
		BOOST_CHECK(res[1].m_value == 1001);
		BOOST_CHECK(res[1].m_count == 1);
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_datetime.h
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "common/datetime.h"

BOOST_AUTO_TEST_SUITE(test_datetime)

BOOST_AUTO_TEST_CASE(cur_date) {
	/*std::cout << System::cur_date() << std::endl;
	std::cout << System::cur_datetime() << std::endl;
	std::cout << System::iso8601_datetime() << std::endl;*/
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_file.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "transfer/transfer.h"
#include "text/text.h"
#include "file/file.h"
#include "file/tsv_file_remote.h"
#include "file/tsv_file.h"
#include "file/archive.h"
#include "algorithm/hash.h"
#include "config.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_file)

BOOST_AUTO_TEST_CASE(transfer_test) {
	int error;
	{
		string result = transfer::file_to_string("/test-data/example.txt", error);
		BOOST_CHECK(error == transfer::OK);
		BOOST_CHECK(text::trim(result) == "An example file");
	}

	{
		string result = transfer::gz_file_to_string("/test-data/example.txt.gz", error);
		BOOST_CHECK(error == transfer::OK);
		BOOST_CHECK(text::trim(result) == "An example file");
	}

	{
		string result = transfer::file_to_string("test-data/example.txt", error);
		BOOST_CHECK(error == transfer::OK);
		BOOST_CHECK(text::trim(result) == "An example file");
	}

	{
		string result = transfer::gz_file_to_string("test-data/example.txt.gz", error);
		BOOST_CHECK(error == transfer::OK);
		BOOST_CHECK(text::trim(result) == "An example file");
	}

	{
		stringstream ss;
		transfer::file_to_stream("/test-data/example.txt", ss, error);
		string result = ss.str();
		BOOST_CHECK(error == transfer::OK);
		BOOST_CHECK(text::trim(result) == "An example file");
	}

	{
		stringstream ss;
		transfer::gz_file_to_stream("/test-data/example.txt.gz", ss, error);
		string result = ss.str();
		BOOST_CHECK(error == transfer::OK);
		BOOST_CHECK(text::trim(result) == "An example file");
	}
}

BOOST_AUTO_TEST_CASE(handle_errors) {
	int error;
	{
		string result = transfer::file_to_string("/non-existing.txt", error);
		BOOST_CHECK(error == transfer::ERROR);
	}

	{
		string result = transfer::gz_file_to_string("/non-existing.txt.gz", error);
		BOOST_CHECK(error == transfer::ERROR);
	}

	{
		stringstream ss;
		transfer::file_to_stream("/non-existing.txt", ss, error);
		BOOST_CHECK(error == transfer::ERROR);
	}

	{
		stringstream ss;
		transfer::gz_file_to_stream("/non-existing.txt.gz", ss, error);
		BOOST_CHECK(error == transfer::ERROR);
	}

	{
		vector<string> downloaded = transfer::download_gz_files_to_disk({"/non-existing.txt.gz"});
		BOOST_CHECK(downloaded.size() == 0);
	}
}

BOOST_AUTO_TEST_CASE(tsv_file_exists) {
	file::tsv_file_remote manual_paths_file("crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz");
	vector<string> warc_paths;
	manual_paths_file.read_column_into(0, warc_paths);

	BOOST_CHECK(manual_paths_file.is_open());
	BOOST_CHECK(warc_paths.size() > 0);
	BOOST_CHECK(warc_paths[0] == "crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz");
}

BOOST_AUTO_TEST_CASE(tsv_file_dont_exists) {
	file::tsv_file_remote manual_paths_file("non-existing-file.gz");
	BOOST_CHECK(!manual_paths_file.is_open());
}

BOOST_AUTO_TEST_CASE(local_tsv_files) {

	file::tsv_file my_file(config::test_data_path + "tsvtest.tsv");

	BOOST_CHECK_EQUAL(my_file.find_first_position("aaa"), 0);
	BOOST_CHECK_EQUAL(my_file.find_first_position("aab"), 126);
	BOOST_CHECK_EQUAL(my_file.find_first_position("european"), string::npos);

	BOOST_CHECK_EQUAL(my_file.find_last_position("aaa"), 112);
	BOOST_CHECK_EQUAL(my_file.find_last_position("aab"), 126);
	BOOST_CHECK_EQUAL(my_file.find_last_position("european"), string::npos);

	file::tsv_file my_file2(config::test_data_path + "tsvtest2.tsv");

	BOOST_CHECK_EQUAL(my_file2.find_first_position("aaa"), 0);
	BOOST_CHECK(my_file2.find_first_position("aab") > 0);
	BOOST_CHECK_EQUAL(my_file2.find_first_position("european"), string::npos);

	BOOST_CHECK(my_file2.find_last_position("aaa") > 0 && my_file2.find_last_position("aaa") < my_file2.size());
	BOOST_CHECK(my_file2.find_last_position("aab") > 0 && my_file2.find_last_position("aab") < my_file2.size());
	BOOST_CHECK(my_file2.find_last_position("aac") > 0 && my_file2.find_last_position("aac") == my_file2.size() - 115);
	BOOST_CHECK(my_file2.find_last_position("european") == string::npos);

	BOOST_CHECK_EQUAL(my_file2.find_next_position("aaa"), my_file2.find_first_position("aab"));
	BOOST_CHECK_EQUAL(my_file2.find_next_position("aab"), my_file2.find_first_position("aac"));
	BOOST_CHECK_EQUAL(my_file2.find_next_position("aabb"), my_file2.find_first_position("aac"));
	BOOST_CHECK_EQUAL(my_file2.find_next_position("aac"), my_file2.size());
}

BOOST_AUTO_TEST_CASE(head_content_len) {

	{
		int error;
		size_t content_len = transfer::head_content_length("http://127.0.0.1/test-data/automobileszone.com", error);
		BOOST_CHECK_EQUAL(error, transfer::OK);
		BOOST_CHECK_EQUAL(content_len, 8084);
	}

	{
		int error;
		size_t content_len = transfer::head_content_length("http://127.0.0.1/test-data/automobileszone.com-not-here", error);
		BOOST_CHECK_EQUAL(error, transfer::ERROR);
		BOOST_CHECK_EQUAL(content_len, 0);
	}

}

BOOST_AUTO_TEST_CASE(test_upload) {
	// This amazon bucket is gone.
	/*{
		int error;
		string buffer;
		transfer::url_to_string("http://alexandria-test-data.s3.amazonaws.com/multipart_test", buffer, error);
		BOOST_CHECK_EQUAL(error, transfer::OK);

		error = transfer::upload_file("multipart_test", buffer);
		BOOST_CHECK_EQUAL(error, transfer::OK);
	}*/
}

BOOST_AUTO_TEST_CASE(test_upload_gz) {
	// This amazon bucket is gone.
	/*{
		int error;
		string buffer;
		transfer::url_to_string("http://alexandria-test-data.s3.amazonaws.com/multipart_test", buffer, error);
		BOOST_CHECK_EQUAL(error, transfer::OK);

		error = transfer::upload_gz_file("multipart_test.gz", buffer);
		BOOST_CHECK_EQUAL(error, transfer::OK);

		// Download it again as gz file and see if we get the same result.
		
		const string result_back = transfer::gz_file_to_string("multipart_test.gz", error);
		BOOST_CHECK_EQUAL(error, transfer::OK);

		BOOST_CHECK_EQUAL(result_back.size(), buffer.size());
		BOOST_CHECK_EQUAL(algorithm::hash(result_back), algorithm::hash(buffer));
	}*/
}

/*
 * Test the tsv_file::read_column_into function that is used a lot.
 * */
BOOST_AUTO_TEST_CASE(test_tsv_file) {

	{
		file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv");
		vector<string> vec;
		tsv.read_column_into(0, vec, 2, 3);

		BOOST_CHECK(vec.size() == 2);
		BOOST_CHECK(vec[0] == "line4");
		BOOST_CHECK(vec[1] == "line5");
	}

	{
		file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv");
		set<string> data;
		tsv.read_column_into(0, data, 2, 3);

		BOOST_CHECK(data.size() == 2);
		BOOST_CHECK(data.count("line4") == 1);
		BOOST_CHECK(data.count("line5") == 1);
	}

	{
		file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv");
		vector<string> vec;
		tsv.read_column_into(0, vec, 100, 3);

		BOOST_CHECK(vec.size() == 3);
		BOOST_CHECK(vec[0] == "line4");
		BOOST_CHECK(vec[1] == "line5");
		BOOST_CHECK(vec[2] == "line6");
	}

	{
		file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv");
		set<string> data;
		tsv.read_column_into(0, data, 100, 3);

		BOOST_CHECK(data.size() == 3);
		BOOST_CHECK(data.count("line4") == 1);
		BOOST_CHECK(data.count("line5") == 1);
		BOOST_CHECK(data.count("line6") == 1);
	}

	{
		file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv");
		vector<string> vec;
		tsv.read_column_into(0, vec, 3, 0);

		BOOST_CHECK(vec.size() == 3);
		BOOST_CHECK(vec[0] == "line1");
		BOOST_CHECK(vec[1] == "line2");
		BOOST_CHECK(vec[2] == "line3");
	}

	{
		file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv");
		set<string> data;
		tsv.read_column_into(0, data, 3, 0);

		BOOST_CHECK(data.size() == 3);
		BOOST_CHECK(data.count("line1") == 1);
		BOOST_CHECK(data.count("line2") == 1);
		BOOST_CHECK(data.count("line3") == 1);
	}
}

/*
 * Test the file::archive simple tarball
 * */
BOOST_AUTO_TEST_CASE(test_archive) {

	{
		file::archive tar("test_dir.tar");

		file::create_directory("test_dir1");

		std::ofstream file1("test_dir1/file1.txt");
		file1 << "hello world 1";
		file1.close();

		std::ofstream file2("test_dir1/file2.txt");
		file2 << "hello world 2";
		file2.close();

		std::ofstream file3("test_dir1/file3.txt");
		file3 << "hello world 3";
		file3.close();

		tar.read_dir("test_dir1");
	}

	{
		file::archive tar("test_dir.tar");

		file::create_directory("test_dir2");
		tar.untar("test_dir2");

		BOOST_CHECK_EQUAL(file::cat("test_dir2/file1.txt"), "hello world 1");
		BOOST_CHECK_EQUAL(file::cat("test_dir2/file2.txt"), "hello world 2");
		BOOST_CHECK_EQUAL(file::cat("test_dir2/file3.txt"), "hello world 3");

	}
	file::delete_directory("test_dir1");
	file::delete_directory("test_dir2");
	file::delete_file("test_dir.tar");
}

BOOST_AUTO_TEST_CASE(test_archive2) {

	{
		file::archive tar("test_dir.tar");

		file::create_directory("test_dir1");

		// Create 500 files.
		for (size_t i = 1; i <= 500; i++) {
			std::ofstream file1("test_dir1/file" + std::to_string(i) + ".txt");
			for (size_t j = 0; j < i; j++) {
				file1 << "hello world " << j << std::endl;
			}
		}

		tar.read_dir("test_dir1");
	}

	{
		file::archive tar("test_dir.tar");

		file::create_directory("test_dir2");
		tar.untar("test_dir2");

		// Check 500 files.
		for (size_t i = 1; i <= 500; i++) {
			std::ifstream file1("test_dir2/file" + std::to_string(i) + ".txt");
			std::string line;
			size_t j = 0;
			while (std::getline(file1, line)) {
				BOOST_CHECK_EQUAL(line, "hello world " + std::to_string(j));
				j++;
			}
			BOOST_CHECK_EQUAL(j, i);
		}

	}
	file::delete_directory("test_dir1");
	file::delete_directory("test_dir2");
	file::delete_file("test_dir.tar");
}

BOOST_AUTO_TEST_CASE(test_rename_file) {
	file::create_directory("/tmp/alexandria_test_98237593257");
	file::create_directory("/tmp/alexandria_test_98237593257/testdir");
	file::rename("/tmp/alexandria_test_98237593257/testdir", "/tmp/alexandria_test_98237593257/testdir2");
	BOOST_CHECK(file::file_exists("/tmp/alexandria_test_98237593257/testdir2"));
	BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257/testdir"));
	file::delete_directory("/tmp/alexandria_test_98237593257");
	BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257/testdir"));
	BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257/testdir2"));
	BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257"));

	file::create_directory("/tmp/alexandria_test_98237593257/testdir");
	BOOST_CHECK(file::file_exists("/tmp/alexandria_test_98237593257"));
	file::delete_directory("/tmp/alexandria_test_98237593257");
	BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257"));
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_hash.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "algorithm/hash.h"

BOOST_AUTO_TEST_SUITE(hash)

BOOST_AUTO_TEST_CASE(str) {

	BOOST_CHECK_EQUAL(algorithm::hash("testing"), 4540905123118180926ull);
	BOOST_CHECK_EQUAL(algorithm::hash(""), 6142509188972423790ull);
	BOOST_CHECK_EQUAL(algorithm::hash("abcdefghijklmnopqrstuvxyz"), 17219978627035894604ull);
	BOOST_CHECK_EQUAL(algorithm::hash("123"), 10089081994332581363ull);
	BOOST_CHECK_EQUAL(algorithm::hash("1234"), 15651099383784684535ull);

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_hash_table.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>

#include "hash_table2/hash_table.h"
#include "hash_table2/builder.h"
#include "hash_table_helper/hash_table_helper.h"
#include "indexer/merger.h"

#include <set>

BOOST_AUTO_TEST_SUITE(test_hash_table)

BOOST_AUTO_TEST_CASE(test_file_paths) {

	{
		hash_table2::hash_table_shard_builder ht_builder("test_index", 8);
		BOOST_CHECK_EQUAL(ht_builder.file_base_data(), "./0/hash_table/ht_test_index_8");
		BOOST_CHECK_EQUAL(ht_builder.filename_data(), "./0/hash_table/ht_test_index_8.data");
	}
	{
		hash_table2::hash_table_shard_builder ht_builder("test_index", 8, 1000, "/data_path");
		BOOST_CHECK_EQUAL(ht_builder.file_base_data(), "/data_path/ht_test_index_8");
		BOOST_CHECK_EQUAL(ht_builder.filename_data(), "/data_path/ht_test_index_8.data");
		BOOST_CHECK_EQUAL(ht_builder.filename_pos(), "./0/hash_table/ht_test_index_8.pos");
	}

}

BOOST_AUTO_TEST_CASE(single_shard_add) {

	hash_table_helper::truncate("test_index");

	{
		hash_table2::hash_table_shard_builder idx("test_index", 0);

		idx.truncate();

		idx.add(123, "hello world");
		idx.append();
		idx.merge();
	}

	{
		hash_table2::hash_table_shard idx("test_index", 0);

		BOOST_CHECK(idx.has(123));
		BOOST_CHECK(!idx.has(1234));
		BOOST_CHECK_EQUAL(idx.find(123), "hello world");
	}

}

BOOST_AUTO_TEST_CASE(single_shard_add_versioned) {

	{
		hash_table2::hash_table_shard_builder idx("test_index", 0);

		idx.truncate();

		idx.add(123, "hello world", 5);
		idx.append();
		idx.merge();
		idx.add(123, "new value", 6);
		idx.append();
		idx.merge();
		idx.add(123, "old value", 4);
		idx.append();
		idx.merge();

		idx.add(123, "old value 2", 3);
		idx.add(123, "newest value", 7);
		idx.append();
		idx.merge();
	}

	{
		hash_table2::hash_table_shard idx("test_index", 0);

		BOOST_CHECK_EQUAL(idx.find(123), "newest value");
	}

}

BOOST_AUTO_TEST_CASE(single_shard_add_versioned2) {

	{
		hash_table2::hash_table_shard_builder idx("test_index", 0);

		idx.truncate();

		idx.add(101, "an old value", 1000);
		idx.append();
		idx.merge();
		idx.optimize();
		idx.add(101, "another old value", 1000);
		idx.append();
		idx.merge();
		idx.optimize();
		idx.add(101, "a new value", 1001);
		idx.append();
		idx.merge();
		idx.optimize();
		idx.add(101, "an older value", 999);
		idx.append();
		idx.merge();
		idx.optimize();
	}

	{
		hash_table2::hash_table_shard idx("test_index", 0);

		BOOST_CHECK_EQUAL(idx.find(101), "a new value");
	}

}

BOOST_AUTO_TEST_CASE(add_to_hash_table) {

	hash_table_helper::truncate("test_index");

	{
		hash_table2::builder idx("test_index", 43);

		idx.truncate();

		// Add 1000 elements.
		for (size_t i = 0; i < 1000; i++) {
			idx.add(i, "Random test data with id: " + std::to_string(i));
		}

		idx.merge();
	}

	{
		hash_table2::hash_table hash_table("test_index", 43);

		for (size_t i = 0; i < 1000; i++) {
			BOOST_CHECK_EQUAL(hash_table.find(i), "Random test data with id: " + std::to_string(i));
		}
	}

	{
		hash_table2::builder idx("test_index", 43);

		idx.truncate();

		// Add 1000 elements.
		for (size_t i = 1000; i < 2000; i++) {
			idx.add(i, "Random test data with id: " + std::to_string(i));
		}

		idx.merge();
	}

	{
		hash_table2::hash_table hash_table("test_index", 43);

		for (size_t i = 1000; i < 2000; i++) {
			BOOST_CHECK_EQUAL(hash_table.find(i), "Random test data with id: " + std::to_string(i));
		}
	}

}

BOOST_AUTO_TEST_CASE(add_to_hash_table_reverse) {

	hash_table_helper::truncate("test_index");

	{
		hash_table2::builder idx("test_index", 17);

		idx.truncate();

		// Add 1000 elements.
		for (size_t i = 100000; i < 200000; i++) {
			idx.add(i, "Random test data with id: " + std::to_string(i));
		}

		idx.merge();
	}

	{
		hash_table2::hash_table hash_table("test_index", 17);

		BOOST_CHECK_EQUAL(hash_table.size(), 100000);
	}

	{
		// Add more elements.
		hash_table2::builder idx("test_index", 17);

		// Add 1000 elements.
		for (size_t i = 0; i < 100000; i++) {
			idx.add(i, "Random test data with id: " + std::to_string(i));
		}

		idx.merge();
	}

	{
		hash_table2::hash_table hash_table("test_index", 17);

		BOOST_CHECK_EQUAL(hash_table.size(), 200000);
	}

}

BOOST_AUTO_TEST_CASE(optimize) {

	hash_table_helper::truncate("test_index");

	size_t shard_size = 0;
	size_t shard_file_size = 0;

	{
		hash_table2::hash_table_shard_builder builder("test_index", 0);

		builder.add(1, "data element 1 v1");
		builder.add(2, "data element 2 v1");
		builder.add(3, "data element 3 v1");

		builder.append();
		builder.merge();

		hash_table2::hash_table_shard shard("test_index", 0);
		shard_size = shard.size();
		shard_file_size = shard.file_size();
	}

	{
		// Add some more elements with identical keys.
		hash_table2::hash_table_shard_builder builder("test_index", 0);

		builder.add(1, "data element 1 v2");
		builder.add(2, "data element 2 v2");
		builder.add(3, "data element 3 v2");

		builder.append();
		builder.merge();

		builder.optimize();

		hash_table2::hash_table_shard shard("test_index", 0);

		BOOST_CHECK_EQUAL(shard.size(), shard_size);
		BOOST_CHECK_EQUAL(shard.file_size(), shard_file_size);

		BOOST_CHECK_EQUAL(shard.find(1), "data element 1 v2");
		BOOST_CHECK_EQUAL(shard.find(2), "data element 2 v2");
		BOOST_CHECK_EQUAL(shard.find(3), "data element 3 v2");
	}
}

BOOST_AUTO_TEST_CASE(optimize_empty) {

	hash_table_helper::truncate("main_index");

	hash_table2::hash_table_shard_builder idx("main_index", 0);
	idx.optimize();

}

BOOST_AUTO_TEST_CASE(conditional) {

	hash_table_helper::truncate("main_index");

	{

		hash_table2::builder ht("main_index", 10);

		ht.truncate();

		ht.add(101, "an old value", 1000);
		ht.add(101, "another old value", 1000);
		ht.add(101, "a new value", 1001);
		ht.add(101, "an older value", 999);

		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 10);

		std::string value = ht.find(101);

		BOOST_CHECK_EQUAL(value, "a new value");
	}

}

BOOST_AUTO_TEST_CASE(conditional2) {

	hash_table_helper::truncate("main_index");

	{

		hash_table2::builder ht("main_index", 10);

		ht.truncate();

		// Merge between each. Should still get the same value.

		ht.add(101, "an old value", 1000);
		ht.merge();
		ht.add(101, "another old value", 1000);
		ht.merge();
		ht.add(101, "a new value", 1001);
		ht.merge();
		ht.add(101, "an older value", 999);
		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 10);

		std::string value = ht.find(101);

		BOOST_CHECK_EQUAL(value, "a new value");
	}

}

BOOST_AUTO_TEST_CASE(more_tests) {

	hash_table_helper::truncate("main_index");

	{
		hash_table2::builder ht("main_index", 10);

		ht.truncate();

		ht.add(101, "first value", 1000);
		ht.add(101, "second value", 1001);
		ht.add(101, "third value", 1002);

		ht.add(102, "first value", 1000);
		ht.add(102, "second value", 1001);
		ht.add(102, "third value", 1002);

		ht.add(103, "first value", 1);
		ht.add(103, "second value", 100000);
		ht.add(103, "third value", 99999999999);

		ht.add(50, "third value");

		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 10);

		BOOST_CHECK_EQUAL(ht.find(101), "third value");
		BOOST_CHECK_EQUAL(ht.find(102), "third value");
		BOOST_CHECK_EQUAL(ht.find(103), "third value");
		BOOST_CHECK_EQUAL(ht.find(50), "third value");
	}

}

BOOST_AUTO_TEST_CASE(for_each) {

	hash_table_helper::truncate("main_index");

	{
		hash_table2::builder ht("main_index", 10);

		ht.truncate();

		ht.add(101, "first value", 1000);
		ht.merge();
		ht.add(101, "second value", 1001);
		ht.merge();
		ht.add(101, "third value", 1002);

		ht.add(102, "first value", 1000);
		ht.merge();
		ht.add(102, "second value", 1001);
		ht.merge();
		ht.add(102, "third value", 1002);

		ht.add(103, "third value", 99999999999);
		ht.add(103, "first value", 1);
		ht.merge();
		ht.add(103, "second value", 100000);
		ht.merge();

		ht.add(50, "third value");

		ht.merge();
		ht.optimize();
	}

	{
		hash_table2::hash_table ht("main_index", 10);

		BOOST_CHECK_EQUAL(ht.find(101), "third value");
		BOOST_CHECK_EQUAL(ht.find(102), "third value");
		BOOST_CHECK_EQUAL(ht.find(103), "third value");
		BOOST_CHECK_EQUAL(ht.find(50), "third value");

		std::set<uint64_t> keys;
		std::set<std::string> values;
		ht.for_each([&keys, &values](uint64_t key, const std::string &val) {
			keys.insert(key);
			values.insert(val);
		});

		BOOST_CHECK_EQUAL(keys.size(), 4);
		BOOST_CHECK_EQUAL(values.size(), 1);

		for (const auto &val : values) {
			BOOST_CHECK_EQUAL(val, "third value");
		}
	}

}

BOOST_AUTO_TEST_CASE(larger_test) {

	{
		indexer::merger::start_merge_thread();

		hash_table2::builder ht("main_index", 10);

		ht.truncate();

		for (size_t key = 1000; key < 10000; key++) {
			ht.add(key, std::string(key, 'x'));
		}

		for (size_t key = 1000; key < 10000; key++) {
			ht.add(key, std::string(key, 'y'), 1);
		}

		indexer::merger::stop_merge_thread();
	}

	{
		indexer::merger::start_merge_thread();

		hash_table2::builder ht("main_index", 10);

		for (size_t key = 1000; key < 10000; key++) {
			ht.add(key, std::string(key, 'z'), 2);
		}

		indexer::merger::stop_merge_thread();
	}

	{
		indexer::merger::start_merge_thread();

		hash_table2::builder ht("main_index", 10);

		for (size_t key = 1000; key < 10000; key++) {
			ht.add(key, std::string(key, 'a'), 2);
		}

		indexer::merger::stop_merge_thread();
	}

	{
		hash_table2::builder ht("main_index", 10);
		ht.optimize();
	}

	{
		hash_table2::hash_table ht("main_index", 10);

		for (size_t key = 1000; key < 10000; key++) {
			BOOST_REQUIRE_EQUAL(ht.find(key), std::string(key, 'a'));
		}

		std::map<uint64_t, std::vector<std::string>> vals;
		ht.for_each([&vals](uint64_t key, const std::string &val) {
			vals[key].push_back(val);
		});

		for (const auto &iter : vals) {
			BOOST_REQUIRE_EQUAL(iter.second.size(), 1);
			BOOST_REQUIRE_EQUAL(iter.second[0], std::string(iter.first, 'a'));
		}
	}

}

BOOST_AUTO_TEST_CASE(merge_with) {

	{
		hash_table2::builder ht("main_index", 11);

		ht.truncate();

		ht.add(123, "a1", 10);
		ht.add(1230, "a2", 10);
		ht.add(1231, "a3", 10);
		ht.add(1231, "a3_n2", 11);

		ht.add(3828540, "a4", 10);
		ht.add(2234645, "a5", 10);
		ht.add(8424878, "a6", 10);
		ht.add(4174861, "a7", 10);
		ht.add(7013344, "a8", 10);

		ht.merge();
	}

	{
		hash_table2::builder ht("main_index2", 11);

		ht.truncate();

		ht.add(123, "b1", 11);
		ht.add(1230, "b2", 12);
		ht.add(1231, "b3", 9);
		ht.add(1231, "b3", 8);

		ht.add(8321508, "b4", 10);
		ht.add(7309646, "b5", 10);
		ht.add(2809224, "b6", 10);
		ht.add(6543485, "b7", 10);
		ht.add(6078858, "b8", 10);

		ht.merge();
	}

	{
		hash_table2::builder ht1("main_index", 11);
		hash_table2::builder ht2("main_index2", 11);

		ht1.merge_with(ht2);
	}

	{
		hash_table2::hash_table ht("main_index", 11);

		BOOST_CHECK_EQUAL(ht.find(123), "b1");
		BOOST_CHECK_EQUAL(ht.find(1230), "b2");
		BOOST_CHECK_EQUAL(ht.find(1231), "a3_n2");
		BOOST_CHECK_EQUAL(ht.find(6543485), "b7");
		BOOST_CHECK_EQUAL(ht.find(2234645), "a5");
	}

}

BOOST_AUTO_TEST_CASE(merge_with_files) {

	{
		hash_table2::builder ht("main_index", 1);

		ht.truncate();

		ht.add(123, "a1", 10);
		ht.add(1230, "a2", 10);
		ht.add(1231, "a3", 10);
		ht.add(1231, "a3_n2", 11);

		ht.add(3828540, "a4", 10);
		ht.add(2234645, "a5", 10);
		ht.add(8424878, "a6", 10);
		ht.add(4174861, "a7", 10);
		ht.add(7013344, "a8", 10);

		ht.merge();
	}

	{
		hash_table2::builder ht("main_index2", 1);

		ht.truncate();

		ht.add(123, "b1", 11);
		ht.add(1230, "b2", 12);
		ht.add(1231, "b3", 9);
		ht.add(1231, "b3", 8);

		ht.add(8321508, "b4", 10);
		ht.add(7309646, "b5", 10);
		ht.add(2809224, "b6", 10);
		ht.add(6543485, "b7", 10);
		ht.add(6078858, "b8", 10);

		ht.merge();
	}

	{
		hash_table2::builder ht("main_index2", 1);

		ht.get_shard(0)->merge_with("./0/hash_table/ht_main_index_0.pos", "./0/hash_table/ht_main_index_0.data");
	}
	{
		hash_table2::hash_table ht("main_index2", 1);

		BOOST_CHECK_EQUAL(ht.find(123), "b1");
		BOOST_CHECK_EQUAL(ht.find(1230), "b2");
		BOOST_CHECK_EQUAL(ht.find(1231), "a3_n2");
		BOOST_CHECK_EQUAL(ht.find(6543485), "b7");
		BOOST_CHECK_EQUAL(ht.find(2234645), "a5");
	}

}

BOOST_AUTO_TEST_CASE(remove_record) {

	{
		hash_table2::builder ht("main_index", 1);

		ht.truncate();

		ht.add(10000, "data1", 10);
		ht.add(10001, "data2", 10);
		ht.add(10002, "data3", 10);

		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 1);

		BOOST_CHECK_EQUAL(ht.find(10000), "data1");
		BOOST_CHECK_EQUAL(ht.find(10001), "data2");
		BOOST_CHECK_EQUAL(ht.find(10002), "data3");
	}

	{
		hash_table2::builder ht("main_index", 1);

		ht.remove(10001);

		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 1);

		BOOST_CHECK_EQUAL(ht.find(10000), "data1");
		BOOST_CHECK_EQUAL(ht.find(10001), "");
		BOOST_CHECK_EQUAL(ht.find(10002), "data3");
	}

}

BOOST_AUTO_TEST_CASE(remove_record2) {

	{
		hash_table2::builder ht("main_index", 1);

		ht.truncate();

		ht.add(10000, "data1", 10);
		ht.add(10001, "data2", 10);
		ht.add(10002, "data3", 10);

		ht.merge();
	}

	{
		hash_table2::builder ht("main_index2", 1);

		ht.truncate();

		ht.add(10000, "data1", 10);
		ht.add(10002, "data3", 10);

		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 1);

		BOOST_CHECK_EQUAL(ht.find(10000), "data1");
		BOOST_CHECK_EQUAL(ht.find(10001), "data2");
		BOOST_CHECK_EQUAL(ht.find(10002), "data3");
	}

	{
		hash_table2::hash_table ht("main_index2", 1);

		BOOST_CHECK_EQUAL(ht.find(10000), "data1");
		BOOST_CHECK_EQUAL(ht.find(10002), "data3");
	}

	{
		hash_table2::builder ht("main_index", 1);

		ht.remove(10001);

		ht.merge();
	}

	{
		hash_table2::hash_table ht1("main_index", 1);
		hash_table2::hash_table ht2("main_index", 1);

		size_t total_size1 = 0;
		ht1.for_each_shard([&total_size1](auto shard) {
			total_size1 += shard->file_size();
		});

		size_t total_size2 = 0;
		ht2.for_each_shard([&total_size2](auto shard) {
			total_size2 += shard->file_size();
		});

		BOOST_CHECK_EQUAL(total_size1, total_size2);
	}

}

BOOST_AUTO_TEST_CASE(for_each_key) {

	{
		hash_table2::builder ht("main_index", 1);

		ht.truncate();

		ht.add(100, "data1");
		ht.add(101, "other data");
		ht.add(102, "data3");

		ht.merge();
	}

	{
		hash_table2::hash_table ht("main_index", 1);

		int num = 0;
		ht.for_each_key([&num](uint64_t key) {
			BOOST_CHECK(key == 100 || key == 101 || key == 102);
			num++;
		});

		BOOST_CHECK_EQUAL(num, 3);
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_html_parser.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "parser/html_parser.h"
#include "text/text.h"
#include "file/file.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(html_parser)

BOOST_AUTO_TEST_CASE(html_parse1) {
	parser::html_parser parser;

	parser.parse("<title>test1</title>");
	BOOST_CHECK_EQUAL(parser.title(), "test1");

	parser.parse("<title>test1</title><h1>test2</h1>");
	BOOST_CHECK_EQUAL(parser.h1(), "test2");

	parser.parse("he oisjdf osdjfo idjsofi djsof<h1></h1>");
	BOOST_CHECK_EQUAL(parser.title(), "");
	BOOST_CHECK_EQUAL(parser.h1(), "");

	parser.parse("<html><title>test1</title><meta name=\"description\" content=\"Recensioner av Vår vid sommen och andra böcker.\"></html>");
	BOOST_CHECK_EQUAL(parser.meta(), "Recensioner av Vår vid sommen och andra böcker");

	parser.parse(file::read_test_file("test1.html"));
	BOOST_CHECK_EQUAL(parser.meta(), "Pris: 199 kr. Inbunden, 2021. Finns i lager. Köp Sammetsdiktaturen : motstånd och medlöpare i dagens Ryssland av Anna-Lena Laurén på Bokus.com. Boken har 3 st läsarrecensioner");

	parser.parse("<title>test1</title><h1><span>Hej Hopp</span></h1>");
	BOOST_CHECK_EQUAL(parser.h1(), "Hej Hopp");

	parser.parse("<html><title>test1</title><h1>test2</h1> lite text efter</html>");
	BOOST_CHECK_EQUAL(parser.text(), "lite text efter");
}

BOOST_AUTO_TEST_CASE(html_parse2) {
	parser::html_parser parser;

	parser.parse(file::read_test_file("test5.html"));
	BOOST_CHECK_EQUAL(parser.text().substr(0, 50),
		string("Nya lån 2021 Nya lån 2020 Nya lån 2019 Nya lån 2018 Nya lån 2017 Nya lån 2016 Uppdaterad 2021-10-01.").substr(0, 50));

	parser.parse(file::read_test_file("test6.html"));
}

BOOST_AUTO_TEST_CASE(html_parse3) {
	parser::html_parser parser;

	parser.parse(file::read_test_file("test7.html"));
	BOOST_CHECK_EQUAL(parser.text().substr(0, 20), "Add to wishlist Adde");

}

BOOST_AUTO_TEST_CASE(html_parse4) {
	parser::html_parser parser;

	parser.parse(file::read_test_file("test8.html"));
	BOOST_CHECK_EQUAL(parser.text().substr(0, 107), "Hacker News new | past | comments | ask | show | jobs | submit login 1. Apple Broke Up with Me ( merecivili");

}

BOOST_AUTO_TEST_CASE(html_parse5) {
	parser::html_parser parser;

	parser.parse(file::read_test_file("test10.html"));

	BOOST_CHECK_EQUAL(parser.meta(), "");
	BOOST_CHECK_EQUAL(parser.title(), "Association for Progressive Communications | Internet for social justice and sustainable development");
	BOOST_CHECK_EQUAL(parser.h1(), "");

}

BOOST_AUTO_TEST_CASE(html_parse6) {
	parser::html_parser parser;

	parser.parse(file::read_test_file("test11.html"));

	BOOST_CHECK_EQUAL(parser.meta(), "Svenska Dagbladet står för seriös och faktabaserad kvalitetsjournalistik som utmanar, ifrågasätter och inspirerar");
	BOOST_CHECK_EQUAL(parser.title(), "SvD | Sveriges kvalitetssajt för nyheter");

}

BOOST_AUTO_TEST_CASE(html_parse7) {
	parser::html_parser parser;

	parser.parse(file::read_test_file("test12.html"));

	BOOST_CHECK_EQUAL(parser.meta(), "The systematic thinking in our industry is that settings are the result of design failure. As designers, our goal is to create product experiences that don’t require any adjustment by the user. So offering customization options is often seen as a failure to make firm product decisions. I think there is a misunderstanding about what settings really are");
	BOOST_CHECK_EQUAL(parser.title(), "Settings are not a design failure");

}

BOOST_AUTO_TEST_CASE(html_parse_links) {

	string html;
	vector<parser::html_link> links;

	string test2_html = file::read_test_file("test2.html");

	parser::html_parser parser;
	parser.parse(test2_html);
	BOOST_CHECK_EQUAL(parser.title(), "Resebyrån Främmande Världar - L. D. Lapinski - inbunden (9789178937943) | Adlibris Bokhandel");
	BOOST_CHECK_EQUAL(parser.meta(), "inbunden, 2021. Köp boken Resebyrån Främmande Världar av L. D. Lapinski (ISBN 9789178937943) hos Adlibris. Fraktfritt över 229 kr Alltid bra priser och snabb leverans. | Adlibris");
	BOOST_CHECK_EQUAL(parser.h1(), "Resebyrån Främmande Världar - inbunden, Svenska, 2021");

	BOOST_CHECK_EQUAL(parser.text(), "");
	BOOST_CHECK(parser.should_insert());

	string test4_html = file::read_test_file("test4.html");
	parser.parse(test4_html);
	BOOST_CHECK_EQUAL(parser.title(), "Corona – samlad information för privatpersoner | Skatteverket");
	BOOST_CHECK_EQUAL(parser.h1(), "Corona – information för privatpersoner");
	BOOST_CHECK_EQUAL(parser.meta(), "Här har vi samlat information för privatpersoner som påverkas av corona på olika sätt");
	BOOST_CHECK(parser.should_insert());

	string stackoverflow_html = file::read_test_file("stackoverflow.html");
	parser.parse(stackoverflow_html);
	BOOST_CHECK_EQUAL(parser.title(), "node.js - How to use Async and Await with AWS SDK Javascript - Stack Overflow");
	BOOST_CHECK_EQUAL(parser.h1(), "How to use Async and Await with AWS SDK Javascript");
	BOOST_CHECK_EQUAL(parser.meta(), "I am working with the AWS SDK using the KMS libary. I would like to use async and await instead of callbacks. import AWS, { KMS } from \"aws-sdk\"; this.kms = new AWS.KMS(); const key = await this");
	BOOST_CHECK(parser.should_insert());

	html = file::read_test_file("hallakonsument.html");
	parser.parse(html, "https://www.hallakonsument.se/konsumentratt-kopsatt/innan-du-tar-ett-lan/");
	BOOST_CHECK_EQUAL(parser.title(), "Innan du tar ett lån | Hallå konsument – Konsumentverket");
	BOOST_CHECK_EQUAL(parser.h1(), "Innan du tar ett lån");
	BOOST_CHECK_EQUAL(parser.meta(), "Om du har ett behov av att låna pengar är det viktigt att läsa på om vilken typ av lån som passar dig. Prata med flera banker, jämför villkoren och kostnaderna för olika lån");
	BOOST_CHECK(parser.should_insert());

	links = parser.links();
	bool found_link = false;
	for (const auto &link : links) {
		if (link.target_host() == "konsumenternas.se" &&
			link.target_path() == "/lan--betalningar/lan/sa-fungerar-ett-lan/forhandsinformation/" &&
			link.text() == "Läs mer om förhandsinformation på webbplatsen konsumenternas.se") {
			found_link = true;
		}
	}

	BOOST_CHECK(found_link);

	html = file::read_test_file("konsumenternas.html");
	parser.parse(html, "https://www.konsumenternas.se/lan--betalningar/lan/");
	BOOST_CHECK_EQUAL(parser.title(), "Lån");
	BOOST_CHECK_EQUAL(parser.h1(), "Lån");
	BOOST_CHECK_EQUAL(parser.meta(), "Att låna pengar kan vara ett sätt att finansiera något som du behöver eller gärna vill köpa, men inte har råd att betala direkt. Men ett lån kostar pengar i form av avgifter och räntor");
	BOOST_CHECK(parser.should_insert());

	links = parser.links();
	found_link = false;
	for (const auto &link : links) {
		if (link.target_host() == "konsumenternas.us17.list-manage.com" &&
			link.target_path() == "/subscribe?u=a63ab96c95e9b06c9a857d5f9&id=132436ec8d" &&
			link.text() == "Nyhetsbrev") {
			found_link = true;
		}
	}
	BOOST_CHECK(found_link);

	html = file::read_test_file("sbab.html");
	parser.parse(html, "https://www.sbab.se/1/privat/lana/privatlan/privatlan_-_sa_funkar_det.html#/berakna_manadskostnad");
	BOOST_CHECK_EQUAL(parser.title(), "Privatlån - låna pengar till bra ränta - SBAB");
	BOOST_CHECK_EQUAL(parser.h1(), "Privatlån – låna pengar till bra ränta");
	BOOST_CHECK_EQUAL(parser.meta(), "Ansök om ett privatlån mellan 30 000 och 500 000 kronor. Låna pengar utan säkerhet. Ansök och få besked direkt");
	BOOST_CHECK(parser.should_insert());

	links = parser.links();
	found_link = false;
	for (const auto &link : links) {
		if (link.target_host() == "sbab.kundo.se" &&
			link.target_path() == "/org/sbab/" &&
			link.text() == "Kundforum") {
			found_link = true;
		}
	}
	BOOST_CHECK(found_link);

	html = file::read_test_file("kronofogden.html");
	parser.parse(html, "https://www.kronofogden.se/82374.html");
	BOOST_CHECK_EQUAL(parser.title(), "Fem tips om ekonomin förändras | Kronofogden");
	BOOST_CHECK_EQUAL(parser.h1(), "Fem tips om ekonomin förändras");
	BOOST_CHECK_EQUAL(parser.meta(), "");
	BOOST_CHECK(parser.should_insert());

	links = parser.links();
	found_link = false;
	for (const auto &link : links) {
		if (link.target_host() == "hallakonsument.se" &&
			link.target_path() == "/" &&
			link.text() == "Välkommen till Hallå konsument") {
			found_link = true;
		}
	}
	BOOST_CHECK(found_link);

	html = file::read_test_file("uppsala.html");
	parser.parse(html, "https://www.uppsala.se/stod-och-omsorg/privatekonomi-och-ekonomiskt-stod/boka-tid-for-budget--och-skuldradgivning/");
	BOOST_CHECK_EQUAL(parser.title(), "Budget- och skuldrådgivning hos Konsument Uppsala - Uppsala kommun");
	BOOST_CHECK_EQUAL(parser.h1(), "Budget- och skuldrådgivning hos Konsument Uppsala");
	BOOST_CHECK_EQUAL(parser.meta(), "Om du vill göra din egen hushållsbudget, vill ha ekonomisk rådgivning eller har skulder och inte får pengarna att räcka till kan du vända dig till Konsument Uppsala. ");
	BOOST_CHECK(parser.should_insert());

	links = parser.links();
	found_link = false;
	for (const auto &link : links) {
		if (link.target_host() == "outlook.office365.com" &&
			link.target_path() == "/owa/calendar/Budgetochskuldrdgivning@uppsalakommun1.onmicrosoft.com/bookings/" &&
			link.text() == "Boka tid online") {
			found_link = true;
		}
	}
	BOOST_CHECK(found_link);

	html = file::read_test_file("chessgames.com");
	parser.parse(html, "http://store.chessgames.com/chess-books/chess-notation-type/an---algebraic/author/s/alexander-cherniaev-anatoly-karpov-joe-gallagher-joel-r.-steed-miguel-a.-sanchez-richard-obrien/hardware-requirements/windows.html");
	BOOST_CHECK_EQUAL(parser.title(), "Chess Books : Windows, AN - Algebraic, Alexander Cherniaev, Anatoly Karpov, Joe Gallagher, Joel R. Steed, Miguel A. Sanchez and Richard O'Brien");
	BOOST_CHECK_EQUAL(parser.h1(), "Chess Books");
	BOOST_CHECK_EQUAL(parser.meta(), "Shop for Chess Books at US Chess Federation Sales. We offer the widest selection of Chess Books at the lowest prices with same-day shipping.Windows, AN - Algebraic, Alexander Cherniaev, Anatoly Karpov, Joe Gallagher, Joel R. Steed, Miguel A. Sanchez and Richard O'Brien");

	BOOST_CHECK_EQUAL(parser.links().size(), 0);
	BOOST_CHECK(parser.should_insert());

	html = file::read_test_file("acomesf.org");
	parser.parse(html, "http://acomesf.org/download/42104960-3er-congreso-acomesf/");
	BOOST_CHECK_EQUAL(parser.title(), "42104960 3er Congreso ACOMESF | Asociación Colombiana de Médicos Especialistas en Salud Familiar (ACOMESF");
	BOOST_CHECK_EQUAL(parser.h1(), "42104960 3er Congreso ACOMESF");
	BOOST_CHECK_EQUAL(parser.meta(), "");
	BOOST_CHECK(parser.should_insert());

	html = file::read_test_file("automobileszone.com");
	parser.parse(html, "http://automobileszone.com/wp-login.php?redirect_to=http%3A%2F%2Fautomobileszone.com%2Fbest-bronco-build-off-our-editors-weigh-in-on-their-ideal-suvs%2F");
	BOOST_CHECK_EQUAL(parser.text(), "Username or Email Address Password Remember Me Lost your password? ← Back to Automobiles Zone Log in with WordPress.com");
	BOOST_CHECK(parser.should_insert());

	html = file::read_test_file("vcareprojectmanagement.com");
	parser.parse(html, "https://vcareprojectmanagement.com/products/project-manager-project-management-certification-pmi-atp-authorised-training-provider-pmp-capm-2021-online-training-course-class");
	BOOST_CHECK_EQUAL(parser.h1(), "");
	BOOST_CHECK_EQUAL(parser.text(), "");
}

BOOST_AUTO_TEST_CASE(html_parser_encodings) {

	parser::html_parser parser;
	BOOST_CHECK(!parser.is_exotic_language("hej jag heter josef cullhed"));
	BOOST_CHECK(!parser.is_exotic_language("åäö"));
	BOOST_CHECK(!parser.is_exotic_language("Đảng,Đoàn thể - tnxp.hochiminhcity.gov.vn"));
	BOOST_CHECK(!parser.is_exotic_language("Maktspelet i Volvo : en skildring inifr&aring;n - Hans Nyman - Kartonnage (9789189323056) | Bokus"));

	BOOST_CHECK(parser.is_exotic_language("В КФУ проходят съемки короткометражного фильма в рамках проекта «Кино за 7 дней» | ВидеоПрокат+"));
	BOOST_CHECK(parser.is_exotic_language("2015-09-09から1日間の記事一覧 - Nani-Sore　何それ？"));
	BOOST_CHECK(parser.is_exotic_language("Ремонт Принтеров Hp в Спб Адреса | Ремонт принтеров"));
}

BOOST_AUTO_TEST_CASE(html_parser_long_text) {

	parser::html_parser parser(100000);
	string html = file::read_test_file("zlib_manual.html");

	parser.parse(html, "https://zlib.net/manual.html");

	string text = parser.text();
	BOOST_CHECK_EQUAL(text.substr(text.size() - 14), "# endif #endif");

	vector<string> words = text::get_expanded_full_text_words(text);

	bool has_word = false;
	for (const string &word : words) {
		if (word == "inflateinit2") has_word = true;
	}

	BOOST_CHECK(has_word);
}

/*
	test these links: <a href="http://skatteverket.se/">Skatteverket</A>
	here: http://nomell.se/2009/03/24/prisa-gud-har-kommer-skatteaterbaringen/
*/

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_hyper_ball.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "algorithm/hyper_ball.h"
#include "algorithm/algorithm.h"
#include <set>
#include <vector>

using namespace std;

BOOST_AUTO_TEST_SUITE(hyper_ball)

BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball) {

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 1),
			std::make_pair(1, 2),
			std::make_pair(2, 0),
			std::make_pair(2, 3),
			std::make_pair(3, 4),
			std::make_pair(3, 5),
			std::make_pair(4, 2),
			std::make_pair(5, 4),
		};
		const size_t n = 1000;
		vector<uint32_t> *edge_map = algorithm::set_to_edge_map(n, e);
		vector<double> h = algorithm::hyper_ball(n, edge_map);
		delete [] edge_map;
		BOOST_CHECK(h.size() == n);
		BOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001);
		BOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001);
		BOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001);
		BOOST_CHECK_EQUAL(h[6], 0.0);
	}

}

BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball2) {

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 1),
			std::make_pair(1, 5),
			std::make_pair(2, 5),
			std::make_pair(3, 2),
			std::make_pair(6, 2),
			std::make_pair(7, 3),
			std::make_pair(10, 7),
			std::make_pair(7, 9),
			std::make_pair(9, 3),
			std::make_pair(9, 6),
			std::make_pair(8, 9),
			std::make_pair(4, 8),
		};
		const size_t n = 1000;
		vector<uint32_t> *edge_map = algorithm::set_to_edge_map(n, e);
		vector<double> h = algorithm::hyper_ball(n, edge_map);
		delete [] edge_map;
		BOOST_CHECK(h.size() == n);
		BOOST_CHECK_CLOSE(h[5], 4.86666666667, 0.000001);
		BOOST_CHECK_CLOSE(h[8], 1.0, 0.000001);
		BOOST_CHECK_CLOSE(h[2], 3.91666666667, 0.000001);
	}

}

BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball3) {

	{
		set<pair<uint32_t, uint32_t>> e = {
			std::make_pair(0, 11),
			std::make_pair(1, 0),
			std::make_pair(2, 1),
			std::make_pair(3, 2),
			std::make_pair(3, 8),
			std::make_pair(4, 7),
			std::make_pair(5, 7),
			std::make_pair(6, 7),
			std::make_pair(7, 8),
			std::make_pair(10, 12),
			std::make_pair(11, 1),
			std::make_pair(11, 10),
			std::make_pair(12, 25),
			std::make_pair(13, 9),
			std::make_pair(13, 14),
			std::make_pair(14, 9),
			std::make_pair(14, 8),
			std::make_pair(14, 15),
			std::make_pair(15, 7),
			std::make_pair(19, 15),
			std::make_pair(20, 21),
			std::make_pair(21, 16),
			std::make_pair(21, 17),
			std::make_pair(21, 18),
			std::make_pair(21, 22),
			std::make_pair(22, 23),
			std::make_pair(23, 19),
			std::make_pair(24, 20),
			std::make_pair(24, 21),
			std::make_pair(24, 25),
			std::make_pair(25, 24),
		};
		const size_t n = 1000;
		vector<uint32_t> *edge_map = algorithm::set_to_edge_map(n, e);
		vector<double> h = algorithm::hyper_ball(n, edge_map);
		delete [] edge_map;
		BOOST_CHECK(h.size() == n);
		BOOST_CHECK_CLOSE(h[0], 2.33333333333, 0.000001);
		BOOST_CHECK_CLOSE(h[7], 7.25156232656, 0.000001);
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_hyper_log_log.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "algorithm/hyper_log_log.h"
#include <cstdlib>
#include <vector>

using namespace std;

BOOST_AUTO_TEST_SUITE(hyper_log_log)

BOOST_AUTO_TEST_CASE(hyper_simple) {
	{
		algorithm::hyper_log_log hl;

		BOOST_CHECK(hl.leading_zeros_plus_one(0x0ull) == 65);
		BOOST_CHECK(hl.leading_zeros_plus_one(0x1ull) == 64);
		BOOST_CHECK(hl.leading_zeros_plus_one(0xFFFFFFFFull) == 33);
		BOOST_CHECK(hl.leading_zeros_plus_one(0xFFFFFFFFull) == 33);
	}
}

BOOST_AUTO_TEST_CASE(hyper_inserts) {

	{
		algorithm::hyper_log_log hl;
		hl.insert(0);
		hl.insert(1);
		hl.insert(2);
		hl.insert(3);
		hl.insert(4);
		hl.insert(5);
		hl.insert(6);

		algorithm::hyper_log_log hl2;
		hl2.insert(0);
		hl2.insert(1);
		hl2.insert(2);
		hl2.insert(3);
		hl2.insert(4);
		hl2.insert(5);
		hl2.insert(7);

		algorithm::hyper_log_log hl3 = hl + hl2;
	}

	vector<size_t> intervals = {400000, 500000, 1000000, 10000000};

	for (size_t interval : intervals) {
		algorithm::hyper_log_log hl;
		for (size_t i = 0; i < interval; i++) {
			hl.insert(i);
		}
		BOOST_CHECK(std::abs((int)hl.count() - (int)interval) < interval * hl.error_bound());
	}

}

BOOST_AUTO_TEST_CASE(hyper_union) {
	algorithm::hyper_log_log hl1;
	algorithm::hyper_log_log hl2;

	for (size_t i = 0; i < 250000; i++) {
		hl1.insert(i);
	}
	for (size_t i = 250000; i < 500000; i++) {
		hl2.insert(i);
	}

	algorithm::hyper_log_log hl3 = hl1 + hl2;
	BOOST_CHECK(std::abs((int)hl3.count() - 500000) < 500000 * hl3.error_bound());
}

BOOST_AUTO_TEST_CASE(hyper_log_log_data_copy) {
	algorithm::hyper_log_log hl1;

	for (size_t i = 0; i < 250000; i++) {
		hl1.insert(i);
	}

	algorithm::hyper_log_log hl2(hl1.data(), hl1.b());

	BOOST_CHECK(std::abs((int)hl2.count() - 250000) < 250000 * hl1.error_bound());

	std::vector<size_t> sizes = {25000, 50000, 75000, 100000, 200000, 300000, 400000};

	srand(100);
	for (size_t size : sizes) {
		algorithm::hyper_log_log hll;
		for (size_t i = 0; i < size; i++) {
			size_t rnd = (((size_t)rand()) << 32) | ((size_t)rand());
			hll.insert(rnd);
		}
		BOOST_CHECK(std::abs((int)hll.count() - (int)size) < size * hl1.error_bound());
	}
}

BOOST_AUTO_TEST_CASE(hyper_log_log_test2) {
	algorithm::hyper_log_log hl1(10);

	const int sz = 100000;

	for (size_t i = 0; i < sz; i++) {
		hl1.insert(rand());
	}

	BOOST_CHECK(std::abs((int)hl1.count() - sz) < sz * hl1.error_bound());
}

BOOST_AUTO_TEST_CASE(hyper_log_log_move) {
	algorithm::hyper_log_log hl1(10);

	const int sz = 100000;

	for (size_t i = 0; i < sz; i++) {
		hl1.insert(rand());
	}

	auto hl2 = std::move(hl1);

	BOOST_CHECK(std::abs((int)hl2.count() - sz) < sz * hl1.error_bound());
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_index_builder.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "file/file.h"
#include "indexer/index_builder.h"
#include "indexer/index.h"
#include "indexer/generic_record.h"
#include "indexer/value_record.h"

BOOST_AUTO_TEST_SUITE(test_index_builder)

BOOST_AUTO_TEST_CASE(test_merge_with) {

	file::delete_directory("./0/full_text/test_index");
	file::create_directory("./0/full_text/test_index");

	{
		indexer::index_builder<indexer::value_record> idx("test_index", 0, 1000);

		idx.add(123, indexer::value_record(1000));
		idx.add(123, indexer::value_record(1001));
		idx.add(124, indexer::value_record(1000));

		idx.append();
		idx.merge();
	}
	{
		indexer::index<indexer::value_record> idx("test_index", 0, 1000);

		auto res1 = idx.find(123);
		auto res2 = idx.find(124);

		BOOST_REQUIRE_EQUAL(res1.size(), 2);
		BOOST_REQUIRE_EQUAL(res2.size(), 1);

		BOOST_CHECK_EQUAL(res1[0].m_value, 1000);
		BOOST_CHECK_EQUAL(res1[1].m_value, 1001);
		BOOST_CHECK_EQUAL(res2[0].m_value, 1000);
	}
	{
		indexer::index_builder<indexer::value_record> idx("test_index", 8, 1000);

		idx.add(123, indexer::value_record(1002));
		idx.add(123, indexer::value_record(1003));
		idx.add(124, indexer::value_record(1010));
		idx.add(125, indexer::value_record(1011));

		idx.append();
		idx.merge();
	}

	{
		indexer::index_builder<indexer::value_record> idx1("test_index", 0, 1000);
		indexer::index<indexer::value_record> idx2("test_index", 8, 1000);

		idx1.merge_with(idx2);
	}

	{
		indexer::index<indexer::value_record> idx("test_index", 0, 1000);

		auto res1 = idx.find(123);
		auto res2 = idx.find(124);
		auto res3 = idx.find(125);

		BOOST_REQUIRE_EQUAL(res1.size(), 4);
		BOOST_REQUIRE_EQUAL(res2.size(), 2);
		BOOST_REQUIRE_EQUAL(res3.size(), 1);

		BOOST_CHECK_EQUAL(res1[0].m_value, 1000);
		BOOST_CHECK_EQUAL(res1[1].m_value, 1001);
		BOOST_CHECK_EQUAL(res1[2].m_value, 1002);
		BOOST_CHECK_EQUAL(res1[3].m_value, 1003);
		BOOST_CHECK_EQUAL(res2[0].m_value, 1000);
		BOOST_CHECK_EQUAL(res2[1].m_value, 1010);
		BOOST_CHECK_EQUAL(res3[0].m_value, 1011);
	}
}

BOOST_AUTO_TEST_CASE(test_merge_with2) {

	file::delete_directory("./0/full_text/test_index");
	file::create_directory("./0/full_text/test_index");

	{
		indexer::index_builder<indexer::value_record> idx("test_index", 0, 1000);

		idx.add(123, indexer::value_record(1000));
		idx.add(123, indexer::value_record(1001));
		idx.add(124, indexer::value_record(1000));

		idx.append();
		idx.merge();
	}
	{
		indexer::index<indexer::value_record> idx("test_index", 0, 1000);

		auto res1 = idx.find(123);
		auto res2 = idx.find(124);

		BOOST_REQUIRE_EQUAL(res1.size(), 2);
		BOOST_REQUIRE_EQUAL(res2.size(), 1);

		BOOST_CHECK_EQUAL(res1[0].m_value, 1000);
		BOOST_CHECK_EQUAL(res1[1].m_value, 1001);
		BOOST_CHECK_EQUAL(res2[0].m_value, 1000);
	}
	{
		indexer::index_builder<indexer::value_record> idx("test_index", 8, 1000);

		idx.add(123, indexer::value_record(1002));
		idx.add(123, indexer::value_record(1003));
		idx.add(124, indexer::value_record(1010));
		idx.add(125, indexer::value_record(1011));

		idx.append();
		idx.merge();
	}

	{
		indexer::index_builder<indexer::value_record> idx1("test_index", 0, 1000);
		indexer::index<indexer::value_record> idx2("test_index", 8, 1000);

		idx1.merge_with(idx2);
	}

	{
		indexer::index<indexer::value_record> idx("test_index", 0, 1000);

		auto res1 = idx.find(123);
		auto res2 = idx.find(124);
		auto res3 = idx.find(125);

		BOOST_REQUIRE_EQUAL(res1.size(), 4);
		BOOST_REQUIRE_EQUAL(res2.size(), 2);
		BOOST_REQUIRE_EQUAL(res3.size(), 1);

		BOOST_CHECK_EQUAL(res1[0].m_value, 1000);
		BOOST_CHECK_EQUAL(res1[1].m_value, 1001);
		BOOST_CHECK_EQUAL(res1[2].m_value, 1002);
		BOOST_CHECK_EQUAL(res1[3].m_value, 1003);
		BOOST_CHECK_EQUAL(res2[0].m_value, 1000);
		BOOST_CHECK_EQUAL(res2[1].m_value, 1010);
		BOOST_CHECK_EQUAL(res3[0].m_value, 1011);
	}
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_index_iteration.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include <mutex>
#include "indexer/sharded_builder.h"
#include "indexer/sharded.h"
#include "indexer/basic_index_builder.h"
#include "indexer/basic_index.h"
#include "indexer/counted_record.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_index_iteration)

BOOST_AUTO_TEST_CASE(test_index_iteration) {

	{
		indexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> idx("test_index", 10);
		idx.truncate();

		idx.add(100, indexer::counted_record(1000));
		idx.add(101, indexer::counted_record(1001));
		idx.add(101, indexer::counted_record(1002));
		idx.add(102, indexer::counted_record(1003));

		idx.append();
		idx.merge();
	}

	indexer::sharded<indexer::basic_index, indexer::counted_record> idx("test_index", 10);

	std::vector<uint64_t> found_keys;
	std::vector<uint64_t> found_values;
	std::mutex lock;
	idx.for_each([&lock, &found_keys, &found_values](uint64_t key, const std::vector<indexer::counted_record> &recs) {

		std::lock_guard grd(lock);

		found_keys.push_back(key);
		for (auto &rec : recs) {
			found_values.push_back(rec.m_value);
		}

	});

	std::sort(found_keys.begin(), found_keys.end());
	std::sort(found_values.begin(), found_values.end());

	BOOST_CHECK(found_keys[0] == 100);
	BOOST_CHECK(found_keys[1] == 101);
	BOOST_CHECK(found_keys[2] == 102);
	BOOST_CHECK(found_keys.size() == 3);

	BOOST_CHECK(found_values[0] == 1000);
	BOOST_CHECK(found_values[1] == 1001);
	BOOST_CHECK(found_values[2] == 1002);
	BOOST_CHECK(found_values[3] == 1003);
	BOOST_CHECK(found_values.size() == 4);

}

BOOST_AUTO_TEST_CASE(test_index_iteration2) {

	{
		indexer::sharded_builder<indexer::basic_index_builder, indexer::counted_record> idx("test_index", 10);
		idx.truncate();

		for (size_t i = 1; i <= 10000; i++) {
			idx.add(i % 10, indexer::counted_record(i));
			idx.add(i % 100, indexer::counted_record(i));
			idx.add(i % 7, indexer::counted_record(i));
			idx.add(i % 13, indexer::counted_record(i));
		}

		idx.append();
		idx.merge();
	}

	indexer::sharded<indexer::basic_index, indexer::counted_record> idx("test_index", 10);

	std::map<uint64_t, std::vector<size_t>> records;
	std::mutex lock;
	idx.for_each([&lock, &records](uint64_t key, const std::vector<indexer::counted_record> &recs) {

		std::lock_guard grd(lock);

		for (auto &rec : recs) {
			records[key].push_back(rec.m_value);
		}

	});

	for (size_t i = 1; i <= 10000; i++) {
		BOOST_CHECK(std::find(records[i % 10].begin(), records[i % 10].end(), i) != records[i % 10].end());
		BOOST_CHECK(std::find(records[i % 100].begin(), records[i % 100].end(), i) != records[i % 100].end());
		BOOST_CHECK(std::find(records[i % 7].begin(), records[i % 7].end(), i) != records[i % 7].end());
		BOOST_CHECK(std::find(records[i % 13].begin(), records[i % 13].end(), i) != records[i % 13].end());
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_index_reader.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "indexer/index_builder.h"
#include "indexer/index.h"
#include "indexer/generic_record.h"
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include "URL.h"
#include "text/text.h"
#include "profiler/profiler.h"
#include "roaring/roaring.hh"

BOOST_AUTO_TEST_SUITE(test_index_reader)

BOOST_AUTO_TEST_CASE(test_index_reader1) {

	{
		indexer::index_builder<indexer::generic_record> idx("test_db", 0, 1000);

		idx.truncate();

		idx.add(100, indexer::generic_record(1000));
		idx.add(100, indexer::generic_record(1001));
		idx.add(100, indexer::generic_record(1002));

		idx.append();
		idx.merge();

	}

	{
		ifstream reader("./0/full_text/test_db/0.data", ios::binary);
		reader.seekg(0, ios::end);
		size_t file_size = reader.tellg();
		reader.seekg(0, ios::beg);
		char *buffer = new char[file_size];
		reader.read(buffer, file_size);

		std::string file_data(buffer, file_size);

		std::istringstream ram_reader(file_data);

		indexer::index<indexer::generic_record> idx(&ram_reader, 1000);

		vector<indexer::generic_record> res = idx.find(100);

		BOOST_REQUIRE(res.size() == 3);
		BOOST_CHECK(res[0].m_value == 1000);
		BOOST_CHECK(res[1].m_value == 1001);
		BOOST_CHECK(res[2].m_value == 1002);

		delete[] buffer;
	}

}

BOOST_AUTO_TEST_CASE(test_index_reader_2) {

	/*
	{
		indexer::index_builder<indexer::url_record> idx("restaurantbusinessonline.com");
		idx.set_hash_table_size(1000);

		idx.truncate();

		const vector<size_t> cols = {1, 2, 3, 4};

		vector<string> files;

		boost::filesystem::path p ("./output");
		boost::filesystem::directory_iterator end_itr;

		for (boost::filesystem::directory_iterator itr(p); itr != end_itr; ++itr) {
			// If it's not a directory, list it. If you want to list directories too, just remove this check.
			if (boost::filesystem::is_regular_file(itr->path())) {
				// assign current file name to current_file and echo it out to the console.
				string current_file = itr->path().string();
				files.push_back(current_file);
			}
		}

		size_t num_added = 0;
		size_t num_bytes_added = 0;

		for (const string &local_path : files) {

			ifstream infile(local_path, ios::in);
			boost::iostreams::filtering_istream decompress_stream;
			decompress_stream.push(boost::iostreams::gzip_decompressor());
			decompress_stream.push(infile);

			string line;
			while (getline(decompress_stream, line)) {
				vector<string> col_values;
				boost::algorithm::split(col_values, line, boost::is_any_of("\t"));

				URL url(col_values[0]);

				if (url.host() != "doodlecraftblog.com") continue;

				num_added++;

				uint64_t url_hash = url.hash();

				for (size_t col : cols) {
					vector<string> words = text::get_full_text_words(col_values[col]);
					for (const string &word : words) {
						num_bytes_added += word.size();
						idx.add(::algorithm::hash(word), ::indexer::url_record(url_hash));
					}
				}
			}
		}

		num_added++;

		cout << "ADDED " << num_added << " URLS" << endl;
		cout << num_bytes_added << " bytes" << endl;

		idx.append();
		idx.merge();

	}

	{
		logger::verbose(true);
		profiler::instance prof("load index file to ram");
		ifstream reader("restaurantbusinessonline.com.data", ios::binary);
		reader.seekg(0, ios::end);
		size_t file_size = reader.tellg();
		reader.seekg(0, ios::beg);
		char *buffer = new char[file_size];
		reader.read(buffer, file_size);
		prof.stop();

		indexer::index_reader_ram ram_reader(buffer, file_size);

		indexer::index<indexer::generic_record> idx((indexer::index_reader *)&ram_reader, 1000);

		cout << "file_size: " << file_size << endl;
		idx.print_stats();

		vector<indexer::generic_record> res = idx.find(::algorithm::hash("helicopter"));

		BOOST_REQUIRE(res.size() > 0);

		delete buffer;
	}*/

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_logger.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "logger/logger.h"
#include "config.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_logger)

BOOST_AUTO_TEST_CASE(test_logger1) {

	logger::log_string("test1");
	logger::log_string("test2");

	logger::sync();

	ifstream logfile(config::log_file_path);
	logfile.seekg(-12, std::ios::end);
	string line1, line2;
	getline(logfile, line1);
	getline(logfile, line2);

	BOOST_CHECK_EQUAL(line1, "test1");
	BOOST_CHECK_EQUAL(line2, "test2");
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_memory.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "memory/memory.h"
#include "memory/debugger.h"
#include "indexer/index_builder.h"
#include "indexer/basic_index_builder.h"
#include "indexer/domain_link_record.h"

BOOST_AUTO_TEST_SUITE(test_memory)

BOOST_AUTO_TEST_CASE(test_memory) {
	memory::update();

	BOOST_CHECK(memory::get_available_memory() > 0);
	BOOST_CHECK(memory::get_total_memory() > 0);

	const size_t used1 = memory::allocated_memory();

	const size_t memlen = 1000000;
	char *some_mem = new char[memlen];
	for (size_t i = 0; i < memlen; i++) {
		some_mem[i] = 1;
	}
	memory::update();

	const size_t used2 = memory::allocated_memory();
	delete[] some_mem;
	const size_t used3 = memory::allocated_memory();

	std::cout << "used1: " << used1 << std::endl;
	std::cout << "used2: " << used2 << std::endl;
	std::cout << "used3: " << used3 << std::endl;
	BOOST_CHECK(used1 + 1000000 == used2);
	BOOST_CHECK(used1 == used3);
}

/*
 * Test memory consumtion during merge, should end with same amount.
 * */
BOOST_AUTO_TEST_CASE(test_indexer_memory) {
	memory::update();

	indexer::create_db_directories("domain_link_index");

	BOOST_CHECK(memory::get_available_memory() > 0);
	BOOST_CHECK(memory::get_total_memory() > 0);

	size_t memuse1, memuse2, memuse3, memuse4;
	memuse1 = memory::allocated_memory();

	{
		indexer::basic_index_builder<indexer::domain_link_record> idx("domain_link_index", 97ull);

		memuse2 = memory::allocated_memory();
		idx.append();
		idx.merge();
		memuse3 = memory::allocated_memory();
	}

	memuse4 = memory::allocated_memory();

	BOOST_CHECK(memuse1 == memuse4);
	BOOST_CHECK(memuse2 == memuse3);

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_n_gram.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "text/text.h"
#include "algorithm/hash.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(n_gram)

BOOST_AUTO_TEST_CASE(words_to_ngram) {
	vector<uint64_t> ngrams;
	text::words_to_ngram_hash({"the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"}, 3, [&ngrams](const uint64_t hash) {
		ngrams.push_back(hash);
	});

	BOOST_CHECK_EQUAL(ngrams[0], algorithm::hash("the"));
	BOOST_CHECK_EQUAL(ngrams[1], algorithm::hash("the quick"));
	BOOST_CHECK_EQUAL(ngrams[2], algorithm::hash("the quick brown"));

	BOOST_CHECK_EQUAL(ngrams[3], algorithm::hash("quick"));
	BOOST_CHECK_EQUAL(ngrams[4], algorithm::hash("quick brown"));
	BOOST_CHECK_EQUAL(ngrams[5], algorithm::hash("quick brown fox"));

	BOOST_CHECK_EQUAL(ngrams[6], algorithm::hash("brown"));
	BOOST_CHECK_EQUAL(ngrams[7], algorithm::hash("brown fox"));
	BOOST_CHECK_EQUAL(ngrams[8], algorithm::hash("brown fox jumps"));

	BOOST_CHECK_EQUAL(ngrams[18], algorithm::hash("the"));
	BOOST_CHECK_EQUAL(ngrams[19], algorithm::hash("the lazy"));
	BOOST_CHECK_EQUAL(ngrams[20], algorithm::hash("the lazy dog"));

	BOOST_CHECK_EQUAL(ngrams[21], algorithm::hash("lazy"));
	BOOST_CHECK_EQUAL(ngrams[22], algorithm::hash("lazy dog"));
	BOOST_CHECK_EQUAL(ngrams[23], algorithm::hash("dog"));

	BOOST_CHECK_EQUAL(ngrams.size(), 24);

}

BOOST_AUTO_TEST_CASE(n_gram2) {

	vector<uint64_t> ngrams;
	text::words_to_ngram_hash({"i", "liberoklubben", "här"}, 3, [&ngrams](const uint64_t hash, const std::string &word) {
		ngrams.push_back(hash);
	});

	BOOST_CHECK_EQUAL(ngrams[0], algorithm::hash("i"));
	BOOST_CHECK_EQUAL(ngrams[1], algorithm::hash("i liberoklubben"));
	BOOST_CHECK_EQUAL(ngrams[2], algorithm::hash("i liberoklubben här"));

	BOOST_CHECK_EQUAL(ngrams[3], algorithm::hash("liberoklubben"));
	BOOST_CHECK_EQUAL(ngrams[4], algorithm::hash("liberoklubben här"));
	BOOST_CHECK_EQUAL(ngrams[5], algorithm::hash("här"));

	BOOST_CHECK_EQUAL(ngrams.size(), 6);
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_robot_parser.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "robots.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(robot_parser)

BOOST_AUTO_TEST_CASE(parse) {
	std::string robots_content = "Sitemap: https://www.omnible.se/sitemap.xml\n"
		"User-agent: AlexandriaBot\n"
		"Disallow: *\n"
		"User-agent: *   # all agents\n"
		"Disallow: /*crawl=no*\n"
		"Disallow: /basket/add*\n"
	;
	std::string user_agent = "AlexandriaBot";
	googlebot::RobotsMatcher matcher;
	std::string url = "/visit";
	bool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url);
	BOOST_CHECK(!allowed);
}

BOOST_AUTO_TEST_CASE(parse2) {
	std::string robots_content = string("Sitemap: https://www.omnible.se/sitemap.xml\n"
		"User-agent: *\n"
		"Disallow: /visit\n"
		"User-agent: AlexandriaBot\n"
		"Disallow: /10126597891759986715\n");

	std::string user_agent = "AlexandriaBot";
	googlebot::RobotsMatcher matcher;
	{
		std::string url = "https://www.omnible.se/10126597891759986715";
		bool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url);
		BOOST_CHECK(!allowed);
	}
	{
		std::string url = "https://www.omnible.se/1012659789175998671";
		bool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url);
		BOOST_CHECK(allowed);
	}

}


BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_scraper.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include <boost/algorithm/string.hpp>
#include "scraper/scraper.h"
#include <queue>
#include <vector>

using namespace std;

BOOST_AUTO_TEST_SUITE(test_scraper)

BOOST_AUTO_TEST_CASE(test_scraper) {

	scraper::scraper_store store;

	scraper::scraper scraper("omnible.se", &store);
	scraper.set_timeout(0);
	scraper.push_url(URL("http://omnible.se/"));
	scraper.push_url(URL("http://omnible.se/10126597891759986715"));
	scraper.push_url(URL("http://omnible.se/10123997891267016458"));
	scraper.push_url(URL("http://omnible.se/gtin/9789180230865"));
	scraper.push_url(URL("http://omnible.se/10123697814011564169"));
	scraper.push_url(URL("https://www.omnible.se/notfound"));
	scraper.push_url(URL("https://www.omnible.se/gtin/9789177714958"));

	scraper.run();

	string last = store.tail();
	vector<string> cols;
	boost::algorithm::split(cols, last, boost::is_any_of("\t"));
	BOOST_CHECK_EQUAL(cols[0], "https://www.omnible.se/10123697814011564169");
	BOOST_CHECK_EQUAL(cols[1], "Den sista gåvan av Abdulrazak Gurnah - recensioner & prisjämförelse - Omnible");
}

BOOST_AUTO_TEST_CASE(scraper_multithreaded) {

	return;

	vector<string> urls = {
		/*"http://omnible.se/",
		"http://omnible.se/10126597891759986715",
		"http://omnible.se/10123997891267016458",
		"https://spelagratis.nu/",
		"https://spelagratis.nu/super_mario_world.html",
		"http://omnible.se/gtin/9789180230865",
		"http://omnible.se/10123697814011564169",
		"https://spelagratis.nu/dirt_bike.html"*/
		"http://optout.aboutads.info/",
		"http://tabernus.com/",
		"http://tabernus.com/test",
		"http://apnews.excite.com/article/20071031/D8SKBRKO0.html",
		"http://thebetter.wiki/en/Jeb_Magruder",
		"https://www.thebetter.wiki/en/testing"
	};

	scraper::run_scraper_on_urls(urls);
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_sharded_index_builder.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "indexer/index_manager.h"
#include "indexer/sharded_index_builder.h"
#include "indexer/sharded_index.h"
#include "indexer/merger.h"
#include "text/text.h"
#include "algorithm/hash.h"
#include "transfer/transfer.h"

BOOST_AUTO_TEST_SUITE(test_sharded_index_builder)

BOOST_AUTO_TEST_CASE(test_sharded_index_builder) {

	{
		indexer::sharded_index_builder<indexer::generic_record> idx("test_index", 10);

		idx.truncate();

		idx.add(101, indexer::generic_record(1000, 1.0f));
		idx.add(102, indexer::generic_record(1001, 1.0f));

		idx.append();
		idx.merge();
	}

	{
		indexer::sharded_index<indexer::generic_record> idx("test_index", 10);
		vector<indexer::generic_record> res = idx.find(101);

		BOOST_REQUIRE(res.size() == 1);
		BOOST_CHECK(res[0].m_value == 1000);
	}

}

BOOST_AUTO_TEST_CASE(test_group_by) {

	using indexer::domain_link_record;

	{
		indexer::sharded_index_builder<domain_link_record> idx("test_index", 1);

		idx.truncate();

		idx.add(101, domain_link_record(1000, 1.0f, 200));
		idx.add(101, domain_link_record(1004, 1.0f, 300));
		idx.add(101, domain_link_record(1001, 1.0f, 200));
		idx.add(101, domain_link_record(1003, 1.0f, 300));
		idx.add(101, domain_link_record(1002, 1.0f, 200));

		idx.add(102, domain_link_record(1000, 1.0f, 200));
		idx.add(102, domain_link_record(1001, 1.0f, 200));
		idx.add(102, domain_link_record(1005, 1.0f, 300));
		idx.add(102, domain_link_record(1002, 1.0f, 200));

		idx.add(103, domain_link_record(1000, 1.0f, 200));
		idx.add(103, domain_link_record(1001, 1.0f, 200));
		idx.add(103, domain_link_record(1004, 1.0f, 300));
		idx.add(103, domain_link_record(1002, 1.0f, 200));

		idx.append();
		idx.merge();
		idx.optimize();
	}

	{
		indexer::sharded_index<domain_link_record> idx("test_index", 1);

		auto identity = [](float score) {
			return score;
		};
		std::vector<size_t> counts;
		vector<domain_link_record> res = idx.find_group_by({101, 102}, identity, counts);

		BOOST_REQUIRE(res.size() == 1);
		BOOST_CHECK(res[0].m_score == 3.0f);
		BOOST_CHECK(counts[0] == 3);
	}

	{
		indexer::sharded_index<domain_link_record> idx("test_index", 1);
		auto times_two = [](float score) {
			return 2.0f * score;
		};
		std::vector<size_t> counts;
		vector<domain_link_record> res = idx.find_group_by({101, 103}, times_two, counts);

		BOOST_REQUIRE(res.size() == 2);

		sort(res.begin(), res.end(), domain_link_record::storage_order());
		BOOST_CHECK(res[0].m_score == 2.0f * (3.0f));
		BOOST_CHECK(res[1].m_score == 2.0f * (1.0f));
		BOOST_CHECK(counts[0] == 3);
		BOOST_CHECK(counts[1] == 1);
	}

}

BOOST_AUTO_TEST_CASE(test_score_mod) {

	using indexer::domain_record;

	{
		indexer::sharded_index_builder<domain_record> idx("test_index", 1);

		idx.truncate();

		idx.add(101, domain_record(1000, 1.0f));
		idx.add(101, domain_record(1004, 1.0f));
		idx.add(101, domain_record(1001, 1.0f));
		idx.add(101, domain_record(1003, 1.0f));
		idx.add(101, domain_record(1002, 1.0f));

		idx.add(102, domain_record(1000, 1.0f));
		idx.add(102, domain_record(1001, 1.0f));
		idx.add(102, domain_record(1005, 1.0f));
		idx.add(102, domain_record(1002, 1.0f));

		idx.append();
		idx.merge();
		idx.optimize();
	}

	{
		/*
		 * intersected records will be in this order:
		 * 1000
		 * 1001
		 * 1002
		 *
		 * so score modification will take place in that order.
		 *
		 * */
		indexer::sharded_index<domain_record> idx("test_index", 1);
		uint64_t sum_id = 0;
		vector<domain_record> res = idx.find_top({101, 102}, 2,
				[&sum_id](const domain_record &val) -> float {
					return (float)(sum_id++);
				});

		BOOST_REQUIRE(res.size() == 2);
		BOOST_CHECK(res[0].m_score == 2.0f);
		BOOST_CHECK(res[0].m_value == 1002);
		BOOST_CHECK(res[1].m_score == 1.0f);
		BOOST_CHECK(res[1].m_value == 1001);
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_sort.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "algorithm/sort.h"
#include <vector>

using namespace std;

BOOST_AUTO_TEST_SUITE(test_sort)

struct test_data_struct1 {
	int data1;
	int data2;
};

BOOST_AUTO_TEST_CASE(merge_arrays) {

	{
		vector<int> arr1 = {1, 2, 3};
		vector<int> arr2 = {4, 5, 6};
		vector<int> arr3;
		vector<int> arr4{1, 2, 3, 4, 5, 6};

		algorithm::sort::merge_arrays(arr1, arr2, arr3);

		BOOST_CHECK(arr3 == arr4);
	}

	{
		vector<int> arr1 = {1, 2, 3};
		vector<int> arr2 = {3, 4, 5, 6};
		vector<int> arr3;
		vector<int> arr4{1, 2, 3, 3, 4, 5, 6};

		algorithm::sort::merge_arrays(arr1, arr2, arr3);

		BOOST_CHECK(arr3 == arr4);
	}

	{
		vector<int> arr1 = {};
		vector<int> arr2 = {3, 4, 5, 6};
		vector<int> arr3;
		vector<int> arr4{3, 4, 5, 6};

		algorithm::sort::merge_arrays(arr1, arr2, arr3);

		BOOST_CHECK(arr3 == arr4);
	}

}

BOOST_AUTO_TEST_CASE(merge_arrays_of_struct) {

	{
		vector<struct test_data_struct1> arr1{test_data_struct1{.data1 = 1, .data2 = 2}};
		vector<struct test_data_struct1> arr2{test_data_struct1{.data1 = 2, .data2 = 3}};
		vector<struct test_data_struct1> arr3;
		vector<struct test_data_struct1> arr4{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 2, .data2 = 3}};

		algorithm::sort::merge_arrays(arr1, arr2, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) {
			return a.data1 < b.data1;
		}, arr3);

		BOOST_CHECK(arr3[0].data1 == arr4[0].data1 && arr3[0].data2 == arr4[0].data2);
		BOOST_CHECK(arr3[1].data1 == arr4[1].data1 && arr3[1].data2 == arr4[1].data2);
	}

	{
		vector<struct test_data_struct1> arr1{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 3, .data2 = 4}};
		vector<struct test_data_struct1> arr2{test_data_struct1{.data1 = 2, .data2 = 3}};
		vector<struct test_data_struct1> arr3;
		vector<struct test_data_struct1> arr4{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 2, .data2 = 3},
			test_data_struct1{.data1 = 3, .data2 = 4}};

		algorithm::sort::merge_arrays(arr1, arr2, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) {
			return a.data1 < b.data1;
		}, arr3);

		BOOST_CHECK(arr3[0].data1 == arr4[0].data1 && arr3[0].data2 == arr4[0].data2);
		BOOST_CHECK(arr3[1].data1 == arr4[1].data1 && arr3[1].data2 == arr4[1].data2);
		BOOST_CHECK(arr3[2].data1 == arr4[2].data1 && arr3[2].data2 == arr4[2].data2);
	}

}

BOOST_AUTO_TEST_CASE(merge_many_arrays) {

	{
		vector<int> arr1 = {1, 2, 3};
		vector<int> arr2 = {4, 5, 6};
		vector<int> arr3 = {7, 8, 9};
		vector<int> res;
		vector<vector<int>> inp{arr1, arr2, arr3};
		vector<int> corr{1, 2, 3, 4, 5, 6, 7, 8, 9};

		algorithm::sort::merge_arrays(inp, res);

		BOOST_CHECK(res == corr);
	}

	{
		vector<int> arr1 = {1, 3, 6};
		vector<int> arr2 = {2, 4, 9};
		vector<int> arr3 = {1, 5, 7, 8};
		vector<int> res;
		vector<vector<int>> inp{arr1, arr2, arr3};
		vector<int> corr{1, 1, 2, 3, 4, 5, 6, 7, 8, 9};

		algorithm::sort::merge_arrays(inp, res);

		BOOST_CHECK(res == corr);
	}
}

BOOST_AUTO_TEST_CASE(merge_many_arrays_of_struct) {

	{
		vector<struct test_data_struct1> arr1{
			test_data_struct1{.data1 = 1, .data2 = 11},
			test_data_struct1{.data1 = 2, .data2 = 12},
			test_data_struct1{.data1 = 3, .data2 = 13}
		};
		vector<struct test_data_struct1> arr2 = {
			test_data_struct1{.data1 = 4, .data2 = 14},
			test_data_struct1{.data1 = 5, .data2 = 15},
			test_data_struct1{.data1 = 6, .data2 = 16}
		};
		vector<struct test_data_struct1> arr3 = {
			test_data_struct1{.data1 = 7, .data2 = 17},
			test_data_struct1{.data1 = 8, .data2 = 18},
			test_data_struct1{.data1 = 9, .data2 = 19}
		};
		vector<struct test_data_struct1> res;
		vector<vector<struct test_data_struct1>> inp{arr1, arr2, arr3};
		vector<struct test_data_struct1> corr{
			test_data_struct1{.data1 = 1, .data2 = 11},
			test_data_struct1{.data1 = 2, .data2 = 12},
			test_data_struct1{.data1 = 3, .data2 = 13},
			test_data_struct1{.data1 = 4, .data2 = 14},
			test_data_struct1{.data1 = 5, .data2 = 15},
			test_data_struct1{.data1 = 6, .data2 = 16},
			test_data_struct1{.data1 = 7, .data2 = 17},
			test_data_struct1{.data1 = 8, .data2 = 18},
			test_data_struct1{.data1 = 9, .data2 = 19}
		};

		algorithm::sort::merge_arrays(inp, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) {
			return a.data1 < b.data1;
		}, res);

		BOOST_CHECK(corr.size() == res.size());
		for (size_t i = 0; i < corr.size(); i++) {
			BOOST_CHECK(res[i].data1 == corr[i].data1 && res[i].data2 == corr[i].data2);
		}
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_sum_sorted.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include <vector>
#include "algorithm/sum_sorted.h"
#include "indexer/counted_record.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_sum_sorted, * boost::unit_test::tolerance(0.00001))

BOOST_AUTO_TEST_CASE(test_sum_sorted1) {

	vector<vector<int>> sorted = {
		{1, 2, 3},
		{2, 3},
		{3}
	};
	vector<int> res = ::algorithm::sum_sorted<int>(sorted, [](int &a, const int &b) {
		a += b;
	});

	BOOST_REQUIRE(res.size() == 3);
	BOOST_CHECK(res[0] == 1);
	BOOST_CHECK(res[1] == 4);
	BOOST_CHECK(res[2] == 9);
}

BOOST_AUTO_TEST_CASE(test_sum_sorted2) {

	vector<vector<int>> sorted = {
		{3},
		{2, 3},
		{1, 2, 3},
	};
	vector<int> res = ::algorithm::sum_sorted<int>(sorted, [](int &a, const int &b) {
		a += b;
	});

	BOOST_REQUIRE(res.size() == 3);
	BOOST_CHECK(res[0] == 1);
	BOOST_CHECK(res[1] == 4);
	BOOST_CHECK(res[2] == 9);
}

BOOST_AUTO_TEST_CASE(test_sum_sorted3) {

	vector<vector<indexer::counted_record>> sorted = {
		{indexer::counted_record(3, 0.1)},
		{indexer::counted_record(2, 0.1), indexer::counted_record(3, 0.1)},
		{indexer::counted_record(1, 0.1), indexer::counted_record(2, 0.1), indexer::counted_record(3, 0.1)},
	};
	vector<indexer::counted_record> res = ::algorithm::sum_sorted<indexer::counted_record>(sorted,
			[](indexer::counted_record &a, const indexer::counted_record &b) {
		a.m_score += b.m_score;
	});

	BOOST_REQUIRE(res.size() == 3);
	BOOST_CHECK_EQUAL(res[0].m_score, 0.1f);
	BOOST_CHECK_EQUAL(res[1].m_score, 0.2f);
	BOOST_CHECK_EQUAL(res[2].m_score, 0.3f);
}

BOOST_AUTO_TEST_CASE(test_sum_sorted4) {

	vector<vector<indexer::counted_record>> sorted = {
		{indexer::counted_record(1, 0.1), indexer::counted_record(2, 0.2), indexer::counted_record(3, 0.3)},
		{indexer::counted_record(10, 0.4), indexer::counted_record(25, 0.5), indexer::counted_record(30, 0.6)},
		{indexer::counted_record(1, 0.7), indexer::counted_record(25, 0.8), indexer::counted_record(40, 0.9)},
	};
	vector<indexer::counted_record> res = ::algorithm::sum_sorted<indexer::counted_record>(sorted,
			[](indexer::counted_record &a, const indexer::counted_record &b) {
		a.m_score += b.m_score;
	});

	BOOST_REQUIRE(res.size() == 7);
	BOOST_CHECK_EQUAL(res[0].m_score, 0.8f);
	BOOST_CHECK_EQUAL(res[1].m_score, 0.2f);
	BOOST_CHECK_EQUAL(res[2].m_score, 0.3f);
	BOOST_CHECK_EQUAL(res[3].m_score, 0.4f);
	BOOST_CHECK_EQUAL(res[4].m_score, 1.3f);
	BOOST_CHECK_EQUAL(res[5].m_score, 0.6f);
	BOOST_CHECK_EQUAL(res[6].m_score, 0.9f);

	BOOST_CHECK_EQUAL(res[0].m_value, 1);
	BOOST_CHECK_EQUAL(res[1].m_value, 2);
	BOOST_CHECK_EQUAL(res[2].m_value, 3);
	BOOST_CHECK_EQUAL(res[3].m_value, 10);
	BOOST_CHECK_EQUAL(res[4].m_value, 25);
	BOOST_CHECK_EQUAL(res[5].m_value, 30);
	BOOST_CHECK_EQUAL(res[6].m_value, 40);
}

BOOST_AUTO_TEST_CASE(test_sum_sorted5) {

	vector<vector<int>> sorted = {
		{1, 2, 3},
		{}
	};
	vector<int> res = ::algorithm::sum_sorted<int>(sorted, [](int &a, const int &b) {
		a += b;
	});

	BOOST_REQUIRE(res.size() == 3);
	BOOST_CHECK(res[0] == 1);
	BOOST_CHECK(res[1] == 2);
	BOOST_CHECK(res[2] == 3);
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_text.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "text/text.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_text)

BOOST_AUTO_TEST_CASE(get_full_text_words) {
	{
		vector<string> words = text::get_full_text_words("C++ map. is the, best thing");
		BOOST_CHECK_EQUAL(words[0], "c++");
		BOOST_CHECK_EQUAL(words[1], "map");
		BOOST_CHECK_EQUAL(words[2], "is");
		BOOST_CHECK_EQUAL(words[3], "the");
		BOOST_CHECK_EQUAL(words[4], "best");
		BOOST_CHECK_EQUAL(words[5], "thing");
	}

	{
		vector<string> words = text::get_full_text_words("C# is also good.");
		BOOST_CHECK_EQUAL(words[0], "c#");
		BOOST_CHECK_EQUAL(words[1], "is");
		BOOST_CHECK_EQUAL(words[2], "also");
		BOOST_CHECK_EQUAL(words[3], "good");
	}
}

BOOST_AUTO_TEST_CASE(get_tokens) {
	vector<uint64_t> tokens = text::get_tokens("My name is Josef Cullhed");

	vector<uint64_t> targets = {
		algorithm::hash("my"),
		algorithm::hash("name"),
		algorithm::hash("is"),
		algorithm::hash("josef"),
		algorithm::hash("cullhed"),
	};

	BOOST_CHECK(tokens == targets);
}

BOOST_AUTO_TEST_CASE(get_tokens2) {
	vector<uint64_t> tokens = text::get_tokens("Test. Ing! the    test   +function+");

	vector<uint64_t> targets = {
		algorithm::hash("test"),
		algorithm::hash("ing"),
		algorithm::hash("the"),
		algorithm::hash("test"),
		algorithm::hash("+function+"),
	};

	BOOST_CHECK(tokens == targets);
}

BOOST_AUTO_TEST_CASE(get_tokens3) {
	vector<uint64_t> tokens = text::get_expanded_full_text_tokens("Test. Ing! the    test   +func-tion+");

	vector<uint64_t> targets = {
		algorithm::hash("test"),
		algorithm::hash("ing"),
		algorithm::hash("the"),
		algorithm::hash("test"),
		algorithm::hash("+func-tion+"),
		algorithm::hash("+func"),
		algorithm::hash("tion+"),
	};

	BOOST_CHECK(tokens == targets);
}

BOOST_AUTO_TEST_CASE(get_snippets) {
	{
		vector<string> snippets = text::get_snippets("A small text that should fit in one snippet");

		BOOST_REQUIRE(snippets.size() == 1);
		BOOST_CHECK(snippets[0] == "A small text that should fit in one snippet");
	}
	{
		vector<string> snippets = text::get_snippets(" The zlib compression library provides in-memory compression and decompression functions, including integrity checks of the uncompressed data. This version of the library supports only one compression method (deflation) but other algorithms will be added later and will have the same stream interface.  Compression can be done in a single step if the buffers are large enough (for example if an input file is mmap'ed), or can be done by repeated calls of the compression function. In the latter case, the application must provide more input and/or consume the output (providing more output space) before each call. ");

		BOOST_REQUIRE(snippets.size() == 3);
	}
}

BOOST_AUTO_TEST_CASE(get_words_without_stopwords) {
	vector<string> words = text::get_words_without_stopwords("Hej asd!asd jag, heter! !josef. cullhed 	\
		jfoidjfoai823hr9hfhwe9f8hshgohewogiqhoih");

	BOOST_CHECK_EQUAL(words.size(), 8);
	BOOST_CHECK_EQUAL(words[0], "hej");
	BOOST_CHECK_EQUAL(words[1], "asd");
	BOOST_CHECK_EQUAL(words[2], "asd");
	BOOST_CHECK_EQUAL(words[3], "jag");
	BOOST_CHECK_EQUAL(words[4], "heter");
	BOOST_CHECK_EQUAL(words[5], "josef");
	BOOST_CHECK_EQUAL(words[6], "cullhed");
	BOOST_CHECK_EQUAL(words[7], "jfoidjfoai823hr9hfhwe9f8hshgohewogiqhoih");
}

BOOST_AUTO_TEST_CASE(clean_word) {

	BOOST_CHECK_EQUAL(text::clean_word("hej"), "hej");
	BOOST_CHECK_EQUAL(text::clean_word("åäö"), "åäö");
	BOOST_CHECK_EQUAL(text::clean_word("123"), "123");
	BOOST_CHECK_EQUAL(text::clean_word("$Üç"), "");
	BOOST_CHECK_EQUAL(text::clean_word("hejç"), "hej");
	BOOST_CHECK_EQUAL(text::clean_word("açd"), "ad");

	BOOST_CHECK(text::is_clean_word("hej"));
	BOOST_CHECK(text::is_clean_word("åäö"));
	BOOST_CHECK(text::is_clean_word("123"));
	BOOST_CHECK(!text::is_clean_word("$Üç"));
	BOOST_CHECK(!text::is_clean_word("hejç"));
	BOOST_CHECK(!text::is_clean_word("açd"));

	BOOST_CHECK_EQUAL(text::get_words_without_stopwords("hej")[0], "hej");
	BOOST_CHECK_EQUAL(text::get_words_without_stopwords("åäö")[0], "åäö");
	BOOST_CHECK_EQUAL(text::get_words_without_stopwords("123")[0], "123");
	BOOST_CHECK_EQUAL(text::get_words_without_stopwords("$Üç").size(), 0);
	BOOST_CHECK_EQUAL(text::get_words_without_stopwords("hejç").size(), 0);
	BOOST_CHECK_EQUAL(text::get_words_without_stopwords("açd").size(), 0);

	BOOST_CHECK(text::get_words_without_stopwords("hej josef") == vector<string>({"hej", "josef"}));
	BOOST_CHECK(text::get_words_without_stopwords("hej, josef!") == vector<string>({"hej", "josef"}));
	BOOST_CHECK(text::get_words_without_stopwords("hej jÜsef cullhed du är bäst") ==
		vector<string>({"hej", "cullhed", "du", "bäst"}));

	BOOST_CHECK(text::get_words_without_stopwords("Låna! (Pengar till bilar)") ==
		vector<string>({"låna", "pengar", "bilar"}));
	BOOST_CHECK(text::get_words_without_stopwords("Dallas Swarner | Character | zKillboard", 3) ==
		vector<string>({"dallas", "swarner", "character"}));
	BOOST_CHECK(text::get_words_without_stopwords("Tapis Fleur des Champs Moutarde | Zen Dos", 3) ==
		vector<string>({"tapis", "fleur", "des"}));
	BOOST_CHECK(text::get_words_without_stopwords("Gina Osorno & The Dreamers", 3) ==
		vector<string>({"gina", "osorno", "dreamers"}));

	BOOST_CHECK(text::get_words_without_stopwords("IMG_2190 | Zhenyu (Tony) Tian") ==
		vector<string>({"zhenyu", "tony", "tian"}));
	BOOST_CHECK(text::get_words_without_stopwords("Tills alla dör - Diamant Salihu - Bok (9789189061842) | Bokus", 3)
		== vector<string>({"tills", "dör", "diamant"}));

	BOOST_CHECK(text::get_words_without_stopwords("Messages postés par Prechan • Forum • Zeste de Savoir", 3) ==
		vector<string>({"messages", "par", "prechan"}));
	BOOST_CHECK(text::get_words_without_stopwords("Science SARU – 紙本分格") == vector<string>({"science", "saru"}));
	BOOST_CHECK(text::get_words_without_stopwords("Realiteti i trishtë shqiptar përmes fotove të gazetarit gjerman që komunizmi nuk i lejoi \
		të bëheshin publike | Gazeta Malesia", 3) == vector<string>({"realiteti", "shqiptar", "fotove"}));
	BOOST_CHECK(text::get_words_without_stopwords("York County, VA") == vector<string>({"york", "county", "va"}));
	BOOST_CHECK(text::get_words_without_stopwords("HTML Sitemap 14 - zfreeti.com", 3) ==
		vector<string>({"html", "sitemap", "14"}));
	BOOST_CHECK(text::get_words_without_stopwords("HTML Sitemap 14 - zfreeti.com") ==
		vector<string>({"html", "sitemap", "14"}));
	BOOST_CHECK(text::get_words_without_stopwords("Archives.com zfreeti.com best. stream. in .the world") ==
		vector<string>({"best", "stream", "world"}));

}

BOOST_AUTO_TEST_CASE(word_freq, * boost::unit_test::tolerance(0.00001)) {
	auto freq = text::get_word_frequency("hello my name is josef and it is good");
	BOOST_TEST(freq["hello"] == 1.0/9.0);
	BOOST_TEST(freq["is"] == 2.0/9.0);
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_thread_pool.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "utils/thread_pool.hpp"
#include "profiler/profiler.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_thread_pool)

BOOST_AUTO_TEST_CASE(thread_pool) {
	utils::thread_pool pool(10);

	vector<int> vec(10);
	for (int &i : vec) {
		pool.enqueue([&i]() {
			i++;
		});
	}

	pool.run_all();

	for (int i : vec) {
		BOOST_CHECK(i == 1);
	}
	
}

BOOST_AUTO_TEST_CASE(thread_pool2) {
	utils::thread_pool pool(12);

	vector<int> vec(24);
	for (int &i : vec) {
		pool.enqueue([&i]() {
				std::this_thread::sleep_for(200ms);
			i = 1;
		});
	}

	double now = profiler::now_micro();

	pool.run_all();

	double dt = profiler::now_micro() - now;

	BOOST_CHECK(dt < (200*2 + 10)*1000);

	for (int i : vec) {
		BOOST_CHECK(i == 1);
	}
	
}

/*
 * Test limit of queue length. The idea here is that if you pass a second parameter to the pool
 * you get a maximum queue length. Then if the workers are all working and the queue is full
 * the next call to enqueue will wait for the queue to become smaller.
 * 
 * This is useful if you want X workers to work but you don't want to fill up the queue because of.. limited memory.
 * */
BOOST_AUTO_TEST_CASE(thread_pool3) {
	utils::thread_pool pool(4, 1);

	vector<int> vec(4);
	int idx = 1;
	for (int &i : vec) {
		pool.enqueue([&i, idx]() {
			std::this_thread::sleep_for(200ms);
			i = idx;
		});
		// Allow some time for the work to be picked from the queue.
		std::this_thread::sleep_for(10ms);
		idx++;
	}
	// Now the 4 workers are working.

	// Enqueue one more.
	double now1 = profiler::now_micro();
	pool.enqueue([]() {
		std::this_thread::sleep_for(200ms);
	});
	double now2 = profiler::now_micro();

	// Should be quick.
	BOOST_CHECK(now2 - now1 < 10 * 1000); // < 10 milliseconds.

	// Now the next enqueue should wait around 200ms
	double now3 = profiler::now_micro();
	pool.enqueue([]() {
		std::this_thread::sleep_for(200ms);
	});
	double now4 = profiler::now_micro();

	BOOST_CHECK(now4 - now3 > 180 * 1000);

	std::this_thread::sleep_for(300ms);

	// All threads should be done now
	idx = 1;
	for (int i : vec) {
		BOOST_CHECK(i == idx);
		idx++;
	}

	pool.run_all();
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_top_k.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include <algorithm/top_k.h>

BOOST_AUTO_TEST_SUITE(test_top_k)

BOOST_AUTO_TEST_CASE(test_1) {
	const std::vector<int> res = ::algorithm::top_k<int>({1,2,3,4,5,6}, 2);
	bool is_correct = (res == std::vector<int>{5,6} || res == std::vector<int>{6,5});
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_2) {
	const std::vector<int> res = ::algorithm::top_k<int>({1,2,3,4,5,6,7}, 2);
	bool is_correct = (res == std::vector<int>{6,7} || res == std::vector<int>{7,6});
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_3) {
	const std::vector<int> res = ::algorithm::top_k<int>({}, 2);
	bool is_correct = (res == std::vector<int>{});
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_4) {
	const std::vector<int> res = ::algorithm::top_k<int>({2,3,1}, 2);
	bool is_correct = (res == std::vector<int>{2,3} || res == std::vector<int>{3,2});
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_5) {
	const std::vector<int> res = ::algorithm::top_k<int>({7,5,3,4,4,8,4,1,1,3,4}, 3);

	bool is_correct = true;
	for (int i : res) {
		if (i < 5) is_correct = false;
	}
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_6) {
	const std::vector<int> res = ::algorithm::top_k<int>({7,5,3,4,4,8,4,1,1,3,4}, 6);

	bool is_correct = true;
	for (int i : res) {
		if (i < 4) is_correct = false;
	}
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_7) {
	const std::vector<int> res = ::algorithm::top_k<int>({1,3,0,1,4,3,9,2,0,3}, 1);

	bool is_correct = res == std::vector<int>{9};
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_CASE(test_8) {
	const std::vector<int> res = ::algorithm::top_k<int>({1,3,0,1,4,3,9,2,0,3}, 3, [](const int &a, const int &b) {
		return a > b;
	});

	bool is_correct = true;
	for (int i : res) {
		if (i > 1) is_correct = false;
	}
	BOOST_CHECK(is_correct);
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_unicode.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "parser/unicode.h"

BOOST_AUTO_TEST_SUITE(unicode)

BOOST_AUTO_TEST_CASE(unicode) {
	BOOST_CHECK_EQUAL(parser::unicode::encode("hej jag heter josef"), "hej jag heter josef");
	BOOST_CHECK_EQUAL(parser::unicode::encode("hej jag heter josef och jag tillåter utf8 åäö chars$€"),
		"hej jag heter josef och jag tillåter utf8 åäö chars$€");
	BOOST_CHECK_EQUAL(parser::unicode::encode("是美国民主党政治家，于19世纪下半叶担"), "是美国民主党政治家，于19世纪下半叶担");

	BOOST_CHECK(parser::unicode::is_valid(parser::unicode::encode("L�gg i varukorg Om produkten Specifikation Anv�ndning Våra bönor är \
		rika på protein, mineraler och fibrer. Smaken är söt och konsistensen le")));
}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_url.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "URL.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_url)

BOOST_AUTO_TEST_CASE(basic) {
	BOOST_CHECK_EQUAL(URL("https://www.facebook.com/test.html?key=value").str(), "https://www.facebook.com/test.html?key=value");

	{
		URL url("https://www.facebook.com/test.html?key=value");
		url.set_scheme("http");
		url.set_www(false);
		
		BOOST_CHECK_EQUAL(url.str(), "http://facebook.com/test.html?key=value");

		url.set_scheme("https");
		url.set_www(true);

		BOOST_CHECK_EQUAL(url.str(), "https://www.facebook.com/test.html?key=value");
	}
}

BOOST_AUTO_TEST_CASE(url_parsing) {

	{
		URL url("https://www.facebook.com/test.html?key=value");
		BOOST_CHECK_EQUAL(url.str(), "https://www.facebook.com/test.html?key=value");
		BOOST_CHECK_EQUAL(url.domain_without_tld(), "facebook");
		BOOST_CHECK_EQUAL(url.host(), "facebook.com");
		BOOST_CHECK_EQUAL(url.host_reverse(), "com.facebook");
		BOOST_CHECK_EQUAL(url.scheme(), "https");
		BOOST_CHECK_EQUAL(url.path(), "/test.html");
		BOOST_CHECK_EQUAL(url.path_with_query(), "/test.html?key=value");
		BOOST_CHECK_EQUAL(url.size(), strlen("https://www.facebook.com/test.html?key=value"));
		BOOST_CHECK_EQUAL(url.has_https(), true);
		BOOST_CHECK_EQUAL(url.has_www(), true);

		auto query = url.query();
		BOOST_CHECK_EQUAL(query.size(), 1);
		BOOST_CHECK_EQUAL(query["key"], "value");
	}
	{
		URL url("http://example.com/");
		BOOST_CHECK_EQUAL(url.has_https(), false);
		BOOST_CHECK_EQUAL(url.has_www(), false);
	}

	{
		URL url("http://example.com/");
		BOOST_CHECK_EQUAL(url.path(), "/");
	}
	{
		URL url("http://example.com");
		BOOST_CHECK_EQUAL(url.path(), "/");
	}
}

BOOST_AUTO_TEST_CASE(url_parsing2) {

	URL url("https://github.com/joscul/alexandria/blob/main/tests/File.h");
	BOOST_CHECK_EQUAL(url.domain_without_tld(), "github");
	BOOST_CHECK_EQUAL(url.host(), "github.com");
	BOOST_CHECK_EQUAL(url.scheme(), "https");
	BOOST_CHECK_EQUAL(url.path(), "/joscul/alexandria/blob/main/tests/File.h");
	BOOST_CHECK_EQUAL(url.path_with_query(), "/joscul/alexandria/blob/main/tests/File.h");

	auto query = url.query();
	BOOST_CHECK_EQUAL(query.size(), 0);
}

BOOST_AUTO_TEST_CASE(hash) {

	URL url("https://github.com/joscul/alexandria/blob/main/tests/File.h");

	size_t hash1 = URL("https://github.com/joscul/alexandria/blob/main/tests/File.h").hash();
	size_t hash2 = URL("https://github.com/joscul/alexandria/blob/main/tests/File.h?query=param").hash();
	size_t hash3 = URL("https://github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp").hash();
	size_t hash4 = URL("https://www.github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp").hash();
	size_t hash5 = URL("http://github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp").hash();

	BOOST_CHECK(hash1 != hash2);
	BOOST_CHECK(hash2 != hash3);
	BOOST_CHECK(hash3 == hash4);
	BOOST_CHECK(hash4 == hash5);
}

BOOST_AUTO_TEST_CASE(unescape) {

	{
		URL url("https://github.com/?q=test%20test");
		map<string, string> query = url.query();

		BOOST_CHECK_EQUAL(query["q"], "test test");
	}
	{
		URL url("https://github.com/?q=test%2020");
		map<string, string> query = url.query();

		BOOST_CHECK_EQUAL(query["q"], "test 20");
	}
	{
		URL url("https://github.com/search?q=targumical&cp=0&hl=en-US&pq=%targumical%&sourceid=chrome&ie=UTF-8");
		map<string, string> query = url.query();

		BOOST_CHECK_EQUAL(query["pq"], "%targumical%");
	}

	{
		URL url("https://github.com/search?q=stress%%c3%C3%a5%C3%A4%c3%b6%0G");
		map<string, string> query = url.query();

		BOOST_CHECK_EQUAL(query["q"], "stress%c3åäö%0G");
	}

	{
		// Test double encoding.
		URL url("https://github.com/search?q=%25C3%25A5%25C3%25A4%25C3%25B6");
		map<string, string> query = url.query();

		BOOST_CHECK_EQUAL(query["q"], "%C3%A5%C3%A4%C3%B6");
	}

	{
		// Test double encoding.
		URL url("https://github.com/search?q=%josef%0");
		map<string, string> query = url.query();

		BOOST_CHECK_EQUAL(query["q"], "%josef%0");
	}

}

BOOST_AUTO_TEST_CASE(host_top_domain) {

	{
		URL url("https://test.uk");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "test.uk");
	}
	{
		URL url("https://testing.com.au");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "testing.com.au");
	}
	{
		URL url("https://subdomain.testing.com.au");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "testing.com.au");
	}
	{
		URL url("https://github.com/");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "github.com");
	}
	{
		URL url("https://test.github.com/");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "github.com");
	}

	{
		URL url("https://bbc.co.uk/");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "bbc.co.uk");
	}

	{
		URL url("https://testing.bbc.co.uk/");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "bbc.co.uk");
	}

	{
		URL url(".");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "");
	}

	{
		URL url("");
		BOOST_CHECK_EQUAL(url.host_top_domain(), "");
	}

}

BOOST_AUTO_TEST_SUITE_END()


================================================
FILE: tests/test_url_record.cpp
================================================
/*
 * MIT License
 *
 * Alexandria.org
 *
 * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <boost/test/unit_test.hpp>
#include "indexer/url_record.h"

using namespace std;

BOOST_AUTO_TEST_SUITE(test_url_record)

BOOST_AUTO_TEST_CASE(basic) {

	indexer::url_record record(123ull);

	record.url_length(442);
	BOOST_CHECK_EQUAL(record.url_length(), 442);

	record.url_length(4);
	BOOST_CHECK_EQUAL(record.url_length(), 4);

	record.url_length(0);
	BOOST_CHECK_EQUAL(record.url_length(), 0);


}

BOOST_AUTO_TEST_SUITE_END()