Showing preview only (954K chars total). Download the full file or copy to clipboard to get everything.
Repository: alexandria-org/alexandria
Branch: main
Commit: 129e162e8068
Files: 234
Total size: 891.5 KB
Directory structure:
gitextract_46gecs8w/
├── .gdbinit
├── .gitignore
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── README.md
├── cmake/
│ └── Findfcgi.cmake
├── config.conf
├── documentation/
│ ├── alexandria.md
│ ├── api_response_format.md
│ ├── caching.md
│ ├── coding_rules.md
│ ├── configure_local_nginx.md
│ ├── full_text_indexes.md
│ ├── ideas.md
│ ├── index_file_format.md
│ ├── indexer.md
│ ├── installing_nodes.md
│ ├── performance_journal.md
│ ├── search_result_ranking.md
│ └── statues_swe.tex
├── scripts/
│ ├── bootstrap_node_2drives.sh
│ ├── build-deps.sh
│ ├── clean.sh
│ ├── download-deps.sh
│ ├── download-test-data.sh
│ ├── find_missing_files_in_batch.sh
│ ├── init-docker.sh
│ ├── install-deps.sh
│ ├── packager.sh
│ ├── prepare-output-dirs.sh
│ ├── truncate.sh
│ └── update.sh
├── src/
│ ├── URL.cpp
│ ├── URL.h
│ ├── alexandria.cpp
│ ├── algorithm/
│ │ ├── algorithm.cpp
│ │ ├── algorithm.h
│ │ ├── bloom_filter.cpp
│ │ ├── bloom_filter.h
│ │ ├── hash.cpp
│ │ ├── hash.h
│ │ ├── hyper_ball.h
│ │ ├── hyper_log_log.cpp
│ │ ├── hyper_log_log.h
│ │ ├── intersection.cpp
│ │ ├── intersection.h
│ │ ├── sort.cpp
│ │ ├── sort.h
│ │ ├── sum_sorted.h
│ │ └── top_k.h
│ ├── api/
│ │ ├── api_response.cpp
│ │ ├── api_response.h
│ │ ├── result_with_snippet.cpp
│ │ └── result_with_snippet.h
│ ├── cluster/
│ │ ├── cluster.h
│ │ ├── document.cpp
│ │ └── document.h
│ ├── common/
│ │ ├── ThreadPool.h
│ │ ├── datetime.cpp
│ │ ├── datetime.h
│ │ ├── dictionary.cpp
│ │ ├── dictionary.h
│ │ ├── dictionary_row.cpp
│ │ ├── dictionary_row.h
│ │ ├── simple_thread_pool.hpp
│ │ ├── system.cpp
│ │ └── system.h
│ ├── config.cpp
│ ├── config.h
│ ├── debug.cpp
│ ├── debug.h
│ ├── domain_stats/
│ │ ├── domain_stats.cpp
│ │ └── domain_stats.h
│ ├── downloader/
│ │ ├── merge_downloader.cpp
│ │ ├── merge_downloader.h
│ │ ├── warc_downloader.cpp
│ │ └── warc_downloader.h
│ ├── file/
│ │ ├── archive.cpp
│ │ ├── archive.h
│ │ ├── file.cpp
│ │ ├── file.h
│ │ ├── gz_tsv_file.cpp
│ │ ├── gz_tsv_file.h
│ │ ├── tsv_file.cpp
│ │ ├── tsv_file.h
│ │ ├── tsv_file_remote.cpp
│ │ ├── tsv_file_remote.h
│ │ ├── tsv_row.cpp
│ │ └── tsv_row.h
│ ├── full_text/
│ │ ├── domain_link_record.h
│ │ ├── link_record.h
│ │ ├── record.h
│ │ ├── result_set.h
│ │ └── search_metric.h
│ ├── hash_table2/
│ │ ├── builder.cpp
│ │ ├── builder.h
│ │ ├── hash_table.cpp
│ │ ├── hash_table.h
│ │ ├── hash_table_shard.cpp
│ │ ├── hash_table_shard.h
│ │ ├── hash_table_shard_base.h
│ │ ├── hash_table_shard_builder.cpp
│ │ └── hash_table_shard_builder.h
│ ├── hash_table_helper/
│ │ ├── hash_table_helper.cpp
│ │ └── hash_table_helper.h
│ ├── http/
│ │ ├── request.cpp
│ │ ├── request.h
│ │ ├── response.h
│ │ ├── server.cpp
│ │ └── server.h
│ ├── indexer/
│ │ ├── basic_index.h
│ │ ├── basic_index_builder.h
│ │ ├── console.cpp
│ │ ├── console.h
│ │ ├── counted_record.h
│ │ ├── domain_link_record.h
│ │ ├── domain_record.h
│ │ ├── generic_record.h
│ │ ├── index.h
│ │ ├── index_base.h
│ │ ├── index_builder.h
│ │ ├── index_manager.cpp
│ │ ├── index_manager.h
│ │ ├── index_reader.cpp
│ │ ├── index_reader.h
│ │ ├── index_utils.cpp
│ │ ├── index_utils.h
│ │ ├── link_record.h
│ │ ├── merger.cpp
│ │ ├── merger.h
│ │ ├── regular_index_builder.h
│ │ ├── return_record.h
│ │ ├── score_builder.cpp
│ │ ├── score_builder.h
│ │ ├── sharded.h
│ │ ├── sharded_builder.h
│ │ ├── sharded_index.h
│ │ ├── sharded_index_builder.h
│ │ ├── url_record.h
│ │ └── value_record.h
│ ├── indexer.cpp
│ ├── logger/
│ │ ├── logger.cpp
│ │ └── logger.h
│ ├── memory/
│ │ ├── debugger.cpp
│ │ ├── debugger.h
│ │ ├── memory.cpp
│ │ ├── memory.h
│ │ └── overload.cpp
│ ├── parser/
│ │ ├── cc_parser.cpp
│ │ ├── cc_parser.h
│ │ ├── entities.cpp
│ │ ├── entities.h
│ │ ├── html_link.cpp
│ │ ├── html_link.h
│ │ ├── html_parser.cpp
│ │ ├── html_parser.h
│ │ ├── parser.cpp
│ │ ├── parser.h
│ │ ├── unicode.cpp
│ │ └── unicode.h
│ ├── profiler/
│ │ ├── profiler.cpp
│ │ └── profiler.h
│ ├── scraper/
│ │ ├── scraper.cpp
│ │ ├── scraper.h
│ │ ├── scraper_store.cpp
│ │ └── scraper_store.h
│ ├── scraper.cpp
│ ├── search_engine/
│ │ ├── search_allocation.h
│ │ ├── search_engine.cpp
│ │ └── search_engine.h
│ ├── server/
│ │ ├── search_server.cpp
│ │ ├── search_server.h
│ │ ├── url_server.cpp
│ │ └── url_server.h
│ ├── server.cpp
│ ├── stats/
│ │ └── stats.h
│ ├── text/
│ │ ├── stopwords.cpp
│ │ ├── stopwords.h
│ │ ├── text.cpp
│ │ └── text.h
│ ├── tools/
│ │ ├── calculate_harmonic.cpp
│ │ ├── calculate_harmonic.h
│ │ ├── counter.cpp
│ │ ├── counter.h
│ │ ├── find_links.cpp
│ │ ├── find_links.h
│ │ ├── generate_url_lists.cpp
│ │ ├── generate_url_lists.h
│ │ ├── splitter.cpp
│ │ └── splitter.h
│ ├── transfer/
│ │ ├── transfer.cpp
│ │ └── transfer.h
│ ├── url_link/
│ │ ├── link.cpp
│ │ └── link.h
│ ├── utils/
│ │ ├── id_allocator.h
│ │ ├── thread_pool.cpp
│ │ ├── thread_pool.hpp
│ │ └── thread_pool_arg.h
│ └── warc/
│ ├── tlds.h
│ ├── warc.cpp
│ └── warc.h
└── tests/
├── main.cpp
├── test_algorithm.cpp
├── test_bloom_filter.cpp
├── test_cc_parser.cpp
├── test_config.conf
├── test_config2.conf
├── test_configuration.cpp
├── test_counted_index_builder.cpp
├── test_datetime.h
├── test_file.cpp
├── test_hash.cpp
├── test_hash_table.cpp
├── test_html_parser.cpp
├── test_hyper_ball.cpp
├── test_hyper_log_log.cpp
├── test_index_builder.cpp
├── test_index_iteration.cpp
├── test_index_reader.cpp
├── test_logger.cpp
├── test_memory.cpp
├── test_n_gram.cpp
├── test_robot_parser.cpp
├── test_scraper.cpp
├── test_sharded_index_builder.cpp
├── test_sort.cpp
├── test_sum_sorted.cpp
├── test_text.cpp
├── test_thread_pool.cpp
├── test_top_k.cpp
├── test_unicode.cpp
├── test_url.cpp
└── test_url_record.cpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .gdbinit
================================================
set history save on
================================================
FILE: .gitignore
================================================
deps/*
tmp/*
src/*.o
tests/*.o
build/*
documentation/*.aux
documentation/*.log
documentation/statues_swe.pdf
.DS_Store
config/config.h
response.txt
cc_parser.zip
cc_parser
cc_indexer.zip
cc_indexer
cc_api.zip
cc_api
cc_full_text.zip
cc_full_text
run_tests
CMakeCache.txt
CMakeFiles
CMakeScripts
Makefile
cmake_install.cmake
warc.paths
.vscode
.gdb_history
*~
*.swp
*.swo
================================================
FILE: CMakeLists.txt
================================================
set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)
cmake_minimum_required(VERSION 3.5)
set(CMAKE_C_COMPILER /usr/bin/gcc-10)
set(CMAKE_CXX_COMPILER /usr/bin/g++-10)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(THREADS_PREFER_PTHREAD_FLAG ON)
project(alexandria LANGUAGES CXX)
add_definitions(-Wfatal-errors)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
add_subdirectory("deps/abseil-cpp")
find_package(roaring REQUIRED)
find_package(Threads REQUIRED)
FIND_PACKAGE(CURL REQUIRED)
find_package(Boost REQUIRED COMPONENTS system iostreams filesystem unit_test_framework)
find_package(ZLIB)
find_package(fcgi)
include_directories(src/)
include_directories(deps/)
include_directories(tests/)
set(SRC_CLASSES
"src/url_link/link.cpp"
"src/api/result_with_snippet.cpp"
"src/api/api_response.cpp"
"src/file/file.cpp"
"src/file/archive.cpp"
"src/file/tsv_file.cpp"
"src/file/gz_tsv_file.cpp"
"src/file/tsv_file_remote.cpp"
"src/file/tsv_row.cpp"
"src/transfer/transfer.cpp"
"src/hash_table2/hash_table.cpp"
"src/hash_table2/hash_table_shard.cpp"
"src/hash_table2/hash_table_shard_builder.cpp"
"src/hash_table2/builder.cpp"
"src/hash_table_helper/hash_table_helper.cpp"
"src/parser/parser.cpp"
"src/parser/entities.cpp"
"src/parser/html_link.cpp"
"src/parser/html_parser.cpp"
"src/parser/unicode.cpp"
"src/parser/cc_parser.cpp"
"src/downloader/warc_downloader.cpp"
"src/downloader/merge_downloader.cpp"
"src/URL.cpp"
"src/warc/warc.cpp"
"src/profiler/profiler.cpp"
"src/logger/logger.cpp"
"src/utils/thread_pool.cpp"
"src/memory/memory.cpp"
"src/memory/debugger.cpp"
"src/config.cpp"
"src/algorithm/algorithm.cpp"
"src/algorithm/intersection.cpp"
"src/algorithm/sort.cpp"
"src/algorithm/hash.cpp"
"src/algorithm/hyper_log_log.cpp"
"src/algorithm/bloom_filter.cpp"
"src/tools/splitter.cpp"
"src/tools/find_links.cpp"
"src/tools/counter.cpp"
"src/tools/calculate_harmonic.cpp"
"src/tools/generate_url_lists.cpp"
"src/cluster/document.cpp"
"src/scraper/scraper.cpp"
"src/scraper/scraper_store.cpp"
"src/indexer/index_manager.cpp"
"src/indexer/console.cpp"
"src/indexer/merger.cpp"
"src/indexer/score_builder.cpp"
"src/indexer/index_reader.cpp"
"src/indexer/index_utils.cpp"
"src/server/search_server.cpp"
"src/server/url_server.cpp"
"src/http/server.cpp"
"src/http/request.cpp"
"src/domain_stats/domain_stats.cpp"
"src/debug.cpp"
"deps/robots.cc"
)
set(SRC_COMMON
"src/common/dictionary.cpp"
"src/common/system.cpp"
"src/common/datetime.cpp"
"src/common/dictionary_row.cpp"
"src/text/stopwords.cpp"
"src/text/text.cpp"
)
set(SRC_TESTS
"tests/test_hyper_log_log.cpp"
"tests/test_memory.cpp"
"tests/test_algorithm.cpp"
"tests/test_bloom_filter.cpp"
"tests/test_cc_parser.cpp"
"tests/test_configuration.cpp"
"tests/test_counted_index_builder.cpp"
"tests/test_datetime.h"
"tests/test_file.cpp"
"tests/test_hash.cpp"
"tests/test_hash_table.cpp"
"tests/test_html_parser.cpp"
"tests/test_hyper_ball.cpp"
"tests/test_index_builder.cpp"
"tests/test_index_iteration.cpp"
"tests/test_index_reader.cpp"
"tests/test_logger.cpp"
"tests/test_n_gram.cpp"
"tests/test_robot_parser.cpp"
"tests/test_scraper.cpp"
"tests/test_sharded_index_builder.cpp"
"tests/test_sort.cpp"
"tests/test_sum_sorted.cpp"
"tests/test_text.cpp"
"tests/test_thread_pool.cpp"
"tests/test_top_k.cpp"
"tests/test_unicode.cpp"
"tests/test_url.cpp"
"tests/test_url_record.cpp"
# This overloads the new/delete operators to keep track of memory, slows things down a lot.
"src/memory/overload.cpp"
)
add_executable(run_tests
"tests/main.cpp"
${SRC_CLASSES}
${SRC_COMMON}
${SRC_TESTS}
)
add_executable(server
"src/server.cpp"
${SRC_CLASSES}
${SRC_COMMON}
)
add_executable(scraper
"src/scraper.cpp"
${SRC_CLASSES}
${SRC_COMMON}
)
add_executable(indexer
"src/indexer.cpp"
${SRC_CLASSES}
${SRC_COMMON}
)
add_executable(alexandria
"src/alexandria.cpp"
${SRC_CLASSES}
${SRC_COMMON}
)
target_compile_definitions(run_tests PUBLIC IS_TEST)
target_compile_definitions(run_tests PUBLIC FT_NUM_SHARDS=16)
target_compile_definitions(run_tests PUBLIC HT_NUM_SHARDS=16)
target_compile_definitions(run_tests PUBLIC FILE_SERVER="http://127.0.0.1")
target_compile_definitions(run_tests PUBLIC COMPILE_WITH_LINK_INDEX)
target_compile_options(run_tests PUBLIC -Wall -Werror)
target_compile_options(server PUBLIC -Wall -Werror)
target_compile_options(scraper PUBLIC -Wall -Werror)
target_compile_options(indexer PUBLIC -Wall -Werror)
target_compile_options(alexandria PUBLIC -Wall -Werror)
target_link_libraries(run_tests PUBLIC
${FCGI_LIBRARY}
${FCGI_LIBRARYCPP}
${CURL_LIBRARIES}
${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(server PUBLIC
${FCGI_LIBRARY}
${FCGI_LIBRARYCPP}
${CURL_LIBRARIES}
${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(scraper PUBLIC
${FCGI_LIBRARY}
${FCGI_LIBRARYCPP}
${CURL_LIBRARIES}
${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(indexer PUBLIC
${FCGI_LIBRARY}
${FCGI_LIBRARYCPP}
${CURL_LIBRARIES}
${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
target_link_libraries(alexandria PUBLIC
${FCGI_LIBRARY}
${FCGI_LIBRARYCPP}
${CURL_LIBRARIES}
${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring)
================================================
FILE: Dockerfile
================================================
# syntax=docker/dockerfile:1
FROM ubuntu:latest
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y zip make cmake gcc gcc-10 g++ g++-10 libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx vim wget git curl
================================================
FILE: LICENSE
================================================
MIT License
Alexandria.org
Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Alexandria.org
1. [Coding Rules](/documentation/coding_rules.md)
2. [Full text indexes](/documentation/full_text_indexes.md)
3. [Hash table](/documentation/hash_table.md)
## Build instructions with docker
1. Checkout repo
WINDOWS USERS: You need to run 'git config --global core.autocrlf false' before checking out the repository
```
git clone git@github.com:alexandria-org/alexandria.git
```
2. Build docker image
```
docker build . -t alexandria
```
3. Run container
```
docker container run --name alexandria -v ${PWD}:/alexandria -it -d alexandria
```
4. Attach to container.
```
docker exec -it alexandria /bin/bash
```
5. Navigate to directory
```
cd /alexandria
```
6. Initialize docker
```
scripts/init-docker.sh
```
7. Configure with cmake
```
mkdir build; cd build; cmake ..
```
8. Build all
```
make -j4
```
9. Run test suite
```
./run_tests
```
## How to build manually (not recommended)
1. Configure the system (Tested on Ubuntu 20.04)
```
# Will alter your system and install dependencies with apt.
./scripts/install-deps.sh
# Will download and build zlib, aws-lambda-cpp and aws-sdk-cpp will only alter the local directory.
./scripts/build-deps.sh
```
2. Build with cmake
```
mkdir build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug
or
cmake .. -DCMAKE_BUILD_TYPE=Release
make -j24
```
3. Download test data to local server.
To run the test suite you need to install nginx and pre-download all the data: [Configure local nginx test data server](/documentation/configure_local_nginx.md)
4. Create output directories. Note, this will create a bunch of directories in the /mnt so make sure you don't have anything there.
```
./scripts/prepare-output-dirs.sh
```
5. Run the test suite
```
cd build
make run_tests -j24
./run_tests
```
## Notes
On nodes with spinning disks we should turn off energy saving:
```
hdparm -B 255 /dev/sda
```
## Debugging notes
### Debugging scraper with gdb:
By default, gdb captures SIGPIPE of a process and pauses it. However, some program ignores SIGPIPE. So, the default behavour of gdb is not desired when debugging those program. To avoid gdb stopping in SIGPIPE, use the folloing command in gdb:
```handle SIGPIPE nostop noprint pass```
================================================
FILE: cmake/Findfcgi.cmake
================================================
# CMake module to search for FastCGI headers
#
# If it's found it sets FCGI_FOUND to TRUE
# and following variables are set:
# FCGI_INCLUDE_DIR
# FCGI_LIBRARY
FIND_PATH(FCGI_INCLUDE_DIR
fcgio.h
PATHS
/usr/include
/usr/local/include
/usr/include/fastcgi
"$ENV{LIB_DIR}/include"
$ENV{INCLUDE}
)
FIND_LIBRARY(FCGI_LIBRARY NAMES fcgi libfcgi PATHS
/usr/local/lib
/usr/lib
"$ENV{LIB_DIR}/lib"
"$ENV{LIB}"
)
FIND_LIBRARY(FCGI_LIBRARYCPP NAMES libfcgi++.so PATHS
/usr/local/lib
/usr/lib
"$ENV{LIB_DIR}/lib"
"$ENV{LIB}"
)
IF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY)
SET(FCGI_FOUND TRUE)
ENDIF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY)
IF (FCGI_FOUND)
IF (NOT FCGI_FIND_QUIETLY)
MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARY}")
MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARYCPP}")
ENDIF (NOT FCGI_FIND_QUIETLY)
ELSE (FCGI_FOUND)
IF (FCGI_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "Could not find FCGI")
ENDIF (FCGI_FIND_REQUIRED)
ENDIF (FCGI_FOUND)
================================================
FILE: config.conf
================================================
# Cluster config
nodes_in_cluster = 3
node_id = 0
# Indexer config
batches[] = ALEXANDRIA-MANUAL-01
batches[] = CC-MAIN-2021-25
batches[] = CC-MAIN-2021-31
link_batches[] = CC-MAIN-2021-31
link_batches[] = CC-MAIN-2021-25
link_batches[] = CC-MAIN-2021-21
link_batches[] = CC-MAIN-2021-17
link_batches[] = CC-MAIN-2021-10
link_batches[] = CC-MAIN-2021-04
link_batches[] = CC-MAIN-2020-50
link_batches[] = CC-MAIN-2020-45
# Server config
worker_count = 8
query_max_words = 10 # Maximum number of words used in query.
query_max_len = 200
deduplicate_domain_count = 5
pre_result_limit = 200000
result_limit = 1000
# Full text config
ft_max_sections = 4
ft_max_results_per_section = 2000000
================================================
FILE: documentation/alexandria.md
================================================
Usage: ./alexandria [OPTIONS]...
## Options
**--downloader [commoncrawl-batch] [limit] [offset]**
Downloads files from the given commoncrawl batch. Limit and offset arguments are used for downloading a subset of the files. Example
```
./alexandria --downloader CC-MAIN-2022-27 2500 0
```
Will download the first 2500 files from CC-MAIN-2022-27 and upload them to the 'upload' host. See config documentation.
**--downloader-merge**
Merges downloaded files. This should run on the upload host to merge the different downloaded batches into our hash table.
**--hash-table-url [URL]**
Searches the local hash table called 'all_urls' for the given URL.
**--hash-table-url-hash [URL-hash]**
Searches the local hash table called 'all_urls' for the given URL-hash.
**--hash-table-count**
Counts all items in local hash table called 'all_urls'.
**--hash-table-find-all [HOST]**
Searches the local hash table called 'all_urls' for urls from specified host. This takes several days for large hash table.
**--hash-table-count [HOST]**
Estimated count of host from hash table by only counting one shard and multiply by number of shards.
**--hash-table-optimize-shard [SHARD]**
Optimizes shard for local hash table called 'all_urls'.
**--internal-harmonic**
Run the whole internal links harmonic calculator. Should run on 'upload' host.
================================================
FILE: documentation/api_response_format.md
================================================
# Api Response Format
This is a description of the endpoints available on a node.
### Perform search
```
curl http://node0002.alexandria.org/?q=the%20beatles
{
"status": "success",
"time_ms": 35.876,
"total_found": 245436,
"total_url_links_found": 4092,
"total_domain_links_found": 4092,
"links_handled": 674,
"link_domain_matches": 18059,
"link_url_matches": 589,
"results": [{
"url": "https://www.example.com/",
"title": "Example dot com",
"snippet": "Lorem ipsum dolor esit",
"score": 182.51408386230469,
"domain_hash": "2892282071861106665",
"url_hash": "2892281418178079567"
}]
}
The url flag d can be used to control deduplication:
curl http://node0002.alexandria.org/?q=the%20beatles&d=a
curl http://node0002.alexandria.org/?q=the%20beatles&d=d
d=a // No deduplication, show all results
d=d // Deduplication
Default value is d=d
```
### Perform url lookup
```
curl http://node0002.alexandria.org/?u=https://www.example.org/
{
"status": "success",
"time_ms": 35.876,
"response": "[DATA]"
}
```
### Fetch information about search result
```
curl http://node0002.alexandria.org/?s=example%20query
{
"status": "success",
"time_ms": 13.984,
"index": {
"total": 980770801,
"words": {
"example": 0.0080152416772448342,
"query": 0.0017581304401006531
}
},
"link_index": {
"total": 472012858,
"words": {
"example": 0.000581251114985516,
"query": 6.3595725182554242e-05
}
}
}
```
### Fetch status of the node.
```
curl http://node0002.alexandria.org/status
{
"status": "success",
"time_ms": 13.984,
"total_disk_space": 89374934876,
"avail_disk_space": 83975235,
"avail_disk_percent": 0.0832,
"index": {
"items": 980770801,
"full_text_disk_used": 973295875,
"full_text_disk_percent": 0.5423,
"hash_table_disk_used": 839265,
"hash_table_disk_percent": 0.05423
},
"link_index": {
"items": 980770801,
"full_text_disk_used": 973295875,
"full_text_disk_percent": 0.2423,
"hash_table_disk_used": 839265,
"hash_table_disk_percent": 0.0423
}
}
```
### Combined api response (api.alexandria.org)
```
curl https://api.alexandria.org/?q=the%20beatles&p=1
{
"status": "success",
"time_ms": 35.876,
"total_found": 245436,
"total_url_links_found": 4092,
"total_domain_links_found": 4092,
"links_handled": 674,
"link_domain_matches": 18059,
"link_url_matches": 589,
"page_max": 10,
"results": [{
"url": "https://www.example.com/",
"display_url": "https://www.example.com/",
"title": "Example dot com",
"snippet": "Lorem ipsum dolor esit",
"score": 182.51408386230469,
"domain_hash": "2892282071861106665",
"url_hash": "2892281418178079567",
"exact_match": 1,
"phrase_match": 1,
"year": 3300,
"is_old": 0,
"is_subdomain": 0,
"domain": "www.example.com"
},
...
]
}
```
================================================
FILE: documentation/caching.md
================================================
## Caching
Our nodes should try to use as much RAM as possible to store index data for common tokens in RAM. I think the best way would be to hold a list of the most commonly queried tokens.
We can use /proc/meminfo to retrieve information about available memory on the server.
================================================
FILE: documentation/coding_rules.md
================================================
## Coding rules
1. Indent with tabs.
2. Use auto for variable declarations when possible.
3. Never put "using namespace std" in any file.
4. Prefix class member variables with m_, this way you know you are using a member or local variable.
5. All namespaces, classes, functions and variables should be lower_case.
6. All files within a sub-directory must declare everything within a namespace with the same name as the directory. For example src/file/tsv_file.h must declare everything within the namespace file::
7. Prefer smart pointers over regular pointers.
8. Prefer if statements over switch statements.
## Indentation examples
Indent with tabs!
### pointers
```c++
// * and & are glued to the variable
int *ptr = new int[100];
int *ptr2 = &addr;
```
### operators
```c++
// Spaces between binary operators
int a = 1 + 2;
int b = multiple * (add1 + add2);
a += b;
// Unary operators are glued to variable
int a = 1;
a++;
int b = -a;
```
### functions
```c++
// Spaces after comma
int add(int a, int b) {
return a + b;
}
// Spaces after comma here too
add(123, 333);
```
### classes
```c++
template<typename data_record>
class index_builder {
public:
index_builder(const std::string &db_name, size_t id);
int public_func();
private:
int m_member;
int m_counter;
int private_func();
};
```
### if
```c++
// Space between "if" and "("
// Space between ")" and "{"
if (something) {
do_something();
} else if (something_else) {
do_something_else();
} else {
do_else();
}
```
### loops
```c++
// Prefer range based loops.
for (const auto &iter : m_map) {
}
// But if you need a standard loop indent it like this.
for (int i = 0; i < 100; i++) {
}
```
### memory allocation
```c++
// Avoid new/delete, use smart pointers everywhere.
// If you just need a regular pointer to memory do this:
std::unique_ptr<char[]> allocator;
try {
allocator = std::make_unique<char[]>(1000);
} catch (std::bad_alloc &error) {
// Handle allocation error.
}
char *ptr = allocator.get();
// Use ptr as regular pointer to 1000 chars.
// ptr will be deleted automatically when allocator goes out of scope.
```
================================================
FILE: documentation/configure_local_nginx.md
================================================
# COnfigure local nginx server.
1. Install nginx
```
apt-get install nginx
```
2. Add configuration to /etc/nginx/sites-available/default (If you are running other sites locally you should probably do something else here)
```
server {
listen 80 default_server;
listen [::]:80 default_server;
root /var/www/html/node0003.alexandria.org;
index index.html index.htm index.nginx-debian.html;
server_name _;
location / {
try_files $uri $uri/ =404;
autoindex on;
}
}
```
3. Download test data to /var/www/html
```
./scripts/download-test-data.sh /var/www/html
```
================================================
FILE: documentation/full_text_indexes.md
================================================
# The alexandria full text index
A full text index in its simplest form is a hash map from an integer word id ```key``` to a list of documents.
There are two kinds of data structures called ```index``` and ```counted_index```. Both data structures acts on a given template type
```data_record```.
The two data structures shares the same data layout except for the last part where ```index``` stores roaring bitmaps while `counted_index` store the records.
## Data layout
The index starts with a hash table. The hash table stores the position for the page containing `key` at index `key % hash_table_size`.
```
hash table : uint64_t[hash_table_size] (8 x hash_table_size bytes)
num_records : uint64_t (8 bytes)
list of records : data_record[num_records] (sizeof(data_record) * num_records bytes)
consecutive pages : page[varying] (undetermined size)
```
A single page consists of a list of keys. Each key then has a corresponding position among the bitmaps and a length of the bitmap. The bitmaps (of varying length) are then stored consecutively.
```
num_keys : uint64_t (8 bytes)
list of keys : uint64_t[num_keys] (8 x num_keys bytes)
list of positions : uint64_t[num_keys] (8 x num_keys bytes)
list of lengths : uint64_t[num_keys] (8 x num_keys bytes)
consecutive bitmaps : bitmap[num_keys] (undetermined size)
```
================================================
FILE: documentation/ideas.md
================================================
# Similar words
To handle similar words (saluhall, saluhallen) we should create a hashtable with similar words and as an additional index create "saluhall+" by combining our existing indexes of saluhall, saluhallen, saluhallarna etc. into one additional index.
# Autocomplete
We should base our autocomplete on the most common words in titles of documents before and after each word. For example "Uppsala" could suggest "Uppsala kommun", "Uppsala universitet" and "Destination Uppsala" based on the search results.
================================================
FILE: documentation/index_file_format.md
================================================
# Index file format
```8 bytes number of keys (n)
8 * n bytes keys
8 * n bytes positions
8 * n bytes lengths (len(k) number of records for key k)
8 * n bytes total found results
[Data Records]
```
```
Data records are structured like this:
len(k) * (8 bytes unsigned long URL id, 4 bytes single precision float score)
================================================
FILE: documentation/indexer.md
================================================
### NAME
indexer - manually index data or analyze things
### SYNOPSIS
indexer [OPTION]
### DESCRIPTION
```
--split source_batch target_prefix
splits the urls in the local source batch and outputs them into {target_prefix}-[0-23]/files.
for example --split CC-MAIN-2021-04 /mnt/crawl-data/NODE
--split-count
--split-count-domains
--split-count-links
--split-make-scraper-urls
--tools-download-batch
--tools-upload-urls-with-links
--tools-find-links
--calculate-harmonic-hosts
--calculate-harmonic-links
--calculate-harmonic
--host-hash
--host-hash-mod
--console
run the interactive console for making debug searches.
--index-domans BATCH LIMIT OFFSET
run the indexer for our domain index adding the urls+data from BATCH
--index-links BATCH LIMIT OFFSET
run the link indexer adding url_ and domain_ links from BATCH
--index-words BATCH LIMIT OFFSET
run the word indexer adding word data from BATCH
--index-urls BATCH LIMIT OFFSET
run the url indexer on batch generating one index per domain
--index-snippets BATCH LIMIT OFFSET
run the snippet indexer
--truncate-domains
--truncate-links
--truncate-words
--truncate-urls
--truncate-snippets
--info
print info about indexes
```
================================================
FILE: documentation/installing_nodes.md
================================================
If problem with raid information on drive unmount all partitions and do this:
```
wipefs -a /dev/nvme1n1
```
then reset and install node again.
To setup node with two drives run:
```
source <(curl -s https://raw.githubusercontent.com/alexandria-org/alexandria/main/scripts/bootstrap_node_2drives.sh)
```
================================================
FILE: documentation/performance_journal.md
================================================
## Performance journal
### File system testing
Ext2 (noatime,nodiratime,barrier=0)
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 4.76649 s, 451 MB/s
$ echo 3 > /proc/sys/vm/drop_caches
$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.43043 s, 1.5 GB/s
real 0m1.435s
user 0m0.013s
sys 0m0.763s
```
Ext2 (relatime)
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 5.02563 s, 427 MB/s
$ echo 3 > /proc/sys/vm/drop_caches
$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.48533 s, 1.4 GB/s
real 0m1.490s
user 0m0.046s
sys 0m0.604s
```
Ext4 (noatime,nodiratime,barrier=0):
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.26469 s, 948 MB/s
$ echo 3 > /proc/sys/vm/drop_caches
$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.821499 s, 2.6 GB/s
real 0m0.824s
user 0m0.004s
sys 0m0.648s
```
Ext4 (relatime):
```
$ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync
0+1 records in
0+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.15461 s, 997 MB/s
$ echo 3 > /proc/sys/vm/drop_caches
$ time dd if=/tmp/test1.img of=/dev/null bs=8k
262143+1 records in
262143+1 records out
2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.822013 s, 2.6 GB/s
real 0m0.825s
user 0m0.029s
sys 0m0.568s
```
Conclusion. Run ext4
### Software load testing
2021-10-06, AX61-NVME with two discs
```
Server Software: nginx/1.18.0
Server Hostname: node0002.alexandria.org
Server Port: 80
Concurrency Level: 5
Time taken for tests: 294.451 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 294262066 bytes
HTML transferred: 293986342 bytes
Requests per second: 6.79 [#/sec] (mean)
Time per request: 736.127 [ms] (mean)
Time per request: 147.225 [ms] (mean, across all concurrent requests)
Transfer rate: 975.94 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 12 19 10.1 16 152
Processing: 16 717 461.5 652 2896
Waiting: 0 662 431.7 587 2770
Total: 31 736 460.4 671 2911
Percentage of the requests served within a certain time (ms)
50% 671
66% 879
75% 1009
80% 1108
90% 1344
95% 1595
98% 1864
99% 2062
100% 2911 (longest request)
```
2021-10-10, AX61-NVME with two discs
```
Server Software: nginx/1.18.0
Server Hostname: node0002.alexandria.org
Server Port: 80
Concurrency Level: 5
Time taken for tests: 328.051 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 255881934 bytes
HTML transferred: 255605934 bytes
Requests per second: 6.10 [#/sec] (mean)
Time per request: 820.128 [ms] (mean)
Time per request: 164.026 [ms] (mean, across all concurrent requests)
Transfer rate: 761.73 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 12 52 95.6 25 1560
Processing: 16 767 558.9 689 3961
Waiting: 15 638 427.9 594 2631
Total: 32 819 558.5 742 4113
Percentage of the requests served within a certain time (ms)
50% 742
66% 982
75% 1159
80% 1260
90% 1560
95% 1831
98% 2186
99% 2470
100% 4113 (longest request)
```
2021-10-10, AX41-NVMe with four discs
```
Server Software: nginx/1.18.0
Server Hostname: 65.21.238.146
Server Port: 80
Concurrency Level: 5
Time taken for tests: 278.694 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 232745432 bytes
HTML transferred: 232469432 bytes
Requests per second: 7.18 [#/sec] (mean)
Time per request: 696.735 [ms] (mean)
Time per request: 139.347 [ms] (mean, across all concurrent requests)
Transfer rate: 815.56 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 12 69 98.4 35 1107
Processing: 14 627 698.4 454 9790
Waiting: 14 435 346.5 368 4045
Total: 29 696 719.1 522 10159
Percentage of the requests served within a certain time (ms)
50% 522
66% 755
75% 927
80% 1050
90% 1382
95% 1781
98% 2415
99% 3439
100% 10159 (longest request)
```
2021-10-10, AX41-NVMe with four discs
```
Server Software: nginx/1.18.0
Server Hostname: 65.21.238.146
Server Port: 80
Concurrency Level: 5
Time taken for tests: 252.503 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 230349918 bytes
HTML transferred: 230073780 bytes
Requests per second: 7.92 [#/sec] (mean)
Time per request: 631.258 [ms] (mean)
Time per request: 126.252 [ms] (mean, across all concurrent requests)
Transfer rate: 890.88 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 12 54 78.2 27 1068
Processing: 15 576 519.3 436 3659
Waiting: 15 421 325.7 354 2421
Total: 30 631 527.6 491 3728
Percentage of the requests served within a certain time (ms)
50% 491
66% 707
75% 861
80% 988
90% 1355
95% 1736
98% 2100
99% 2419
100% 3728 (longest request)
```
2021-10-10, AX61-NVME with two discs, 4 partitions
```
Server Software: nginx/1.18.0
Server Hostname: 65.21.125.158
Server Port: 80
Concurrency Level: 5
Time taken for tests: 263.283 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 282821583 bytes
HTML transferred: 282545445 bytes
Requests per second: 7.60 [#/sec] (mean)
Time per request: 658.209 [ms] (mean)
Time per request: 131.642 [ms] (mean, across all concurrent requests)
Transfer rate: 1049.03 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 13 28 32.9 26 630
Processing: 17 629 434.1 563 3051
Waiting: 15 587 412.8 517 2949
Total: 36 657 435.8 593 3090
Percentage of the requests served within a certain time (ms)
50% 593
66% 774
75% 914
80% 1003
90% 1260
95% 1480
98% 1708
99% 1959
100% 3090 (longest request)
```
2021-10-10, AX61-NVME with two discs, 4 partitions
```
Server Software: nginx/1.18.0
Server Hostname: 65.21.125.158
Server Port: 80
Concurrency Level: 5
Time taken for tests: 249.241 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 267058842 bytes
HTML transferred: 266782842 bytes
Requests per second: 8.02 [#/sec] (mean)
Time per request: 623.101 [ms] (mean)
Time per request: 124.620 [ms] (mean, across all concurrent requests)
Transfer rate: 1046.38 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 13 27 19.3 25 734
Processing: 15 596 469.4 506 3785
Waiting: 0 554 449.3 467 3660
Total: 32 622 470.7 531 3805
Percentage of the requests served within a certain time (ms)
50% 531
66% 735
75% 878
80% 974
90% 1234
95% 1495
98% 1809
99% 2104
100% 3805 (longest request)
```
2021-10-12, AX61-NVME with four discs and 8 partitions
```
Server Software: nginx/1.18.0
Server Hostname: 135.181.182.4
Server Port: 80
Concurrency Level: 5
Time taken for tests: 264.412 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 274309399 bytes
HTML transferred: 274033261 bytes
Requests per second: 7.56 [#/sec] (mean)
Time per request: 661.029 [ms] (mean)
Time per request: 132.206 [ms] (mean, across all concurrent requests)
Transfer rate: 1013.12 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 13 27 16.1 25 348
Processing: 14 633 449.6 565 2996
Waiting: 0 590 425.7 520 2545
Total: 34 661 450.3 594 3014
Percentage of the requests served within a certain time (ms)
50% 594
66% 772
75% 905
80% 1000
90% 1271
95% 1510
98% 1834
99% 1997
100% 3014 (longest request)
```
2021-10-12, AX61-NVME with four discs and 8 partitions
```
Server Software: nginx/1.18.0
Server Hostname: 135.181.182.4
Server Port: 80
Concurrency Level: 5
Time taken for tests: 233.408 seconds
Complete requests: 2000
Failed requests: 0
Write errors: 0
Total transferred: 272488725 bytes
HTML transferred: 272213277 bytes
Requests per second: 8.57 [#/sec] (mean)
Time per request: 583.519 [ms] (mean)
Time per request: 116.704 [ms] (mean, across all concurrent requests)
Transfer rate: 1140.07 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 12 25 10.1 24 187
Processing: 15 558 402.0 487 2727
Waiting: 0 512 377.0 440 2051
Total: 33 583 402.8 512 2757
Percentage of the requests served within a certain time (ms)
50% 512
66% 695
75% 806
80% 882
90% 1114
95% 1373
98% 1621
99% 1779
100% 2757 (longest request)
```
================================================
FILE: documentation/search_result_ranking.md
================================================
# Search Result Ranking
This document describes how search results are indexed and ranked.
## Input
Input to our indexer is a sequence of deduplicated urls with the following data.
```
{
url: "https://www.example.com/",
title: "Example Page",
meta_description: "",
h1: "Example Domain",
text: "This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information..."
}
```
## 1. Domain level
Each url is added with the url hash as key. The tokens are not deduplicated throughout the domain.
```
domain_score:
idf * sum(tf_ + )
```
```
domain_score = expm1(5 * link.m_score) + 0.1;
url_score = expm1(10 * link.m_score) + 0.1;
```
================================================
FILE: documentation/statues_swe.tex
================================================
\documentclass[12pt, a4paper]{article}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[swedish]{babel}
\title{Stadgar för Föreningen Alexandria.org}
\date{Januari 2022}
\begin{document}
\maketitle
\paragraph{§ 1 Föreningens firma}
\paragraph{}
Föreningens firma är Föreningen Alexandria.org och föreningens firmatecknare är ordförande eller annan person utsedd till firmatecknare av styrelsen.
\paragraph{§ 2 Föreningens ändamål}
\paragraph{}
Föreningen har som ändamål att göra kunskap mer tillgängligt. Föreningen ska uppfylla sitt ändamål genom att utveckla och tillhandahålla en sökmotor som är gratis och utan annonser. Källkoden till sökmotorn ska publiceras som öppen källkod.
\paragraph{§ 3 Föreningens säte}
\paragraph{}
Föreningen har sitt säte i Uppsala.
\paragraph{§ 4 Medlemsskap}
\paragraph{}
Föreningens medlemmar är aktiva i föreningens verksamhet. Nya medlemmar måste godkännas av styrelsen.
\paragraph{§ 5 Medlemsavgifter}
\paragraph{}
Medlem ska betala den medlemsavgift som årligen fastställs av årsmötet.
\pagebreak
\paragraph{§ 6 Styrelsen}
\paragraph{}
Styrelsen består av en ordförande, en kassör, en suppleant och eventuellt ytterligare ledarmöter enligt årsmötets beslut.
\paragraph{§ 7 Styrelsens uppdrag}
\paragraph{}
Styrelsen företräder föreningen, bevakar dess intressen och handhar dess angelägenheter. Styrelsen beslutar å föreningens vägnar såvida inte annat
föreskrivs i dessa stadgar. Styrelsen ska verkställa av årsmötet fattade beslut, handha föreningens ekonomiska angelägenheter och föra räkenskaper,
samt avge årsredovisning till årsstämman för det senaste räkenskapsåret. Styrelsen sammanträder när ordföranden finner det erforderligt eller om
minst två styrelseledamöter begär detta.
\paragraph{}
Styrelsen är beslutsför då minst hälften av ledmöterna, avrundat uppåt är närvarande. Styrelsebeslut fattas med enkel majoritet. Vid lika röstetal gäller den mening
ordföranden biträder.
\paragraph{§ 8 Räkenskaper}
\paragraph{}
Räkenskapsår ska vara kalenderår.
\paragraph{§ 9 Revisor}
\paragraph{}
Styrelsens förvaltning ska årligen granskas av en på årsmötet utsedd revisor. Revisorn ska senast den 1 mars avge sin revisionsberättelse. Revisorn får ej vara medlem i styrelsen.
\paragraph{§ 10 Årsmöte}
\paragraph{}
Ordinarie årsmöte, vilket är föreningens högsta beslutande organ, hålls årligen före den 30 juni på tid och plats som styrelsen bestämmer. Kallelse sker via epost minst 2 veckor före utsatt möte. Motioner som har inkommit senast 7 dagar före årsmötet ska anses ha kommit i tid. Motioner skickas via epost.
\paragraph{}
Vid ordinarie årsmöte ska följande ärenden behandlas:
\begin{enumerate}
\item Val av ordförande och sekreterare för mötet.
\item Fastställande av röstlängd för mötet.
\item Fastställande av dagordning.
\item Styrelsens verksamhetsberättelse för det senaste verksamhetsåret.
\item Styrelsens förvaltningsberättelse (balans- och resultaträkning) för det senaste verksamhets-/räkenskapsåret.
\item Revisionsberättelsen för verksamhets-/räkenskapsåret.
\item Fråga om ansvarsfrihet för styrelsen för den tid revisionen avser.
\item Fastställande av medlemsavgifter.
\item Fastställande av ev. verksamhetsplan och behandling av budget för det kommande verksamhets-/räkenskapsåret.
\item Val av ordförande i föreningen för en tid av 1 år.
\item Val av kassör, övriga styrelseledamöter samt suppleanter för en tid av 1 år
\item Val av revisorer.
\item Behandling av styrelsens förslag och i rätt tid inkomna motioner.
\item Övriga frågor.
\end{enumerate}
\paragraph{§ 11 Extra årsmöte}
\paragraph{}
Extra årsmöte hålls när styrelsen eller revisorerna finner att det är nödvändigt. Kallelse sker via epost minst 2 veckor före utsatt möte.
\paragraph{§ 12 Rösträtt}
\paragraph{}
Vid årsmöte har varje medlem en röst. Rösträtten är personlig och kan inte utövas genom ombud.
\paragraph{§ 13 Beslut, omröstning och beslutsmässighet}
\paragraph{}
Beslut fattas med bifallsrop (acklamation) eller om så begärs, efter omröstning (votering).
\paragraph{}
Omröstning sker öppet, utom vid val där sluten omröstning ska äga rum om någon begär detta. Beslut fattas, såvida dessa stadgar ej föreskriver
annat, med enkel majoritet. Vid lika röstetal skall den mening som ordförande biträder vinna bifall.
\paragraph{}
Mötet är beslutsmässigt med det antal röstberättigade medlemmar som är närvarande på mötet.
\paragraph{§ 14 Regler för ändring av stadgarna}
\paragraph{}
För ändring av dessa stadgar krävs beslut av två på varandra följande ordinarie årsmöten. Förslag till ändring av stadgarna får ges såväl av medlem som styrelsen.
\paragraph{§ 15 Utträde}
\paragraph{}
Medlem som önskar utträda ur föreningen ska skriftligen anmäla detta till styrelsen och anses därmed omedelbart ha lämnat föreningen.
\paragraph{§ 16 Uteslutning}
\paragraph{}
Medlem får uteslutas från föreningen om den har försummat att betala beslutade avgifter, motarbetat föreningens
verksamhet eller ändamål, eller skadat föreningens intressen. Beslut om uteslutning fattas av styrelsen.
\end{document}
================================================
FILE: scripts/bootstrap_node_2drives.sh
================================================
#!/bin/bash
apt-get update
apt-get -y install vim parted zip unzip nginx
_mkpart() {
disc=$1
mountpoint1=$2
mountpoint2=$3
mountpoint3=$4
mountpoint4=$5
parted -s $disc mklabel gpt
parted -s -a optimal $disc mkpart primary ext4 0% 25%
parted -s -a optimal $disc mkpart primary ext4 25% 50%
parted -s -a optimal $disc mkpart primary ext4 50% 75%
parted -s -a optimal $disc mkpart primary ext4 75% 100%
sleep 1
mkfs.ext4 -F ${disc}p1
mkfs.ext4 -F ${disc}p2
mkfs.ext4 -F ${disc}p3
mkfs.ext4 -F ${disc}p4
mkdir $mountpoint1
mkdir $mountpoint2
mkdir $mountpoint3
mkdir $mountpoint4
mount ${disc}p1 $mountpoint1
mount ${disc}p2 $mountpoint2
mount ${disc}p3 $mountpoint3
mount ${disc}p4 $mountpoint4
echo "" >> /etc/fstab
echo "${disc}p1 $mountpoint1 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
echo "${disc}p2 $mountpoint2 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
echo "${disc}p3 $mountpoint3 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
echo "${disc}p4 $mountpoint4 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab
}
mkdir /mnt/0
mkdir /mnt/1
mkdir /mnt/2
mkdir /mnt/3
_mkpart /dev/nvme1n1 /mnt/4 /mnt/5 /mnt/6 /mnt/7
for shard in $(seq 0 7); do
mkdir "/mnt/$shard/input";
mkdir "/mnt/$shard/output";
mkdir "/mnt/$shard/upload";
mkdir "/mnt/$shard/hash_table";
mkdir "/mnt/$shard/full_text";
mkdir "/mnt/$shard/tmp";
done
echo "server {
listen 80;
server_name localhost;
location / {
fastcgi_pass 127.0.0.1:8000;
fastcgi_param GATEWAY_INTERFACE CGI/1.1;
fastcgi_param SERVER_SOFTWARE nginx;
fastcgi_param QUERY_STRING \$query_string;
fastcgi_param REQUEST_METHOD \$request_method;
fastcgi_param CONTENT_TYPE \$content_type;
fastcgi_param CONTENT_LENGTH \$content_length;
fastcgi_param SCRIPT_FILENAME \$document_root\$fastcgi_script_name;
fastcgi_param SCRIPT_NAME \$fastcgi_script_name;
fastcgi_param REQUEST_URI \$request_uri;
fastcgi_param DOCUMENT_URI \$document_uri;
fastcgi_param DOCUMENT_ROOT \$document_root;
fastcgi_param SERVER_PROTOCOL \$server_protocol;
fastcgi_param REMOTE_ADDR \$remote_addr;
fastcgi_param REMOTE_PORT \$remote_port;
fastcgi_param SERVER_ADDR \$server_addr;
fastcgi_param SERVER_PORT \$server_port;
fastcgi_param SERVER_NAME \$server_name;
}
}" > /etc/nginx/sites-enabled/default
/etc/init.d/nginx restart
adduser --system --shell /sbin/nologin --gecos "User for running alexandria service" --disabled-password --home /alexandria alexandria
touch /var/log/alexandria.log
chown alexandria:syslog /var/log/alexandria.log
echo "[Unit]
Description=Alexandria Server
[Service]
User=alexandria
WorkingDirectory=/alexandria
ExecStart=/alexandria/server
Nice=-20
Restart=always
[Install]
WantedBy=multi-user.target" > /etc/systemd/system/alexandria.service
echo "# Cluster config
nodes_in_cluster = 4
node_id = 0
# Indexer config
batches[] = NODE-0
batches[] = NODE-1
batches[] = NODE-2
batches[] = NODE-3
batches[] = NODE-4
batches[] = NODE-5
link_batches[] = LINK-0
link_batches[] = LINK-1
link_batches[] = LINK-2
link_batches[] = LINK-3
link_batches[] = LINK-4
link_batches[] = LINK-5
# Server config
worker_count = 8
query_max_words = 10 # Maximum number of words used in query.
query_max_len = 200
deduplicate_domain_count = 5
pre_result_limit = 200000
result_limit = 1000
# Full text config
ft_max_sections = 8
ft_max_results_per_section = 2000000
ft_section_depth = 4" > /etc/alexandria.conf
mkdir /alexandria
cd /alexandria
wget https://github.com/alexandria-org/alexandria/releases/download/v1.0/alexandria.zip
unzip alexandria.zip
chown -R alexandria /mnt/*
================================================
FILE: scripts/build-deps.sh
================================================
#!/bin/bash
cd `dirname $0`
cd ..
base_path=`pwd`
cd $base_path
cd deps
cd zlib-1.2.12
./configure
make -j4
make install
cd $base_path
cd deps
export CC=/usr/bin/gcc
export CXX=/usr/bin/g++
cd CRoaring
mkdir build
cd build
cmake ..
make
make install
================================================
FILE: scripts/clean.sh
================================================
#!/bin/bash
cd `dirname $0`
cd ..
read -p "Do you want to delete your local alexandria data? [Y/n] " -n 1 -r
echo
if [[ $REPLY =~ ^[Y]$ ]]
then
for shard in $(seq 0 7); do
rm -r /mnt/$shard/*
mkdir /mnt/$shard
mkdir "/mnt/$shard/input";
mkdir "/mnt/$shard/output";
mkdir "/mnt/$shard/upload";
mkdir "/mnt/$shard/hash_table";
mkdir "/mnt/$shard/full_text";
mkdir "/mnt/$shard/tmp";
done
else
echo "Ignoring"
fi
================================================
FILE: scripts/download-deps.sh
================================================
#!/bin/bash
cd `dirname $0`
cd ..
export CC=/usr/bin/gcc-10
export CXX=/usr/bin/g++-10
base_path=`pwd`
cd $base_path
mkdir -p deps
cd deps
curl -L https://github.com/nlohmann/json/releases/latest/download/json.hpp > json.hpp
curl https://zlib.net/fossils/zlib-1.2.12.tar.gz > zlib-1.2.12.tar.gz
gunzip -f zlib-1.2.12.tar.gz
tar -xvf zlib-1.2.12.tar
git clone https://github.com/abseil/abseil-cpp.git
git clone https://github.com/RoaringBitmap/CRoaring.git
wget https://raw.githubusercontent.com/google/robotstxt/master/robots.cc
wget https://raw.githubusercontent.com/google/robotstxt/master/robots.h
================================================
FILE: scripts/download-test-data.sh
================================================
#!/bin/bash
cd `dirname $0`
if [ $# -eq 0 ]; then
echo "Provide destination path as first argument"
exit 1
fi
for shard in $(seq 0 7); do
mkdir "/mnt/$shard";
mkdir "/mnt/$shard/input";
mkdir "/mnt/$shard/output";
mkdir "/mnt/$shard/upload";
mkdir "/mnt/$shard/hash_table";
mkdir "/mnt/$shard/full_text";
mkdir "/mnt/$shard/tmp";
done
DEST=$1
cd $DEST || { echo "target directory does not exist"; exit 127; }
rm -r node0003.alexandria.org
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-01/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-02/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-03/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-04/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-05/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-06/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-07/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-08/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-09/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-10/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/50_top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/dev_files/ --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/example.txt --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/example.txt.gz --http-user=alexandria --http-password=wmXN6U4u
wget -r -l1 --no-parent http://node0003.alexandria.org/test-data/ --http-user=alexandria --http-password=wmXN6U4u
mkdir node0003.alexandria.org/nodes
mkdir node0003.alexandria.org/nodes/test0001
mkdir node0003.alexandria.org/upload-tmp
chown -R www-data:www-data node0003.alexandria.org
================================================
FILE: scripts/find_missing_files_in_batch.sh
================================================
#!/bin/bash
cd `dirname $0`
cd ..
batch=$1
files=`curl https://data.commoncrawl.org/crawl-data/$batch/warc.paths.gz | gunzip`
missing_files_path="/mnt/crawl-data/$batch/missing.paths"
truncate -s 0 $missing_files_path
for raw_file in $files; do
file="/mnt/${raw_file/.warc.gz/.gz}"
if [[ -f "$file" ]]; then
filesize=$(stat -c%s "$file")
if [[ $filesize -lt 1000 ]]; then
echo "The file '$file' exists and is small."
echo $raw_file >> $missing_files_path
fi
else
echo "The file '$file' does not exist."
echo $raw_file >> $missing_files_path
fi
done
gzip $missing_files_path
================================================
FILE: scripts/init-docker.sh
================================================
#!/bin/bash
cd `dirname $0`
# The local docker development environment runs the data server on the local machine.
# This script sets that up and downloads the test data.
echo "Copying nginx config";
echo "server {
listen 80 default_server;
listen [::]:80 default_server;
root /var/www/html/node0003.alexandria.org;
index index.html;
server_name _;
location / {
autoindex on;
client_body_temp_path /var/www/html/node0003.alexandria.org/upload-tmp;
dav_methods PUT;
create_full_put_path on;
dav_access group:rw all:r;
client_max_body_size 10000m;
}
location /store {
fastcgi_pass 127.0.0.1:8001;
fastcgi_param GATEWAY_INTERFACE CGI/1.1;
fastcgi_param SERVER_SOFTWARE nginx;
fastcgi_param QUERY_STRING \$query_string;
fastcgi_param REQUEST_METHOD \$request_method;
fastcgi_param CONTENT_TYPE \$content_type;
fastcgi_param CONTENT_LENGTH \$content_length;
fastcgi_param SCRIPT_FILENAME \$document_root\$fastcgi_script_name;
fastcgi_param SCRIPT_NAME \$fastcgi_script_name;
fastcgi_param REQUEST_URI \$request_uri;
fastcgi_param DOCUMENT_URI \$document_uri;
fastcgi_param DOCUMENT_ROOT \$document_root;
fastcgi_param SERVER_PROTOCOL \$server_protocol;
fastcgi_param REMOTE_ADDR \$remote_addr;
fastcgi_param REMOTE_PORT \$remote_port;
fastcgi_param SERVER_ADDR \$server_addr;
fastcgi_param SERVER_PORT \$server_port;
fastcgi_param SERVER_NAME \$server_name;
}
}
" > /etc/nginx/sites-enabled/default
echo "Downloading test data";
./download-test-data.sh /var/www/html
mkdir /var/www/html/node0003.alexandria.org/nodes
mkdir /var/www/html/node0003.alexandria.org/nodes/test0001
mkdir /var/www/html/node0003.alexandria.org/upload-tmp
chown -R www-data:www-data /var/www/html/node0003.alexandria.org
/etc/init.d/nginx restart
./download-deps.sh
./build-deps.sh
================================================
FILE: scripts/install-deps.sh
================================================
#!/bin/bash
apt-get install -y zip make cmake gcc-10 g++-10 gcc g++ libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx
================================================
FILE: scripts/packager.sh
================================================
#!/bin/bash
# Copyright 2018-present Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# Modified by Josef Cullhed 2021
set -euo pipefail
print_help() {
echo -e "Usage: packager [OPTIONS] <binary name>\n"
echo -e "OPTIONS\n"
echo -e "\t-d,--default-libc\t Use the target host libc libraries. This will not package the C library files.\n"
}
if [ $# -lt 1 ]; then
echo -e "Error: missing arguments\n"
print_help
exit 1
fi
POSITIONAL=()
INCLUDE_LIBC=true
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-d|--default-libc)
INCLUDE_LIBC=false
shift # past argument
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
PKG_BIN_PATH=$1
architecture=$(arch)
if [ ! -d "$PKG_BIN_PATH" ]; then
echo "$PKG_BIN_PATH" - No such directory.;
exit 1;
fi
if ! type zip > /dev/null 2>&1; then
echo "zip utility is not found. Please install it and re-run this script"
exit 1
fi
function package_libc_via_pacman {
if grep --extended-regexp "Arch Linux|Manjaro Linux" < /etc/os-release > /dev/null 2>&1; then
if type pacman > /dev/null 2>&1; then
pacman --query --list --quiet glibc | sed -E '/\.so$|\.so\.[0-9]+$/!d'
fi
fi
}
function package_libc_via_dpkg() {
if type dpkg-query > /dev/null 2>&1; then
if [[ $(dpkg-query --listfiles libc6 | wc -l) -gt 0 ]]; then
dpkg-query --listfiles libc6 | sed -E '/\.so$|\.so\.[0-9]+$/!d'
fi
fi
}
function package_libc_via_rpm() {
if type rpm > /dev/null 2>&1; then
if [[ $(rpm --query --list glibc.$architecture | wc -l) -gt 1 ]]; then
rpm --query --list glibc.$architecture | sed -E '/\.so$|\.so\.[0-9]+$/!d'
fi
fi
}
# hasElement expects an element and an array parameter
# it's equivalent to array.contains(element)
# e.g. hasElement "needle" ${haystack[@]}
function hasElement() {
local el key=$1
shift
for el in "$@"
do
[[ "$el" == "$key" ]] && return 0
done
return 1
}
PKG_BIN_FILENAME=alexandria
PKG_DIR=tmp
PKG_LD=""
list=$(ldd "$PKG_BIN_PATH/server" | awk '{print $(NF-1)}')
libc_libs=()
libc_libs+=($(package_libc_via_dpkg))
libc_libs+=($(package_libc_via_rpm))
libc_libs+=($(package_libc_via_pacman))
mkdir -p "$PKG_DIR/bin" "$PKG_DIR/lib"
for i in $list
do
if [[ ! -f $i ]]; then # ignore linux-vdso.so.1
continue
fi
# Do not copy libc files which are directly linked unless it's the dynamic loader
if hasElement "$i" "${libc_libs[@]}"; then
filename=$(basename "$i")
if [[ -z "${filename##ld-*}" ]]; then
PKG_LD=$filename # Use this file as the loader
cp "$i" "$PKG_DIR/lib"
fi
continue
fi
cp "$i" $PKG_DIR/lib
done
if [[ $INCLUDE_LIBC == true ]]; then
for i in "${libc_libs[@]}"
do
filename=$(basename "$i")
if [[ -z "${filename##ld-*}" ]]; then
# if the loader is empty, then the binary is probably linked to a symlink of the loader. The symlink will
# not show up when quering the package manager for libc files. So, in this case, we want to copy the loader
if [[ -z "$PKG_LD" ]]; then
PKG_LD=$filename
cp "$i" "$PKG_DIR/lib" # we want to follow the symlink (default behavior)
fi
continue # We don't want the dynamic loader's symlink because its target is an absolute path (/lib/ld-*).
fi
cp --no-dereference "$i" "$PKG_DIR/lib"
done
fi
if [[ -z "$PKG_LD" ]]; then
echo "Failed to identify, locate or package the loader. Please file an issue on Github!" 1>&2
exit 1
fi
bootstrap_script_server=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf nice -n -20 ./lib/$PKG_LD --library-path ./lib ./bin/server
EOF
)
bootstrap_script_scraper=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf nice -n -20 ./lib/$PKG_LD --library-path ./lib ./bin/scraper
EOF
)
bootstrap_script_indexer=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf ./lib/$PKG_LD --library-path ./lib ./bin/indexer \$@
EOF
)
bootstrap_script_alexandria=$(cat <<EOF
#!/bin/bash
set -euo pipefail
ulimit -n 104857
ALEXANDRIA_LIVE=1 ALEXANDRIA_CONFIG=/etc/alexandria.conf ./lib/$PKG_LD --library-path ./lib ./bin/alexandria \$@
EOF
)
cp "$PKG_BIN_PATH/server" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/scraper" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/indexer" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/alexandria" "$PKG_DIR/bin"
cp "$PKG_BIN_PATH/../scripts/bootstrap_node_2drives.sh" "$PKG_DIR/"
cp "$PKG_BIN_PATH/../scripts/truncate.sh" "$PKG_DIR/"
cp "$PKG_BIN_PATH/../scripts/update.sh" "$PKG_DIR/"
chmod +x "$PKG_DIR/bootstrap_node_2drives.sh"
chmod +x "$PKG_DIR/truncate.sh"
chmod +x "$PKG_DIR/update.sh"
echo -e "$bootstrap_script_server" > "$PKG_DIR/server"
echo -e "$bootstrap_script_scraper" > "$PKG_DIR/scraper"
echo -e "$bootstrap_script_indexer" > "$PKG_DIR/indexer"
echo -e "$bootstrap_script_alexandria" > "$PKG_DIR/alexandria"
chmod +x "$PKG_DIR/server"
chmod +x "$PKG_DIR/scraper"
chmod +x "$PKG_DIR/indexer"
chmod +x "$PKG_DIR/alexandria"
# some shenanigans to create the right layout in the zip file without extraneous directories
pushd "$PKG_DIR" > /dev/null
zip --symlinks --recurse-paths "$PKG_BIN_FILENAME".zip -- *
ORIGIN_DIR=$(dirs -l +1)
mv "$PKG_BIN_FILENAME".zip "$ORIGIN_DIR"
popd > /dev/null
rm -r "$PKG_DIR"
echo Created "$ORIGIN_DIR/$PKG_BIN_FILENAME".zip
================================================
FILE: scripts/prepare-output-dirs.sh
================================================
#!/bin/bash
cd `dirname $0`
cd ..
for shard_id in $(seq 0 7); do
shard="/mnt/$shard_id"
rm -r $shard
mkdir $shard
mkdir "$shard/input";
mkdir "$shard/output";
mkdir "$shard/upload";
mkdir "$shard/hash_table";
mkdir "$shard/full_text";
mkdir "$shard/tmp";
done
================================================
FILE: scripts/truncate.sh
================================================
#!/bin/bash
cd `dirname $0`
cd ..
for shard in $(seq 0 7); do
rm -r /mnt/$shard/*
mkdir "/mnt/$shard/input";
mkdir "/mnt/$shard/output";
mkdir "/mnt/$shard/upload";
mkdir "/mnt/$shard/hash_table";
mkdir "/mnt/$shard/full_text";
mkdir "/mnt/$shard/tmp";
done
chown -R alexandria /mnt/*
================================================
FILE: scripts/update.sh
================================================
#!/bin/bash
cd `dirname $0`
wget https://github.com/alexandria-org/alexandria/releases/latest/download/alexandria.zip -O alexandria.zip
unzip -o alexandria.zip
================================================
FILE: src/URL.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "URL.h"
#include "algorithm/hash.h"
#include "parser/parser.h"
#include <curl/curl.h>
#include "text/text.h"
#include "warc/tlds.h"
using namespace std;
URL::URL() {
m_status = ::parser::OK;
}
URL::URL(const URL &url) :
m_url_string(url.m_url_string),
m_host(url.m_host),
m_host_reverse(url.m_host_reverse),
m_scheme(url.m_scheme),
m_path(url.m_path),
m_query(url.m_query),
m_status(url.m_status),
m_has_www(url.m_has_www)
{
}
URL::URL(const string &url) :
m_url_string(url)
{
m_status = parse();
}
URL::URL(const string &host, const string &path) :
m_url_string("http://" + host + path), m_host(host), m_path(path)
{
m_host_reverse = URL::host_reverse(m_host);
m_status = ::parser::OK;
}
URL::~URL() {
}
void URL::set_url_string(const string &url) {
m_url_string = url;
m_status = parse();
}
string URL::str() const {
return m_url_string;
}
string URL::key() const {
/*
* We should probably change this to:
* return m_host + path_with_query();
* but we need to do it later..
*/
return m_host + m_path + m_query;
}
string URL::hash_input() const {
return m_host + path_with_query();
}
uint64_t URL::hash() const {
return ::algorithm::hash(hash_input());
}
uint64_t URL::host_hash() const {
return ::algorithm::hash(m_host);
}
uint64_t URL::link_hash(const URL &target_url, const string &link_text) const {
return ::algorithm::hash(host() + target_url.str());
}
uint64_t URL::domain_link_hash(const URL &target_url, const string &link_text) const {
return ::algorithm::hash(host() + target_url.host());
}
bool URL::canonically_different(const URL &url) const {
return key() != url.key();
}
bool URL::has_https() const {
return m_scheme == "https";
}
bool URL::has_www() const {
return m_has_www;
}
string URL::host() const {
return m_host;
}
string URL::host_top_domain() const {
vector<string> parts;
std::string_view host(m_host);
size_t pos1 = host.find_last_of(".");
if (host.substr(pos1 + 1) == "uk") {
pos1 = host.find_last_of(".", pos1 - 1);
if (host.substr(pos1 + 1) != "co.uk") {
return m_host;
}
} else if (host.substr(pos1 + 1) == "au") {
pos1 = host.find_last_of(".", pos1 - 1);
}
size_t pos2 = host.find_last_of(".", pos1 - 1);
if (pos2 == string::npos) {
return m_host;
}
return m_host.substr(pos2 + 1);
}
string URL::scheme() const {
return m_scheme;
}
string URL::host_reverse() const {
return m_host_reverse;
}
string URL::path() const {
return m_path;
}
string URL::path_with_query() const {
if (m_query.size() > 0) {
return m_path + "?" + m_query;
} else {
return m_path;
}
}
map<string, string> URL::query() const {
map<string, string> ret;
vector<string> parts;
boost::split(parts, m_query, boost::is_any_of("&"));
for (const string &part : parts) {
vector<string> pair;
boost::split(pair, part, boost::is_any_of("="));
if (pair.size() > 1) {
ret[pair[0]] = parser::urldecode(pair[1]);
}
}
return ret;
}
float URL::harmonic() const {
return 0.0f;
}
string URL::host_reverse(const string &host) {
vector<string> parts;
boost::split(parts, host, boost::is_any_of("."));
reverse(parts.begin(), parts.end());
return boost::algorithm::join(parts, ".");
}
string URL::host_reverse_top_domain(const string &host) {
/*
* This algorithm is OK since we only run on these tlds:
* {"se", "com", "nu", "net", "org", "gov", "edu", "info"}
* */
vector<string> parts;
boost::split(parts, host, boost::is_any_of("."));
if (parts.size() > 2) {
parts = {parts[parts.size() - 2], parts[parts.size() - 1]};
}
reverse(parts.begin(), parts.end());
return boost::algorithm::join(parts, ".");
}
string URL::domain_without_tld() const {
vector<string> parts;
boost::split(parts, m_host, boost::is_any_of("."));
if (parts.size() > 1) {
return parts[parts.size() - 2];
}
return "";
}
uint32_t URL::size() const {
return str().size();
}
void URL::set_scheme(const string &scheme) {
m_scheme = scheme;
rebuild_url_str();
}
void URL::set_www(bool has_www) {
m_has_www = has_www;
rebuild_url_str();
}
URL &URL::operator=(const URL &other) {
m_url_string = other.m_url_string;
m_host = other.m_host;
m_host_reverse = other.m_host_reverse;
m_scheme = other.m_scheme;
m_path = other.m_path;
m_query = other.m_query;
m_status = other.m_status;
m_has_www = other.m_has_www;
return *this;
}
istream &operator >>(istream &ss, URL &url) {
ss >> (url.m_url_string);
url.m_status = url.parse();
return ss;
}
ostream &operator <<(ostream& os, const URL& url) {
os << url.m_url_string;
return os;
}
int URL::parse() {
CURLU *h = curl_url();
if (!h) return ::parser::ERROR;
CURLUcode uc = curl_url_set(h, CURLUPART_URL, m_url_string.c_str(), 0);
if (uc) {
curl_url_cleanup(h);
return ::parser::ERROR;
}
char *chost;
uc = curl_url_get(h, CURLUPART_HOST, &chost, 0);
if (!uc) {
m_host = chost;
remove_www(m_host);
curl_free(chost);
}
char *scheme;
uc = curl_url_get(h, CURLUPART_SCHEME, &scheme, 0);
if (!uc) {
m_scheme = scheme;
curl_free(scheme);
}
char *cpath;
uc = curl_url_get(h, CURLUPART_PATH, &cpath, 0);
if (!uc) {
m_path = cpath;
curl_free(cpath);
}
char *cquery;
uc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0);
if (!uc) {
m_query = cquery;
curl_free(cquery);
}
curl_url_cleanup(h);
m_host_reverse = URL::host_reverse(m_host);
return ::parser::OK;
}
void URL::rebuild_url_str() {
m_url_string = m_scheme + "://" + (m_has_www ? "www." : "") + m_host + path_with_query();
}
inline void URL::remove_www(string &path) {
size_t pos = path.find("www.");
if (pos == 0) {
m_has_www = true;
path.erase(0, 4);
} else {
m_has_www = false;
}
text::trim(path);
}
================================================
FILE: src/URL.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include "config.h"
#include <iostream>
#include <functional>
#include <map>
#include <boost/algorithm/string/join.hpp>
class URL {
public:
URL();
URL(const URL &url);
explicit URL(const std::string &url);
explicit URL(const std::string &host, const std::string &path);
~URL();
static std::string host_reverse(const std::string &host);
static std::string host_reverse_top_domain(const std::string &host);
void set_url_string(const std::string &url);
std::string str() const;
std::string key() const;
std::string hash_input() const;
uint64_t hash() const;
uint64_t host_hash() const;
uint64_t link_hash(const URL &target_url, const std::string &link_text) const;
uint64_t domain_link_hash(const URL &target_url, const std::string &link_text) const;
bool canonically_different(const URL &url) const;
bool has_https() const;
bool has_www() const;
std::string host() const;
std::string host_top_domain() const;
std::string scheme() const;
std::string path() const;
std::string path_with_query() const;
std::map<std::string, std::string> query() const;
std::string host_reverse() const;
std::string domain_without_tld() const;
uint32_t size() const;
void set_scheme(const std::string &scheme);
void set_www(bool has_www);
float harmonic() const;
size_t index_on_node() const {
return host_hash() % config::nodes_in_cluster;
}
URL &operator=(const URL &other);
friend std::istream &operator >>(std::istream &ss, URL &url);
friend std::ostream &operator <<(std::ostream& os, const URL& url);
private:
std::string m_url_string;
std::string m_host;
std::string m_host_reverse;
std::string m_scheme;
std::string m_path;
std::string m_query;
int m_status;
bool m_has_www;
int parse();
void rebuild_url_str();
inline void remove_www(std::string &path);
};
================================================
FILE: src/alexandria.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <iostream>
#include <sstream>
#include <numeric>
#include "logger/logger.h"
#include "downloader/warc_downloader.h"
#include "downloader/merge_downloader.h"
#include "URL.h"
#include "hash_table2/hash_table.h"
#include "hash_table2/hash_table_shard_builder.h"
#include "indexer/index.h"
#include "indexer/index_builder.h"
#include "indexer/value_record.h"
#include "algorithm/hyper_ball.h"
#include "utils/thread_pool.hpp"
#include "file/file.h"
#include "http/server.h"
#include "parser/parser.h"
#include <boost/algorithm/string.hpp>
using namespace std;
void help() {
std::string content = file::cat("../documentation/alexandria.md");
std::cout << content << std::endl;
}
int main(int argc, const char **argv) {
logger::start_logger_thread();
logger::verbose(true);
if (getenv("ALEXANDRIA_CONFIG") != NULL) {
config::read_config(getenv("ALEXANDRIA_CONFIG"));
} else {
config::read_config("/etc/alexandria.conf");
}
if (argc < 2) {
help();
return 0;
}
const string arg(argc > 1 ? argv[1] : "");
if (arg == "--hash-table-url" && argc > 2) {
URL url(argv[2]);
hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");
size_t ver = 0;
std::string data = ht.find(url.hash(), ver);
std::cout << ver << std::endl;
std::cout << data << std::endl;
} else if (arg == "--hash-table-url-hash" && argc > 2) {
uint64_t url_hash = std::stoull(argv[2]);
hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");
size_t ver = 0;
std::string data = ht.find(url_hash, ver);
std::cout << ver << std::endl;
std::cout << data << std::endl;
} else if (arg == "--hash-table-count") {
hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");
std::cout << ht.size() << std::endl;
} else if (arg == "--hash-table-find-all" && argc > 2) {
hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");
// Put given hosts in array with hashes to search for.
std::vector<uint64_t> search_for;
for (int i = 2; i < argc; i++) {
search_for.push_back(URL(string("https://") + argv[i]).host_hash());
}
ht.for_each([&search_for](uint64_t key, std::string value) {
URL url(value.substr(0, value.find("\t")));
const auto my_host_hash = url.host_hash();
for (const auto &host_hash : search_for) {
if (host_hash == my_host_hash) {
std::cout << key << "\t" << url.str() << std::endl;
break;
}
}
});
} else if (arg == "--hash-table-count" && argc > 2) {
std::string data = file::cat("domains.txt");
std::vector<std::string> lines;
boost::split(lines, data, boost::is_any_of("\n"));
std::map<std::string, uint64_t> domains;
std::map<uint64_t, size_t> domain_counts;
std::vector<std::string> domain_list;
for (const auto &line : lines) {
if (line == "") continue;
const std::string reversed = URL::host_reverse(line);
std::cout << reversed << std::endl;
const uint64_t domain_hash = URL(string("https://") + reversed).host_hash();
domains[reversed] = domain_hash;
domain_counts[domain_hash] = 0;
domain_list.push_back(reversed);
}
hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");
uint64_t thelazy_host_hash = URL(string("https://") + argv[2]).host_hash();
ht.for_each([thelazy_host_hash, &domain_counts](uint64_t key, std::string value) {
URL url(value.substr(0, value.find("\t")));
const auto my_host_hash = url.host_hash();
for (auto &iter : domain_counts) {
if (iter.first == my_host_hash) {
domain_counts[iter.first]++;
break;
}
}
/*if (url.host_hash() == thelazy_host_hash) {
std::cout << key << " => " << url.str() << std::endl;
}*/
});
for (auto &domain : domain_list) {
std::cout << domain << "\t" << domain_counts[domains[domain]] << std::endl;
}
} else if (arg == "--hash-table-optimize-shard" && argc > 2) {
size_t shard_id = std::stoull(argv[2]);
hash_table2::hash_table_shard_builder ht_shard("all_urls", shard_id, 1000000, "/slow_data");
ht_shard.optimize();
} else if (arg == "--internal-harmonic") {
profiler::instance prof_total("total");
/*
std::vector<std::string> all_files;
file::read_directory("/mnt/0/full_text/internal_links", [&all_files](const std::string &filename) {
all_files.push_back(filename);
});
size_t done_with = 0;
profiler::instance prof("total");
for (const auto &filename : all_files) {
// Read the file.
std::ifstream infile("/mnt/0/full_text/internal_links/" + filename, std::ios::binary);
std::string infile_data(std::istreambuf_iterator<char>(infile), {});
infile.close();
std::istringstream reader(infile_data);
indexer::index<indexer::value_record> idx(&reader, 1000);
// Create vertices vector
std::vector<uint64_t> vertices;
std::map<uint64_t, uint64_t> vertex_map;
size_t record_id = 0;
for (const auto &record : idx.records()) {
vertices.push_back(record.m_value);
vertex_map[record.m_value] = record_id;
record_id++;
}
std::vector<roaring::Roaring> edge_map(vertices.size());
// Populate edge map
idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {
if (vertex_map.count(key) == 0) {
vertices.push_back(key);
edge_map.push_back(roaring::Roaring());
vertex_map[key] = record_id;
record_id++;
}
edge_map[vertex_map[key]] = std::move(bitmap);
});
// Calculate harmonic centrality on graph.
if (vertices.size() > 500) {
auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());
}
// Sort the results a bit.
std::vector<size_t> sorted(harmonic.size());
std::iota(sorted.begin(), sorted.end(), 0);
std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {
return harmonic[a] > harmonic[b];
});
done_with++;
float percent = ((float)done_with / all_files.size()) * 100.0f;
float elapsed_milliseconds = prof.get();
size_t items_left = all_files.size() - done_with;
float milliseconds_per_file = elapsed_milliseconds/done_with;
float milliseconds_left = milliseconds_per_file * items_left;
float hours_left = milliseconds_left / (1000.0f * 3600.0f);
std::cout << "done with " << done_with << " out of " << all_files.size() << " (" <<
percent << "% done) time left: " << hours_left << " hours"<< std::endl;
}
return 0;*/
// load the file
std::string content = file::cat("multiple_domains.tsv");
std::vector<std::string> lines;
boost::split(lines, content, boost::is_any_of("\n"));
std::vector<std::vector<std::string>> csv_data;
for (auto line : lines) {
std::vector<std::string> cols;
boost::split(cols, line, boost::is_any_of("\t"));
if (cols.size() > 1) {
if (URL(cols[1]).host_hash() == URL("http://abc13.com").host_hash()) {
csv_data.push_back(cols);
}
}
}
profiler::instance prof_load("load");
//std::ifstream infile("/mnt/5/full_text/internal_links/3492248666075096845.data", std::ios::binary);
std::ifstream infile("/mnt/6/full_text/internal_links/12854855988816217414.data", std::ios::binary);
std::string infile_data(std::istreambuf_iterator<char>(infile), {});
infile.close();
std::istringstream reader(infile_data);
indexer::index<indexer::value_record> idx(&reader, 1000);
prof_load.stop();
profiler::instance prof("make vertices");
std::vector<uint64_t> vertices;
std::map<uint64_t, uint64_t> vertex_map;
size_t record_id = 0;
for (const auto &record : idx.records()) {
vertices.push_back(record.m_value);
vertex_map[record.m_value] = record_id;
record_id++;
}
std::vector<roaring::Roaring> edge_map(vertices.size());
idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {
if (vertex_map.count(key) == 0) {
vertices.push_back(key);
edge_map.push_back(roaring::Roaring());
vertex_map[key] = record_id;
record_id++;
}
edge_map[vertex_map[key]] = std::move(bitmap);
});
prof.stop();
profiler::instance prof2("run hyper_ball");
auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());
prof2.stop();
prof_total.stop();
std::vector<size_t> sorted(harmonic.size());
std::iota(sorted.begin(), sorted.end(), 0);
std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {
return harmonic[a] > harmonic[b];
});
std::map<uint64_t, double> harmonic_by_url;
for (size_t i = 0; i < harmonic.size(); i++) {
harmonic_by_url[vertices[sorted[i]]] = harmonic[sorted[i]] / vertices.size();
}
for (auto row : csv_data) {
uint64_t url_hash = stoull(row[0]);
double harmonic = harmonic_by_url[url_hash];
std::cout << row[0] << "\t" << row[1] << "\t" << harmonic << std::endl;
}
/*
profiler::instance prof_load("load");
//std::ifstream infile("/mnt/5/full_text/internal_links/3492263685688109621.data", std::ios::binary);
//std::ifstream infile("/mnt/5/full_text/internal_links/3492528524383210893.data", std::ios::binary);
//std::ifstream infile("/mnt/0/full_text/internal_links/7131549202223940368.data", std::ios::binary);
std::ifstream infile("/mnt/0/full_text/internal_links/10401139885298228528.data", std::ios::binary);
std::string infile_data(std::istreambuf_iterator<char>(infile), {});
infile.close();
std::istringstream reader(infile_data);
indexer::index<indexer::value_record> idx(&reader, 1000);
prof_load.stop();
profiler::instance prof("make vertices");
std::vector<uint64_t> vertices;
std::map<uint64_t, uint64_t> vertex_map;
size_t record_id = 0;
for (const auto &record : idx.records()) {
vertices.push_back(record.m_value);
vertex_map[record.m_value] = record_id;
record_id++;
}
std::vector<roaring::Roaring> edge_map(vertices.size());
idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) {
if (vertex_map.count(key) == 0) {
vertices.push_back(key);
edge_map.push_back(roaring::Roaring());
vertex_map[key] = record_id;
record_id++;
}
edge_map[vertex_map[key]] = std::move(bitmap);
});
prof.stop();
profiler::instance prof2("run hyper_ball");
auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data());
prof2.stop();
prof_total.stop();
std::vector<size_t> sorted(harmonic.size());
std::iota(sorted.begin(), sorted.end(), 0);
std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) {
return harmonic[a] > harmonic[b];
});
//for (size_t i = 0; i < harmonic.size(); i++) {
//std::cout << "vertex: " << vertices[sorted[i]] << " has harmonic: " << harmonic[sorted[i]] << std::endl;
//}
*/
} else if (arg == "--url-server") {
// Spin up a simple url server.
hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data");
http::server url_server([&ht](auto request) {
http::response res;
URL url = request.url();
auto query = url.query();
URL find_url(parser::urldecode(query["url"]));
size_t ver;
const auto find_str = ht.find(find_url.hash(), ver);
if (find_str == "") {
res.code(404);
res.body("Not found 404");
} else {
res.code(200);
res.body(find_str);
}
return res;
});
} else {
help();
}
logger::join_logger_thread();
return 0;
}
================================================
FILE: src/algorithm/algorithm.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "algorithm.h"
#include "profiler/profiler.h"
#include <iostream>
#include <set>
#include <numeric>
#include <map>
#include <math.h>
#include <cassert>
#include <future>
#include <cstring>
namespace algorithm {
/*
Returns partitions with indices that are smaller than the values in the dims vector.
For example:
dims = {2,2} gives {0,0}, {1,0}, {0,1}, {1,1}
dims = {2,3} gives {0,0}, {1,0}, {0,1}, {1,1}, {0,2}, {1,2}
*/
std::vector<std::vector<int>> incremental_partitions(const std::vector<int> &dims, size_t limit) {
std::vector<std::vector<int>> res;
std::set<std::vector<int>> uniq;
std::vector<int> initial(dims.size(), 0);
res.push_back(initial);
uniq.insert(initial);
for (size_t j = 0; j < res.size(); j++) {
std::vector<int> vec = res[j];
for (size_t i = 0; i < vec.size(); i++) {
if (vec[i] < dims[i]-1) {
std::vector<int> copy(vec);
copy[i]++;
res.push_back(copy);
uniq.insert(copy);
if (uniq.size() >= limit) break;
}
}
if (uniq.size() >= limit) break;
}
std::vector<std::vector<int>> ret(uniq.begin(), uniq.end());
sort(ret.begin(), ret.end(), [](const std::vector<int> &a, const std::vector<int> &b) {
int sum1 = accumulate(a.begin(), a.end(), 0);
int sum2 = accumulate(b.begin(), b.end(), 0);
if (sum1 == sum2) {
int max1 = *max_element(a.begin(), a.end());
int max2 = *max_element(b.begin(), b.end());
if (max1 == max2) {
return b < a;
}
return max1 < max2;
}
return sum1 < sum2;
});
return ret;
}
/*
Calculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i.
The depth parameter is the maximum level to traverse in the neighbour tree.
The edges set contains pairs of edges (from vertex, to vertex)
*/
/*
* This is the inner outer loop for calculating harmonic centrality.
* */
std::vector<double> harmonic_centrality_subvector(size_t vlen, const std::vector<uint32_t> *edge_map,
size_t depth, size_t start, size_t len) {
char *all = new char[vlen];
uint32_t *level1 = new uint32_t[vlen];
uint32_t *level2 = new uint32_t[vlen];
uint32_t *levels[2] = {level1, level2};
size_t level_len[2] = {0, 0};
std::vector<double> harmonics;
profiler::instance prof("Timetaker");
for (size_t i = start; i < start + len; i++) {
const uint32_t vertex = i;
level_len[0] = 0;
level_len[1] = 0;
memset(all, 0, vlen);
levels[0][0] = vertex;
level_len[0]++;
all[vertex] = 1;
double harmonic = 0.0;
/*
If we can assume the average number of incoming edges per vertex to be constant these loops should be O(1) in n.
Example, if we have n = 10 000 000 vertices and 10 inbound edges on each vertex these loops should be
(first loop is depth) X (worst case second loop is 10^depth) X (inner loop is 10)
depth * 10^depth * 10
independent of n
*/
size_t last_level = 0;
size_t cur_level = 1;
for (size_t level = 1; level <= depth; level++) {
//for (const uint32_t &v : level[level - 1]) {
for (size_t j = 0; j < level_len[last_level]; j++) {
const uint32_t v = levels[last_level][j];
for (const uint32_t &edge : edge_map[v]) {
if (!all[edge]) {
levels[cur_level][level_len[cur_level]++] = edge;
all[edge] = 1;
}
}
}
if (level_len[cur_level] == 0) break;
harmonic += (double)level_len[cur_level] / level;
// Swap levels
level_len[last_level] = 0;
size_t tmp = last_level;
last_level = cur_level;
cur_level = tmp;
}
harmonics.push_back(harmonic);
}
delete [] level2;
delete [] level1;
delete [] all;
return harmonics;
}
std::vector<double> harmonic_centrality(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth) {
std::vector<double> harmonics;
std::vector<uint32_t> *edge_map = new std::vector<uint32_t>[vlen];
for (const auto &edge : edges) {
/*
second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase
harmonic centrality of vertex.
*/
edge_map[edge.second].push_back(edge.first);
}
std::vector<double> ret = harmonic_centrality(vlen, edge_map, depth);
delete [] edge_map;
return ret;
}
std::vector<double> harmonic_centrality(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth) {
return harmonic_centrality_subvector(vlen, edge_map, depth, 0, vlen);
}
std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth,
size_t num_threads) {
std::vector<uint32_t> *edge_map = new std::vector<uint32_t>[vlen];
for (const auto &edge : edges) {
/*
second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase
harmonic centrality of vertex.
*/
edge_map[edge.second].push_back(edge.first);
}
std::vector<double> ret = harmonic_centrality_threaded(vlen, edge_map, depth, num_threads);
delete [] edge_map;
return ret;
}
std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth, size_t num_threads) {
assert(vlen >= num_threads);
std::vector<std::future<std::vector<double>>> threads;
// Split the vertices into several vectors.
const size_t max_len = ceil((double)vlen / num_threads);
for (size_t i = 0; i < vlen; i += max_len) {
const size_t len = std::min(max_len, vlen - i);
threads.emplace_back(std::async(std::launch::async, harmonic_centrality_subvector, vlen, edge_map, depth, i, len));
}
std::vector<double> harmonic;
for (auto &thread : threads) {
std::vector<double> part = thread.get();
harmonic.insert(harmonic.end(), part.begin(), part.end());
}
return harmonic;
}
std::vector<uint32_t> *set_to_edge_map(size_t n, const std::set<std::pair<uint32_t, uint32_t>> &edges) {
std::vector<uint32_t> *edge_map = new std::vector<uint32_t>[n];
for (const auto &edge : edges) {
/*
second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase
harmonic centrality of vertex.
*/
edge_map[edge.second].push_back(edge.first);
}
return edge_map;
}
}
================================================
FILE: src/algorithm/algorithm.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <vector>
#include <set>
#include <unordered_map>
#include <cstdint>
namespace algorithm {
template<class T>
void vector_chunk(const std::vector<T> &vec, size_t chunk_size, std::vector<std::vector<T>> &dest) {
std::vector<T> chunk;
for (T item : vec) {
chunk.push_back(item);
if (chunk.size() == chunk_size) {
dest.push_back(chunk);
chunk.clear();
}
}
if (chunk.size()) {
dest.push_back(chunk);
}
}
std::vector<std::vector<int>> incremental_partitions(const std::vector<int> &dims, size_t limit);
/*
Calculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i.
The depth parameter is the maximum level to traverse in the neighbour tree.
The edges set contains pairs of edges (from vertex, to vertex)
*/
std::vector<double> harmonic_centrality(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth);
std::vector<double> harmonic_centrality(size_t vlen, const std::vector<uint32_t> *edge_map, size_t depth);
std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::set<std::pair<uint32_t, uint32_t>> &edges, size_t depth,
size_t num_threads);
std::vector<double> harmonic_centrality_threaded(size_t vlen, const std::vector<uint32_t> *edge_map,
size_t depth, size_t num_threads);
std::vector<uint32_t> *set_to_edge_map(size_t n, const std::set<std::pair<uint32_t, uint32_t>> &edges);
}
================================================
FILE: src/algorithm/bloom_filter.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "bloom_filter.h"
#include "algorithm/hash.h"
#include <cmath>
#include <cstring>
#include <fstream>
namespace algorithm {
bloom_filter::bloom_filter()
{
m_bitmap = std::make_unique<uint64_t[]>(m_dim);
for (size_t i = 0; i < m_dim; i++) {
m_bitmap[i] = 0x0ull;
}
}
// Dim should be a prime number..
bloom_filter::bloom_filter(size_t dim)
: m_dim(dim), m_bitlen(dim * 64)
{
m_bitmap = std::make_unique<uint64_t[]>(m_dim);
for (size_t i = 0; i < m_dim; i++) {
m_bitmap[i] = 0x0ull;
}
}
void bloom_filter::insert(const std::string &item) {
for (size_t i = 0; i < m_seeds.size(); i++) {
const uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]);
set_bit(hash);
}
}
void bloom_filter::insert(uint64_t item) {
insert(std::to_string(item));
}
void bloom_filter::insert_many(std::vector<uint64_t> &items) {
std::vector<size_t> hashes;
for (const auto &item : items) {
const auto str_item = std::to_string(item);
for (size_t i = 0; i < m_seeds.size(); i++) {
const uint64_t hash = algorithm::hash_with_seed(str_item, m_seeds[i]);
hashes.push_back(hash);
}
}
std::lock_guard guard(m_mutex);
for (const auto &hash : hashes) {
set_bit(hash);
}
}
const char * bloom_filter::data() const {
return (char *)m_bitmap.get();
}
bool bloom_filter::exists(const std::string &item) const {
for (size_t i = 0; i < m_seeds.size(); i++) {
const uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]);
if (!get_bit(hash)) return false;
}
return true;
}
bool bloom_filter::exists(uint64_t data) const {
return exists(std::to_string(data));
}
void bloom_filter::read(char *data, size_t len) {
memcpy((char *)m_bitmap.get(), data, len);
}
void bloom_filter::merge(const bloom_filter &other) {
for (size_t i = 0; i < m_dim; i++) {
m_bitmap[i] |= other.m_bitmap[i];
}
}
double bloom_filter::saturation() {
return 1.0;
}
void bloom_filter::read_file(const std::string &file_name) {
std::ifstream infile(file_name, std::ios::binary);
infile.read((char *)m_bitmap.get(), size());
}
void bloom_filter::write_file(const std::string &file_name) const {
std::ofstream outfile(file_name, std::ios::binary | std::ios::trunc);
outfile.write((char *)m_bitmap.get(), size());
}
void bloom_filter::set_bit(size_t bit) {
const size_t x = bit % m_bitlen;
const size_t pos = static_cast<size_t>(x / 64);
const size_t bit_in_pos = x % 64;
m_bitmap[pos] = m_bitmap[pos] | (0x1ull << bit_in_pos);
}
bool bloom_filter::get_bit(size_t bit) const {
const size_t x = bit % m_bitlen;
const size_t pos = static_cast<size_t>(x / 64);
const size_t bit_in_pos = x % 64;
return (m_bitmap[pos] & (0x1ull << bit_in_pos)) >> bit_in_pos;
}
}
================================================
FILE: src/algorithm/bloom_filter.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <memory>
#include <mutex>
#include "roaring/roaring64map.hh"
namespace algorithm {
class bloom_filter {
public:
bloom_filter();
bloom_filter(size_t dim);
void insert(const std::string &item);
void insert(uint64_t item);
void insert_many(std::vector<uint64_t> &items);
bool exists(const std::string &item) const;
bool exists(uint64_t data) const;
size_t size() const { return m_dim * sizeof(uint64_t); }
const char *data() const;
void read(char *data, size_t len);
void merge(const bloom_filter &other);
double saturation();
void read_file(const std::string &file_name);
void write_file(const std::string &file_name) const;
private:
std::unique_ptr<uint64_t[]> m_bitmap;
#ifdef IS_TEST
size_t m_dim = 2695797;
#else
size_t m_dim = 4043696581;
#endif
size_t m_bitlen = m_dim * 64;
// some random prime numbers
std::array<uint64_t, 10> m_seeds = {3339675911, 2695798769, 2695831867, 2695857877, 2695879891, 2695879891, 2695922687, 2695935521,
3339689791, 3339703163};
std::mutex m_mutex;
void set_bit(size_t bit);
bool get_bit(size_t bit) const;
};
}
================================================
FILE: src/algorithm/hash.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <cstdint>
#include "hash.h"
namespace algorithm {
/*
* Murmur hash by Austin Appleby
* Taken from here https://sites.google.com/site/murmurhash/
* */
size_t murmur_hash(const char *key, size_t len, size_t seed) {
const uint64_t m = 0xc6a4a7935bd1e995ull;
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
while(data != end) {
uint64_t k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
switch(len & 7) {
case 7: h ^= uint64_t(data2[6]) << 48;
case 6: h ^= uint64_t(data2[5]) << 40;
case 5: h ^= uint64_t(data2[4]) << 32;
case 4: h ^= uint64_t(data2[3]) << 24;
case 3: h ^= uint64_t(data2[2]) << 16;
case 2: h ^= uint64_t(data2[1]) << 8;
case 1: h ^= uint64_t(data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
size_t hash(const std::string &str) {
static const size_t seed = 0xc70f6907ul;
return murmur_hash(str.c_str(), str.size(), seed);
}
size_t hash_with_seed(const std::string &str, size_t seed) {
return murmur_hash(str.c_str(), str.size(), seed);
}
}
================================================
FILE: src/algorithm/hash.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <string>
namespace algorithm {
size_t hash(const std::string &str);
size_t hash_with_seed(const std::string &str, size_t seed);
}
================================================
FILE: src/algorithm/hyper_ball.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <vector>
#include <cstdint>
#include "hyper_log_log.h"
#include "profiler/profiler.h"
#include "logger/logger.h"
#include <future>
namespace algorithm {
template <typename edge_map_type>
bool hyper_ball_worker(double t, size_t v_begin, size_t v_end, const edge_map_type &edge_map,
std::vector<hyper_log_log> &c, std::vector<hyper_log_log> &a, std::vector<double> &harmonic) {
bool counter_changed = false;
for (uint32_t v = v_begin; v < v_end; v++) {
a[v] = c[v];
for (const uint32_t &w : edge_map[v]) {
a[v] += c[w];
}
// a[v] is t + 1 and c[v] is at t
const size_t counter_diff = a[v].count() - c[v].count();
if (counter_diff) {
counter_changed = true;
harmonic[v] += (1.0 / (t + 1.0)) * counter_diff;
}
}
for (uint32_t v = v_begin; v < v_end; v++) {
c[v] = a[v];
}
return counter_changed;
}
/*
* n is the number of vertices in graph.
* edge_map is pointing to a static array of size n.
* each item in edge_map is a vector of variable size.
* each vector edge_map[m] contains values between 0 and n-1 indicating edge between m and edge_map[m].
* NOTE direction of edge in edge map has to be EDGE_FROM -> EDGE_TO.
* so for vertex m, n = edge_map[m] indicates directed edge from n to m
* */
template <typename edge_map_type>
std::vector<double> hyper_ball(uint32_t n, const edge_map_type &edge_map) {
if (n == 0) return {};
const size_t num_threads = std::min(32, (int)n);
const size_t items_per_thread = n / num_threads;
std::vector<hyper_log_log> c(n, hyper_log_log(10));
std::vector<hyper_log_log> a(n, hyper_log_log(10));
std::vector<double> harmonic(n, 0.0);
for (uint32_t v = 0; v < n; v++) {
c[v].insert(v);
}
double t = 0.0;
while (true) {
std::vector<std::future<bool>> threads;
for (size_t i = 0; i < num_threads; i++) {
const size_t v_begin = i * items_per_thread;
const size_t v_end = (i == num_threads - 1) ? n : (i + 1) * items_per_thread;
auto fut = std::async(hyper_ball_worker<edge_map_type>, t, v_begin, v_end, std::cref(edge_map), std::ref(c), std::ref(a), std::ref(harmonic));
threads.emplace_back(std::move(fut));
}
bool should_continue = false;
for (auto &fut : threads) {
should_continue = fut.get() || should_continue;
}
t += 1.0;
if (!should_continue) break;
}
return harmonic;
}
}
================================================
FILE: src/algorithm/hyper_log_log.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <numeric>
#include "hyper_log_log.h"
#include "algorithm/hash.h"
namespace algorithm {
hyper_log_log::hyper_log_log(size_t b)
: m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) {
m_M.resize(m_len);
std::fill(m_M.begin(), m_M.end(), 0);
}
hyper_log_log::hyper_log_log(const char *registers, size_t b)
: m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) {
m_M.resize(m_len);
memcpy(m_M.data(), registers, m_len);
}
hyper_log_log::hyper_log_log(const hyper_log_log &other)
: m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) {
m_M.resize(m_len);
std::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin());
}
hyper_log_log::hyper_log_log(hyper_log_log &&other)
: m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) {
m_M.swap(other.m_M);
}
hyper_log_log::~hyper_log_log() {
}
void hyper_log_log::insert(size_t v) {
size_t x = algorithm::hash(std::to_string(v));
size_t j = x >> (64-m_b);
m_M[j] = std::max(m_M[j], leading_zeros_plus_one(x << m_b));
}
size_t hyper_log_log::count() const {
double Z = 0.0;
for (size_t j = 0; j < m_len; j++) {
Z += 1.0 / (1ull << m_M[j]);
}
double E = m_alpha * m_len * m_len / Z;
// Only small range correction implemented since we use 64 bit hash.
if (E <= (5.0/2.0) * m_len) {
size_t V = num_zero_registers();
if (V != 0) {
E = m_len * log((double)m_len / V);
}
}
return (size_t)E;
}
void hyper_log_log::reset() {
std::fill(m_M.begin(), m_M.end(), 0);
}
char hyper_log_log::leading_zeros_plus_one(size_t x) const {
size_t num_zeros = 1;
for (size_t i = 0; i < 64; i++) {
if ((x >> (64 - 1 - i)) & 0x1ull) return num_zeros;
num_zeros++;
}
return num_zeros;
}
size_t hyper_log_log::num_zero_registers() const {
return std::transform_reduce(m_M.begin(), m_M.end(), 0,
[](int a, int b) { return a + b; },
[](char a) { return a == 0 ? 1 : 0; });
}
double hyper_log_log::error_bound() const {
double stdd = 1.04 / sqrt((double)m_len);
return stdd * 3; // Gives 99% confidence
}
hyper_log_log hyper_log_log::operator +(const hyper_log_log &hl) const {
hyper_log_log res;
std::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(res.m_M), [] (char a, char b) { return std::max(a, b); });
return res;
}
hyper_log_log &hyper_log_log::operator +=(const hyper_log_log &hl) {
std::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(m_M), [] (char a, char b) { return std::max(a, b); });
return *this;
}
hyper_log_log &hyper_log_log::operator =(const hyper_log_log &other) {
std::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin());
return *this;
}
}
================================================
FILE: src/algorithm/hyper_log_log.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <cmath>
#include <cstring>
#include <algorithm>
#include <iostream>
#include <vector>
namespace algorithm {
/*
* Implementation of the hyper log log algorithm as described by Flajolet1 et al.
* http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
*
* Using 64 bit hash instead of 32bit.
* */
class hyper_log_log {
public:
/*
* initializes with given b parameter. size of data structure will be 2^b bytes.
* */
hyper_log_log(size_t b = 15);
hyper_log_log(const char *registers, size_t b = 15);
hyper_log_log(const hyper_log_log &other);
hyper_log_log(hyper_log_log &&other);
~hyper_log_log();
void insert(size_t v);
size_t count() const;
double error_bound() const;
void reset();
const char *data() const { return m_M.data(); };
char *data() { return m_M.data(); };
int b() const { return m_b; }
size_t data_size() const { return m_len; };
hyper_log_log operator +(const hyper_log_log &hl) const;
hyper_log_log &operator +=(const hyper_log_log &hl);
hyper_log_log &operator =(const hyper_log_log &other);
char leading_zeros_plus_one(size_t x) const;
private:
std::vector<char> m_M; // Points to registers.
const int m_b;
const size_t m_len;
const double m_alpha;
size_t num_zero_registers() const;
};
}
================================================
FILE: src/algorithm/intersection.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <functional>
#include "intersection.h"
namespace algorithm {
roaring::Roaring intersection(const std::vector<roaring::Roaring> &input) {
if (input.size() == 0) return roaring::Roaring();
roaring::Roaring intersection = input[0];
for (size_t i = 1; i < input.size(); i++) {
intersection &= input[i];
}
return intersection;
}
}
================================================
FILE: src/algorithm/intersection.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <vector>
#include <memory>
#include "roaring/roaring.hh"
namespace algorithm {
roaring::Roaring intersection(const std::vector<roaring::Roaring> &input);
template<typename item>
std::vector<item> intersection(const std::vector<std::vector<item>> &input,
std::function<void(item &a, const item &b)> sum_fun) {
if (input.size() == 0) return {};
size_t shortest_vector_position = 0;
size_t shortest_len = SIZE_MAX;
size_t iter_index = 0;
for (const std::vector<item> &vec : input) {
if (shortest_len > vec.size()) {
shortest_len = vec.size();
shortest_vector_position = iter_index;
}
iter_index++;
}
std::vector<size_t> positions(input.size(), 0);
std::vector<item> intersection;
while (positions[shortest_vector_position] < shortest_len) {
bool all_equal = true;
item value = input[shortest_vector_position][positions[shortest_vector_position]];
size_t iter_index = 0;
for (const std::vector<item> &vec : input) {
const size_t len = vec.size();
size_t *pos = &(positions[iter_index]);
while (*pos < len && vec[*pos] < value) {
(*pos)++;
}
if (((*pos < len) && (value < vec[*pos])) || *pos >= len) {
all_equal = false;
break;
} else {
if (iter_index != shortest_vector_position) {
sum_fun(value, vec[*pos]);
}
}
iter_index++;
}
if (all_equal) {
intersection.push_back(value);
}
positions[shortest_vector_position]++;
}
return intersection;
}
template<typename item>
std::vector<item> intersection(const std::vector<std::unique_ptr<item[]>> &input, const std::vector<size_t> lengths) {
if (input.size() == 0) return {};
size_t shortest_vector_position = 0;
size_t shortest_len = SIZE_MAX;
size_t iter_index = 0;
for (size_t len : lengths) {
if (shortest_len > len) {
shortest_len = len;
shortest_vector_position = iter_index;
}
iter_index++;
}
std::vector<size_t> positions(input.size(), 0);
std::vector<item> intersection;
while (positions[shortest_vector_position] < shortest_len) {
bool all_equal = true;
item value = input[shortest_vector_position][positions[shortest_vector_position]];
size_t iter_index = 0;
for (const std::unique_ptr<item[]> &ptr : input) {
const size_t len = lengths[iter_index];
size_t *pos = &(positions[iter_index]);
while (*pos < len && ptr[*pos] < value) {
(*pos)++;
}
if (((*pos < len) && (value < ptr[*pos])) || *pos >= len) {
all_equal = false;
break;
} else {
if (iter_index != shortest_vector_position) {
//sum_fun(value, ptr[*pos]);
}
}
iter_index++;
}
if (all_equal) {
intersection.push_back(value);
}
positions[shortest_vector_position]++;
}
return intersection;
}
template<typename item>
std::vector<item> intersection(const std::vector<std::vector<item>> &input) {
return intersection<item>(input, [](item &a, const item &b) {});
}
}
================================================
FILE: src/algorithm/sort.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "sort.h"
namespace algorithm {
}
================================================
FILE: src/algorithm/sort.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <vector>
#include <span>
namespace algorithm {
namespace sort {
template<typename data_record, typename F>
void merge_arrays(const std::vector<data_record> &arr1, const std::vector<data_record> &arr2, F compare, std::vector<data_record> &arr3) {
size_t i = 0, j = 0;
while (i < arr1.size() && j < arr2.size()) {
if (compare(arr1[i], arr2[j])) {
arr3.push_back(arr1[i++]);
} else {
arr3.push_back(arr2[j++]);
}
}
while (i < arr1.size()) arr3.push_back(arr1[i++]);
while (j < arr2.size()) arr3.push_back(arr2[j++]);
}
template<typename data_record, typename F>
void merge_arrays(const std::span<data_record> *arr1, const std::span<data_record> *arr2, F compare, std::vector<data_record> &arr3) {
size_t i = 0, j = 0;
while (i < arr1->size() && j < arr2->size()) {
if (compare((*arr1)[i], (*arr2)[j])) {
arr3.push_back((*arr1)[i++]);
} else {
arr3.push_back((*arr2)[j++]);
}
}
while (i < arr1->size()) arr3.push_back((*arr1)[i++]);
while (j < arr2->size()) arr3.push_back((*arr2)[j++]);
}
template<typename data_record>
void merge_arrays(const std::vector<data_record> &arr1, const std::vector<data_record> &arr2, std::vector<data_record> &arr3) {
merge_arrays(arr1, arr2, [](const data_record &a, const data_record &b) {
return a < b;
}, arr3);
}
template<typename data_record>
void merge_arrays(const std::vector<std::vector<data_record>> &arrays, std::vector<data_record> &res) {
merge_arrays(arrays, [](const data_record &a, const data_record &b) {
return a < b;
}, res);
}
template<typename data_record, typename F>
void merge_array_range(const std::vector<std::vector<data_record>> &arrays, size_t i, size_t j, F compare, std::vector<data_record> &res) {
if (i == j) {
for (const data_record &rec : arrays[i]) {
res.push_back(rec);
}
} else if (j - i == 1) {
merge_arrays(arrays[i], arrays[j], compare, res);
} else {
std::vector<data_record> out1;
std::vector<data_record> out2;
merge_array_range(arrays, i, (i + j)/2, compare, out1);
merge_array_range(arrays, (i + j)/2 + 1, j, compare, out2);
merge_arrays(out1, out2, compare, res);
}
}
template<typename data_record, typename F>
void merge_arrays(const std::vector<std::vector<data_record>> &arrays, F compare, std::vector<data_record> &res) {
if (arrays.size() == 0) return;
merge_array_range(arrays, 0, arrays.size() - 1, compare, res);
}
template<typename data_record, typename F>
void merge_array_range(const std::vector<std::span<data_record> *> &arrays, size_t i, size_t j, F compare, std::vector<data_record> &res) {
if (i == j) {
for (const data_record &rec : *(arrays[i])) {
res.push_back(rec);
}
} else if (j - i == 1) {
merge_arrays(arrays[i], arrays[j], compare, res);
} else {
std::vector<data_record> out1;
std::vector<data_record> out2;
merge_array_range(arrays, i, (i + j)/2, compare, out1);
merge_array_range(arrays, (i + j)/2 + 1, j, compare, out2);
merge_arrays(out1, out2, compare, res);
}
}
template<typename data_record, typename F>
void merge_arrays(const std::vector<std::span<data_record> *> &arrays, F compare, std::vector<data_record> &res) {
if (arrays.size() == 0) return;
merge_array_range(arrays, 0, arrays.size() - 1, compare, res);
}
}
}
================================================
FILE: src/algorithm/sum_sorted.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <vector>
#include <functional>
namespace algorithm {
template<class dtype>
std::vector<dtype> sum_sorted(const std::vector<std::vector<dtype>> &input,
std::function<void(dtype &a, const dtype &b)> plus_eq) {
const size_t n = input.size();
if (n == 0) return {};
std::vector<dtype> ret;
std::vector<size_t> pos(n, 0);
while (true) {
int start_vec = -1;
for (size_t i = 0; i < n; i++) {
if (pos[i] < input[i].size() ) {
start_vec = i;
break;
}
}
if (start_vec == -1) break;
dtype smallest = input[start_vec][pos[start_vec]];
for (size_t i = 0; i < n; i++) {
if (pos[i] < input[i].size() && input[i][pos[i]] < smallest) {
smallest = input[i][pos[i]];
start_vec = i;
}
}
const dtype el = input[start_vec][pos[start_vec]];
dtype sum = el;
pos[start_vec]++;
for (size_t i = start_vec + 1; i < n; i++) {
while (pos[i] < input[i].size() && input[i][pos[i]] < el) {
pos[i]++;
}
if (pos[i] < input[i].size() && input[i][pos[i]] == el) {
plus_eq(sum, input[i][pos[i]]);
pos[i]++;
}
}
ret.push_back(sum);
}
return ret;
}
}
================================================
FILE: src/algorithm/top_k.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <vector>
#include <functional>
namespace algorithm {
/*
* Returns top k elements in unsorted const vector in linear time using a 2k memory buffer.
* */
template<class dtype>
std::vector<dtype> top_k(const std::vector<dtype> &input, size_t k,
std::function<bool(const dtype &, const dtype &)> ordered) {
if (input.size() <= k) return input;
if (input.size() <= 2 * k) {
std::vector<dtype> buf(input.begin(), input.end());
std::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered);
return std::vector<dtype>(buf.begin() + buf.size() / 2, buf.end());
}
std::vector<dtype> buf(input.begin(), input.begin() + (2 * k));
size_t idx = 2 * k;
while (idx < input.size()) {
std::nth_element(buf.begin(), buf.begin() + k, buf.end(), ordered);
for (size_t i = 0, j = idx; i < k && j < input.size(); i++, j++) {
// Only insert objects that are out of order compared to pivot buf[k]
if (!ordered(input[j], buf[k])) {
buf[i] = input[idx + i];
}
}
idx += k;
}
// Run final partition.
std::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered);
return std::vector<dtype>(buf.begin() + k, buf.end());
}
/*
* top_k but with default less than operator.
* */
template<class dtype>
std::vector<dtype> top_k(const std::vector<dtype> &input, size_t k) {
return top_k<dtype>(input, k, [](const dtype &a, const dtype &b) { return a < b; });
}
}
================================================
FILE: src/api/api_response.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "api_response.h"
#include "indexer/return_record.h"
#include "full_text/search_metric.h"
#include "parser/unicode.h"
#include "json.hpp"
namespace api {
api_response::api_response(const std::vector<indexer::return_record> &results, const struct full_text::search_metric &metric, double profile) {
using json = nlohmann::ordered_json;
json message;
json result_array;
for (const auto &result : results) {
json json_result;
try {
json_result["url"] = result.m_url.str();
json_result["title"] = parser::unicode::encode(result.m_title);
json_result["snippet"] = parser::unicode::encode(result.m_snippet);
json_result["score"] = result.m_score;
json_result["domain_hash"] = std::to_string(result.m_domain_hash);
json_result["url_hash"] = std::to_string(result.m_url.hash());
result_array.push_back(json_result);
} catch (nlohmann::detail::type_error &error) {
// skip this result.
// in future log this and fix what is wrong.
}
}
message["status"] = "success";
message["time_ms"] = profile;
message["total_found"] = metric.m_total_found;
message["total_url_links_found"] = metric.m_total_url_links_found;
message["total_domain_links_found"] = metric.m_total_domain_links_found;
message["links_handled"] = metric.m_links_handled;
message["link_domain_matches"] = metric.m_link_domain_matches;
message["link_url_matches"] = metric.m_link_url_matches;
message["results"] = result_array;
//m_response = message.dump();
m_response = message.dump(4);
}
api_response::~api_response() {
}
std::ostream &operator<<(std::ostream &os, const api_response &api_response) {
os << api_response.m_response;
return os;
}
}
================================================
FILE: src/api/api_response.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <vector>
namespace full_text {
struct search_metric;
}
namespace indexer {
class return_record;
}
namespace api {
class api_response {
public:
api_response(const std::vector<indexer::return_record> &results, const struct full_text::search_metric &metric, double profile);
~api_response();
friend std::ostream &operator<<(std::ostream &os, const api_response &api_response);
private:
std::string m_response;
};
}
================================================
FILE: src/api/result_with_snippet.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "result_with_snippet.h"
#include "text/text.h"
namespace api {
result_with_snippet::result_with_snippet(const std::string &tsv_data, const indexer::return_record &res)
: m_score(res.m_score), m_domain_hash(res.m_domain_hash) {
size_t pos_start = 0;
size_t pos_end = 0;
size_t col_num = 0;
while (pos_end != std::string::npos) {
pos_end = tsv_data.find('\t', pos_start);
const size_t len = pos_end - pos_start;
if (col_num == 0) {
m_url = URL(tsv_data.substr(pos_start, len));
}
if (col_num == 1) {
m_title = tsv_data.substr(pos_start, len);
}
if (col_num == 3) {
m_meta = tsv_data.substr(pos_start, len);
}
if (col_num == 4) {
m_snippet = make_snippet(tsv_data.substr(pos_start, len));
if (m_snippet.size() == 0) {
m_snippet = make_snippet(m_meta);
}
}
pos_start = pos_end + 1;
col_num++;
}
}
result_with_snippet::~result_with_snippet() {
}
std::string result_with_snippet::make_snippet(const std::string &text) const {
std::string response = text.substr(0, 140);
text::trim(response);
if (response.size() >= 140) response += "...";
return response;
}
}
================================================
FILE: src/api/result_with_snippet.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include "URL.h"
#include "indexer/return_record.h"
namespace api {
class result_with_snippet {
public:
result_with_snippet(const std::string &tsv_data, const indexer::return_record &res);
~result_with_snippet();
const URL &url() const { return m_url; };
const std::string &title() const { return m_title; };
const std::string &snippet() const { return m_snippet; };
const float &score() const { return m_score; };
const uint64_t &domain_hash() const { return m_domain_hash; };
private:
URL m_url;
std::string m_title;
std::string m_meta;
std::string m_snippet;
float m_score;
uint64_t m_domain_hash;
std::string make_snippet(const std::string &text) const;
};
}
================================================
FILE: src/cluster/cluster.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
================================================
FILE: src/cluster/document.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "document.h"
#include "algorithm/hash.h"
#include "text/text.h"
#include "URL.h"
namespace cluster {
document::document()
: m_name("unnamed document"){
}
document::document(const std::string &name)
: m_name(name) {
}
document::~document() {
}
void document::read_text(const std::string &text) {
const std::vector<std::string> words = text::get_words(text, 0);
for (const auto &word : words) {
m_counts[algorithm::hash(word)]++;
}
}
void read_text_to_corpus(corpus &corp, const std::string &text) {
const std::vector<std::string> words = text::get_words(text, 0);
for (const auto &word : words) {
size_t key = algorithm::hash(word);
corp.counts[key]++;
if (corp.words.count(key) == 0) {
corp.words[key] = word;
}
}
}
void read_corpus(corpus &corp, documents &documents, std::stringstream &tsv) {
std::string line;
while (getline(tsv, line)) {
const size_t pos = line.find('\t');
if (pos == std::string::npos) continue;
URL url(line.substr(0, pos));
const std::string doc_text = line.substr(pos);
const size_t key = url.host_hash();
if (!documents.count(key)) {
documents.emplace(key, url.host());
}
documents[key].read_text(doc_text);
if (key == algorithm::hash("annicaviklund.se")) {
std::cout << doc_text << std::endl;
}
read_text_to_corpus(corp, doc_text);
}
}
void print_document(corpus &corp, const document &document) {
std::vector<std::pair<size_t, size_t>> keys;
for (const auto &iter : document.m_counts) {
keys.emplace_back(iter.first, iter.second);
}
sort(keys.begin(), keys.end(), [](const auto &a, const auto &b) {
return a.second > b.second;
});
size_t len = keys.size();
for (size_t i = 0; i < std::min(100ul, len); i++) {
std::cout << corp.words[keys[i].first] << " = " << keys[i].second << std::endl;
}
}
}
================================================
FILE: src/cluster/document.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <unordered_map>
#include <cstdint>
namespace cluster {
typedef struct corpus_s {
std::unordered_map<size_t, std::string> words;
std::unordered_map<size_t, size_t> counts;
} corpus;
class document {
public:
document();
document(const std::string &name);
~document();
std::string name() const { return m_name; };
size_t size() const { return m_counts.size(); };
void read_text(const std::string &text);
friend void print_document(corpus &corp, const document &document);
private:
std::string m_name;
std::unordered_map<size_t, size_t> m_counts;
};
typedef document topic;
typedef std::unordered_map<size_t, document> documents;
void read_corpus(corpus &corp, documents &documents, std::stringstream &tsv);
void print_document(corpus &corp, const document &document);
}
================================================
FILE: src/common/ThreadPool.h
================================================
/*
Copyright (c) 2012 Jakob Progsch, Václav Zeman
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
#ifndef THREAD_POOL_H
#define THREAD_POOL_H
#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>
class ThreadPool {
public:
explicit ThreadPool(size_t);
template<class F, class... Args>
auto enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>;
~ThreadPool();
private:
// need to keep track of threads so we can join them
std::vector< std::thread > workers;
// the task queue
std::queue< std::function<void()> > tasks;
// synchronization
std::mutex queue_mutex;
std::condition_variable condition;
bool stop;
};
// the constructor just launches some amount of workers
inline ThreadPool::ThreadPool(size_t threads)
: stop(false)
{
for(size_t i = 0;i<threads;++i)
workers.emplace_back(
[this]
{
for(;;)
{
std::function<void()> task;
{
std::unique_lock<std::mutex> lock(this->queue_mutex);
this->condition.wait(lock,
[this]{ return this->stop || !this->tasks.empty(); });
if(this->stop && this->tasks.empty())
return;
task = std::move(this->tasks.front());
this->tasks.pop();
}
task();
}
}
);
}
// add new work item to the pool
template<class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
{
using return_type = typename std::result_of<F(Args...)>::type;
auto task = std::make_shared< std::packaged_task<return_type()> >(
std::bind(std::forward<F>(f), std::forward<Args>(args)...)
);
std::future<return_type> res = task->get_future();
{
std::unique_lock<std::mutex> lock(queue_mutex);
// don't allow enqueueing after stopping the pool
if(stop)
throw std::runtime_error("enqueue on stopped ThreadPool");
tasks.emplace([task](){ (*task)(); });
}
condition.notify_one();
return res;
}
// the destructor joins all threads
inline ThreadPool::~ThreadPool()
{
{
std::unique_lock<std::mutex> lock(queue_mutex);
stop = true;
}
condition.notify_all();
for(std::thread &worker: workers)
worker.join();
}
#endif
================================================
FILE: src/common/datetime.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "datetime.h"
#include <ctime>
namespace common {
size_t cur_date() {
time_t tt = time(NULL);
struct tm tm = *localtime(&tt);
size_t year_since_00 = tm.tm_year - 100;
size_t year = 2000 + year_since_00;
return (year * 100 * 100) + ((tm.tm_mon + 1) * 100) + tm.tm_mday;
}
size_t cur_time() {
time_t tt = time(NULL);
struct tm tm = *localtime(&tt);
return (tm.tm_hour * 100 * 100) + (tm.tm_min * 100) + tm.tm_sec;
}
size_t cur_datetime() {
size_t date = cur_date();
return (date * 100 * 100 * 100) + cur_time();
}
const std::string iso8601_datetime() {
time_t now;
time(&now);
char buf[21];
strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&now));
return std::string(buf);
}
}
================================================
FILE: src/common/datetime.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
namespace common {
size_t cur_date();
size_t cur_time();
size_t cur_datetime();
const std::string iso8601_datetime();
}
================================================
FILE: src/common/dictionary.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "dictionary.h"
#include "logger/logger.h"
#include "file/tsv_file.h"
#include "dictionary_row.h"
#include "algorithm/hash.h"
using namespace std;
namespace common {
dictionary::dictionary() {
}
dictionary::dictionary(file::tsv_file &tsv_file) {
load_tsv(tsv_file);
}
dictionary::~dictionary() {
}
void dictionary::load_tsv(file::tsv_file &tsv_file) {
while (!tsv_file.eof()) {
auto line = tsv_file.get_line();
std::stringstream ss(line);
std::string col;
getline(ss, col, '\t');
if (col.size()) {
size_t key = ::algorithm::hash(col);
if (m_rows.find(key) != m_rows.end()) {
handle_collision(key, col);
}
m_rows[key] = dictionary_row(ss);
}
}
}
unordered_map<size_t, dictionary_row>::const_iterator dictionary::find(const std::string &key) const {
return m_rows.find(::algorithm::hash(key));
}
unordered_map<size_t, dictionary_row>::const_iterator dictionary::find(size_t hash) const {
return m_rows.find(hash);
}
unordered_map<size_t, dictionary_row>::const_iterator dictionary::begin() const {
return m_rows.begin();
}
unordered_map<size_t, dictionary_row>::const_iterator dictionary::end() const {
return m_rows.end();
}
bool dictionary::has_key(const std::string &key) const {
return find(key) != end();
}
void dictionary::handle_collision(size_t key, const std::string &col) {
LOG_ERROR("Collision: " + std::to_string(key) + " " + col);
}
}
================================================
FILE: src/common/dictionary.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <map>
#include <unordered_map>
#include "dictionary_row.h"
namespace file {
class tsv_file;
}
namespace common {
class dictionary {
public:
dictionary();
explicit dictionary(file::tsv_file &tsv_file);
~dictionary();
void load_tsv(file::tsv_file &tsv_file);
std::unordered_map<size_t, dictionary_row>::const_iterator find(const std::string &key) const;
std::unordered_map<size_t, dictionary_row>::const_iterator find(size_t hash) const;
std::unordered_map<size_t, dictionary_row>::const_iterator begin() const;
std::unordered_map<size_t, dictionary_row>::const_iterator end() const;
bool has_key(const std::string &key) const;
size_t size() const { return m_rows.size(); }
private:
std::unordered_map<size_t, dictionary_row> m_rows;
void handle_collision(size_t key, const std::string &col);
};
}
================================================
FILE: src/common/dictionary_row.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "dictionary_row.h"
namespace common {
dictionary_row::dictionary_row() {
}
dictionary_row::dictionary_row(const dictionary_row &row) {
m_columns = row.m_columns;
}
dictionary_row::dictionary_row(const std::string &row) {
std::stringstream stream(row);
read_stream(stream);
}
dictionary_row::dictionary_row(std::stringstream &stream) {
read_stream(stream);
}
dictionary_row::~dictionary_row() {
}
int dictionary_row::get_int(int column) const {
return (int)m_columns[column];
}
float dictionary_row::get_float(int column) const {
return (float)m_columns[column];
}
double dictionary_row::get_double(int column) const {
return m_columns[column];
}
void dictionary_row::read_stream(std::stringstream &stream) {
std::string col;
int i = 0;
while (std::getline(stream, col, '\t')) {
try {
m_columns.push_back(stod(col));
} catch(const std::invalid_argument &error) {
} catch(const std::out_of_range &error) {
}
i++;
}
}
}
================================================
FILE: src/common/dictionary_row.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <sstream>
#include <vector>
#define CC_ROW_LEN 5
namespace common {
class dictionary_row {
public:
dictionary_row();
dictionary_row(const dictionary_row &row);
explicit dictionary_row(const std::string &row);
explicit dictionary_row(std::stringstream &stream);
~dictionary_row();
int get_int(int column) const;
float get_float(int column) const;
double get_double(int column) const;
private:
std::vector<double> m_columns;
void read_stream(std::stringstream &stream);
};
}
================================================
FILE: src/common/simple_thread_pool.hpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <thread>
#include <future>
#include <queue>
namespace common {
class simple_thread_pool {
public:
explicit simple_thread_pool(size_t);
~simple_thread_pool();
void enqueue(std::function<void()> &&fun);
private:
void handle_work();
std::vector<std::thread> m_workers;
std::queue<std::function<void()>> m_queue;
std::mutex m_queue_lock;
std::condition_variable m_condition;
bool m_stop = false;
};
}
================================================
FILE: src/common/system.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "system.h"
#include <thread>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
#include <boost/uuid/uuid_io.hpp>
namespace common {
bool is_dev() {
if (getenv("ALEXANDRIA_LIVE") != NULL && std::stoi(getenv("ALEXANDRIA_LIVE")) > 0) {
return false;
}
return true;
}
std::string domain_index_filename() {
if (is_dev()) {
return "/dev_files/domain_info.tsv";
}
return "/files/domain_info.tsv";
}
std::string dictionary_filename() {
if (is_dev()) {
return "/dev_files/dictionary.tsv";
}
return "/files/dictionary.tsv";
}
std::string uuid() {
// Create a random UUID
boost::uuids::uuid uuid = boost::uuids::random_generator()();
// Convert UUID to string and return
return boost::uuids::to_string(uuid);
}
}
================================================
FILE: src/common/system.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
namespace common {
bool is_dev();
std::string domain_index_filename();
std::string dictionary_filename();
std::string uuid();
}
================================================
FILE: src/config.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "config.h"
#include "text/text.h"
#include "logger/logger.h"
#include "file/file.h"
using namespace std;
namespace config {
config::config() {
create_data_directories(m_data_path);
}
const config s_instance = config();
const std::string &data_path() {
return s_instance.data_path();
}
void create_data_directories(const std::string &data_path) {
if (file::directory_exists(data_path)) {
for (size_t shard_id = 0; shard_id < 8; shard_id++) {
const std::string base = data_path + "/" + to_string(shard_id);
file::create_directory(base);
file::create_directory(base + "/input");
file::create_directory(base + "/output");
file::create_directory(base + "/upload");
file::create_directory(base + "/hash_table");
file::create_directory(base + "/full_text");
file::create_directory(base + "/tmp");
}
}
}
string node = "test0001";
string master = "localhost";
string upload = "localhost";
string data_node;
//string url_store_host = "http://localhost";
string url_store_host = "http://node0009.alexandria.org";
string url_store_path = "/alexandria/urlstore";
string url_store_cache_path = "/mnt/4/urlstore_cache";
size_t nodes_in_cluster = 1;
size_t node_id = 0;
bool index_snippets = true;
bool index_text = true;
vector<string> batches;
vector<string> link_batches;
size_t worker_count = 8;
size_t query_max_words = 10;
size_t query_max_len = 200;
size_t deduplicate_domain_count = 5;
size_t pre_result_limit = 200000;
size_t result_limit = 1000;
string file_upload_user = "";
string file_upload_password = "";
size_t n_grams = 1;
size_t shard_hash_table_size = 100000;
size_t html_parser_long_text_len = 1000;
size_t ft_shard_builder_buffer_len = 240000;
size_t ft_num_shards = 2048;
size_t ft_max_sections = 8;
size_t ft_max_results_per_section = 100000;
size_t ft_section_depth = 8;
size_t ft_max_cache_gb = 30;
size_t ft_num_threads_indexing = 24;
size_t ft_num_threads_merging = 24;
size_t ft_num_threads_appending = 8;
double ft_cached_bytes_per_shard() {
return (ft_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * ft_num_threads_indexing);
}
void read_config(const string &config_file) {
batches.clear();
link_batches.clear();
ifstream in(config_file);
if (!in.is_open()) {
LOG_ERROR("Could not read config file: " + config_file);
return;
}
string line;
while (getline(in, line)) {
size_t comment_pos = line.find("#");
if (comment_pos != string::npos) {
line = line.substr(0, comment_pos);
}
if (text::trim(line) == "") {
continue;
}
vector<string> parts;
boost::split(parts, line, boost::is_any_of("="));
for (string &part : parts) {
part = text::trim(part);
}
if (parts[0] == "node") {
node = parts[1];
} else if (parts[0] == "master") {
master = parts[1];
upload = parts[1];
} else if (parts[0] == "upload") {
upload = parts[1];
} else if (parts[0] == "data_node") {
data_node = parts[1];
} else if (parts[0] == "url_store_host") {
url_store_host = parts[1];
} else if (parts[0] == "url_store_path") {
url_store_path = parts[1];
} else if (parts[0] == "nodes_in_cluster") {
nodes_in_cluster = stoi(parts[1]);
} else if (parts[0] == "node_id") {
node_id = stoi(parts[1]);
} else if (parts[0] == "batches[]") {
batches.push_back(parts[1]);
} else if (parts[0] == "link_batches[]") {
link_batches.push_back(parts[1]);
} else if (parts[0] == "worker_count") {
worker_count = stoi(parts[1]);
} else if (parts[0] == "query_max_words") {
query_max_words = stoi(parts[1]);
} else if (parts[0] == "query_max_len") {
query_max_len = stoi(parts[1]);
} else if (parts[0] == "deduplicate_domain_count") {
deduplicate_domain_count = stoi(parts[1]);
} else if (parts[0] == "pre_result_limit") {
pre_result_limit = stoi(parts[1]);
} else if (parts[0] == "result_limit") {
result_limit = stoi(parts[1]);
} else if (parts[0] == "ft_num_shards") {
ft_num_shards = stoi(parts[1]);
} else if (parts[0] == "ft_max_sections") {
ft_max_sections = stoi(parts[1]);
} else if (parts[0] == "ft_max_results_per_section") {
ft_max_results_per_section = stoi(parts[1]);
} else if (parts[0] == "ft_section_depth") {
ft_section_depth = stoi(parts[1]);
} else if (parts[0] == "ft_max_cache_gb") {
ft_max_cache_gb = stoi(parts[1]);
} else if (parts[0] == "ft_num_threads_indexing") {
ft_num_threads_indexing = stoi(parts[1]);
} else if (parts[0] == "ft_num_threads_merging") {
ft_num_threads_merging = stoi(parts[1]);
} else if (parts[0] == "ft_num_threads_appending") {
ft_num_threads_appending = stoi(parts[1]);
} else if (parts[0] == "file_upload_user") {
file_upload_user = parts[1];
} else if (parts[0] == "file_upload_password") {
file_upload_password = parts[1];
} else if (parts[0] == "n_grams") {
n_grams = stoull(parts[1]);
} else if (parts[0] == "index_snippets") {
index_snippets = static_cast<bool>(stoull(parts[1]));
} else if (parts[0] == "index_text") {
index_text = static_cast<bool>(stoull(parts[1]));
} else if (parts[0] == "shard_hash_table_size") {
shard_hash_table_size = stoull(parts[1]);
} else if (parts[0] == "html_parser_long_text_len") {
html_parser_long_text_len = stoull(parts[1]);
} else if (parts[0] == "data_path") {
s_instance.data_path(parts[1]);
}
}
}
}
================================================
FILE: src/config.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <fstream>
#include <vector>
namespace config {
void create_data_directories(const std::string &data_path);
class config {
public:
config();
const std::string &data_path() const { return m_data_path; }
void data_path(const std::string &str) const { m_data_path = str; create_data_directories(m_data_path); }
private:
mutable std::string m_data_path = "/mnt";
};
const std::string &data_path();
extern std::string node;
extern std::string master;
extern std::string upload;
extern std::string data_node;
extern std::string url_store_host;
extern std::string url_store_path;
extern std::string url_store_cache_path;
const size_t url_store_shards = 24;
extern size_t nodes_in_cluster;
extern size_t node_id;
extern bool index_snippets;
extern bool index_text;
extern std::vector<std::string> batches;
extern std::vector<std::string> link_batches;
extern size_t worker_count;
extern size_t query_max_words;
extern size_t query_max_len;
extern size_t deduplicate_domain_count;
extern size_t pre_result_limit;
extern size_t result_limit;
extern std::string file_upload_user;
extern std::string file_upload_password;
extern size_t n_grams;
extern size_t shard_hash_table_size;
extern size_t html_parser_long_text_len;
extern size_t ft_shard_builder_buffer_len;
/*
Constants only configurable at compilation time.
*/
// Full text indexer config
extern size_t ft_num_shards;
extern size_t ft_max_sections;
extern size_t ft_max_results_per_section;
extern size_t ft_section_depth;
extern size_t ft_max_cache_gb;
extern size_t ft_num_threads_indexing;
extern size_t ft_num_threads_merging;
extern size_t ft_num_threads_appending;
double ft_cached_bytes_per_shard();
// Link indexer config
inline const unsigned long long li_max_cache_gb = 4;
inline const unsigned long long li_num_threads_indexing = 48;
inline const unsigned long long li_num_threads_merging = 16;
inline const double li_cached_bytes_per_shard = (li_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * li_num_threads_indexing);
inline const unsigned long long li_indexer_max_cache_size = 500;
// Hash table indexer config
inline const unsigned long long ht_num_shards = 1031;
inline const unsigned long long ht_num_buckets = 8;
inline const unsigned long long ht_key_size = 8;
// Server config
// Other constants.
inline const unsigned long long num_async_file_transfers = 48;
inline const std::string test_data_path = "/var/www/html/node0003.alexandria.org/test-data/";
// Commoncrawl parser.
inline const std::string cc_target_output = "alexandria-cc-output";
inline const bool cc_run_on_lambda = false;
inline const std::string log_file_path = "/var/log/alexandria.log";
void read_config(const std::string &config_file);
}
================================================
FILE: src/debug.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "debug.h"
void print_elem(std::map<size_t, size_t> &m, size_t elem) {
std::cout << m[elem] << std::endl;
}
================================================
FILE: src/debug.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include <map>
void print_elem(std::map<size_t, size_t> &m, size_t elem);
================================================
FILE: src/domain_stats/domain_stats.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "domain_stats.h"
#include <iostream>
#include "common/dictionary.h"
#include "file/tsv_file_remote.h"
#include "logger/logger.h"
#include "common/system.h"
namespace domain_stats {
common::dictionary domain_data;
void download_domain_stats() {
LOG_INFO("download domain_info.tsv");
file::tsv_file_remote domain_info_tsv(common::domain_index_filename());
LOG_INFO("parsing.....");
domain_data.load_tsv(domain_info_tsv);
}
float harmonic_centrality(const URL &url) {
return harmonic_centrality(url.host());
}
float harmonic_centrality(const std::string &host) {
const auto iter = domain_data.find(host);
float harmonic = 0.0f;
if (iter != domain_data.end()) {
const common::dictionary_row row = iter->second;
harmonic = row.get_float(0);
}
return harmonic;
}
}
================================================
FILE: src/domain_stats/domain_stats.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
#include "URL.h"
namespace domain_stats {
void download_domain_stats();
float harmonic_centrality(const URL &url);
float harmonic_centrality(const std::string &domain);
}
================================================
FILE: src/downloader/merge_downloader.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <iostream>
#include <sstream>
#include "file/file.h"
#include "file/archive.h"
#include "hash_table2/builder.h"
#include "utils/thread_pool.hpp"
#include "indexer/index.h"
#include "indexer/index_builder.h"
#include "indexer/index_reader.h"
#include "indexer/value_record.h"
namespace downloader {
bool internal_links_complete(const std::string &path) {
for (size_t i = 0; i < 8; i++) {
if (!file::file_exists(path + "/internal_links_" + std::to_string(i))) {
return false;
}
}
return true;
}
bool hash_table_complete(const std::string &path) {
const size_t num_shards = 1019;
for (size_t i = 0; i < num_shards; i++) {
if (!file::file_exists(path + "/" + std::to_string(i) + ".pos")) {
return false;
}
}
for (size_t i = 0; i < num_shards; i++) {
if (!file::file_exists(path + "/" + std::to_string(i) + ".data")) {
return false;
}
}
return true;
}
void merge_internal_links(const std::string &path, const std::string &batch_name) {
return;
/*
const std::string target_path = "/slow_data/internal_links/" + batch_name;
file::create_directory(target_path);
for (size_t i = 0; i < 8; i++) {
file::copy_file(path + "/internal_links_" + std::to_string(i), target_path + "/internal_links_" + std::to_string(i));
}
*/
utils::thread_pool pool(8);
for (size_t i = 0; i < 8; i++) {
pool.enqueue([i, path]() {
file::archive tar(path + "/internal_links_" + std::to_string(i));
utils::thread_pool pool(4, 10);
tar.untar([&pool](const std::string &filename, const std::string &data) {
pool.enqueue([filename, data]() {
uint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5));
std::istringstream ram_reader(data);
indexer::index_builder<indexer::value_record> idx1("internal_links", host_hash, 1000);
indexer::index<indexer::value_record> idx2(&ram_reader, 1000);
try {
idx1.merge_with(idx2);
} catch (const std::runtime_error &err) {
// The file is corrupt. Lets delete it and report.
std::cout << "internal_links: " << host_hash << " is corrupt" << std::endl;
idx1.truncate();
} catch (const std::bad_alloc &err) {
// The file is corrupt. Lets delete it and report.
std::cout << "internal_links: " << host_hash << " is corrupt" << std::endl;
idx1.truncate();
}
});
});
pool.run_all();
});
}
pool.run_all();
std::cout << "finished with the merge" << std::endl;
}
void merge_hash_table(const std::string &path) {
utils::thread_pool pool(32);
hash_table2::builder ht("all_urls", 1019, 1000000, "/slow_data");
for (size_t i = 0; i < 1019; i++) {
pool.enqueue([&ht, i, path]() {
ht.get_shard(i)->merge_with(path + "/" + std::to_string(i) + ".pos", path + "/" + std::to_string(i) + ".data");
});
}
pool.run_all();
}
void merge_downloader() {
indexer::index_builder<indexer::value_record>::create_directories("internal_links");
file::read_directory(config::data_path() + "/downloader", [](const std::string &node_id) {
const std::string dir = config::data_path() + "/downloader/" + node_id;
file::read_directory(dir, [dir](const std::string &file) {
try {
size_t ts = std::stoull(file);
const std::string batch = dir + "/" + std::to_string(ts);
if (internal_links_complete(batch) && hash_table_complete(batch + "/ht")) {
std::cout << "merging directory: " << batch << std::endl;
profiler::instance prof1("merge_internal_links");
merge_internal_links(batch, std::to_string(ts));
prof1.stop();
profiler::instance prof2("merge_hash_table");
merge_hash_table(batch + "/ht");
prof2.stop();
file::delete_directory(batch);
exit(0);
}
} catch (...) {
}
});
});
}
}
================================================
FILE: src/downloader/merge_downloader.h
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include <iostream>
namespace downloader {
void merge_downloader();
}
================================================
FILE: src/downloader/warc_downloader.cpp
================================================
/*
* MIT License
*
* Alexandria.org
*
* Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <iomanip>
#include "config.h"
#include "common/datetime.h"
#include "warc/warc.h"
#include "utils/thread_pool.hpp"
#include "utils/id_allocator.h"
#include "file/archive.h"
#include "logger/logger.h"
#include "text/text.h"
#include "transfer/transfer.h"
#include <iostream>
#include "hash_table2/builder.h"
#include "algorithm/algorithm.h"
#include "indexer/index_utils.h"
#include "indexer/index_builder.h"
#include "indexer/value_record.h"
#include "indexer/merger.h"
namespace downloader {
void run_downloader(const std::string &warc_path) {
warc::parser pp;
for (int retry = 0; retry < 3; retry++) {
try {
warc::multipart_download("http://data.commoncrawl.org/" + warc_path, [&pp](const std::string &chunk) {
std::stringstream ss(chunk);
pp.parse_stream(ss);
});
break;
} catch (const std::runtime_error &err) {
std::cout << "GOT ERROR: " << err.what() << std::endl;
std::cout << "Retrying... try " << retry << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(5));
}
}
LOG_INFO("uploading: " + warc_path);
int error;
error = transfer::upload_gz_file(warc::get_result_path(warc_path), pp.result());
error = transfer::upload_gz_file(warc::get_link_result_path(warc_path), pp.link_result());
if (error) {
LOG_INFO("error uploading: " + warc_path);
}
}
std::vector<std::string> download_warc_paths() {
int error;
auto content = transfer::file_to_string("nodes/" + config::node + "/warc.paths", error);
if (error == transfer::ERROR) return {};
content = text::trim(content);
std::vector<std::string> raw_warc_paths;
boost::algorithm::split(raw_warc_paths, content, boost::is_any_of("\n"));
std::vector<std::string> warc_paths;
for (const auto &warc_path : raw_warc_paths) {
if (text::trim(warc_path).size()) {
warc_paths.push_back(text::trim(warc_path));
}
}
return warc_paths;
}
bool upload_warc_paths(const std::vector<std::string> &warc_paths) {
auto content = boost::algorithm::join(warc_paths, "\n");
int error = transfer::upload_file("nodes/" + config::node + "/warc.paths", content);
return error == transfer::OK;
}
void start_downloaders(const std::vector<std::string> &warc_paths) {
const size_t num_threads = 12;
std::vector<std::vector<std::string>> chunks;
algorithm::vector_chunk<std::string>(warc_paths, std::ceil(warc_paths.size() / num_threads) + 1, chunks);
utils::thread_pool pool(num_threads);
for (const auto &chunk : chunks) {
pool.enqueue([chunk] {
size_t count = 0;
for (const auto &warc_path : chunk) {
run_downloader(warc_path);
count++;
std::cout << "done with " << warc_path << " done with " << count << "/" << chunk.size() << std::endl;
}
});
}
pool.run_all();
}
void upload_all() {
/*auto upload_id = std::to_string(common::cur_datetime());
// Upload internal links.
for (size_t i = 0; i < 8; i++) {
// Optimize all internal links.
utils::thread_pool pool(32);
file::read_directory(config::data_path() + "/" + std::to_string(i) + "/full_text/internal_links", [&pool](const std::string &filename) {
uint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5));
indexer::index_builder<indexer::value_record> idx("internal_links", host_hash, 1000);
idx.optimize();
});
pool.run_all();
const auto filename = "internal_links_" + std::to_string(i);
file::archive tar(filename);
tar.read_dir(config::data_path() + "/" + std::to_string(i) + "/full_text/internal_links");
transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/" + filename, filename);
file::delete_file(filename);
}
hash_table2::hash_table ht("crawl_index", 1019);
ht.for_each_shard([upload_id](auto shard) {
const auto pos_filename = shard->filename_pos();
const auto data_filename = shard->filename_data();
const auto target_filename = std::to_string(shard->shard_id());
transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/ht/" + target_filename + ".pos", pos_filename);
transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/ht/" + target_filename + ".data", data_filename);
});
*/
}
void warc_downloader_with_url(const std::string &batch, const std::string &warc_paths_url) {
std::vector<std::string> warc_paths;
int error;
auto content = transfer::gz_file_to_string(warc_paths_url, error);
std::stringstream ss(content);
std::string lin
gitextract_46gecs8w/
├── .gdbinit
├── .gitignore
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── README.md
├── cmake/
│ └── Findfcgi.cmake
├── config.conf
├── documentation/
│ ├── alexandria.md
│ ├── api_response_format.md
│ ├── caching.md
│ ├── coding_rules.md
│ ├── configure_local_nginx.md
│ ├── full_text_indexes.md
│ ├── ideas.md
│ ├── index_file_format.md
│ ├── indexer.md
│ ├── installing_nodes.md
│ ├── performance_journal.md
│ ├── search_result_ranking.md
│ └── statues_swe.tex
├── scripts/
│ ├── bootstrap_node_2drives.sh
│ ├── build-deps.sh
│ ├── clean.sh
│ ├── download-deps.sh
│ ├── download-test-data.sh
│ ├── find_missing_files_in_batch.sh
│ ├── init-docker.sh
│ ├── install-deps.sh
│ ├── packager.sh
│ ├── prepare-output-dirs.sh
│ ├── truncate.sh
│ └── update.sh
├── src/
│ ├── URL.cpp
│ ├── URL.h
│ ├── alexandria.cpp
│ ├── algorithm/
│ │ ├── algorithm.cpp
│ │ ├── algorithm.h
│ │ ├── bloom_filter.cpp
│ │ ├── bloom_filter.h
│ │ ├── hash.cpp
│ │ ├── hash.h
│ │ ├── hyper_ball.h
│ │ ├── hyper_log_log.cpp
│ │ ├── hyper_log_log.h
│ │ ├── intersection.cpp
│ │ ├── intersection.h
│ │ ├── sort.cpp
│ │ ├── sort.h
│ │ ├── sum_sorted.h
│ │ └── top_k.h
│ ├── api/
│ │ ├── api_response.cpp
│ │ ├── api_response.h
│ │ ├── result_with_snippet.cpp
│ │ └── result_with_snippet.h
│ ├── cluster/
│ │ ├── cluster.h
│ │ ├── document.cpp
│ │ └── document.h
│ ├── common/
│ │ ├── ThreadPool.h
│ │ ├── datetime.cpp
│ │ ├── datetime.h
│ │ ├── dictionary.cpp
│ │ ├── dictionary.h
│ │ ├── dictionary_row.cpp
│ │ ├── dictionary_row.h
│ │ ├── simple_thread_pool.hpp
│ │ ├── system.cpp
│ │ └── system.h
│ ├── config.cpp
│ ├── config.h
│ ├── debug.cpp
│ ├── debug.h
│ ├── domain_stats/
│ │ ├── domain_stats.cpp
│ │ └── domain_stats.h
│ ├── downloader/
│ │ ├── merge_downloader.cpp
│ │ ├── merge_downloader.h
│ │ ├── warc_downloader.cpp
│ │ └── warc_downloader.h
│ ├── file/
│ │ ├── archive.cpp
│ │ ├── archive.h
│ │ ├── file.cpp
│ │ ├── file.h
│ │ ├── gz_tsv_file.cpp
│ │ ├── gz_tsv_file.h
│ │ ├── tsv_file.cpp
│ │ ├── tsv_file.h
│ │ ├── tsv_file_remote.cpp
│ │ ├── tsv_file_remote.h
│ │ ├── tsv_row.cpp
│ │ └── tsv_row.h
│ ├── full_text/
│ │ ├── domain_link_record.h
│ │ ├── link_record.h
│ │ ├── record.h
│ │ ├── result_set.h
│ │ └── search_metric.h
│ ├── hash_table2/
│ │ ├── builder.cpp
│ │ ├── builder.h
│ │ ├── hash_table.cpp
│ │ ├── hash_table.h
│ │ ├── hash_table_shard.cpp
│ │ ├── hash_table_shard.h
│ │ ├── hash_table_shard_base.h
│ │ ├── hash_table_shard_builder.cpp
│ │ └── hash_table_shard_builder.h
│ ├── hash_table_helper/
│ │ ├── hash_table_helper.cpp
│ │ └── hash_table_helper.h
│ ├── http/
│ │ ├── request.cpp
│ │ ├── request.h
│ │ ├── response.h
│ │ ├── server.cpp
│ │ └── server.h
│ ├── indexer/
│ │ ├── basic_index.h
│ │ ├── basic_index_builder.h
│ │ ├── console.cpp
│ │ ├── console.h
│ │ ├── counted_record.h
│ │ ├── domain_link_record.h
│ │ ├── domain_record.h
│ │ ├── generic_record.h
│ │ ├── index.h
│ │ ├── index_base.h
│ │ ├── index_builder.h
│ │ ├── index_manager.cpp
│ │ ├── index_manager.h
│ │ ├── index_reader.cpp
│ │ ├── index_reader.h
│ │ ├── index_utils.cpp
│ │ ├── index_utils.h
│ │ ├── link_record.h
│ │ ├── merger.cpp
│ │ ├── merger.h
│ │ ├── regular_index_builder.h
│ │ ├── return_record.h
│ │ ├── score_builder.cpp
│ │ ├── score_builder.h
│ │ ├── sharded.h
│ │ ├── sharded_builder.h
│ │ ├── sharded_index.h
│ │ ├── sharded_index_builder.h
│ │ ├── url_record.h
│ │ └── value_record.h
│ ├── indexer.cpp
│ ├── logger/
│ │ ├── logger.cpp
│ │ └── logger.h
│ ├── memory/
│ │ ├── debugger.cpp
│ │ ├── debugger.h
│ │ ├── memory.cpp
│ │ ├── memory.h
│ │ └── overload.cpp
│ ├── parser/
│ │ ├── cc_parser.cpp
│ │ ├── cc_parser.h
│ │ ├── entities.cpp
│ │ ├── entities.h
│ │ ├── html_link.cpp
│ │ ├── html_link.h
│ │ ├── html_parser.cpp
│ │ ├── html_parser.h
│ │ ├── parser.cpp
│ │ ├── parser.h
│ │ ├── unicode.cpp
│ │ └── unicode.h
│ ├── profiler/
│ │ ├── profiler.cpp
│ │ └── profiler.h
│ ├── scraper/
│ │ ├── scraper.cpp
│ │ ├── scraper.h
│ │ ├── scraper_store.cpp
│ │ └── scraper_store.h
│ ├── scraper.cpp
│ ├── search_engine/
│ │ ├── search_allocation.h
│ │ ├── search_engine.cpp
│ │ └── search_engine.h
│ ├── server/
│ │ ├── search_server.cpp
│ │ ├── search_server.h
│ │ ├── url_server.cpp
│ │ └── url_server.h
│ ├── server.cpp
│ ├── stats/
│ │ └── stats.h
│ ├── text/
│ │ ├── stopwords.cpp
│ │ ├── stopwords.h
│ │ ├── text.cpp
│ │ └── text.h
│ ├── tools/
│ │ ├── calculate_harmonic.cpp
│ │ ├── calculate_harmonic.h
│ │ ├── counter.cpp
│ │ ├── counter.h
│ │ ├── find_links.cpp
│ │ ├── find_links.h
│ │ ├── generate_url_lists.cpp
│ │ ├── generate_url_lists.h
│ │ ├── splitter.cpp
│ │ └── splitter.h
│ ├── transfer/
│ │ ├── transfer.cpp
│ │ └── transfer.h
│ ├── url_link/
│ │ ├── link.cpp
│ │ └── link.h
│ ├── utils/
│ │ ├── id_allocator.h
│ │ ├── thread_pool.cpp
│ │ ├── thread_pool.hpp
│ │ └── thread_pool_arg.h
│ └── warc/
│ ├── tlds.h
│ ├── warc.cpp
│ └── warc.h
└── tests/
├── main.cpp
├── test_algorithm.cpp
├── test_bloom_filter.cpp
├── test_cc_parser.cpp
├── test_config.conf
├── test_config2.conf
├── test_configuration.cpp
├── test_counted_index_builder.cpp
├── test_datetime.h
├── test_file.cpp
├── test_hash.cpp
├── test_hash_table.cpp
├── test_html_parser.cpp
├── test_hyper_ball.cpp
├── test_hyper_log_log.cpp
├── test_index_builder.cpp
├── test_index_iteration.cpp
├── test_index_reader.cpp
├── test_logger.cpp
├── test_memory.cpp
├── test_n_gram.cpp
├── test_robot_parser.cpp
├── test_scraper.cpp
├── test_sharded_index_builder.cpp
├── test_sort.cpp
├── test_sum_sorted.cpp
├── test_text.cpp
├── test_thread_pool.cpp
├── test_top_k.cpp
├── test_unicode.cpp
├── test_url.cpp
└── test_url_record.cpp
SYMBOL INDEX (582 symbols across 191 files)
FILE: src/URL.cpp
function string (line 74) | string URL::str() const {
function string (line 78) | string URL::key() const {
function string (line 87) | string URL::hash_input() const {
function string (line 119) | string URL::host() const {
function string (line 123) | string URL::host_top_domain() const {
function string (line 143) | string URL::scheme() const {
function string (line 147) | string URL::host_reverse() const {
function string (line 151) | string URL::path() const {
function string (line 155) | string URL::path_with_query() const {
function string (line 183) | string URL::host_reverse(const string &host) {
function string (line 190) | string URL::host_reverse_top_domain(const string &host) {
function string (line 204) | string URL::domain_without_tld() const {
function URL (line 227) | URL &URL::operator=(const URL &other) {
function istream (line 240) | istream &operator >>(istream &ss, URL &url) {
function ostream (line 247) | ostream &operator <<(ostream& os, const URL& url) {
FILE: src/URL.h
function class (line 36) | class URL {
FILE: src/alexandria.cpp
function help (line 48) | void help() {
function main (line 53) | int main(int argc, const char **argv) {
FILE: src/algorithm/algorithm.cpp
type algorithm (line 38) | namespace algorithm {
function incremental_partitions (line 46) | std::vector<std::vector<int>> incremental_partitions(const std::vector...
function harmonic_centrality_subvector (line 94) | std::vector<double> harmonic_centrality_subvector(size_t vlen, const s...
function harmonic_centrality (line 158) | std::vector<double> harmonic_centrality(size_t vlen, const std::set<st...
function harmonic_centrality (line 177) | std::vector<double> harmonic_centrality(size_t vlen, const std::vector...
function harmonic_centrality_threaded (line 181) | std::vector<double> harmonic_centrality_threaded(size_t vlen, const st...
function harmonic_centrality_threaded (line 200) | std::vector<double> harmonic_centrality_threaded(size_t vlen, const st...
FILE: src/algorithm/algorithm.h
function namespace (line 34) | namespace algorithm {
FILE: src/algorithm/bloom_filter.cpp
type algorithm (line 33) | namespace algorithm {
FILE: src/algorithm/bloom_filter.h
function namespace (line 34) | namespace algorithm {
FILE: src/algorithm/hash.cpp
type algorithm (line 31) | namespace algorithm {
function murmur_hash (line 37) | size_t murmur_hash(const char *key, size_t len, size_t seed) {
function hash (line 77) | size_t hash(const std::string &str) {
function hash_with_seed (line 82) | size_t hash_with_seed(const std::string &str, size_t seed) {
FILE: src/algorithm/hash.h
function namespace (line 29) | namespace algorithm {
FILE: src/algorithm/hyper_log_log.cpp
type algorithm (line 31) | namespace algorithm {
function hyper_log_log (line 107) | hyper_log_log hyper_log_log::operator +(const hyper_log_log &hl) const {
function hyper_log_log (line 114) | hyper_log_log &hyper_log_log::operator +=(const hyper_log_log &hl) {
function hyper_log_log (line 119) | hyper_log_log &hyper_log_log::operator =(const hyper_log_log &other) {
FILE: src/algorithm/hyper_log_log.h
function namespace (line 35) | namespace algorithm {
FILE: src/algorithm/intersection.cpp
type algorithm (line 31) | namespace algorithm {
function intersection (line 33) | roaring::Roaring intersection(const std::vector<roaring::Roaring> &inp...
FILE: src/algorithm/intersection.h
function namespace (line 32) | namespace algorithm {
FILE: src/algorithm/sort.cpp
type algorithm (line 29) | namespace algorithm {
FILE: src/algorithm/sort.h
function namespace (line 32) | namespace algorithm {
FILE: src/algorithm/sum_sorted.h
function namespace (line 32) | namespace algorithm {
FILE: src/algorithm/top_k.h
function namespace (line 32) | namespace algorithm {
FILE: src/api/api_response.cpp
type api (line 33) | namespace api {
type full_text::search_metric (line 35) | struct full_text::search_metric
FILE: src/api/api_response.h
function namespace (line 32) | namespace full_text {
function namespace (line 36) | namespace indexer {
function namespace (line 40) | namespace api {
FILE: src/api/result_with_snippet.cpp
type api (line 30) | namespace api {
FILE: src/api/result_with_snippet.h
function namespace (line 33) | namespace api {
FILE: src/cluster/document.cpp
type cluster (line 32) | namespace cluster {
function read_text_to_corpus (line 55) | void read_text_to_corpus(corpus &corp, const std::string &text) {
function read_corpus (line 67) | void read_corpus(corpus &corp, documents &documents, std::stringstream...
function print_document (line 89) | void print_document(corpus &corp, const document &document) {
FILE: src/cluster/document.h
function namespace (line 33) | namespace cluster {
FILE: src/common/ThreadPool.h
function class (line 37) | class ThreadPool {
function ThreadPool (line 57) | inline ThreadPool::ThreadPool(size_t threads)
function task (line 91) | auto task = std::make_shared< std::packaged_task<return_type()> >(
function ThreadPool (line 110) | inline ThreadPool::~ThreadPool()
FILE: src/common/datetime.cpp
type common (line 29) | namespace common {
function cur_date (line 31) | size_t cur_date() {
function cur_time (line 39) | size_t cur_time() {
function cur_datetime (line 45) | size_t cur_datetime() {
function iso8601_datetime (line 50) | const std::string iso8601_datetime() {
FILE: src/common/datetime.h
function namespace (line 30) | namespace common {
FILE: src/common/dictionary.cpp
type common (line 35) | namespace common {
FILE: src/common/dictionary.h
function namespace (line 33) | namespace file {
function namespace (line 37) | namespace common {
FILE: src/common/dictionary_row.cpp
type common (line 29) | namespace common {
FILE: src/common/dictionary_row.h
function namespace (line 35) | namespace common {
FILE: src/common/simple_thread_pool.hpp
type common (line 34) | namespace common {
class simple_thread_pool (line 36) | class simple_thread_pool {
FILE: src/common/system.cpp
type common (line 33) | namespace common {
function is_dev (line 35) | bool is_dev() {
function domain_index_filename (line 42) | std::string domain_index_filename() {
function dictionary_filename (line 49) | std::string dictionary_filename() {
function uuid (line 56) | std::string uuid() {
FILE: src/common/system.h
function namespace (line 31) | namespace common {
FILE: src/config.cpp
type config (line 34) | namespace config {
function create_data_directories (line 46) | void create_data_directories(const std::string &data_path) {
function ft_cached_bytes_per_shard (line 100) | double ft_cached_bytes_per_shard() {
function read_config (line 104) | void read_config(const string &config_file) {
FILE: src/config.h
function namespace (line 33) | namespace config {
FILE: src/debug.cpp
function print_elem (line 29) | void print_elem(std::map<size_t, size_t> &m, size_t elem) {
FILE: src/domain_stats/domain_stats.cpp
type domain_stats (line 34) | namespace domain_stats {
function download_domain_stats (line 38) | void download_domain_stats() {
function harmonic_centrality (line 45) | float harmonic_centrality(const URL &url) {
function harmonic_centrality (line 49) | float harmonic_centrality(const std::string &host) {
FILE: src/domain_stats/domain_stats.h
function namespace (line 32) | namespace domain_stats {
FILE: src/downloader/merge_downloader.cpp
type downloader (line 38) | namespace downloader {
function internal_links_complete (line 40) | bool internal_links_complete(const std::string &path) {
function hash_table_complete (line 51) | bool hash_table_complete(const std::string &path) {
function merge_internal_links (line 67) | void merge_internal_links(const std::string &path, const std::string &...
function merge_hash_table (line 111) | void merge_hash_table(const std::string &path) {
function merge_downloader (line 122) | void merge_downloader() {
FILE: src/downloader/merge_downloader.h
function namespace (line 31) | namespace downloader {
FILE: src/downloader/warc_downloader.cpp
type downloader (line 46) | namespace downloader {
function run_downloader (line 48) | void run_downloader(const std::string &warc_path) {
function download_warc_paths (line 76) | std::vector<std::string> download_warc_paths() {
function upload_warc_paths (line 96) | bool upload_warc_paths(const std::vector<std::string> &warc_paths) {
function start_downloaders (line 102) | void start_downloaders(const std::vector<std::string> &warc_paths) {
function upload_all (line 125) | void upload_all() {
function warc_downloader_with_url (line 164) | void warc_downloader_with_url(const std::string &batch, const std::str...
function warc_downloader (line 186) | void warc_downloader(const std::string &batch) {
function warc_downloader_missing (line 190) | void warc_downloader_missing(const std::string &batch) {
FILE: src/downloader/warc_downloader.h
function namespace (line 32) | namespace downloader {
FILE: src/file/archive.cpp
type file (line 38) | namespace file {
FILE: src/file/archive.h
function namespace (line 32) | namespace file {
FILE: src/file/file.cpp
type file (line 32) | namespace file {
function read_test_file (line 34) | std::string read_test_file(const std::string &file_name) {
function rename (line 49) | void rename(const std::string &old_path, const std::string &new_path) {
function copy_file (line 53) | void copy_file(const std::string &source, const std::string &dest) {
function delete_file (line 60) | void delete_file(const std::string &file) {
function create_directory (line 64) | void create_directory(const std::string &path) {
function delete_directory (line 68) | void delete_directory(const std::string &path) {
function cat (line 72) | std::string cat(const std::string &filename) {
function read_directory (line 79) | void read_directory(const std::string &dirname, std::function<void(con...
function directory_exists (line 91) | bool directory_exists(const std::string &filename) {
function file_exists (line 95) | bool file_exists(const std::string &filename) {
FILE: src/file/file.h
function namespace (line 34) | namespace file {
FILE: src/file/gz_tsv_file.cpp
type file (line 33) | namespace file {
FILE: src/file/gz_tsv_file.h
function namespace (line 37) | namespace file {
FILE: src/file/tsv_file.cpp
type file (line 30) | namespace file {
FILE: src/file/tsv_file.h
function namespace (line 37) | namespace file {
FILE: src/file/tsv_file_remote.cpp
type file (line 38) | namespace file {
FILE: src/file/tsv_file_remote.h
function namespace (line 31) | namespace file {
FILE: src/file/tsv_row.cpp
type file (line 29) | namespace file {
FILE: src/file/tsv_row.h
function namespace (line 32) | namespace file {
FILE: src/full_text/domain_link_record.h
function namespace (line 29) | namespace full_text {
FILE: src/full_text/link_record.h
function namespace (line 29) | namespace full_text {
FILE: src/full_text/record.h
function namespace (line 4) | namespace full_text {
FILE: src/full_text/result_set.h
function namespace (line 36) | namespace full_text {
FILE: src/full_text/search_metric.h
function namespace (line 29) | namespace full_text {
FILE: src/hash_table2/builder.cpp
type hash_table2 (line 30) | namespace hash_table2 {
FILE: src/hash_table2/builder.h
function namespace (line 33) | namespace hash_table2 {
FILE: src/hash_table2/hash_table.cpp
type hash_table2 (line 32) | namespace hash_table2 {
FILE: src/hash_table2/hash_table.h
function namespace (line 37) | namespace hash_table2 {
FILE: src/hash_table2/hash_table_shard.cpp
type hash_table2 (line 37) | namespace hash_table2 {
FILE: src/hash_table2/hash_table_shard.h
function namespace (line 37) | namespace hash_table2 {
FILE: src/hash_table2/hash_table_shard_base.h
function namespace (line 34) | namespace hash_table2 {
FILE: src/hash_table2/hash_table_shard_builder.cpp
type hash_table2 (line 37) | namespace hash_table2 {
FILE: src/hash_table2/hash_table_shard_builder.h
function namespace (line 36) | namespace hash_table2 {
FILE: src/hash_table_helper/hash_table_helper.cpp
type hash_table_helper (line 31) | namespace hash_table_helper {
function truncate (line 33) | void truncate(const std::string &hash_table_name) {
function create_shard_builders (line 43) | std::vector<hash_table2::hash_table_shard_builder *> create_shard_buil...
function delete_shard_builders (line 52) | void delete_shard_builders(std::vector<hash_table2::hash_table_shard_b...
FILE: src/hash_table_helper/hash_table_helper.h
function namespace (line 33) | namespace hash_table_helper {
FILE: src/http/request.cpp
type http (line 29) | namespace http {
FILE: src/http/request.h
function namespace (line 32) | namespace http {
FILE: src/http/response.h
function namespace (line 31) | namespace http {
FILE: src/http/server.cpp
type http (line 35) | namespace http {
FILE: src/http/server.h
function namespace (line 35) | namespace http {
FILE: src/indexer.cpp
function help (line 44) | void help() {
function main (line 52) | int main(int argc, const char **argv) {
FILE: src/indexer/basic_index.h
function namespace (line 33) | namespace indexer {
FILE: src/indexer/basic_index_builder.h
function namespace (line 47) | namespace indexer {
FILE: src/indexer/console.cpp
type indexer (line 45) | namespace indexer {
function cmd_index (line 47) | void cmd_index(index_manager &idx_manager, const std::vector<std::stri...
function cmd_search (line 77) | void cmd_search(index_manager &idx_manager, hash_table2::hash_table &h...
function cmd_word (line 146) | void cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht,...
function cmd_domain_info (line 163) | void cmd_domain_info(index_manager &idx_manager, hash_table2::hash_tab...
function cmd_word (line 182) | void cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht,...
function cmd_word_num (line 201) | void cmd_word_num(index_manager &idx_manager, hash_table2::hash_table ...
function cmd_harmonic (line 212) | void cmd_harmonic(const std::vector<std::string> &args) {
function input_to_args (line 218) | std::vector<std::string> input_to_args(const std::string &input) {
function console (line 233) | void console() {
function index_link_batch (line 236) | void index_link_batch(const std::string &batch) {
function index_links (line 269) | void index_links() {
function index_url_batch (line 279) | void index_url_batch(const std::string &batch) {
function index_urls (line 311) | void index_urls() {
function truncate_links (line 321) | void truncate_links() {
function domain_info_server (line 328) | void domain_info_server() {
function make_domain_index (line 443) | void make_domain_index() {
function make_domain_index_scores (line 507) | void make_domain_index_scores() {
function make_url_bloom_filter (line 527) | void make_url_bloom_filter() {
function count_words_that_hit_max (line 541) | void count_words_that_hit_max() {
function count_urls (line 555) | size_t count_urls() {
FILE: src/indexer/console.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/counted_record.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/domain_link_record.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/domain_record.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/generic_record.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/index.h
function namespace (line 37) | namespace indexer {
FILE: src/indexer/index_base.h
function namespace (line 35) | namespace indexer {
function m_hash_table_size (line 64) | m_hash_table_size(hash_table_size)
FILE: src/indexer/index_builder.h
function namespace (line 53) | namespace indexer {
FILE: src/indexer/index_manager.cpp
type indexer (line 37) | namespace indexer {
FILE: src/indexer/index_manager.h
function namespace (line 48) | namespace indexer {
FILE: src/indexer/index_reader.cpp
type indexer (line 32) | namespace indexer {
FILE: src/indexer/index_reader.h
function namespace (line 33) | namespace indexer {
FILE: src/indexer/index_utils.cpp
type indexer (line 31) | namespace indexer {
function create_db_directories (line 33) | void create_db_directories(const std::string &db_name) {
function delete_db_directories (line 39) | void delete_db_directories(const std::string &db_name) {
FILE: src/indexer/index_utils.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/link_record.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/merger.cpp
type indexer (line 37) | namespace indexer {
type merger (line 39) | namespace merger {
function set_mem_limit (line 49) | void set_mem_limit(double mem_limit) {
function wait_for_merges (line 53) | void wait_for_merges() {
function lock (line 59) | void lock() {
function register_appender (line 65) | void register_appender(size_t id, std::function<void()> append, std:...
function register_merger (line 72) | void register_merger(size_t id, std::function<void()> merge) {
function deregister_merger (line 78) | void deregister_merger(size_t id) {
function append_all (line 89) | void append_all() {
function merge_all (line 119) | void merge_all() {
function total_sizes (line 147) | size_t total_sizes() {
function merge_thread (line 156) | void merge_thread() {
function start_merge_thread (line 167) | void start_merge_thread() {
function stop_merge_thread (line 175) | void stop_merge_thread() {
function stop_merge_thread_only_append (line 182) | void stop_merge_thread_only_append() {
function terminate_merge_thread (line 188) | void terminate_merge_thread() {
function force_append (line 193) | void force_append() {
FILE: src/indexer/merger.h
function namespace (line 34) | namespace indexer {
FILE: src/indexer/return_record.h
function namespace (line 8) | namespace indexer {
FILE: src/indexer/score_builder.cpp
type indexer (line 31) | namespace indexer {
FILE: src/indexer/score_builder.h
function namespace (line 33) | namespace indexer {
FILE: src/indexer/sharded.h
function namespace (line 37) | namespace indexer {
FILE: src/indexer/sharded_builder.h
function namespace (line 37) | namespace indexer {
FILE: src/indexer/sharded_index.h
function namespace (line 36) | namespace indexer {
FILE: src/indexer/sharded_index_builder.h
function namespace (line 36) | namespace indexer {
FILE: src/indexer/url_record.h
function namespace (line 31) | namespace indexer {
FILE: src/indexer/value_record.h
function namespace (line 31) | namespace indexer {
FILE: src/logger/logger.cpp
type logger (line 33) | namespace logger {
function verbose (line 45) | void verbose(bool verbose) {
function initialize (line 49) | void initialize() {
function de_initialize (line 55) | void de_initialize() {
function reopen (line 63) | void reopen() {
function string (line 88) | string timestamp() {
function string (line 99) | string format(const string &type, const string &file, int line, const ...
function log_message (line 109) | void log_message(const string &type, const string &file, int line, con...
function log_string (line 113) | void log_string(const string &message) {
function log (line 121) | void log(const string &type, const string &file, int line, const strin...
function write_message_to_logfile (line 125) | void write_message_to_logfile(const string &message) {
function logger_thread (line 129) | void logger_thread() {
function start_logger_thread (line 150) | void start_logger_thread() {
function join_logger_thread (line 162) | void join_logger_thread() {
function sync (line 170) | void sync() {
FILE: src/logger/logger.h
function namespace (line 39) | namespace logger {
FILE: src/memory/debugger.cpp
type memory (line 53) | namespace memory {
function incr_mem_counter (line 59) | void incr_mem_counter(size_t n) {
function decr_mem_counter (line 63) | void decr_mem_counter(size_t n) {
function allocated_memory (line 67) | size_t allocated_memory() {
function num_allocated (line 71) | size_t num_allocated() {
function reset_usage (line 79) | void reset_usage() {
function record_usage (line 84) | void record_usage() {
function get_usage (line 93) | size_t get_usage() {
function get_usage_peak (line 97) | size_t get_usage_peak() {
FILE: src/memory/debugger.h
function namespace (line 32) | namespace memory {
FILE: src/memory/memory.cpp
type memory (line 32) | namespace memory {
function get_available_memory (line 38) | size_t get_available_memory() {
function get_used_memory (line 42) | size_t get_used_memory() {
function get_total_memory (line 46) | size_t get_total_memory() {
function update (line 53) | void update() {
FILE: src/memory/memory.h
function namespace (line 31) | namespace memory {
FILE: src/parser/cc_parser.cpp
type parser (line 38) | namespace parser {
function run_downloader (line 40) | void run_downloader(const string &warc_path) {
function start_downloaders (line 58) | void start_downloaders(const vector<string> &warc_paths) {
function download_warc_paths (line 75) | vector<string> download_warc_paths() {
function upload_warc_paths (line 95) | bool upload_warc_paths(const vector<string> &warc_paths) {
function warc_downloader (line 101) | void warc_downloader() {
FILE: src/parser/cc_parser.h
function namespace (line 32) | namespace parser {
FILE: src/parser/entities.cpp
function cmp (line 272) | static int cmp(const void *key, const void *value)
function putc_utf8 (line 287) | static size_t putc_utf8(unsigned long cp, char *buffer)
function parse_entity (line 324) | static bool parse_entity(
function decode_html_entities_utf8 (line 364) | size_t decode_html_entities_utf8(char *dest, const char *src)
FILE: src/parser/html_link.cpp
type parser (line 31) | namespace parser {
FILE: src/parser/html_link.h
function namespace (line 32) | namespace parser {
FILE: src/parser/html_parser.cpp
type parser (line 36) | namespace parser {
function string (line 290) | string html_parser::title() const {
function string (line 294) | string html_parser::meta() const {
function string (line 298) | string html_parser::h1() const {
function string (line 302) | string html_parser::text() const {
function string (line 318) | string html_parser::url_tld(const string &url) {
function string (line 359) | string html_parser::get_tag_content(const string &html, const string &...
function string (line 370) | string html_parser::get_meta_tag(const string &html) {
function string (line 443) | string html_parser::get_text_content(const string &html) {
FILE: src/parser/html_parser.h
function namespace (line 49) | namespace parser {
FILE: src/parser/parser.cpp
type parser (line 32) | namespace parser {
function is_percent_encoding (line 34) | bool is_percent_encoding(const char *cstr) {
function string (line 42) | string urldecode(const string &str) {
function string (line 67) | string urlencode(const string &str) {
function string (line 83) | string get_http_header(const string &record, const string &key) {
FILE: src/parser/parser.h
function namespace (line 31) | namespace parser {
FILE: src/parser/unicode.cpp
type parser (line 31) | namespace parser {
FILE: src/parser/unicode.h
function namespace (line 37) | namespace parser {
FILE: src/profiler/profiler.cpp
type profiler (line 34) | namespace profiler {
function print_memory_status (line 89) | void print_memory_status() {
function tick (line 99) | void tick(const string &name, const string §ion) {
function now_micro (line 106) | double now_micro() {
function timestamp (line 113) | size_t timestamp() {
function print_report (line 118) | void print_report() {
FILE: src/profiler/profiler.h
function namespace (line 34) | namespace profiler {
FILE: src/scraper.cpp
function custom_scraper (line 38) | void custom_scraper() {
function main (line 254) | int main(int argc, const char **argv) {
FILE: src/scraper/scraper.cpp
type scraper (line 36) | namespace scraper {
function string (line 38) | string user_agent_token() {
function string (line 42) | string user_agent() {
function string (line 344) | string scraper::simple_get(const URL &url) {
function URL (line 393) | URL scraper::filter_url(const URL &url) {
function curl_string_reader (line 408) | size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *...
function read_max_scrapers (line 416) | size_t read_max_scrapers() {
function reset_scraper_urls (line 424) | bool reset_scraper_urls() {
function download_scraper_urls (line 430) | vector<string> download_scraper_urls() {
function run_scraper_on_urls (line 450) | void run_scraper_on_urls(const vector<string> &input_urls) {
function url_downloader (line 518) | void url_downloader() {
FILE: src/scraper/scraper.h
function namespace (line 38) | namespace scraper {
FILE: src/scraper/scraper_store.cpp
type scraper (line 36) | namespace scraper {
FILE: src/scraper/scraper_store.h
function namespace (line 34) | namespace scraper {
FILE: src/search_engine/search_allocation.h
function namespace (line 37) | namespace search_engine {
FILE: src/search_engine/search_engine.cpp
type search_engine (line 32) | namespace search_engine {
function reset_search_metric (line 34) | void reset_search_metric(struct full_text::search_metric &metric) {
function search_deduplicate (line 43) | std::vector<full_text::record> search_deduplicate(storage<full_text::r...
FILE: src/search_engine/search_engine.h
function namespace (line 48) | namespace search_engine {
FILE: src/server.cpp
function main (line 44) | int main(int argc, const char **argv) {
FILE: src/server/search_server.cpp
type server (line 41) | namespace server {
function search_server (line 43) | void search_server() {
FILE: src/server/search_server.h
function namespace (line 29) | namespace server {
FILE: src/server/url_server.cpp
type server (line 34) | namespace server {
function url_server (line 35) | void url_server() {
FILE: src/server/url_server.h
function namespace (line 29) | namespace server {
FILE: src/stats/stats.h
function namespace (line 34) | namespace stats {
FILE: src/text/stopwords.h
function class (line 32) | class stopwords {
FILE: src/text/text.cpp
type text (line 29) | namespace text {
function is_clean_char (line 31) | bool is_clean_char(const char *ch, size_t multibyte_len) {
function is_clean_word (line 40) | bool is_clean_word(const std::string &s) {
function clean_word (line 58) | std::string clean_word(const std::string &s) {
function get_words (line 80) | std::vector<std::string> get_words(const std::string &str, size_t limi...
function get_words (line 101) | std::vector<std::string> get_words(const std::string &str) {
function get_full_text_words (line 109) | std::vector<std::string> get_full_text_words(const std::string &str, s...
function get_full_text_words (line 132) | std::vector<std::string> get_full_text_words(const std::string &str) {
function get_full_text_tokens (line 137) | std::vector<uint64_t> get_full_text_tokens(const std::string &str, siz...
function get_full_text_tokens (line 150) | std::vector<uint64_t> get_full_text_tokens(const std::string &str) {
function get_unique_full_text_tokens (line 156) | std::vector<uint64_t> get_unique_full_text_tokens(const std::string &s...
function get_unique_full_text_tokens (line 168) | std::vector<uint64_t> get_unique_full_text_tokens(const std::string &s...
function get_tokens (line 178) | std::vector<uint64_t> get_tokens(const std::string &str, std::function...
function get_tokens (line 207) | std::vector<uint64_t> get_tokens(const std::string &str) {
function get_snippets (line 211) | std::vector<std::string> get_snippets(const std::string &str) {
function get_expanded_full_text_words (line 253) | std::vector<std::string> get_expanded_full_text_words(const std::strin...
function get_expanded_full_text_words (line 287) | std::vector<std::string> get_expanded_full_text_words(const std::strin...
function get_expanded_full_text_tokens (line 295) | std::vector<uint64_t> get_expanded_full_text_tokens(const std::string ...
function get_expanded_full_text_tokens (line 307) | std::vector<uint64_t> get_expanded_full_text_tokens(const std::string ...
function get_unique_expanded_full_text_tokens (line 313) | std::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::...
function get_unique_expanded_full_text_tokens (line 325) | std::vector<uint64_t> get_unique_expanded_full_text_tokens(const std::...
function get_words_without_stopwords (line 334) | std::vector<std::string> get_words_without_stopwords(const std::string...
function get_words_without_stopwords (line 355) | std::vector<std::string> get_words_without_stopwords(const std::string...
function words_to_ngram_hash (line 360) | void words_to_ngram_hash(const std::vector<std::string> &words, size_t...
function words_to_ngram_hash (line 375) | void words_to_ngram_hash(const std::vector<std::string> &words, size_t...
function words_to_ngram_hash (line 390) | void words_to_ngram_hash(const std::vector<std::string> &words, size_t...
function get_word_counts (line 405) | std::map<std::string, size_t> get_word_counts(const std::string &text) {
function get_word_frequency (line 415) | std::map<std::string, float> get_word_frequency(const std::string &tex...
FILE: src/text/text.h
function namespace (line 41) | namespace text {
FILE: src/tools/calculate_harmonic.cpp
type tools (line 22) | namespace tools {
function run_uniq_host (line 24) | std::unordered_map<uint64_t, std::string> run_uniq_host(const std::vec...
type pair_hash (line 48) | struct pair_hash {
function run_uniq_link (line 54) | std::unordered_set<std::pair<uint32_t, uint32_t>, pair_hash> run_uniq_...
function calculate_harmonic_hosts (line 84) | void calculate_harmonic_hosts() {
function read_hosts_file (line 123) | std::unordered_map<uint64_t, uint32_t> read_hosts_file() {
function read_hosts_file_vec (line 143) | std::vector<uint32_t> read_hosts_file_vec() {
function read_hosts_file_with_names (line 162) | std::map<uint32_t, std::string> read_hosts_file_with_names() {
function read_edge_file (line 181) | std::unique_ptr<std::vector<uint32_t>[]> read_edge_file(size_t vlen) {
function calculate_harmonic_links (line 201) | void calculate_harmonic_links() {
function calculate_harmonic (line 243) | void calculate_harmonic() {
FILE: src/tools/calculate_harmonic.h
function namespace (line 4) | namespace tools {
FILE: src/tools/counter.cpp
type tools (line 44) | namespace tools {
function count_urls_per_domain (line 46) | std::map<std::string, size_t> count_urls_per_domain(const std::vector<...
function run_counter_per_domain (line 108) | void run_counter_per_domain(const std::string &batch) {
function run_counter (line 214) | void run_counter() {
function download_link_batch (line 309) | std::vector<std::string> download_link_batch(const std::string &batch,...
FILE: src/tools/counter.h
function namespace (line 31) | namespace tools {
FILE: src/tools/find_links.cpp
type tools (line 19) | namespace tools {
function find_links_for_hosts_chunk (line 21) | void find_links_for_hosts_chunk(const std::set<size_t> &host_hashes, c...
function find_links_for_hosts (line 70) | void find_links_for_hosts(const std::set<size_t> &host_hashes) {
function find_links (line 96) | void find_links() {
FILE: src/tools/find_links.h
function namespace (line 31) | namespace tools {
FILE: src/tools/generate_url_lists.cpp
type tools (line 40) | namespace tools {
function read_urls_with_many_links (line 42) | vector<string> read_urls_with_many_links(const std::string &file_path) {
function read_urls (line 65) | vector<string> read_urls(const std::string &path) {
function generate_url_lists (line 80) | void generate_url_lists(const std::string &batch_path) {
FILE: src/tools/generate_url_lists.h
function namespace (line 29) | namespace tools {
FILE: src/tools/splitter.cpp
type tools (line 21) | namespace tools {
function target_url_batches (line 23) | std::vector<std::string> target_url_batches() {
function target_link_batches (line 32) | std::vector<std::string> target_link_batches() {
function generate_list_with_files (line 41) | std::vector<std::string> generate_list_with_files(const std::vector<st...
function generate_list_with_url_files (line 85) | std::vector<std::string> generate_list_with_url_files() {
function generate_list_with_link_files (line 92) | std::vector<std::string> generate_list_with_link_files() {
function generate_list_with_direct_link_files (line 99) | std::vector<std::string> generate_list_with_direct_link_files() {
function generate_list_with_target_url_files (line 106) | std::vector<std::string> generate_list_with_target_url_files() {
function generate_list_with_target_link_files (line 113) | std::vector<std::string> generate_list_with_target_link_files() {
function write_cache (line 121) | std::string write_cache(size_t file_index, std::vector<std::string> &l...
function write_link_cache (line 140) | std::string write_link_cache(size_t file_index, std::vector<std::strin...
function splitter (line 158) | void splitter(const std::vector<std::string> &warc_paths, std::mutex &...
function link_splitter (line 201) | void link_splitter(const std::vector<std::string> &warc_paths, std::mu...
function link_splitter_with_hosts (line 249) | void link_splitter_with_hosts(const std::unordered_set<size_t> &hosts,...
function splitter_with_urls (line 300) | void splitter_with_urls(const std::unordered_set<size_t> &urls, const ...
function splitter_with_roaring (line 350) | void splitter_with_roaring(const ::roaring::Roaring64Map &urls, const ...
function splitter_with_bloom (line 400) | void splitter_with_bloom(const ::algorithm::bloom_filter &bloom, const...
function build_link_set (line 449) | std::unordered_set<size_t> build_link_set(const std::vector<std::strin...
function build_url_host_set (line 474) | std::unordered_set<size_t> build_url_host_set(const std::vector<std::s...
function build_url_set (line 493) | std::unordered_set<size_t> build_url_set(const std::vector<std::string...
function create_warc_directories (line 512) | void create_warc_directories() {
function run_splitter (line 524) | void run_splitter() {
function run_url_splitter_on_urls_in_set (line 565) | void run_url_splitter_on_urls_in_set(const std::unordered_set<size_t> ...
function run_url_splitter_on_urls_in_roaring (line 590) | void run_url_splitter_on_urls_in_roaring(const ::roaring::Roaring64Map...
function run_url_splitter_on_urls_in_bloom_filter (line 615) | void run_url_splitter_on_urls_in_bloom_filter(const ::algorithm::bloom...
function run_link_splitter_on_links_with_target_host_in_set (line 640) | void run_link_splitter_on_links_with_target_host_in_set(const std::uno...
function generate_set_of_urls (line 665) | std::unordered_set<size_t> generate_set_of_urls() {
function run_split_links_with_relevant_domains (line 690) | void run_split_links_with_relevant_domains() {
function split_make_bloom (line 717) | void split_make_bloom(::algorithm::bloom_filter &bloom, const std::vec...
function run_split_build_url_bloom (line 745) | void run_split_build_url_bloom() {
function split_make_direct_links (line 769) | void split_make_direct_links(const ::algorithm::bloom_filter &bloom, c...
function run_split_direct_links (line 809) | void run_split_direct_links() {
function split_make_link_bloom (line 832) | void split_make_link_bloom(::algorithm::bloom_filter &bloom, const std...
function run_split_build_direct_link_bloom (line 861) | void run_split_build_direct_link_bloom() {
function run_split_urls_with_direct_links (line 885) | void run_split_urls_with_direct_links() {
FILE: src/tools/splitter.h
function namespace (line 32) | namespace tools {
FILE: src/transfer/transfer.cpp
type transfer (line 40) | namespace transfer {
function curl_stringstream_writer (line 42) | size_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, ...
function curl_ostream_writer (line 48) | size_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, ostre...
function curl_string_writer (line 54) | size_t curl_string_writer(void *ptr, size_t size, size_t nmemb, string...
type curl_string_read_struct (line 60) | struct curl_string_read_struct {
function curl_string_reader (line 66) | size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *...
function curl_file_reader (line 84) | size_t curl_file_reader(char *ptr, size_t size, size_t nmemb, void *us...
function set_internal_auth (line 98) | void set_internal_auth(CURL *curl) {
function string (line 103) | string make_url(const string &url) {
function string (line 115) | string file_to_string(const string &file_path, int &error) {
function string (line 147) | string gz_file_to_string(const string &file_path, int &error) {
function file_to_stream (line 193) | void file_to_stream(const string &file_path, ostream &output_stream, i...
function gz_file_to_stream (line 221) | void gz_file_to_stream(const string &file_path, ostream &output_stream...
function url_to_string (line 259) | void url_to_string(const string &url, string &buffer, int &error) {
function string (line 290) | string run_gz_download_thread(const string &file_path) {
function download_gz_files_to_disk (line 302) | vector<string> download_gz_files_to_disk(const vector<string> &files_t...
function delete_downloaded_files (line 326) | void delete_downloaded_files(const vector<string> &files) {
function head_content_length (line 333) | size_t head_content_length(const string &url, int &error) {
function upload_file (line 386) | int upload_file(const string &path, const string &data) {
function upload_gz_file (line 420) | int upload_gz_file(const string &path, const string &data) {
function upload_file_from_disk (line 461) | int upload_file_from_disk(const string &dest_path, const string &filen...
function get (line 495) | http::response get(const string &url) {
function get (line 499) | http::response get(const string &url, const vector<string> &headers) {
function post (line 538) | http::response post(const string &url, const string &data) {
function post (line 542) | http::response post(const string &url, const string &data, const vecto...
function put (line 591) | http::response put(const string &url, const string &data) {
FILE: src/transfer/transfer.h
function namespace (line 38) | namespace transfer {
FILE: src/url_link/link.cpp
type url_link (line 32) | namespace url_link {
FILE: src/url_link/link.h
function namespace (line 32) | namespace url_link {
FILE: src/utils/id_allocator.h
function namespace (line 32) | namespace utils {
FILE: src/utils/thread_pool.cpp
type utils (line 36) | namespace utils {
FILE: src/utils/thread_pool.hpp
type utils (line 34) | namespace utils {
class thread_pool (line 36) | class thread_pool {
FILE: src/utils/thread_pool_arg.h
function namespace (line 34) | namespace utils {
FILE: src/warc/warc.cpp
type warc (line 10) | namespace warc {
function string (line 267) | string parser::get_warc_header(const string &record) {
function multipart_download (line 285) | void multipart_download(const string &url, const std::function<void(co...
function string (line 320) | string get_result_path(const string &warc_path) {
function string (line 326) | string get_link_result_path(const string &warc_path) {
function string (line 332) | string get_internal_link_result_path(const string &warc_path) {
FILE: src/warc/warc.h
function namespace (line 12) | namespace warc {
FILE: tests/main.cpp
function run_before (line 54) | void run_before() {
function run_after (line 59) | void run_after() {
FILE: tests/test_algorithm.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(intersection_test) {
function BOOST_AUTO_TEST_CASE (line 111) | BOOST_AUTO_TEST_CASE(incremental_partitions) {
function BOOST_AUTO_TEST_CASE (line 192) | BOOST_AUTO_TEST_CASE(harmonic_centrality) {
function BOOST_AUTO_TEST_CASE (line 236) | BOOST_AUTO_TEST_CASE(harmonic_centrality_threaded) {
FILE: tests/test_bloom_filter.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(test_bloom_filter) {
function BOOST_AUTO_TEST_CASE (line 47) | BOOST_AUTO_TEST_CASE(test_bloom_filter_merge) {
function BOOST_AUTO_TEST_CASE (line 70) | BOOST_AUTO_TEST_CASE(test_bloom_filter_save) {
FILE: tests/test_cc_parser.cpp
function BOOST_AUTO_TEST_CASE (line 37) | BOOST_AUTO_TEST_CASE(download_warc_paths) {
function BOOST_AUTO_TEST_CASE (line 56) | BOOST_AUTO_TEST_CASE(download_warc) {
function BOOST_AUTO_TEST_CASE (line 69) | BOOST_AUTO_TEST_CASE(parse_cc_batch) {
function BOOST_AUTO_TEST_CASE (line 133) | BOOST_AUTO_TEST_CASE(parse_cc_batch_multistream) {
function BOOST_AUTO_TEST_CASE (line 167) | BOOST_AUTO_TEST_CASE(parse_cc_batch_301) {
FILE: tests/test_configuration.cpp
function BOOST_AUTO_TEST_CASE (line 34) | BOOST_AUTO_TEST_CASE(read_config) {
FILE: tests/test_counted_index_builder.cpp
function BOOST_AUTO_TEST_CASE (line 38) | BOOST_AUTO_TEST_CASE(test_case_1) {
function BOOST_AUTO_TEST_CASE (line 62) | BOOST_AUTO_TEST_CASE(test_case_2) {
function BOOST_AUTO_TEST_CASE (line 87) | BOOST_AUTO_TEST_CASE(test_case_3) {
function BOOST_AUTO_TEST_CASE (line 115) | BOOST_AUTO_TEST_CASE(test_case_4) {
FILE: tests/test_datetime.h
function BOOST_AUTO_TEST_CASE (line 29) | BOOST_AUTO_TEST_SUITE(test_datetime)
FILE: tests/test_file.cpp
function BOOST_AUTO_TEST_CASE (line 41) | BOOST_AUTO_TEST_CASE(transfer_test) {
function BOOST_AUTO_TEST_CASE (line 84) | BOOST_AUTO_TEST_CASE(handle_errors) {
function BOOST_AUTO_TEST_CASE (line 114) | BOOST_AUTO_TEST_CASE(tsv_file_exists) {
function BOOST_AUTO_TEST_CASE (line 124) | BOOST_AUTO_TEST_CASE(tsv_file_dont_exists) {
function BOOST_AUTO_TEST_CASE (line 129) | BOOST_AUTO_TEST_CASE(local_tsv_files) {
function BOOST_AUTO_TEST_CASE (line 158) | BOOST_AUTO_TEST_CASE(head_content_len) {
function BOOST_AUTO_TEST_CASE (line 176) | BOOST_AUTO_TEST_CASE(test_upload) {
function BOOST_AUTO_TEST_CASE (line 189) | BOOST_AUTO_TEST_CASE(test_upload_gz) {
function BOOST_AUTO_TEST_CASE (line 213) | BOOST_AUTO_TEST_CASE(test_tsv_file) {
function BOOST_AUTO_TEST_CASE (line 283) | BOOST_AUTO_TEST_CASE(test_archive) {
function BOOST_AUTO_TEST_CASE (line 321) | BOOST_AUTO_TEST_CASE(test_archive2) {
function BOOST_AUTO_TEST_CASE (line 363) | BOOST_AUTO_TEST_CASE(test_rename_file) {
FILE: tests/test_hash.cpp
function BOOST_AUTO_TEST_CASE (line 32) | BOOST_AUTO_TEST_CASE(str) {
FILE: tests/test_hash_table.cpp
function BOOST_AUTO_TEST_CASE (line 38) | BOOST_AUTO_TEST_CASE(test_file_paths) {
function BOOST_AUTO_TEST_CASE (line 54) | BOOST_AUTO_TEST_CASE(single_shard_add) {
function BOOST_AUTO_TEST_CASE (line 78) | BOOST_AUTO_TEST_CASE(single_shard_add_versioned) {
function BOOST_AUTO_TEST_CASE (line 109) | BOOST_AUTO_TEST_CASE(single_shard_add_versioned2) {
function BOOST_AUTO_TEST_CASE (line 142) | BOOST_AUTO_TEST_CASE(add_to_hash_table) {
function BOOST_AUTO_TEST_CASE (line 190) | BOOST_AUTO_TEST_CASE(add_to_hash_table_reverse) {
function BOOST_AUTO_TEST_CASE (line 233) | BOOST_AUTO_TEST_CASE(optimize) {
function BOOST_AUTO_TEST_CASE (line 279) | BOOST_AUTO_TEST_CASE(optimize_empty) {
function BOOST_AUTO_TEST_CASE (line 288) | BOOST_AUTO_TEST_CASE(conditional) {
function BOOST_AUTO_TEST_CASE (line 316) | BOOST_AUTO_TEST_CASE(conditional2) {
function BOOST_AUTO_TEST_CASE (line 348) | BOOST_AUTO_TEST_CASE(more_tests) {
function BOOST_AUTO_TEST_CASE (line 385) | BOOST_AUTO_TEST_CASE(for_each) {
function BOOST_AUTO_TEST_CASE (line 443) | BOOST_AUTO_TEST_CASE(larger_test) {
function BOOST_AUTO_TEST_CASE (line 512) | BOOST_AUTO_TEST_CASE(merge_with) {
function BOOST_AUTO_TEST_CASE (line 571) | BOOST_AUTO_TEST_CASE(merge_with_files) {
function BOOST_AUTO_TEST_CASE (line 628) | BOOST_AUTO_TEST_CASE(remove_record) {
function BOOST_AUTO_TEST_CASE (line 668) | BOOST_AUTO_TEST_CASE(remove_record2) {
function BOOST_AUTO_TEST_CASE (line 735) | BOOST_AUTO_TEST_CASE(for_each_key) {
FILE: tests/test_html_parser.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(html_parse1) {
function BOOST_AUTO_TEST_CASE (line 62) | BOOST_AUTO_TEST_CASE(html_parse2) {
function BOOST_AUTO_TEST_CASE (line 72) | BOOST_AUTO_TEST_CASE(html_parse3) {
function BOOST_AUTO_TEST_CASE (line 80) | BOOST_AUTO_TEST_CASE(html_parse4) {
function BOOST_AUTO_TEST_CASE (line 88) | BOOST_AUTO_TEST_CASE(html_parse5) {
function BOOST_AUTO_TEST_CASE (line 99) | BOOST_AUTO_TEST_CASE(html_parse6) {
function BOOST_AUTO_TEST_CASE (line 109) | BOOST_AUTO_TEST_CASE(html_parse7) {
function BOOST_AUTO_TEST_CASE (line 119) | BOOST_AUTO_TEST_CASE(html_parse_links) {
function BOOST_AUTO_TEST_CASE (line 267) | BOOST_AUTO_TEST_CASE(html_parser_encodings) {
function BOOST_AUTO_TEST_CASE (line 280) | BOOST_AUTO_TEST_CASE(html_parser_long_text) {
FILE: tests/test_hyper_ball.cpp
function BOOST_AUTO_TEST_CASE (line 37) | BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball) {
function BOOST_AUTO_TEST_CASE (line 63) | BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball2) {
function BOOST_AUTO_TEST_CASE (line 92) | BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball3) {
FILE: tests/test_hyper_log_log.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(hyper_simple) {
function BOOST_AUTO_TEST_CASE (line 47) | BOOST_AUTO_TEST_CASE(hyper_inserts) {
function BOOST_AUTO_TEST_CASE (line 83) | BOOST_AUTO_TEST_CASE(hyper_union) {
function BOOST_AUTO_TEST_CASE (line 98) | BOOST_AUTO_TEST_CASE(hyper_log_log_data_copy) {
function BOOST_AUTO_TEST_CASE (line 122) | BOOST_AUTO_TEST_CASE(hyper_log_log_test2) {
function BOOST_AUTO_TEST_CASE (line 134) | BOOST_AUTO_TEST_CASE(hyper_log_log_move) {
FILE: tests/test_index_builder.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(test_merge_with) {
function BOOST_AUTO_TEST_CASE (line 104) | BOOST_AUTO_TEST_CASE(test_merge_with2) {
FILE: tests/test_index_iteration.cpp
function BOOST_AUTO_TEST_CASE (line 39) | BOOST_AUTO_TEST_CASE(test_index_iteration) {
function BOOST_AUTO_TEST_CASE (line 86) | BOOST_AUTO_TEST_CASE(test_index_iteration2) {
FILE: tests/test_index_reader.cpp
function BOOST_AUTO_TEST_CASE (line 40) | BOOST_AUTO_TEST_CASE(test_index_reader1) {
function BOOST_AUTO_TEST_CASE (line 82) | BOOST_AUTO_TEST_CASE(test_index_reader_2) {
FILE: tests/test_logger.cpp
function BOOST_AUTO_TEST_CASE (line 35) | BOOST_AUTO_TEST_CASE(test_logger1) {
FILE: tests/test_memory.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(test_memory) {
function BOOST_AUTO_TEST_CASE (line 65) | BOOST_AUTO_TEST_CASE(test_indexer_memory) {
FILE: tests/test_n_gram.cpp
function BOOST_AUTO_TEST_CASE (line 35) | BOOST_AUTO_TEST_CASE(words_to_ngram) {
function BOOST_AUTO_TEST_CASE (line 65) | BOOST_AUTO_TEST_CASE(n_gram2) {
FILE: tests/test_robot_parser.cpp
function BOOST_AUTO_TEST_CASE (line 34) | BOOST_AUTO_TEST_CASE(parse) {
function BOOST_AUTO_TEST_CASE (line 49) | BOOST_AUTO_TEST_CASE(parse2) {
FILE: tests/test_scraper.cpp
function BOOST_AUTO_TEST_CASE (line 37) | BOOST_AUTO_TEST_CASE(test_scraper) {
function BOOST_AUTO_TEST_CASE (line 60) | BOOST_AUTO_TEST_CASE(scraper_multithreaded) {
FILE: tests/test_sharded_index_builder.cpp
function BOOST_AUTO_TEST_CASE (line 38) | BOOST_AUTO_TEST_CASE(test_sharded_index_builder) {
function BOOST_AUTO_TEST_CASE (line 62) | BOOST_AUTO_TEST_CASE(test_group_by) {
function BOOST_AUTO_TEST_CASE (line 125) | BOOST_AUTO_TEST_CASE(test_score_mod) {
FILE: tests/test_sort.cpp
type test_data_struct1 (line 35) | struct test_data_struct1 {
function BOOST_AUTO_TEST_CASE (line 40) | BOOST_AUTO_TEST_CASE(merge_arrays) {
function BOOST_AUTO_TEST_CASE (line 77) | BOOST_AUTO_TEST_CASE(merge_arrays_of_struct) {
function BOOST_AUTO_TEST_CASE (line 111) | BOOST_AUTO_TEST_CASE(merge_many_arrays) {
function BOOST_AUTO_TEST_CASE (line 140) | BOOST_AUTO_TEST_CASE(merge_many_arrays_of_struct) {
FILE: tests/test_sum_sorted.cpp
function BOOST_AUTO_TEST_CASE (line 36) | BOOST_AUTO_TEST_CASE(test_sum_sorted1) {
function BOOST_AUTO_TEST_CASE (line 53) | BOOST_AUTO_TEST_CASE(test_sum_sorted2) {
function BOOST_AUTO_TEST_CASE (line 70) | BOOST_AUTO_TEST_CASE(test_sum_sorted3) {
function BOOST_AUTO_TEST_CASE (line 88) | BOOST_AUTO_TEST_CASE(test_sum_sorted4) {
function BOOST_AUTO_TEST_CASE (line 118) | BOOST_AUTO_TEST_CASE(test_sum_sorted5) {
FILE: tests/test_text.cpp
function BOOST_AUTO_TEST_CASE (line 34) | BOOST_AUTO_TEST_CASE(get_full_text_words) {
function BOOST_AUTO_TEST_CASE (line 54) | BOOST_AUTO_TEST_CASE(get_tokens) {
function BOOST_AUTO_TEST_CASE (line 68) | BOOST_AUTO_TEST_CASE(get_tokens2) {
function BOOST_AUTO_TEST_CASE (line 82) | BOOST_AUTO_TEST_CASE(get_tokens3) {
function BOOST_AUTO_TEST_CASE (line 98) | BOOST_AUTO_TEST_CASE(get_snippets) {
function BOOST_AUTO_TEST_CASE (line 112) | BOOST_AUTO_TEST_CASE(get_words_without_stopwords) {
function BOOST_AUTO_TEST_CASE (line 127) | BOOST_AUTO_TEST_CASE(clean_word) {
FILE: tests/test_thread_pool.cpp
function BOOST_AUTO_TEST_CASE (line 35) | BOOST_AUTO_TEST_CASE(thread_pool) {
function BOOST_AUTO_TEST_CASE (line 53) | BOOST_AUTO_TEST_CASE(thread_pool2) {
function BOOST_AUTO_TEST_CASE (line 85) | BOOST_AUTO_TEST_CASE(thread_pool3) {
FILE: tests/test_top_k.cpp
function BOOST_AUTO_TEST_CASE (line 32) | BOOST_AUTO_TEST_CASE(test_1) {
function BOOST_AUTO_TEST_CASE (line 38) | BOOST_AUTO_TEST_CASE(test_2) {
function BOOST_AUTO_TEST_CASE (line 44) | BOOST_AUTO_TEST_CASE(test_3) {
function BOOST_AUTO_TEST_CASE (line 50) | BOOST_AUTO_TEST_CASE(test_4) {
function BOOST_AUTO_TEST_CASE (line 56) | BOOST_AUTO_TEST_CASE(test_5) {
function BOOST_AUTO_TEST_CASE (line 66) | BOOST_AUTO_TEST_CASE(test_6) {
function BOOST_AUTO_TEST_CASE (line 76) | BOOST_AUTO_TEST_CASE(test_7) {
function BOOST_AUTO_TEST_CASE (line 83) | BOOST_AUTO_TEST_CASE(test_8) {
FILE: tests/test_unicode.cpp
function BOOST_AUTO_TEST_CASE (line 32) | BOOST_AUTO_TEST_CASE(unicode) {
FILE: tests/test_url.cpp
function BOOST_AUTO_TEST_CASE (line 34) | BOOST_AUTO_TEST_CASE(basic) {
function BOOST_AUTO_TEST_CASE (line 51) | BOOST_AUTO_TEST_CASE(url_parsing) {
function BOOST_AUTO_TEST_CASE (line 86) | BOOST_AUTO_TEST_CASE(url_parsing2) {
function BOOST_AUTO_TEST_CASE (line 99) | BOOST_AUTO_TEST_CASE(hash) {
function BOOST_AUTO_TEST_CASE (line 115) | BOOST_AUTO_TEST_CASE(unescape) {
function BOOST_AUTO_TEST_CASE (line 161) | BOOST_AUTO_TEST_CASE(host_top_domain) {
FILE: tests/test_url_record.cpp
function BOOST_AUTO_TEST_CASE (line 34) | BOOST_AUTO_TEST_CASE(basic) {
Condensed preview — 234 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,003K chars).
[
{
"path": ".gdbinit",
"chars": 22,
"preview": "\nset history save on\n\n"
},
{
"path": ".gitignore",
"chars": 373,
"preview": "\ndeps/*\ntmp/*\nsrc/*.o\ntests/*.o\nbuild/*\ndocumentation/*.aux\ndocumentation/*.log\ndocumentation/statues_swe.pdf\n.DS_Store\n"
},
{
"path": "CMakeLists.txt",
"chars": 5574,
"preview": "\nset(CMAKE_BUILD_TYPE Release)\n#set(CMAKE_BUILD_TYPE Debug)\n\ncmake_minimum_required(VERSION 3.5)\nset(CMAKE_C_COMPILER /u"
},
{
"path": "Dockerfile",
"chars": 341,
"preview": "# syntax=docker/dockerfile:1\nFROM ubuntu:latest\nARG DEBIAN_FRONTEND=noninteractive\nRUN apt-get update && apt-get install"
},
{
"path": "LICENSE",
"chars": 1117,
"preview": "MIT License\n\nAlexandria.org\n\nCopyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n\nPermission is hereby grant"
},
{
"path": "README.md",
"chars": 2196,
"preview": "# Alexandria.org\n\n1. [Coding Rules](/documentation/coding_rules.md)\n2. [Full text indexes](/documentation/full_text_inde"
},
{
"path": "cmake/Findfcgi.cmake",
"chars": 999,
"preview": "# CMake module to search for FastCGI headers\n#\n# If it's found it sets FCGI_FOUND to TRUE\n# and following variables are "
},
{
"path": "config.conf",
"chars": 693,
"preview": "\n# Cluster config\nnodes_in_cluster = 3\nnode_id = 0\n\n# Indexer config\nbatches[] = ALEXANDRIA-MANUAL-01\nbatches[] = CC-MAI"
},
{
"path": "documentation/alexandria.md",
"chars": 1342,
"preview": "Usage: ./alexandria [OPTIONS]...\n\n## Options\n\n**--downloader [commoncrawl-batch] [limit] [offset]**\n\nDownloads files fro"
},
{
"path": "documentation/api_response_format.md",
"chars": 2902,
"preview": "# Api Response Format\n\nThis is a description of the endpoints available on a node.\n\n### Perform search\n```\ncurl http://n"
},
{
"path": "documentation/caching.md",
"chars": 280,
"preview": "## Caching\n\nOur nodes should try to use as much RAM as possible to store index data for common tokens in RAM. I think th"
},
{
"path": "documentation/coding_rules.md",
"chars": 2182,
"preview": "\n## Coding rules\n1. Indent with tabs.\n2. Use auto for variable declarations when possible.\n3. Never put \"using namespace"
},
{
"path": "documentation/configure_local_nginx.md",
"chars": 577,
"preview": "# COnfigure local nginx server.\n\n1. Install nginx\n```\napt-get install nginx\n```\n\n2. Add configuration to /etc/nginx/site"
},
{
"path": "documentation/full_text_indexes.md",
"chars": 1370,
"preview": "# The alexandria full text index\n\nA full text index in its simplest form is a hash map from an integer word id ```key```"
},
{
"path": "documentation/ideas.md",
"chars": 516,
"preview": "# Similar words\nTo handle similar words (saluhall, saluhallen) we should create a hashtable with similar words and as an"
},
{
"path": "documentation/index_file_format.md",
"chars": 320,
"preview": "# Index file format\n\n```8 bytes number of keys (n)\n8 * n bytes keys\n8 * n bytes positions\n8 * n bytes lengths (len(k) nu"
},
{
"path": "documentation/indexer.md",
"chars": 1228,
"preview": "### NAME\n\nindexer - manually index data or analyze things\n\n### SYNOPSIS\n\nindexer [OPTION]\n\n### DESCRIPTION\n```\n\t--split "
},
{
"path": "documentation/installing_nodes.md",
"chars": 305,
"preview": "If problem with raid information on drive unmount all partitions and do this:\n```\nwipefs -a /dev/nvme1n1\n```\nthen reset "
},
{
"path": "documentation/performance_journal.md",
"chars": 10144,
"preview": "## Performance journal\n\n### File system testing\nExt2 (noatime,nodiratime,barrier=0)\n```\n$ dd if=/dev/zero of=/tmp/test1."
},
{
"path": "documentation/search_result_ranking.md",
"chars": 756,
"preview": "\n# Search Result Ranking\n\nThis document describes how search results are indexed and ranked.\n\n## Input\nInput to our inde"
},
{
"path": "documentation/statues_swe.tex",
"chars": 5081,
"preview": "\n\\documentclass[12pt, a4paper]{article}\n\\usepackage[T1]{fontenc}\n\\usepackage[utf8]{inputenc}\n\\usepackage[swedish]{babel}"
},
{
"path": "scripts/bootstrap_node_2drives.sh",
"chars": 3846,
"preview": "#!/bin/bash\n\napt-get update\napt-get -y install vim parted zip unzip nginx\n\n_mkpart() { \n\tdisc=$1\n\tmountpoint1=$2\n\tmountp"
},
{
"path": "scripts/build-deps.sh",
"chars": 258,
"preview": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nbase_path=`pwd`\n\ncd $base_path\ncd deps\n\ncd zlib-1.2.12\n./configure\nmake -j4\nmake ins"
},
{
"path": "scripts/clean.sh",
"chars": 433,
"preview": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nread -p \"Do you want to delete your local alexandria data? [Y/n] \" -n 1 -r\necho\nif ["
},
{
"path": "scripts/download-deps.sh",
"chars": 609,
"preview": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nexport CC=/usr/bin/gcc-10\nexport CXX=/usr/bin/g++-10\n\nbase_path=`pwd`\ncd $base_path\n"
},
{
"path": "scripts/download-test-data.sh",
"chars": 2903,
"preview": "#!/bin/bash\n\ncd `dirname $0`\n\nif [ $# -eq 0 ]; then\n\techo \"Provide destination path as first argument\"\n\texit 1\nfi\n\nfor s"
},
{
"path": "scripts/find_missing_files_in_batch.sh",
"chars": 602,
"preview": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nbatch=$1\n\nfiles=`curl https://data.commoncrawl.org/crawl-data/$batch/warc.paths.gz |"
},
{
"path": "scripts/init-docker.sh",
"chars": 1951,
"preview": "#!/bin/bash\n\ncd `dirname $0`\n\n# The local docker development environment runs the data server on the local machine.\n# Th"
},
{
"path": "scripts/install-deps.sh",
"chars": 231,
"preview": "#!/bin/bash\n\napt-get install -y zip make cmake gcc-10 g++-10 gcc g++ libcurl4-openssl-dev libssl-dev libcrypto++-dev lib"
},
{
"path": "scripts/packager.sh",
"chars": 6305,
"preview": "#!/bin/bash\n# Copyright 2018-present Amazon.com, Inc. or its affiliates. All Rights Reserved.\n#\n# Licensed under the A"
},
{
"path": "scripts/prepare-output-dirs.sh",
"chars": 271,
"preview": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nfor shard_id in $(seq 0 7); do\n\tshard=\"/mnt/$shard_id\"\n\trm -r $shard\n\tmkdir $shard\n\t"
},
{
"path": "scripts/truncate.sh",
"chars": 296,
"preview": "#!/bin/bash\n\ncd `dirname $0`\ncd ..\n\nfor shard in $(seq 0 7); do\n\trm -r /mnt/$shard/*\n\tmkdir \"/mnt/$shard/input\";\n\tmkdir "
},
{
"path": "scripts/update.sh",
"chars": 163,
"preview": "#!/bin/bash\n\ncd `dirname $0`\n\nwget https://github.com/alexandria-org/alexandria/releases/latest/download/alexandria.zip "
},
{
"path": "src/URL.cpp",
"chars": 6877,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/URL.h",
"chars": 3013,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/alexandria.cpp",
"chars": 12560,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/algorithm.cpp",
"chars": 7558,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/algorithm.h",
"chars": 2671,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/bloom_filter.cpp",
"chars": 3956,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/bloom_filter.h",
"chars": 2384,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/hash.cpp",
"chars": 2441,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/hash.h",
"chars": 1335,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/hyper_ball.h",
"chars": 3577,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/hyper_log_log.cpp",
"chars": 3907,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/hyper_log_log.h",
"chars": 2532,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/intersection.cpp",
"chars": 1549,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/intersection.h",
"chars": 4181,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/sort.cpp",
"chars": 1236,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/sort.h",
"chars": 4619,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/sum_sorted.h",
"chars": 2362,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/algorithm/top_k.h",
"chars": 2664,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/api/api_response.cpp",
"chars": 2904,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/api/api_response.h",
"chars": 1676,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/api/result_with_snippet.cpp",
"chars": 2352,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/api/result_with_snippet.h",
"chars": 1929,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/cluster/cluster.h",
"chars": 1206,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/cluster/document.cpp",
"chars": 3061,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/cluster/document.h",
"chars": 2053,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/ThreadPool.h",
"chars": 3508,
"preview": "/*\nCopyright (c) 2012 Jakob Progsch, Václav Zeman\n\nThis software is provided 'as-is', without any express or implied\nwar"
},
{
"path": "src/common/datetime.cpp",
"chars": 1912,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/datetime.h",
"chars": 1349,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/dictionary.cpp",
"chars": 2639,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/dictionary.h",
"chars": 2066,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/dictionary_row.cpp",
"chars": 2189,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/dictionary_row.h",
"chars": 1753,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/simple_thread_pool.hpp",
"chars": 1674,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/system.cpp",
"chars": 1974,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/common/system.h",
"chars": 1360,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/config.cpp",
"chars": 6671,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/config.h",
"chars": 4027,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/debug.cpp",
"chars": 1309,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/debug.h",
"chars": 1300,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/domain_stats/domain_stats.cpp",
"chars": 2001,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/domain_stats/domain_stats.h",
"chars": 1400,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/downloader/merge_downloader.cpp",
"chars": 5009,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/downloader/merge_downloader.h",
"chars": 1278,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/downloader/warc_downloader.cpp",
"chars": 6254,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/downloader/warc_downloader.h",
"chars": 1495,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/archive.cpp",
"chars": 5680,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/archive.h",
"chars": 1779,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/file.cpp",
"chars": 3081,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/file.h",
"chars": 1953,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/gz_tsv_file.cpp",
"chars": 2313,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/gz_tsv_file.h",
"chars": 1621,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/tsv_file.cpp",
"chars": 11102,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/tsv_file.h",
"chars": 3574,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/tsv_file_remote.cpp",
"chars": 2753,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/tsv_file_remote.h",
"chars": 1483,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/tsv_row.cpp",
"chars": 1527,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/file/tsv_row.h",
"chars": 1402,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/full_text/domain_link_record.h",
"chars": 1358,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/full_text/link_record.h",
"chars": 1349,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/full_text/record.h",
"chars": 126,
"preview": "\n#pragma once\n\nnamespace full_text {\n\n\tstruct record {\n\n\t\tuint64_t m_value;\n\t\tfloat m_score;\n\t\tuint64_t m_domain_hash;\n\n"
},
{
"path": "src/full_text/result_set.h",
"chars": 5688,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/full_text/search_metric.h",
"chars": 1476,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/builder.cpp",
"chars": 2589,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/builder.h",
"chars": 1956,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table.cpp",
"chars": 3128,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table.h",
"chars": 2186,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table_shard.cpp",
"chars": 6835,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table_shard.h",
"chars": 2623,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table_shard_base.h",
"chars": 4361,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table_shard_builder.cpp",
"chars": 11401,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table2/hash_table_shard_builder.h",
"chars": 3399,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table_helper/hash_table_helper.cpp",
"chars": 2091,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/hash_table_helper/hash_table_helper.h",
"chars": 1602,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/http/request.cpp",
"chars": 1407,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/http/request.h",
"chars": 1695,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/http/response.h",
"chars": 1749,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/http/server.cpp",
"chars": 3975,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/http/server.h",
"chars": 1644,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/basic_index.h",
"chars": 9173,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/basic_index_builder.h",
"chars": 16184,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/console.cpp",
"chars": 16816,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/console.h",
"chars": 1554,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/counted_record.h",
"chars": 2649,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/domain_link_record.h",
"chars": 2690,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/domain_record.h",
"chars": 1497,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/generic_record.h",
"chars": 2953,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index.h",
"chars": 16443,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_base.h",
"chars": 6952,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_builder.h",
"chars": 24820,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_manager.cpp",
"chars": 12574,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_manager.h",
"chars": 3691,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_reader.cpp",
"chars": 2615,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_reader.h",
"chars": 2686,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_utils.cpp",
"chars": 1716,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/index_utils.h",
"chars": 1364,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/link_record.h",
"chars": 2442,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/merger.cpp",
"chars": 4847,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/merger.h",
"chars": 1725,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/regular_index_builder.h",
"chars": 0,
"preview": ""
},
{
"path": "src/indexer/return_record.h",
"chars": 1643,
"preview": "\n#pragma once\n\n#include \"URL.h\"\n#include \"generic_record.h\"\n#include \"text/text.h\"\n\nnamespace indexer {\n\n\t/*\n\tThis is th"
},
{
"path": "src/indexer/score_builder.cpp",
"chars": 1994,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/score_builder.h",
"chars": 1772,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/sharded.h",
"chars": 7100,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/sharded_builder.h",
"chars": 8582,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/sharded_index.h",
"chars": 11360,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/sharded_index_builder.h",
"chars": 9252,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/url_record.h",
"chars": 1862,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer/value_record.h",
"chars": 2298,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/indexer.cpp",
"chars": 5853,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/logger/logger.cpp",
"chars": 4874,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/logger/logger.h",
"chars": 2528,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/memory/debugger.cpp",
"chars": 2972,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/memory/debugger.h",
"chars": 1558,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/memory/memory.cpp",
"chars": 2537,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/memory/memory.h",
"chars": 1700,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/memory/overload.cpp",
"chars": 2195,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/cc_parser.cpp",
"chars": 3890,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/cc_parser.h",
"chars": 1410,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/entities.cpp",
"chars": 7834,
"preview": "/*\tCopyright 2012, 2016 Christoph Gärtner\n\tDistributed under the Boost Software License, Version 1.0\n*/\n\n#include \"entit"
},
{
"path": "src/parser/entities.h",
"chars": 545,
"preview": "/*\tCopyright 2012 Christoph Gärtner\n\tDistributed under the Boost Software License, Version 1.0\n*/\n\n#ifndef DECODE_HTML_E"
},
{
"path": "src/parser/html_link.cpp",
"chars": 1842,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/html_link.h",
"chars": 2241,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/html_parser.cpp",
"chars": 16032,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/html_parser.h",
"chars": 3832,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/parser.cpp",
"chars": 2879,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/parser.h",
"chars": 1467,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/unicode.cpp",
"chars": 3103,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/parser/unicode.h",
"chars": 1790,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/profiler/profiler.cpp",
"chars": 3692,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/profiler/profiler.h",
"chars": 1866,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/scraper/scraper.cpp",
"chars": 15826,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/scraper/scraper.h",
"chars": 4954,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/scraper/scraper_store.cpp",
"chars": 5359,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/scraper/scraper_store.h",
"chars": 2754,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/scraper.cpp",
"chars": 7776,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/search_engine/search_allocation.h",
"chars": 3031,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/search_engine/search_engine.cpp",
"chars": 2177,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/search_engine/search_engine.h",
"chars": 21986,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/server/search_server.cpp",
"chars": 2707,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/server/search_server.h",
"chars": 1249,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/server/url_server.cpp",
"chars": 6053,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/server/url_server.h",
"chars": 1246,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/server.cpp",
"chars": 1919,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/stats/stats.h",
"chars": 2707,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/text/stopwords.cpp",
"chars": 3635,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/text/stopwords.h",
"chars": 1418,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/text/text.cpp",
"chars": 12315,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/text/text.h",
"chars": 7177,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/tools/calculate_harmonic.cpp",
"chars": 7884,
"preview": "\n#include \"calculate_harmonic.h\"\n#include \"splitter.h\"\n\n#include \"config.h\"\n#include \"url_link/link.h\"\n#include \"URL.h\"\n"
},
{
"path": "src/tools/calculate_harmonic.h",
"chars": 133,
"preview": "\n#pragma once\n\nnamespace tools {\n\n\tvoid calculate_harmonic_hosts();\n\tvoid calculate_harmonic_links();\n\tvoid calculate_ha"
},
{
"path": "src/tools/counter.cpp",
"chars": 9924,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/tools/counter.h",
"chars": 1351,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/tools/find_links.cpp",
"chars": 3891,
"preview": "\n#include \"find_links.h\"\n#include \"file/gz_tsv_file.h\"\n#include \"URL.h\"\n#include \"algorithm/algorithm.h\"\n#include <boost"
},
{
"path": "src/tools/find_links.h",
"chars": 1269,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/tools/generate_url_lists.cpp",
"chars": 3025,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/tools/generate_url_lists.h",
"chars": 1291,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/tools/splitter.cpp",
"chars": 28185,
"preview": "\n#include \"splitter.h\"\n#include \"config.h\"\n#include \"roaring/roaring64map.hh\"\n#include \"algorithm/bloom_filter.h\"\n#inclu"
},
{
"path": "src/tools/splitter.h",
"chars": 1910,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/transfer/transfer.cpp",
"chars": 17516,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/transfer/transfer.h",
"chars": 3310,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/url_link/link.cpp",
"chars": 2298,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/url_link/link.h",
"chars": 2121,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/utils/id_allocator.h",
"chars": 2347,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/utils/thread_pool.cpp",
"chars": 2808,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/utils/thread_pool.hpp",
"chars": 1736,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/utils/thread_pool_arg.h",
"chars": 3026,
"preview": "/*\n * MIT License\n *\n * Alexandria.org\n *\n * Copyright (c) 2021 Josef Cullhed, <info@alexandria.org>, et al.\n *\n * Permi"
},
{
"path": "src/warc/tlds.h",
"chars": 6764,
"preview": "\n#pragma once\n\n#include <iostream>\n#include <unordered_set>\n\nnamespace warc {\n\n\tconst std::unordered_set<std::string> do"
}
]
// ... and 34 more files (download for full content)
About this extraction
This page contains the full source code of the alexandria-org/alexandria GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 234 files (891.5 KB), approximately 253.9k tokens, and a symbol index with 582 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.