Repository: alexandria-org/alexandria Branch: main Commit: 129e162e8068 Files: 234 Total size: 891.5 KB Directory structure: gitextract_46gecs8w/ ├── .gdbinit ├── .gitignore ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── README.md ├── cmake/ │ └── Findfcgi.cmake ├── config.conf ├── documentation/ │ ├── alexandria.md │ ├── api_response_format.md │ ├── caching.md │ ├── coding_rules.md │ ├── configure_local_nginx.md │ ├── full_text_indexes.md │ ├── ideas.md │ ├── index_file_format.md │ ├── indexer.md │ ├── installing_nodes.md │ ├── performance_journal.md │ ├── search_result_ranking.md │ └── statues_swe.tex ├── scripts/ │ ├── bootstrap_node_2drives.sh │ ├── build-deps.sh │ ├── clean.sh │ ├── download-deps.sh │ ├── download-test-data.sh │ ├── find_missing_files_in_batch.sh │ ├── init-docker.sh │ ├── install-deps.sh │ ├── packager.sh │ ├── prepare-output-dirs.sh │ ├── truncate.sh │ └── update.sh ├── src/ │ ├── URL.cpp │ ├── URL.h │ ├── alexandria.cpp │ ├── algorithm/ │ │ ├── algorithm.cpp │ │ ├── algorithm.h │ │ ├── bloom_filter.cpp │ │ ├── bloom_filter.h │ │ ├── hash.cpp │ │ ├── hash.h │ │ ├── hyper_ball.h │ │ ├── hyper_log_log.cpp │ │ ├── hyper_log_log.h │ │ ├── intersection.cpp │ │ ├── intersection.h │ │ ├── sort.cpp │ │ ├── sort.h │ │ ├── sum_sorted.h │ │ └── top_k.h │ ├── api/ │ │ ├── api_response.cpp │ │ ├── api_response.h │ │ ├── result_with_snippet.cpp │ │ └── result_with_snippet.h │ ├── cluster/ │ │ ├── cluster.h │ │ ├── document.cpp │ │ └── document.h │ ├── common/ │ │ ├── ThreadPool.h │ │ ├── datetime.cpp │ │ ├── datetime.h │ │ ├── dictionary.cpp │ │ ├── dictionary.h │ │ ├── dictionary_row.cpp │ │ ├── dictionary_row.h │ │ ├── simple_thread_pool.hpp │ │ ├── system.cpp │ │ └── system.h │ ├── config.cpp │ ├── config.h │ ├── debug.cpp │ ├── debug.h │ ├── domain_stats/ │ │ ├── domain_stats.cpp │ │ └── domain_stats.h │ ├── downloader/ │ │ ├── merge_downloader.cpp │ │ ├── merge_downloader.h │ │ ├── warc_downloader.cpp │ │ └── warc_downloader.h │ ├── file/ │ │ ├── archive.cpp │ │ ├── archive.h │ │ ├── file.cpp │ │ ├── file.h │ │ ├── gz_tsv_file.cpp │ │ ├── gz_tsv_file.h │ │ ├── tsv_file.cpp │ │ ├── tsv_file.h │ │ ├── tsv_file_remote.cpp │ │ ├── tsv_file_remote.h │ │ ├── tsv_row.cpp │ │ └── tsv_row.h │ ├── full_text/ │ │ ├── domain_link_record.h │ │ ├── link_record.h │ │ ├── record.h │ │ ├── result_set.h │ │ └── search_metric.h │ ├── hash_table2/ │ │ ├── builder.cpp │ │ ├── builder.h │ │ ├── hash_table.cpp │ │ ├── hash_table.h │ │ ├── hash_table_shard.cpp │ │ ├── hash_table_shard.h │ │ ├── hash_table_shard_base.h │ │ ├── hash_table_shard_builder.cpp │ │ └── hash_table_shard_builder.h │ ├── hash_table_helper/ │ │ ├── hash_table_helper.cpp │ │ └── hash_table_helper.h │ ├── http/ │ │ ├── request.cpp │ │ ├── request.h │ │ ├── response.h │ │ ├── server.cpp │ │ └── server.h │ ├── indexer/ │ │ ├── basic_index.h │ │ ├── basic_index_builder.h │ │ ├── console.cpp │ │ ├── console.h │ │ ├── counted_record.h │ │ ├── domain_link_record.h │ │ ├── domain_record.h │ │ ├── generic_record.h │ │ ├── index.h │ │ ├── index_base.h │ │ ├── index_builder.h │ │ ├── index_manager.cpp │ │ ├── index_manager.h │ │ ├── index_reader.cpp │ │ ├── index_reader.h │ │ ├── index_utils.cpp │ │ ├── index_utils.h │ │ ├── link_record.h │ │ ├── merger.cpp │ │ ├── merger.h │ │ ├── regular_index_builder.h │ │ ├── return_record.h │ │ ├── score_builder.cpp │ │ ├── score_builder.h │ │ ├── sharded.h │ │ ├── sharded_builder.h │ │ ├── sharded_index.h │ │ ├── sharded_index_builder.h │ │ ├── url_record.h │ │ └── value_record.h │ ├── indexer.cpp │ ├── logger/ │ │ ├── logger.cpp │ │ └── logger.h │ ├── memory/ │ │ ├── debugger.cpp │ │ ├── debugger.h │ │ ├── memory.cpp │ │ ├── memory.h │ │ └── overload.cpp │ ├── parser/ │ │ ├── cc_parser.cpp │ │ ├── cc_parser.h │ │ ├── entities.cpp │ │ ├── entities.h │ │ ├── html_link.cpp │ │ ├── html_link.h │ │ ├── html_parser.cpp │ │ ├── html_parser.h │ │ ├── parser.cpp │ │ ├── parser.h │ │ ├── unicode.cpp │ │ └── unicode.h │ ├── profiler/ │ │ ├── profiler.cpp │ │ └── profiler.h │ ├── scraper/ │ │ ├── scraper.cpp │ │ ├── scraper.h │ │ ├── scraper_store.cpp │ │ └── scraper_store.h │ ├── scraper.cpp │ ├── search_engine/ │ │ ├── search_allocation.h │ │ ├── search_engine.cpp │ │ └── search_engine.h │ ├── server/ │ │ ├── search_server.cpp │ │ ├── search_server.h │ │ ├── url_server.cpp │ │ └── url_server.h │ ├── server.cpp │ ├── stats/ │ │ └── stats.h │ ├── text/ │ │ ├── stopwords.cpp │ │ ├── stopwords.h │ │ ├── text.cpp │ │ └── text.h │ ├── tools/ │ │ ├── calculate_harmonic.cpp │ │ ├── calculate_harmonic.h │ │ ├── counter.cpp │ │ ├── counter.h │ │ ├── find_links.cpp │ │ ├── find_links.h │ │ ├── generate_url_lists.cpp │ │ ├── generate_url_lists.h │ │ ├── splitter.cpp │ │ └── splitter.h │ ├── transfer/ │ │ ├── transfer.cpp │ │ └── transfer.h │ ├── url_link/ │ │ ├── link.cpp │ │ └── link.h │ ├── utils/ │ │ ├── id_allocator.h │ │ ├── thread_pool.cpp │ │ ├── thread_pool.hpp │ │ └── thread_pool_arg.h │ └── warc/ │ ├── tlds.h │ ├── warc.cpp │ └── warc.h └── tests/ ├── main.cpp ├── test_algorithm.cpp ├── test_bloom_filter.cpp ├── test_cc_parser.cpp ├── test_config.conf ├── test_config2.conf ├── test_configuration.cpp ├── test_counted_index_builder.cpp ├── test_datetime.h ├── test_file.cpp ├── test_hash.cpp ├── test_hash_table.cpp ├── test_html_parser.cpp ├── test_hyper_ball.cpp ├── test_hyper_log_log.cpp ├── test_index_builder.cpp ├── test_index_iteration.cpp ├── test_index_reader.cpp ├── test_logger.cpp ├── test_memory.cpp ├── test_n_gram.cpp ├── test_robot_parser.cpp ├── test_scraper.cpp ├── test_sharded_index_builder.cpp ├── test_sort.cpp ├── test_sum_sorted.cpp ├── test_text.cpp ├── test_thread_pool.cpp ├── test_top_k.cpp ├── test_unicode.cpp ├── test_url.cpp └── test_url_record.cpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gdbinit ================================================ set history save on ================================================ FILE: .gitignore ================================================ deps/* tmp/* src/*.o tests/*.o build/* documentation/*.aux documentation/*.log documentation/statues_swe.pdf .DS_Store config/config.h response.txt cc_parser.zip cc_parser cc_indexer.zip cc_indexer cc_api.zip cc_api cc_full_text.zip cc_full_text run_tests CMakeCache.txt CMakeFiles CMakeScripts Makefile cmake_install.cmake warc.paths .vscode .gdb_history *~ *.swp *.swo ================================================ FILE: CMakeLists.txt ================================================ set(CMAKE_BUILD_TYPE Release) #set(CMAKE_BUILD_TYPE Debug) cmake_minimum_required(VERSION 3.5) set(CMAKE_C_COMPILER /usr/bin/gcc-10) set(CMAKE_CXX_COMPILER /usr/bin/g++-10) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_FLAGS_RELEASE "-O3") set(CMAKE_CXX_FLAGS_DEBUG "-g") set(THREADS_PREFER_PTHREAD_FLAG ON) project(alexandria LANGUAGES CXX) add_definitions(-Wfatal-errors) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") add_subdirectory("deps/abseil-cpp") find_package(roaring REQUIRED) find_package(Threads REQUIRED) FIND_PACKAGE(CURL REQUIRED) find_package(Boost REQUIRED COMPONENTS system iostreams filesystem unit_test_framework) find_package(ZLIB) find_package(fcgi) include_directories(src/) include_directories(deps/) include_directories(tests/) set(SRC_CLASSES "src/url_link/link.cpp" "src/api/result_with_snippet.cpp" "src/api/api_response.cpp" "src/file/file.cpp" "src/file/archive.cpp" "src/file/tsv_file.cpp" "src/file/gz_tsv_file.cpp" "src/file/tsv_file_remote.cpp" "src/file/tsv_row.cpp" "src/transfer/transfer.cpp" "src/hash_table2/hash_table.cpp" "src/hash_table2/hash_table_shard.cpp" "src/hash_table2/hash_table_shard_builder.cpp" "src/hash_table2/builder.cpp" "src/hash_table_helper/hash_table_helper.cpp" "src/parser/parser.cpp" "src/parser/entities.cpp" "src/parser/html_link.cpp" "src/parser/html_parser.cpp" "src/parser/unicode.cpp" "src/parser/cc_parser.cpp" "src/downloader/warc_downloader.cpp" "src/downloader/merge_downloader.cpp" "src/URL.cpp" "src/warc/warc.cpp" "src/profiler/profiler.cpp" "src/logger/logger.cpp" "src/utils/thread_pool.cpp" "src/memory/memory.cpp" "src/memory/debugger.cpp" "src/config.cpp" "src/algorithm/algorithm.cpp" "src/algorithm/intersection.cpp" "src/algorithm/sort.cpp" "src/algorithm/hash.cpp" "src/algorithm/hyper_log_log.cpp" "src/algorithm/bloom_filter.cpp" "src/tools/splitter.cpp" "src/tools/find_links.cpp" "src/tools/counter.cpp" "src/tools/calculate_harmonic.cpp" "src/tools/generate_url_lists.cpp" "src/cluster/document.cpp" "src/scraper/scraper.cpp" "src/scraper/scraper_store.cpp" "src/indexer/index_manager.cpp" "src/indexer/console.cpp" "src/indexer/merger.cpp" "src/indexer/score_builder.cpp" "src/indexer/index_reader.cpp" "src/indexer/index_utils.cpp" "src/server/search_server.cpp" "src/server/url_server.cpp" "src/http/server.cpp" "src/http/request.cpp" "src/domain_stats/domain_stats.cpp" "src/debug.cpp" "deps/robots.cc" ) set(SRC_COMMON "src/common/dictionary.cpp" "src/common/system.cpp" "src/common/datetime.cpp" "src/common/dictionary_row.cpp" "src/text/stopwords.cpp" "src/text/text.cpp" ) set(SRC_TESTS "tests/test_hyper_log_log.cpp" "tests/test_memory.cpp" "tests/test_algorithm.cpp" "tests/test_bloom_filter.cpp" "tests/test_cc_parser.cpp" "tests/test_configuration.cpp" "tests/test_counted_index_builder.cpp" "tests/test_datetime.h" "tests/test_file.cpp" "tests/test_hash.cpp" "tests/test_hash_table.cpp" "tests/test_html_parser.cpp" "tests/test_hyper_ball.cpp" "tests/test_index_builder.cpp" "tests/test_index_iteration.cpp" "tests/test_index_reader.cpp" "tests/test_logger.cpp" "tests/test_n_gram.cpp" "tests/test_robot_parser.cpp" "tests/test_scraper.cpp" "tests/test_sharded_index_builder.cpp" "tests/test_sort.cpp" "tests/test_sum_sorted.cpp" "tests/test_text.cpp" "tests/test_thread_pool.cpp" "tests/test_top_k.cpp" "tests/test_unicode.cpp" "tests/test_url.cpp" "tests/test_url_record.cpp" # This overloads the new/delete operators to keep track of memory, slows things down a lot. "src/memory/overload.cpp" ) add_executable(run_tests "tests/main.cpp" ${SRC_CLASSES} ${SRC_COMMON} ${SRC_TESTS} ) add_executable(server "src/server.cpp" ${SRC_CLASSES} ${SRC_COMMON} ) add_executable(scraper "src/scraper.cpp" ${SRC_CLASSES} ${SRC_COMMON} ) add_executable(indexer "src/indexer.cpp" ${SRC_CLASSES} ${SRC_COMMON} ) add_executable(alexandria "src/alexandria.cpp" ${SRC_CLASSES} ${SRC_COMMON} ) target_compile_definitions(run_tests PUBLIC IS_TEST) target_compile_definitions(run_tests PUBLIC FT_NUM_SHARDS=16) target_compile_definitions(run_tests PUBLIC HT_NUM_SHARDS=16) target_compile_definitions(run_tests PUBLIC FILE_SERVER="http://127.0.0.1") target_compile_definitions(run_tests PUBLIC COMPILE_WITH_LINK_INDEX) target_compile_options(run_tests PUBLIC -Wall -Werror) target_compile_options(server PUBLIC -Wall -Werror) target_compile_options(scraper PUBLIC -Wall -Werror) target_compile_options(indexer PUBLIC -Wall -Werror) target_compile_options(alexandria PUBLIC -Wall -Werror) target_link_libraries(run_tests PUBLIC ${FCGI_LIBRARY} ${FCGI_LIBRARYCPP} ${CURL_LIBRARIES} ${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring) target_link_libraries(server PUBLIC ${FCGI_LIBRARY} ${FCGI_LIBRARYCPP} ${CURL_LIBRARIES} ${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring) target_link_libraries(scraper PUBLIC ${FCGI_LIBRARY} ${FCGI_LIBRARYCPP} ${CURL_LIBRARIES} ${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring) target_link_libraries(indexer PUBLIC ${FCGI_LIBRARY} ${FCGI_LIBRARYCPP} ${CURL_LIBRARIES} ${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring) target_link_libraries(alexandria PUBLIC ${FCGI_LIBRARY} ${FCGI_LIBRARYCPP} ${CURL_LIBRARIES} ${Boost_LIBRARIES} ZLIB::ZLIB Threads::Threads absl::strings absl::numeric roaring::roaring) ================================================ FILE: Dockerfile ================================================ # syntax=docker/dockerfile:1 FROM ubuntu:latest ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y zip make cmake gcc gcc-10 g++ g++-10 libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx vim wget git curl ================================================ FILE: LICENSE ================================================ MIT License Alexandria.org Copyright (c) 2021 Josef Cullhed, , et al. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Alexandria.org 1. [Coding Rules](/documentation/coding_rules.md) 2. [Full text indexes](/documentation/full_text_indexes.md) 3. [Hash table](/documentation/hash_table.md) ## Build instructions with docker 1. Checkout repo WINDOWS USERS: You need to run 'git config --global core.autocrlf false' before checking out the repository ``` git clone git@github.com:alexandria-org/alexandria.git ``` 2. Build docker image ``` docker build . -t alexandria ``` 3. Run container ``` docker container run --name alexandria -v ${PWD}:/alexandria -it -d alexandria ``` 4. Attach to container. ``` docker exec -it alexandria /bin/bash ``` 5. Navigate to directory ``` cd /alexandria ``` 6. Initialize docker ``` scripts/init-docker.sh ``` 7. Configure with cmake ``` mkdir build; cd build; cmake .. ``` 8. Build all ``` make -j4 ``` 9. Run test suite ``` ./run_tests ``` ## How to build manually (not recommended) 1. Configure the system (Tested on Ubuntu 20.04) ``` # Will alter your system and install dependencies with apt. ./scripts/install-deps.sh # Will download and build zlib, aws-lambda-cpp and aws-sdk-cpp will only alter the local directory. ./scripts/build-deps.sh ``` 2. Build with cmake ``` mkdir build cd build cmake .. -DCMAKE_BUILD_TYPE=Debug or cmake .. -DCMAKE_BUILD_TYPE=Release make -j24 ``` 3. Download test data to local server. To run the test suite you need to install nginx and pre-download all the data: [Configure local nginx test data server](/documentation/configure_local_nginx.md) 4. Create output directories. Note, this will create a bunch of directories in the /mnt so make sure you don't have anything there. ``` ./scripts/prepare-output-dirs.sh ``` 5. Run the test suite ``` cd build make run_tests -j24 ./run_tests ``` ## Notes On nodes with spinning disks we should turn off energy saving: ``` hdparm -B 255 /dev/sda ``` ## Debugging notes ### Debugging scraper with gdb: By default, gdb captures SIGPIPE of a process and pauses it. However, some program ignores SIGPIPE. So, the default behavour of gdb is not desired when debugging those program. To avoid gdb stopping in SIGPIPE, use the folloing command in gdb: ```handle SIGPIPE nostop noprint pass``` ================================================ FILE: cmake/Findfcgi.cmake ================================================ # CMake module to search for FastCGI headers # # If it's found it sets FCGI_FOUND to TRUE # and following variables are set: # FCGI_INCLUDE_DIR # FCGI_LIBRARY FIND_PATH(FCGI_INCLUDE_DIR fcgio.h PATHS /usr/include /usr/local/include /usr/include/fastcgi "$ENV{LIB_DIR}/include" $ENV{INCLUDE} ) FIND_LIBRARY(FCGI_LIBRARY NAMES fcgi libfcgi PATHS /usr/local/lib /usr/lib "$ENV{LIB_DIR}/lib" "$ENV{LIB}" ) FIND_LIBRARY(FCGI_LIBRARYCPP NAMES libfcgi++.so PATHS /usr/local/lib /usr/lib "$ENV{LIB_DIR}/lib" "$ENV{LIB}" ) IF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY) SET(FCGI_FOUND TRUE) ENDIF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY) IF (FCGI_FOUND) IF (NOT FCGI_FIND_QUIETLY) MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARY}") MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARYCPP}") ENDIF (NOT FCGI_FIND_QUIETLY) ELSE (FCGI_FOUND) IF (FCGI_FIND_REQUIRED) MESSAGE(FATAL_ERROR "Could not find FCGI") ENDIF (FCGI_FIND_REQUIRED) ENDIF (FCGI_FOUND) ================================================ FILE: config.conf ================================================ # Cluster config nodes_in_cluster = 3 node_id = 0 # Indexer config batches[] = ALEXANDRIA-MANUAL-01 batches[] = CC-MAIN-2021-25 batches[] = CC-MAIN-2021-31 link_batches[] = CC-MAIN-2021-31 link_batches[] = CC-MAIN-2021-25 link_batches[] = CC-MAIN-2021-21 link_batches[] = CC-MAIN-2021-17 link_batches[] = CC-MAIN-2021-10 link_batches[] = CC-MAIN-2021-04 link_batches[] = CC-MAIN-2020-50 link_batches[] = CC-MAIN-2020-45 # Server config worker_count = 8 query_max_words = 10 # Maximum number of words used in query. query_max_len = 200 deduplicate_domain_count = 5 pre_result_limit = 200000 result_limit = 1000 # Full text config ft_max_sections = 4 ft_max_results_per_section = 2000000 ================================================ FILE: documentation/alexandria.md ================================================ Usage: ./alexandria [OPTIONS]... ## Options **--downloader [commoncrawl-batch] [limit] [offset]** Downloads files from the given commoncrawl batch. Limit and offset arguments are used for downloading a subset of the files. Example ``` ./alexandria --downloader CC-MAIN-2022-27 2500 0 ``` Will download the first 2500 files from CC-MAIN-2022-27 and upload them to the 'upload' host. See config documentation. **--downloader-merge** Merges downloaded files. This should run on the upload host to merge the different downloaded batches into our hash table. **--hash-table-url [URL]** Searches the local hash table called 'all_urls' for the given URL. **--hash-table-url-hash [URL-hash]** Searches the local hash table called 'all_urls' for the given URL-hash. **--hash-table-count** Counts all items in local hash table called 'all_urls'. **--hash-table-find-all [HOST]** Searches the local hash table called 'all_urls' for urls from specified host. This takes several days for large hash table. **--hash-table-count [HOST]** Estimated count of host from hash table by only counting one shard and multiply by number of shards. **--hash-table-optimize-shard [SHARD]** Optimizes shard for local hash table called 'all_urls'. **--internal-harmonic** Run the whole internal links harmonic calculator. Should run on 'upload' host. ================================================ FILE: documentation/api_response_format.md ================================================ # Api Response Format This is a description of the endpoints available on a node. ### Perform search ``` curl http://node0002.alexandria.org/?q=the%20beatles { "status": "success", "time_ms": 35.876, "total_found": 245436, "total_url_links_found": 4092, "total_domain_links_found": 4092, "links_handled": 674, "link_domain_matches": 18059, "link_url_matches": 589, "results": [{ "url": "https://www.example.com/", "title": "Example dot com", "snippet": "Lorem ipsum dolor esit", "score": 182.51408386230469, "domain_hash": "2892282071861106665", "url_hash": "2892281418178079567" }] } The url flag d can be used to control deduplication: curl http://node0002.alexandria.org/?q=the%20beatles&d=a curl http://node0002.alexandria.org/?q=the%20beatles&d=d d=a // No deduplication, show all results d=d // Deduplication Default value is d=d ``` ### Perform url lookup ``` curl http://node0002.alexandria.org/?u=https://www.example.org/ { "status": "success", "time_ms": 35.876, "response": "[DATA]" } ``` ### Fetch information about search result ``` curl http://node0002.alexandria.org/?s=example%20query { "status": "success", "time_ms": 13.984, "index": { "total": 980770801, "words": { "example": 0.0080152416772448342, "query": 0.0017581304401006531 } }, "link_index": { "total": 472012858, "words": { "example": 0.000581251114985516, "query": 6.3595725182554242e-05 } } } ``` ### Fetch status of the node. ``` curl http://node0002.alexandria.org/status { "status": "success", "time_ms": 13.984, "total_disk_space": 89374934876, "avail_disk_space": 83975235, "avail_disk_percent": 0.0832, "index": { "items": 980770801, "full_text_disk_used": 973295875, "full_text_disk_percent": 0.5423, "hash_table_disk_used": 839265, "hash_table_disk_percent": 0.05423 }, "link_index": { "items": 980770801, "full_text_disk_used": 973295875, "full_text_disk_percent": 0.2423, "hash_table_disk_used": 839265, "hash_table_disk_percent": 0.0423 } } ``` ### Combined api response (api.alexandria.org) ``` curl https://api.alexandria.org/?q=the%20beatles&p=1 { "status": "success", "time_ms": 35.876, "total_found": 245436, "total_url_links_found": 4092, "total_domain_links_found": 4092, "links_handled": 674, "link_domain_matches": 18059, "link_url_matches": 589, "page_max": 10, "results": [{ "url": "https://www.example.com/", "display_url": "https://www.example.com/", "title": "Example dot com", "snippet": "Lorem ipsum dolor esit", "score": 182.51408386230469, "domain_hash": "2892282071861106665", "url_hash": "2892281418178079567", "exact_match": 1, "phrase_match": 1, "year": 3300, "is_old": 0, "is_subdomain": 0, "domain": "www.example.com" }, ... ] } ``` ================================================ FILE: documentation/caching.md ================================================ ## Caching Our nodes should try to use as much RAM as possible to store index data for common tokens in RAM. I think the best way would be to hold a list of the most commonly queried tokens. We can use /proc/meminfo to retrieve information about available memory on the server. ================================================ FILE: documentation/coding_rules.md ================================================ ## Coding rules 1. Indent with tabs. 2. Use auto for variable declarations when possible. 3. Never put "using namespace std" in any file. 4. Prefix class member variables with m_, this way you know you are using a member or local variable. 5. All namespaces, classes, functions and variables should be lower_case. 6. All files within a sub-directory must declare everything within a namespace with the same name as the directory. For example src/file/tsv_file.h must declare everything within the namespace file:: 7. Prefer smart pointers over regular pointers. 8. Prefer if statements over switch statements. ## Indentation examples Indent with tabs! ### pointers ```c++ // * and & are glued to the variable int *ptr = new int[100]; int *ptr2 = &addr; ``` ### operators ```c++ // Spaces between binary operators int a = 1 + 2; int b = multiple * (add1 + add2); a += b; // Unary operators are glued to variable int a = 1; a++; int b = -a; ``` ### functions ```c++ // Spaces after comma int add(int a, int b) { return a + b; } // Spaces after comma here too add(123, 333); ``` ### classes ```c++ template class index_builder { public: index_builder(const std::string &db_name, size_t id); int public_func(); private: int m_member; int m_counter; int private_func(); }; ``` ### if ```c++ // Space between "if" and "(" // Space between ")" and "{" if (something) { do_something(); } else if (something_else) { do_something_else(); } else { do_else(); } ``` ### loops ```c++ // Prefer range based loops. for (const auto &iter : m_map) { } // But if you need a standard loop indent it like this. for (int i = 0; i < 100; i++) { } ``` ### memory allocation ```c++ // Avoid new/delete, use smart pointers everywhere. // If you just need a regular pointer to memory do this: std::unique_ptr allocator; try { allocator = std::make_unique(1000); } catch (std::bad_alloc &error) { // Handle allocation error. } char *ptr = allocator.get(); // Use ptr as regular pointer to 1000 chars. // ptr will be deleted automatically when allocator goes out of scope. ``` ================================================ FILE: documentation/configure_local_nginx.md ================================================ # COnfigure local nginx server. 1. Install nginx ``` apt-get install nginx ``` 2. Add configuration to /etc/nginx/sites-available/default (If you are running other sites locally you should probably do something else here) ``` server { listen 80 default_server; listen [::]:80 default_server; root /var/www/html/node0003.alexandria.org; index index.html index.htm index.nginx-debian.html; server_name _; location / { try_files $uri $uri/ =404; autoindex on; } } ``` 3. Download test data to /var/www/html ``` ./scripts/download-test-data.sh /var/www/html ``` ================================================ FILE: documentation/full_text_indexes.md ================================================ # The alexandria full text index A full text index in its simplest form is a hash map from an integer word id ```key``` to a list of documents. There are two kinds of data structures called ```index``` and ```counted_index```. Both data structures acts on a given template type ```data_record```. The two data structures shares the same data layout except for the last part where ```index``` stores roaring bitmaps while `counted_index` store the records. ## Data layout The index starts with a hash table. The hash table stores the position for the page containing `key` at index `key % hash_table_size`. ``` hash table : uint64_t[hash_table_size] (8 x hash_table_size bytes) num_records : uint64_t (8 bytes) list of records : data_record[num_records] (sizeof(data_record) * num_records bytes) consecutive pages : page[varying] (undetermined size) ``` A single page consists of a list of keys. Each key then has a corresponding position among the bitmaps and a length of the bitmap. The bitmaps (of varying length) are then stored consecutively. ``` num_keys : uint64_t (8 bytes) list of keys : uint64_t[num_keys] (8 x num_keys bytes) list of positions : uint64_t[num_keys] (8 x num_keys bytes) list of lengths : uint64_t[num_keys] (8 x num_keys bytes) consecutive bitmaps : bitmap[num_keys] (undetermined size) ``` ================================================ FILE: documentation/ideas.md ================================================ # Similar words To handle similar words (saluhall, saluhallen) we should create a hashtable with similar words and as an additional index create "saluhall+" by combining our existing indexes of saluhall, saluhallen, saluhallarna etc. into one additional index. # Autocomplete We should base our autocomplete on the most common words in titles of documents before and after each word. For example "Uppsala" could suggest "Uppsala kommun", "Uppsala universitet" and "Destination Uppsala" based on the search results. ================================================ FILE: documentation/index_file_format.md ================================================ # Index file format ```8 bytes number of keys (n) 8 * n bytes keys 8 * n bytes positions 8 * n bytes lengths (len(k) number of records for key k) 8 * n bytes total found results [Data Records] ``` ``` Data records are structured like this: len(k) * (8 bytes unsigned long URL id, 4 bytes single precision float score) ================================================ FILE: documentation/indexer.md ================================================ ### NAME indexer - manually index data or analyze things ### SYNOPSIS indexer [OPTION] ### DESCRIPTION ``` --split source_batch target_prefix splits the urls in the local source batch and outputs them into {target_prefix}-[0-23]/files. for example --split CC-MAIN-2021-04 /mnt/crawl-data/NODE --split-count --split-count-domains --split-count-links --split-make-scraper-urls --tools-download-batch --tools-upload-urls-with-links --tools-find-links --calculate-harmonic-hosts --calculate-harmonic-links --calculate-harmonic --host-hash --host-hash-mod --console run the interactive console for making debug searches. --index-domans BATCH LIMIT OFFSET run the indexer for our domain index adding the urls+data from BATCH --index-links BATCH LIMIT OFFSET run the link indexer adding url_ and domain_ links from BATCH --index-words BATCH LIMIT OFFSET run the word indexer adding word data from BATCH --index-urls BATCH LIMIT OFFSET run the url indexer on batch generating one index per domain --index-snippets BATCH LIMIT OFFSET run the snippet indexer --truncate-domains --truncate-links --truncate-words --truncate-urls --truncate-snippets --info print info about indexes ``` ================================================ FILE: documentation/installing_nodes.md ================================================ If problem with raid information on drive unmount all partitions and do this: ``` wipefs -a /dev/nvme1n1 ``` then reset and install node again. To setup node with two drives run: ``` source <(curl -s https://raw.githubusercontent.com/alexandria-org/alexandria/main/scripts/bootstrap_node_2drives.sh) ``` ================================================ FILE: documentation/performance_journal.md ================================================ ## Performance journal ### File system testing Ext2 (noatime,nodiratime,barrier=0) ``` $ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync 0+1 records in 0+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 4.76649 s, 451 MB/s $ echo 3 > /proc/sys/vm/drop_caches $ time dd if=/tmp/test1.img of=/dev/null bs=8k 262143+1 records in 262143+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.43043 s, 1.5 GB/s real 0m1.435s user 0m0.013s sys 0m0.763s ``` Ext2 (relatime) ``` $ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync 0+1 records in 0+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 5.02563 s, 427 MB/s $ echo 3 > /proc/sys/vm/drop_caches $ time dd if=/tmp/test1.img of=/dev/null bs=8k 262143+1 records in 262143+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 1.48533 s, 1.4 GB/s real 0m1.490s user 0m0.046s sys 0m0.604s ``` Ext4 (noatime,nodiratime,barrier=0): ``` $ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync 0+1 records in 0+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.26469 s, 948 MB/s $ echo 3 > /proc/sys/vm/drop_caches $ time dd if=/tmp/test1.img of=/dev/null bs=8k 262143+1 records in 262143+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.821499 s, 2.6 GB/s real 0m0.824s user 0m0.004s sys 0m0.648s ``` Ext4 (relatime): ``` $ dd if=/dev/zero of=/tmp/test1.img bs=10G count=1 oflag=dsync 0+1 records in 0+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 2.15461 s, 997 MB/s $ echo 3 > /proc/sys/vm/drop_caches $ time dd if=/tmp/test1.img of=/dev/null bs=8k 262143+1 records in 262143+1 records out 2147479552 bytes (2.1 GB, 2.0 GiB) copied, 0.822013 s, 2.6 GB/s real 0m0.825s user 0m0.029s sys 0m0.568s ``` Conclusion. Run ext4 ### Software load testing 2021-10-06, AX61-NVME with two discs ``` Server Software: nginx/1.18.0 Server Hostname: node0002.alexandria.org Server Port: 80 Concurrency Level: 5 Time taken for tests: 294.451 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 294262066 bytes HTML transferred: 293986342 bytes Requests per second: 6.79 [#/sec] (mean) Time per request: 736.127 [ms] (mean) Time per request: 147.225 [ms] (mean, across all concurrent requests) Transfer rate: 975.94 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 12 19 10.1 16 152 Processing: 16 717 461.5 652 2896 Waiting: 0 662 431.7 587 2770 Total: 31 736 460.4 671 2911 Percentage of the requests served within a certain time (ms) 50% 671 66% 879 75% 1009 80% 1108 90% 1344 95% 1595 98% 1864 99% 2062 100% 2911 (longest request) ``` 2021-10-10, AX61-NVME with two discs ``` Server Software: nginx/1.18.0 Server Hostname: node0002.alexandria.org Server Port: 80 Concurrency Level: 5 Time taken for tests: 328.051 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 255881934 bytes HTML transferred: 255605934 bytes Requests per second: 6.10 [#/sec] (mean) Time per request: 820.128 [ms] (mean) Time per request: 164.026 [ms] (mean, across all concurrent requests) Transfer rate: 761.73 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 12 52 95.6 25 1560 Processing: 16 767 558.9 689 3961 Waiting: 15 638 427.9 594 2631 Total: 32 819 558.5 742 4113 Percentage of the requests served within a certain time (ms) 50% 742 66% 982 75% 1159 80% 1260 90% 1560 95% 1831 98% 2186 99% 2470 100% 4113 (longest request) ``` 2021-10-10, AX41-NVMe with four discs ``` Server Software: nginx/1.18.0 Server Hostname: 65.21.238.146 Server Port: 80 Concurrency Level: 5 Time taken for tests: 278.694 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 232745432 bytes HTML transferred: 232469432 bytes Requests per second: 7.18 [#/sec] (mean) Time per request: 696.735 [ms] (mean) Time per request: 139.347 [ms] (mean, across all concurrent requests) Transfer rate: 815.56 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 12 69 98.4 35 1107 Processing: 14 627 698.4 454 9790 Waiting: 14 435 346.5 368 4045 Total: 29 696 719.1 522 10159 Percentage of the requests served within a certain time (ms) 50% 522 66% 755 75% 927 80% 1050 90% 1382 95% 1781 98% 2415 99% 3439 100% 10159 (longest request) ``` 2021-10-10, AX41-NVMe with four discs ``` Server Software: nginx/1.18.0 Server Hostname: 65.21.238.146 Server Port: 80 Concurrency Level: 5 Time taken for tests: 252.503 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 230349918 bytes HTML transferred: 230073780 bytes Requests per second: 7.92 [#/sec] (mean) Time per request: 631.258 [ms] (mean) Time per request: 126.252 [ms] (mean, across all concurrent requests) Transfer rate: 890.88 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 12 54 78.2 27 1068 Processing: 15 576 519.3 436 3659 Waiting: 15 421 325.7 354 2421 Total: 30 631 527.6 491 3728 Percentage of the requests served within a certain time (ms) 50% 491 66% 707 75% 861 80% 988 90% 1355 95% 1736 98% 2100 99% 2419 100% 3728 (longest request) ``` 2021-10-10, AX61-NVME with two discs, 4 partitions ``` Server Software: nginx/1.18.0 Server Hostname: 65.21.125.158 Server Port: 80 Concurrency Level: 5 Time taken for tests: 263.283 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 282821583 bytes HTML transferred: 282545445 bytes Requests per second: 7.60 [#/sec] (mean) Time per request: 658.209 [ms] (mean) Time per request: 131.642 [ms] (mean, across all concurrent requests) Transfer rate: 1049.03 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 13 28 32.9 26 630 Processing: 17 629 434.1 563 3051 Waiting: 15 587 412.8 517 2949 Total: 36 657 435.8 593 3090 Percentage of the requests served within a certain time (ms) 50% 593 66% 774 75% 914 80% 1003 90% 1260 95% 1480 98% 1708 99% 1959 100% 3090 (longest request) ``` 2021-10-10, AX61-NVME with two discs, 4 partitions ``` Server Software: nginx/1.18.0 Server Hostname: 65.21.125.158 Server Port: 80 Concurrency Level: 5 Time taken for tests: 249.241 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 267058842 bytes HTML transferred: 266782842 bytes Requests per second: 8.02 [#/sec] (mean) Time per request: 623.101 [ms] (mean) Time per request: 124.620 [ms] (mean, across all concurrent requests) Transfer rate: 1046.38 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 13 27 19.3 25 734 Processing: 15 596 469.4 506 3785 Waiting: 0 554 449.3 467 3660 Total: 32 622 470.7 531 3805 Percentage of the requests served within a certain time (ms) 50% 531 66% 735 75% 878 80% 974 90% 1234 95% 1495 98% 1809 99% 2104 100% 3805 (longest request) ``` 2021-10-12, AX61-NVME with four discs and 8 partitions ``` Server Software: nginx/1.18.0 Server Hostname: 135.181.182.4 Server Port: 80 Concurrency Level: 5 Time taken for tests: 264.412 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 274309399 bytes HTML transferred: 274033261 bytes Requests per second: 7.56 [#/sec] (mean) Time per request: 661.029 [ms] (mean) Time per request: 132.206 [ms] (mean, across all concurrent requests) Transfer rate: 1013.12 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 13 27 16.1 25 348 Processing: 14 633 449.6 565 2996 Waiting: 0 590 425.7 520 2545 Total: 34 661 450.3 594 3014 Percentage of the requests served within a certain time (ms) 50% 594 66% 772 75% 905 80% 1000 90% 1271 95% 1510 98% 1834 99% 1997 100% 3014 (longest request) ``` 2021-10-12, AX61-NVME with four discs and 8 partitions ``` Server Software: nginx/1.18.0 Server Hostname: 135.181.182.4 Server Port: 80 Concurrency Level: 5 Time taken for tests: 233.408 seconds Complete requests: 2000 Failed requests: 0 Write errors: 0 Total transferred: 272488725 bytes HTML transferred: 272213277 bytes Requests per second: 8.57 [#/sec] (mean) Time per request: 583.519 [ms] (mean) Time per request: 116.704 [ms] (mean, across all concurrent requests) Transfer rate: 1140.07 [Kbytes/sec] received Connection Times (ms) min mean[+/-sd] median max Connect: 12 25 10.1 24 187 Processing: 15 558 402.0 487 2727 Waiting: 0 512 377.0 440 2051 Total: 33 583 402.8 512 2757 Percentage of the requests served within a certain time (ms) 50% 512 66% 695 75% 806 80% 882 90% 1114 95% 1373 98% 1621 99% 1779 100% 2757 (longest request) ``` ================================================ FILE: documentation/search_result_ranking.md ================================================ # Search Result Ranking This document describes how search results are indexed and ranked. ## Input Input to our indexer is a sequence of deduplicated urls with the following data. ``` { url: "https://www.example.com/", title: "Example Page", meta_description: "", h1: "Example Domain", text: "This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information..." } ``` ## 1. Domain level Each url is added with the url hash as key. The tokens are not deduplicated throughout the domain. ``` domain_score: idf * sum(tf_ + ) ``` ``` domain_score = expm1(5 * link.m_score) + 0.1; url_score = expm1(10 * link.m_score) + 0.1; ``` ================================================ FILE: documentation/statues_swe.tex ================================================ \documentclass[12pt, a4paper]{article} \usepackage[T1]{fontenc} \usepackage[utf8]{inputenc} \usepackage[swedish]{babel} \title{Stadgar för Föreningen Alexandria.org} \date{Januari 2022} \begin{document} \maketitle \paragraph{§ 1 Föreningens firma} \paragraph{} Föreningens firma är Föreningen Alexandria.org och föreningens firmatecknare är ordförande eller annan person utsedd till firmatecknare av styrelsen. \paragraph{§ 2 Föreningens ändamål} \paragraph{} Föreningen har som ändamål att göra kunskap mer tillgängligt. Föreningen ska uppfylla sitt ändamål genom att utveckla och tillhandahålla en sökmotor som är gratis och utan annonser. Källkoden till sökmotorn ska publiceras som öppen källkod. \paragraph{§ 3 Föreningens säte} \paragraph{} Föreningen har sitt säte i Uppsala. \paragraph{§ 4 Medlemsskap} \paragraph{} Föreningens medlemmar är aktiva i föreningens verksamhet. Nya medlemmar måste godkännas av styrelsen. \paragraph{§ 5 Medlemsavgifter} \paragraph{} Medlem ska betala den medlemsavgift som årligen fastställs av årsmötet. \pagebreak \paragraph{§ 6 Styrelsen} \paragraph{} Styrelsen består av en ordförande, en kassör, en suppleant och eventuellt ytterligare ledarmöter enligt årsmötets beslut. \paragraph{§ 7 Styrelsens uppdrag} \paragraph{} Styrelsen företräder föreningen, bevakar dess intressen och handhar dess angelägenheter. Styrelsen beslutar å föreningens vägnar såvida inte annat föreskrivs i dessa stadgar. Styrelsen ska verkställa av årsmötet fattade beslut, handha föreningens ekonomiska angelägenheter och föra räkenskaper, samt avge årsredovisning till årsstämman för det senaste räkenskapsåret. Styrelsen sammanträder när ordföranden finner det erforderligt eller om minst två styrelseledamöter begär detta. \paragraph{} Styrelsen är beslutsför då minst hälften av ledmöterna, avrundat uppåt är närvarande. Styrelsebeslut fattas med enkel majoritet. Vid lika röstetal gäller den mening ordföranden biträder. \paragraph{§ 8 Räkenskaper} \paragraph{} Räkenskapsår ska vara kalenderår. \paragraph{§ 9 Revisor} \paragraph{} Styrelsens förvaltning ska årligen granskas av en på årsmötet utsedd revisor. Revisorn ska senast den 1 mars avge sin revisionsberättelse. Revisorn får ej vara medlem i styrelsen. \paragraph{§ 10 Årsmöte} \paragraph{} Ordinarie årsmöte, vilket är föreningens högsta beslutande organ, hålls årligen före den 30 juni på tid och plats som styrelsen bestämmer. Kallelse sker via epost minst 2 veckor före utsatt möte. Motioner som har inkommit senast 7 dagar före årsmötet ska anses ha kommit i tid. Motioner skickas via epost. \paragraph{} Vid ordinarie årsmöte ska följande ärenden behandlas: \begin{enumerate} \item Val av ordförande och sekreterare för mötet. \item Fastställande av röstlängd för mötet. \item Fastställande av dagordning. \item Styrelsens verksamhetsberättelse för det senaste verksamhetsåret. \item Styrelsens förvaltningsberättelse (balans- och resultaträkning) för det senaste verksamhets-/räkenskapsåret. \item Revisionsberättelsen för verksamhets-/räkenskapsåret. \item Fråga om ansvarsfrihet för styrelsen för den tid revisionen avser. \item Fastställande av medlemsavgifter. \item Fastställande av ev. verksamhetsplan och behandling av budget för det kommande verksamhets-/räkenskapsåret. \item Val av ordförande i föreningen för en tid av 1 år. \item Val av kassör, övriga styrelseledamöter samt suppleanter för en tid av 1 år \item Val av revisorer. \item Behandling av styrelsens förslag och i rätt tid inkomna motioner. \item Övriga frågor. \end{enumerate} \paragraph{§ 11 Extra årsmöte} \paragraph{} Extra årsmöte hålls när styrelsen eller revisorerna finner att det är nödvändigt. Kallelse sker via epost minst 2 veckor före utsatt möte. \paragraph{§ 12 Rösträtt} \paragraph{} Vid årsmöte har varje medlem en röst. Rösträtten är personlig och kan inte utövas genom ombud. \paragraph{§ 13 Beslut, omröstning och beslutsmässighet} \paragraph{} Beslut fattas med bifallsrop (acklamation) eller om så begärs, efter omröstning (votering). \paragraph{} Omröstning sker öppet, utom vid val där sluten omröstning ska äga rum om någon begär detta. Beslut fattas, såvida dessa stadgar ej föreskriver annat, med enkel majoritet. Vid lika röstetal skall den mening som ordförande biträder vinna bifall. \paragraph{} Mötet är beslutsmässigt med det antal röstberättigade medlemmar som är närvarande på mötet. \paragraph{§ 14 Regler för ändring av stadgarna} \paragraph{} För ändring av dessa stadgar krävs beslut av två på varandra följande ordinarie årsmöten. Förslag till ändring av stadgarna får ges såväl av medlem som styrelsen. \paragraph{§ 15 Utträde} \paragraph{} Medlem som önskar utträda ur föreningen ska skriftligen anmäla detta till styrelsen och anses därmed omedelbart ha lämnat föreningen. \paragraph{§ 16 Uteslutning} \paragraph{} Medlem får uteslutas från föreningen om den har försummat att betala beslutade avgifter, motarbetat föreningens verksamhet eller ändamål, eller skadat föreningens intressen. Beslut om uteslutning fattas av styrelsen. \end{document} ================================================ FILE: scripts/bootstrap_node_2drives.sh ================================================ #!/bin/bash apt-get update apt-get -y install vim parted zip unzip nginx _mkpart() { disc=$1 mountpoint1=$2 mountpoint2=$3 mountpoint3=$4 mountpoint4=$5 parted -s $disc mklabel gpt parted -s -a optimal $disc mkpart primary ext4 0% 25% parted -s -a optimal $disc mkpart primary ext4 25% 50% parted -s -a optimal $disc mkpart primary ext4 50% 75% parted -s -a optimal $disc mkpart primary ext4 75% 100% sleep 1 mkfs.ext4 -F ${disc}p1 mkfs.ext4 -F ${disc}p2 mkfs.ext4 -F ${disc}p3 mkfs.ext4 -F ${disc}p4 mkdir $mountpoint1 mkdir $mountpoint2 mkdir $mountpoint3 mkdir $mountpoint4 mount ${disc}p1 $mountpoint1 mount ${disc}p2 $mountpoint2 mount ${disc}p3 $mountpoint3 mount ${disc}p4 $mountpoint4 echo "" >> /etc/fstab echo "${disc}p1 $mountpoint1 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab echo "${disc}p2 $mountpoint2 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab echo "${disc}p3 $mountpoint3 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab echo "${disc}p4 $mountpoint4 ext4 noatime,nodiratime,barrier=0 0 0" >> /etc/fstab } mkdir /mnt/0 mkdir /mnt/1 mkdir /mnt/2 mkdir /mnt/3 _mkpart /dev/nvme1n1 /mnt/4 /mnt/5 /mnt/6 /mnt/7 for shard in $(seq 0 7); do mkdir "/mnt/$shard/input"; mkdir "/mnt/$shard/output"; mkdir "/mnt/$shard/upload"; mkdir "/mnt/$shard/hash_table"; mkdir "/mnt/$shard/full_text"; mkdir "/mnt/$shard/tmp"; done echo "server { listen 80; server_name localhost; location / { fastcgi_pass 127.0.0.1:8000; fastcgi_param GATEWAY_INTERFACE CGI/1.1; fastcgi_param SERVER_SOFTWARE nginx; fastcgi_param QUERY_STRING \$query_string; fastcgi_param REQUEST_METHOD \$request_method; fastcgi_param CONTENT_TYPE \$content_type; fastcgi_param CONTENT_LENGTH \$content_length; fastcgi_param SCRIPT_FILENAME \$document_root\$fastcgi_script_name; fastcgi_param SCRIPT_NAME \$fastcgi_script_name; fastcgi_param REQUEST_URI \$request_uri; fastcgi_param DOCUMENT_URI \$document_uri; fastcgi_param DOCUMENT_ROOT \$document_root; fastcgi_param SERVER_PROTOCOL \$server_protocol; fastcgi_param REMOTE_ADDR \$remote_addr; fastcgi_param REMOTE_PORT \$remote_port; fastcgi_param SERVER_ADDR \$server_addr; fastcgi_param SERVER_PORT \$server_port; fastcgi_param SERVER_NAME \$server_name; } }" > /etc/nginx/sites-enabled/default /etc/init.d/nginx restart adduser --system --shell /sbin/nologin --gecos "User for running alexandria service" --disabled-password --home /alexandria alexandria touch /var/log/alexandria.log chown alexandria:syslog /var/log/alexandria.log echo "[Unit] Description=Alexandria Server [Service] User=alexandria WorkingDirectory=/alexandria ExecStart=/alexandria/server Nice=-20 Restart=always [Install] WantedBy=multi-user.target" > /etc/systemd/system/alexandria.service echo "# Cluster config nodes_in_cluster = 4 node_id = 0 # Indexer config batches[] = NODE-0 batches[] = NODE-1 batches[] = NODE-2 batches[] = NODE-3 batches[] = NODE-4 batches[] = NODE-5 link_batches[] = LINK-0 link_batches[] = LINK-1 link_batches[] = LINK-2 link_batches[] = LINK-3 link_batches[] = LINK-4 link_batches[] = LINK-5 # Server config worker_count = 8 query_max_words = 10 # Maximum number of words used in query. query_max_len = 200 deduplicate_domain_count = 5 pre_result_limit = 200000 result_limit = 1000 # Full text config ft_max_sections = 8 ft_max_results_per_section = 2000000 ft_section_depth = 4" > /etc/alexandria.conf mkdir /alexandria cd /alexandria wget https://github.com/alexandria-org/alexandria/releases/download/v1.0/alexandria.zip unzip alexandria.zip chown -R alexandria /mnt/* ================================================ FILE: scripts/build-deps.sh ================================================ #!/bin/bash cd `dirname $0` cd .. base_path=`pwd` cd $base_path cd deps cd zlib-1.2.12 ./configure make -j4 make install cd $base_path cd deps export CC=/usr/bin/gcc export CXX=/usr/bin/g++ cd CRoaring mkdir build cd build cmake .. make make install ================================================ FILE: scripts/clean.sh ================================================ #!/bin/bash cd `dirname $0` cd .. read -p "Do you want to delete your local alexandria data? [Y/n] " -n 1 -r echo if [[ $REPLY =~ ^[Y]$ ]] then for shard in $(seq 0 7); do rm -r /mnt/$shard/* mkdir /mnt/$shard mkdir "/mnt/$shard/input"; mkdir "/mnt/$shard/output"; mkdir "/mnt/$shard/upload"; mkdir "/mnt/$shard/hash_table"; mkdir "/mnt/$shard/full_text"; mkdir "/mnt/$shard/tmp"; done else echo "Ignoring" fi ================================================ FILE: scripts/download-deps.sh ================================================ #!/bin/bash cd `dirname $0` cd .. export CC=/usr/bin/gcc-10 export CXX=/usr/bin/g++-10 base_path=`pwd` cd $base_path mkdir -p deps cd deps curl -L https://github.com/nlohmann/json/releases/latest/download/json.hpp > json.hpp curl https://zlib.net/fossils/zlib-1.2.12.tar.gz > zlib-1.2.12.tar.gz gunzip -f zlib-1.2.12.tar.gz tar -xvf zlib-1.2.12.tar git clone https://github.com/abseil/abseil-cpp.git git clone https://github.com/RoaringBitmap/CRoaring.git wget https://raw.githubusercontent.com/google/robotstxt/master/robots.cc wget https://raw.githubusercontent.com/google/robotstxt/master/robots.h ================================================ FILE: scripts/download-test-data.sh ================================================ #!/bin/bash cd `dirname $0` if [ $# -eq 0 ]; then echo "Provide destination path as first argument" exit 1 fi for shard in $(seq 0 7); do mkdir "/mnt/$shard"; mkdir "/mnt/$shard/input"; mkdir "/mnt/$shard/output"; mkdir "/mnt/$shard/upload"; mkdir "/mnt/$shard/hash_table"; mkdir "/mnt/$shard/full_text"; mkdir "/mnt/$shard/tmp"; done DEST=$1 cd $DEST || { echo "target directory does not exist"; exit 127; } rm -r node0003.alexandria.org wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-01/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-02/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-03/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-04/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-05/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-06/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-07/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-08/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-09/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-TEST-10/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/crawl-data/ALEXANDRIA-MANUAL-01/files/50_top_domains.txt.gz --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/dev_files/ --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/example.txt --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/example.txt.gz --http-user=alexandria --http-password=wmXN6U4u wget -r -l1 --no-parent http://node0003.alexandria.org/test-data/ --http-user=alexandria --http-password=wmXN6U4u mkdir node0003.alexandria.org/nodes mkdir node0003.alexandria.org/nodes/test0001 mkdir node0003.alexandria.org/upload-tmp chown -R www-data:www-data node0003.alexandria.org ================================================ FILE: scripts/find_missing_files_in_batch.sh ================================================ #!/bin/bash cd `dirname $0` cd .. batch=$1 files=`curl https://data.commoncrawl.org/crawl-data/$batch/warc.paths.gz | gunzip` missing_files_path="/mnt/crawl-data/$batch/missing.paths" truncate -s 0 $missing_files_path for raw_file in $files; do file="/mnt/${raw_file/.warc.gz/.gz}" if [[ -f "$file" ]]; then filesize=$(stat -c%s "$file") if [[ $filesize -lt 1000 ]]; then echo "The file '$file' exists and is small." echo $raw_file >> $missing_files_path fi else echo "The file '$file' does not exist." echo $raw_file >> $missing_files_path fi done gzip $missing_files_path ================================================ FILE: scripts/init-docker.sh ================================================ #!/bin/bash cd `dirname $0` # The local docker development environment runs the data server on the local machine. # This script sets that up and downloads the test data. echo "Copying nginx config"; echo "server { listen 80 default_server; listen [::]:80 default_server; root /var/www/html/node0003.alexandria.org; index index.html; server_name _; location / { autoindex on; client_body_temp_path /var/www/html/node0003.alexandria.org/upload-tmp; dav_methods PUT; create_full_put_path on; dav_access group:rw all:r; client_max_body_size 10000m; } location /store { fastcgi_pass 127.0.0.1:8001; fastcgi_param GATEWAY_INTERFACE CGI/1.1; fastcgi_param SERVER_SOFTWARE nginx; fastcgi_param QUERY_STRING \$query_string; fastcgi_param REQUEST_METHOD \$request_method; fastcgi_param CONTENT_TYPE \$content_type; fastcgi_param CONTENT_LENGTH \$content_length; fastcgi_param SCRIPT_FILENAME \$document_root\$fastcgi_script_name; fastcgi_param SCRIPT_NAME \$fastcgi_script_name; fastcgi_param REQUEST_URI \$request_uri; fastcgi_param DOCUMENT_URI \$document_uri; fastcgi_param DOCUMENT_ROOT \$document_root; fastcgi_param SERVER_PROTOCOL \$server_protocol; fastcgi_param REMOTE_ADDR \$remote_addr; fastcgi_param REMOTE_PORT \$remote_port; fastcgi_param SERVER_ADDR \$server_addr; fastcgi_param SERVER_PORT \$server_port; fastcgi_param SERVER_NAME \$server_name; } } " > /etc/nginx/sites-enabled/default echo "Downloading test data"; ./download-test-data.sh /var/www/html mkdir /var/www/html/node0003.alexandria.org/nodes mkdir /var/www/html/node0003.alexandria.org/nodes/test0001 mkdir /var/www/html/node0003.alexandria.org/upload-tmp chown -R www-data:www-data /var/www/html/node0003.alexandria.org /etc/init.d/nginx restart ./download-deps.sh ./build-deps.sh ================================================ FILE: scripts/install-deps.sh ================================================ #!/bin/bash apt-get install -y zip make cmake gcc-10 g++-10 gcc g++ libcurl4-openssl-dev libssl-dev libcrypto++-dev libboost-iostreams-dev libboost-filesystem-dev libboost-system-dev libboost-test-dev libfcgi-dev spawn-fcgi nginx ================================================ FILE: scripts/packager.sh ================================================ #!/bin/bash # Copyright 2018-present Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. # A copy of the License is located at # # http://aws.amazon.com/apache2.0 # # or in the "license" file accompanying this file. This file is distributed # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. # Modified by Josef Cullhed 2021 set -euo pipefail print_help() { echo -e "Usage: packager [OPTIONS] \n" echo -e "OPTIONS\n" echo -e "\t-d,--default-libc\t Use the target host libc libraries. This will not package the C library files.\n" } if [ $# -lt 1 ]; then echo -e "Error: missing arguments\n" print_help exit 1 fi POSITIONAL=() INCLUDE_LIBC=true while [[ $# -gt 0 ]] do key="$1" case $key in -d|--default-libc) INCLUDE_LIBC=false shift # past argument ;; *) # unknown option POSITIONAL+=("$1") # save it in an array for later shift # past argument ;; esac done set -- "${POSITIONAL[@]}" # restore positional parameters PKG_BIN_PATH=$1 architecture=$(arch) if [ ! -d "$PKG_BIN_PATH" ]; then echo "$PKG_BIN_PATH" - No such directory.; exit 1; fi if ! type zip > /dev/null 2>&1; then echo "zip utility is not found. Please install it and re-run this script" exit 1 fi function package_libc_via_pacman { if grep --extended-regexp "Arch Linux|Manjaro Linux" < /etc/os-release > /dev/null 2>&1; then if type pacman > /dev/null 2>&1; then pacman --query --list --quiet glibc | sed -E '/\.so$|\.so\.[0-9]+$/!d' fi fi } function package_libc_via_dpkg() { if type dpkg-query > /dev/null 2>&1; then if [[ $(dpkg-query --listfiles libc6 | wc -l) -gt 0 ]]; then dpkg-query --listfiles libc6 | sed -E '/\.so$|\.so\.[0-9]+$/!d' fi fi } function package_libc_via_rpm() { if type rpm > /dev/null 2>&1; then if [[ $(rpm --query --list glibc.$architecture | wc -l) -gt 1 ]]; then rpm --query --list glibc.$architecture | sed -E '/\.so$|\.so\.[0-9]+$/!d' fi fi } # hasElement expects an element and an array parameter # it's equivalent to array.contains(element) # e.g. hasElement "needle" ${haystack[@]} function hasElement() { local el key=$1 shift for el in "$@" do [[ "$el" == "$key" ]] && return 0 done return 1 } PKG_BIN_FILENAME=alexandria PKG_DIR=tmp PKG_LD="" list=$(ldd "$PKG_BIN_PATH/server" | awk '{print $(NF-1)}') libc_libs=() libc_libs+=($(package_libc_via_dpkg)) libc_libs+=($(package_libc_via_rpm)) libc_libs+=($(package_libc_via_pacman)) mkdir -p "$PKG_DIR/bin" "$PKG_DIR/lib" for i in $list do if [[ ! -f $i ]]; then # ignore linux-vdso.so.1 continue fi # Do not copy libc files which are directly linked unless it's the dynamic loader if hasElement "$i" "${libc_libs[@]}"; then filename=$(basename "$i") if [[ -z "${filename##ld-*}" ]]; then PKG_LD=$filename # Use this file as the loader cp "$i" "$PKG_DIR/lib" fi continue fi cp "$i" $PKG_DIR/lib done if [[ $INCLUDE_LIBC == true ]]; then for i in "${libc_libs[@]}" do filename=$(basename "$i") if [[ -z "${filename##ld-*}" ]]; then # if the loader is empty, then the binary is probably linked to a symlink of the loader. The symlink will # not show up when quering the package manager for libc files. So, in this case, we want to copy the loader if [[ -z "$PKG_LD" ]]; then PKG_LD=$filename cp "$i" "$PKG_DIR/lib" # we want to follow the symlink (default behavior) fi continue # We don't want the dynamic loader's symlink because its target is an absolute path (/lib/ld-*). fi cp --no-dereference "$i" "$PKG_DIR/lib" done fi if [[ -z "$PKG_LD" ]]; then echo "Failed to identify, locate or package the loader. Please file an issue on Github!" 1>&2 exit 1 fi bootstrap_script_server=$(cat < "$PKG_DIR/server" echo -e "$bootstrap_script_scraper" > "$PKG_DIR/scraper" echo -e "$bootstrap_script_indexer" > "$PKG_DIR/indexer" echo -e "$bootstrap_script_alexandria" > "$PKG_DIR/alexandria" chmod +x "$PKG_DIR/server" chmod +x "$PKG_DIR/scraper" chmod +x "$PKG_DIR/indexer" chmod +x "$PKG_DIR/alexandria" # some shenanigans to create the right layout in the zip file without extraneous directories pushd "$PKG_DIR" > /dev/null zip --symlinks --recurse-paths "$PKG_BIN_FILENAME".zip -- * ORIGIN_DIR=$(dirs -l +1) mv "$PKG_BIN_FILENAME".zip "$ORIGIN_DIR" popd > /dev/null rm -r "$PKG_DIR" echo Created "$ORIGIN_DIR/$PKG_BIN_FILENAME".zip ================================================ FILE: scripts/prepare-output-dirs.sh ================================================ #!/bin/bash cd `dirname $0` cd .. for shard_id in $(seq 0 7); do shard="/mnt/$shard_id" rm -r $shard mkdir $shard mkdir "$shard/input"; mkdir "$shard/output"; mkdir "$shard/upload"; mkdir "$shard/hash_table"; mkdir "$shard/full_text"; mkdir "$shard/tmp"; done ================================================ FILE: scripts/truncate.sh ================================================ #!/bin/bash cd `dirname $0` cd .. for shard in $(seq 0 7); do rm -r /mnt/$shard/* mkdir "/mnt/$shard/input"; mkdir "/mnt/$shard/output"; mkdir "/mnt/$shard/upload"; mkdir "/mnt/$shard/hash_table"; mkdir "/mnt/$shard/full_text"; mkdir "/mnt/$shard/tmp"; done chown -R alexandria /mnt/* ================================================ FILE: scripts/update.sh ================================================ #!/bin/bash cd `dirname $0` wget https://github.com/alexandria-org/alexandria/releases/latest/download/alexandria.zip -O alexandria.zip unzip -o alexandria.zip ================================================ FILE: src/URL.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "URL.h" #include "algorithm/hash.h" #include "parser/parser.h" #include #include "text/text.h" #include "warc/tlds.h" using namespace std; URL::URL() { m_status = ::parser::OK; } URL::URL(const URL &url) : m_url_string(url.m_url_string), m_host(url.m_host), m_host_reverse(url.m_host_reverse), m_scheme(url.m_scheme), m_path(url.m_path), m_query(url.m_query), m_status(url.m_status), m_has_www(url.m_has_www) { } URL::URL(const string &url) : m_url_string(url) { m_status = parse(); } URL::URL(const string &host, const string &path) : m_url_string("http://" + host + path), m_host(host), m_path(path) { m_host_reverse = URL::host_reverse(m_host); m_status = ::parser::OK; } URL::~URL() { } void URL::set_url_string(const string &url) { m_url_string = url; m_status = parse(); } string URL::str() const { return m_url_string; } string URL::key() const { /* * We should probably change this to: * return m_host + path_with_query(); * but we need to do it later.. */ return m_host + m_path + m_query; } string URL::hash_input() const { return m_host + path_with_query(); } uint64_t URL::hash() const { return ::algorithm::hash(hash_input()); } uint64_t URL::host_hash() const { return ::algorithm::hash(m_host); } uint64_t URL::link_hash(const URL &target_url, const string &link_text) const { return ::algorithm::hash(host() + target_url.str()); } uint64_t URL::domain_link_hash(const URL &target_url, const string &link_text) const { return ::algorithm::hash(host() + target_url.host()); } bool URL::canonically_different(const URL &url) const { return key() != url.key(); } bool URL::has_https() const { return m_scheme == "https"; } bool URL::has_www() const { return m_has_www; } string URL::host() const { return m_host; } string URL::host_top_domain() const { vector parts; std::string_view host(m_host); size_t pos1 = host.find_last_of("."); if (host.substr(pos1 + 1) == "uk") { pos1 = host.find_last_of(".", pos1 - 1); if (host.substr(pos1 + 1) != "co.uk") { return m_host; } } else if (host.substr(pos1 + 1) == "au") { pos1 = host.find_last_of(".", pos1 - 1); } size_t pos2 = host.find_last_of(".", pos1 - 1); if (pos2 == string::npos) { return m_host; } return m_host.substr(pos2 + 1); } string URL::scheme() const { return m_scheme; } string URL::host_reverse() const { return m_host_reverse; } string URL::path() const { return m_path; } string URL::path_with_query() const { if (m_query.size() > 0) { return m_path + "?" + m_query; } else { return m_path; } } map URL::query() const { map ret; vector parts; boost::split(parts, m_query, boost::is_any_of("&")); for (const string &part : parts) { vector pair; boost::split(pair, part, boost::is_any_of("=")); if (pair.size() > 1) { ret[pair[0]] = parser::urldecode(pair[1]); } } return ret; } float URL::harmonic() const { return 0.0f; } string URL::host_reverse(const string &host) { vector parts; boost::split(parts, host, boost::is_any_of(".")); reverse(parts.begin(), parts.end()); return boost::algorithm::join(parts, "."); } string URL::host_reverse_top_domain(const string &host) { /* * This algorithm is OK since we only run on these tlds: * {"se", "com", "nu", "net", "org", "gov", "edu", "info"} * */ vector parts; boost::split(parts, host, boost::is_any_of(".")); if (parts.size() > 2) { parts = {parts[parts.size() - 2], parts[parts.size() - 1]}; } reverse(parts.begin(), parts.end()); return boost::algorithm::join(parts, "."); } string URL::domain_without_tld() const { vector parts; boost::split(parts, m_host, boost::is_any_of(".")); if (parts.size() > 1) { return parts[parts.size() - 2]; } return ""; } uint32_t URL::size() const { return str().size(); } void URL::set_scheme(const string &scheme) { m_scheme = scheme; rebuild_url_str(); } void URL::set_www(bool has_www) { m_has_www = has_www; rebuild_url_str(); } URL &URL::operator=(const URL &other) { m_url_string = other.m_url_string; m_host = other.m_host; m_host_reverse = other.m_host_reverse; m_scheme = other.m_scheme; m_path = other.m_path; m_query = other.m_query; m_status = other.m_status; m_has_www = other.m_has_www; return *this; } istream &operator >>(istream &ss, URL &url) { ss >> (url.m_url_string); url.m_status = url.parse(); return ss; } ostream &operator <<(ostream& os, const URL& url) { os << url.m_url_string; return os; } int URL::parse() { CURLU *h = curl_url(); if (!h) return ::parser::ERROR; CURLUcode uc = curl_url_set(h, CURLUPART_URL, m_url_string.c_str(), 0); if (uc) { curl_url_cleanup(h); return ::parser::ERROR; } char *chost; uc = curl_url_get(h, CURLUPART_HOST, &chost, 0); if (!uc) { m_host = chost; remove_www(m_host); curl_free(chost); } char *scheme; uc = curl_url_get(h, CURLUPART_SCHEME, &scheme, 0); if (!uc) { m_scheme = scheme; curl_free(scheme); } char *cpath; uc = curl_url_get(h, CURLUPART_PATH, &cpath, 0); if (!uc) { m_path = cpath; curl_free(cpath); } char *cquery; uc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0); if (!uc) { m_query = cquery; curl_free(cquery); } curl_url_cleanup(h); m_host_reverse = URL::host_reverse(m_host); return ::parser::OK; } void URL::rebuild_url_str() { m_url_string = m_scheme + "://" + (m_has_www ? "www." : "") + m_host + path_with_query(); } inline void URL::remove_www(string &path) { size_t pos = path.find("www."); if (pos == 0) { m_has_www = true; path.erase(0, 4); } else { m_has_www = false; } text::trim(path); } ================================================ FILE: src/URL.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "config.h" #include #include #include #include class URL { public: URL(); URL(const URL &url); explicit URL(const std::string &url); explicit URL(const std::string &host, const std::string &path); ~URL(); static std::string host_reverse(const std::string &host); static std::string host_reverse_top_domain(const std::string &host); void set_url_string(const std::string &url); std::string str() const; std::string key() const; std::string hash_input() const; uint64_t hash() const; uint64_t host_hash() const; uint64_t link_hash(const URL &target_url, const std::string &link_text) const; uint64_t domain_link_hash(const URL &target_url, const std::string &link_text) const; bool canonically_different(const URL &url) const; bool has_https() const; bool has_www() const; std::string host() const; std::string host_top_domain() const; std::string scheme() const; std::string path() const; std::string path_with_query() const; std::map query() const; std::string host_reverse() const; std::string domain_without_tld() const; uint32_t size() const; void set_scheme(const std::string &scheme); void set_www(bool has_www); float harmonic() const; size_t index_on_node() const { return host_hash() % config::nodes_in_cluster; } URL &operator=(const URL &other); friend std::istream &operator >>(std::istream &ss, URL &url); friend std::ostream &operator <<(std::ostream& os, const URL& url); private: std::string m_url_string; std::string m_host; std::string m_host_reverse; std::string m_scheme; std::string m_path; std::string m_query; int m_status; bool m_has_www; int parse(); void rebuild_url_str(); inline void remove_www(std::string &path); }; ================================================ FILE: src/alexandria.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include "logger/logger.h" #include "downloader/warc_downloader.h" #include "downloader/merge_downloader.h" #include "URL.h" #include "hash_table2/hash_table.h" #include "hash_table2/hash_table_shard_builder.h" #include "indexer/index.h" #include "indexer/index_builder.h" #include "indexer/value_record.h" #include "algorithm/hyper_ball.h" #include "utils/thread_pool.hpp" #include "file/file.h" #include "http/server.h" #include "parser/parser.h" #include using namespace std; void help() { std::string content = file::cat("../documentation/alexandria.md"); std::cout << content << std::endl; } int main(int argc, const char **argv) { logger::start_logger_thread(); logger::verbose(true); if (getenv("ALEXANDRIA_CONFIG") != NULL) { config::read_config(getenv("ALEXANDRIA_CONFIG")); } else { config::read_config("/etc/alexandria.conf"); } if (argc < 2) { help(); return 0; } const string arg(argc > 1 ? argv[1] : ""); if (arg == "--hash-table-url" && argc > 2) { URL url(argv[2]); hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data"); size_t ver = 0; std::string data = ht.find(url.hash(), ver); std::cout << ver << std::endl; std::cout << data << std::endl; } else if (arg == "--hash-table-url-hash" && argc > 2) { uint64_t url_hash = std::stoull(argv[2]); hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data"); size_t ver = 0; std::string data = ht.find(url_hash, ver); std::cout << ver << std::endl; std::cout << data << std::endl; } else if (arg == "--hash-table-count") { hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data"); std::cout << ht.size() << std::endl; } else if (arg == "--hash-table-find-all" && argc > 2) { hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data"); // Put given hosts in array with hashes to search for. std::vector search_for; for (int i = 2; i < argc; i++) { search_for.push_back(URL(string("https://") + argv[i]).host_hash()); } ht.for_each([&search_for](uint64_t key, std::string value) { URL url(value.substr(0, value.find("\t"))); const auto my_host_hash = url.host_hash(); for (const auto &host_hash : search_for) { if (host_hash == my_host_hash) { std::cout << key << "\t" << url.str() << std::endl; break; } } }); } else if (arg == "--hash-table-count" && argc > 2) { std::string data = file::cat("domains.txt"); std::vector lines; boost::split(lines, data, boost::is_any_of("\n")); std::map domains; std::map domain_counts; std::vector domain_list; for (const auto &line : lines) { if (line == "") continue; const std::string reversed = URL::host_reverse(line); std::cout << reversed << std::endl; const uint64_t domain_hash = URL(string("https://") + reversed).host_hash(); domains[reversed] = domain_hash; domain_counts[domain_hash] = 0; domain_list.push_back(reversed); } hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data"); uint64_t thelazy_host_hash = URL(string("https://") + argv[2]).host_hash(); ht.for_each([thelazy_host_hash, &domain_counts](uint64_t key, std::string value) { URL url(value.substr(0, value.find("\t"))); const auto my_host_hash = url.host_hash(); for (auto &iter : domain_counts) { if (iter.first == my_host_hash) { domain_counts[iter.first]++; break; } } /*if (url.host_hash() == thelazy_host_hash) { std::cout << key << " => " << url.str() << std::endl; }*/ }); for (auto &domain : domain_list) { std::cout << domain << "\t" << domain_counts[domains[domain]] << std::endl; } } else if (arg == "--hash-table-optimize-shard" && argc > 2) { size_t shard_id = std::stoull(argv[2]); hash_table2::hash_table_shard_builder ht_shard("all_urls", shard_id, 1000000, "/slow_data"); ht_shard.optimize(); } else if (arg == "--internal-harmonic") { profiler::instance prof_total("total"); /* std::vector all_files; file::read_directory("/mnt/0/full_text/internal_links", [&all_files](const std::string &filename) { all_files.push_back(filename); }); size_t done_with = 0; profiler::instance prof("total"); for (const auto &filename : all_files) { // Read the file. std::ifstream infile("/mnt/0/full_text/internal_links/" + filename, std::ios::binary); std::string infile_data(std::istreambuf_iterator(infile), {}); infile.close(); std::istringstream reader(infile_data); indexer::index idx(&reader, 1000); // Create vertices vector std::vector vertices; std::map vertex_map; size_t record_id = 0; for (const auto &record : idx.records()) { vertices.push_back(record.m_value); vertex_map[record.m_value] = record_id; record_id++; } std::vector edge_map(vertices.size()); // Populate edge map idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) { if (vertex_map.count(key) == 0) { vertices.push_back(key); edge_map.push_back(roaring::Roaring()); vertex_map[key] = record_id; record_id++; } edge_map[vertex_map[key]] = std::move(bitmap); }); // Calculate harmonic centrality on graph. if (vertices.size() > 500) { auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data()); } // Sort the results a bit. std::vector sorted(harmonic.size()); std::iota(sorted.begin(), sorted.end(), 0); std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) { return harmonic[a] > harmonic[b]; }); done_with++; float percent = ((float)done_with / all_files.size()) * 100.0f; float elapsed_milliseconds = prof.get(); size_t items_left = all_files.size() - done_with; float milliseconds_per_file = elapsed_milliseconds/done_with; float milliseconds_left = milliseconds_per_file * items_left; float hours_left = milliseconds_left / (1000.0f * 3600.0f); std::cout << "done with " << done_with << " out of " << all_files.size() << " (" << percent << "% done) time left: " << hours_left << " hours"<< std::endl; } return 0;*/ // load the file std::string content = file::cat("multiple_domains.tsv"); std::vector lines; boost::split(lines, content, boost::is_any_of("\n")); std::vector> csv_data; for (auto line : lines) { std::vector cols; boost::split(cols, line, boost::is_any_of("\t")); if (cols.size() > 1) { if (URL(cols[1]).host_hash() == URL("http://abc13.com").host_hash()) { csv_data.push_back(cols); } } } profiler::instance prof_load("load"); //std::ifstream infile("/mnt/5/full_text/internal_links/3492248666075096845.data", std::ios::binary); std::ifstream infile("/mnt/6/full_text/internal_links/12854855988816217414.data", std::ios::binary); std::string infile_data(std::istreambuf_iterator(infile), {}); infile.close(); std::istringstream reader(infile_data); indexer::index idx(&reader, 1000); prof_load.stop(); profiler::instance prof("make vertices"); std::vector vertices; std::map vertex_map; size_t record_id = 0; for (const auto &record : idx.records()) { vertices.push_back(record.m_value); vertex_map[record.m_value] = record_id; record_id++; } std::vector edge_map(vertices.size()); idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) { if (vertex_map.count(key) == 0) { vertices.push_back(key); edge_map.push_back(roaring::Roaring()); vertex_map[key] = record_id; record_id++; } edge_map[vertex_map[key]] = std::move(bitmap); }); prof.stop(); profiler::instance prof2("run hyper_ball"); auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data()); prof2.stop(); prof_total.stop(); std::vector sorted(harmonic.size()); std::iota(sorted.begin(), sorted.end(), 0); std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) { return harmonic[a] > harmonic[b]; }); std::map harmonic_by_url; for (size_t i = 0; i < harmonic.size(); i++) { harmonic_by_url[vertices[sorted[i]]] = harmonic[sorted[i]] / vertices.size(); } for (auto row : csv_data) { uint64_t url_hash = stoull(row[0]); double harmonic = harmonic_by_url[url_hash]; std::cout << row[0] << "\t" << row[1] << "\t" << harmonic << std::endl; } /* profiler::instance prof_load("load"); //std::ifstream infile("/mnt/5/full_text/internal_links/3492263685688109621.data", std::ios::binary); //std::ifstream infile("/mnt/5/full_text/internal_links/3492528524383210893.data", std::ios::binary); //std::ifstream infile("/mnt/0/full_text/internal_links/7131549202223940368.data", std::ios::binary); std::ifstream infile("/mnt/0/full_text/internal_links/10401139885298228528.data", std::ios::binary); std::string infile_data(std::istreambuf_iterator(infile), {}); infile.close(); std::istringstream reader(infile_data); indexer::index idx(&reader, 1000); prof_load.stop(); profiler::instance prof("make vertices"); std::vector vertices; std::map vertex_map; size_t record_id = 0; for (const auto &record : idx.records()) { vertices.push_back(record.m_value); vertex_map[record.m_value] = record_id; record_id++; } std::vector edge_map(vertices.size()); idx.for_each([&edge_map, &vertex_map, &vertices, &record_id](uint64_t key, roaring::Roaring &bitmap) { if (vertex_map.count(key) == 0) { vertices.push_back(key); edge_map.push_back(roaring::Roaring()); vertex_map[key] = record_id; record_id++; } edge_map[vertex_map[key]] = std::move(bitmap); }); prof.stop(); profiler::instance prof2("run hyper_ball"); auto harmonic = algorithm::hyper_ball(vertices.size(), edge_map.data()); prof2.stop(); prof_total.stop(); std::vector sorted(harmonic.size()); std::iota(sorted.begin(), sorted.end(), 0); std::sort(sorted.begin(), sorted.end(), [&harmonic] (const auto &a, const auto &b) { return harmonic[a] > harmonic[b]; }); //for (size_t i = 0; i < harmonic.size(); i++) { //std::cout << "vertex: " << vertices[sorted[i]] << " has harmonic: " << harmonic[sorted[i]] << std::endl; //} */ } else if (arg == "--url-server") { // Spin up a simple url server. hash_table2::hash_table ht("all_urls", 1019, 1000000, "/slow_data"); http::server url_server([&ht](auto request) { http::response res; URL url = request.url(); auto query = url.query(); URL find_url(parser::urldecode(query["url"])); size_t ver; const auto find_str = ht.find(find_url.hash(), ver); if (find_str == "") { res.code(404); res.body("Not found 404"); } else { res.code(200); res.body(find_str); } return res; }); } else { help(); } logger::join_logger_thread(); return 0; } ================================================ FILE: src/algorithm/algorithm.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "algorithm.h" #include "profiler/profiler.h" #include #include #include #include #include #include #include #include namespace algorithm { /* Returns partitions with indices that are smaller than the values in the dims vector. For example: dims = {2,2} gives {0,0}, {1,0}, {0,1}, {1,1} dims = {2,3} gives {0,0}, {1,0}, {0,1}, {1,1}, {0,2}, {1,2} */ std::vector> incremental_partitions(const std::vector &dims, size_t limit) { std::vector> res; std::set> uniq; std::vector initial(dims.size(), 0); res.push_back(initial); uniq.insert(initial); for (size_t j = 0; j < res.size(); j++) { std::vector vec = res[j]; for (size_t i = 0; i < vec.size(); i++) { if (vec[i] < dims[i]-1) { std::vector copy(vec); copy[i]++; res.push_back(copy); uniq.insert(copy); if (uniq.size() >= limit) break; } } if (uniq.size() >= limit) break; } std::vector> ret(uniq.begin(), uniq.end()); sort(ret.begin(), ret.end(), [](const std::vector &a, const std::vector &b) { int sum1 = accumulate(a.begin(), a.end(), 0); int sum2 = accumulate(b.begin(), b.end(), 0); if (sum1 == sum2) { int max1 = *max_element(a.begin(), a.end()); int max2 = *max_element(b.begin(), b.end()); if (max1 == max2) { return b < a; } return max1 < max2; } return sum1 < sum2; }); return ret; } /* Calculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i. The depth parameter is the maximum level to traverse in the neighbour tree. The edges set contains pairs of edges (from vertex, to vertex) */ /* * This is the inner outer loop for calculating harmonic centrality. * */ std::vector harmonic_centrality_subvector(size_t vlen, const std::vector *edge_map, size_t depth, size_t start, size_t len) { char *all = new char[vlen]; uint32_t *level1 = new uint32_t[vlen]; uint32_t *level2 = new uint32_t[vlen]; uint32_t *levels[2] = {level1, level2}; size_t level_len[2] = {0, 0}; std::vector harmonics; profiler::instance prof("Timetaker"); for (size_t i = start; i < start + len; i++) { const uint32_t vertex = i; level_len[0] = 0; level_len[1] = 0; memset(all, 0, vlen); levels[0][0] = vertex; level_len[0]++; all[vertex] = 1; double harmonic = 0.0; /* If we can assume the average number of incoming edges per vertex to be constant these loops should be O(1) in n. Example, if we have n = 10 000 000 vertices and 10 inbound edges on each vertex these loops should be (first loop is depth) X (worst case second loop is 10^depth) X (inner loop is 10) depth * 10^depth * 10 independent of n */ size_t last_level = 0; size_t cur_level = 1; for (size_t level = 1; level <= depth; level++) { //for (const uint32_t &v : level[level - 1]) { for (size_t j = 0; j < level_len[last_level]; j++) { const uint32_t v = levels[last_level][j]; for (const uint32_t &edge : edge_map[v]) { if (!all[edge]) { levels[cur_level][level_len[cur_level]++] = edge; all[edge] = 1; } } } if (level_len[cur_level] == 0) break; harmonic += (double)level_len[cur_level] / level; // Swap levels level_len[last_level] = 0; size_t tmp = last_level; last_level = cur_level; cur_level = tmp; } harmonics.push_back(harmonic); } delete [] level2; delete [] level1; delete [] all; return harmonics; } std::vector harmonic_centrality(size_t vlen, const std::set> &edges, size_t depth) { std::vector harmonics; std::vector *edge_map = new std::vector[vlen]; for (const auto &edge : edges) { /* second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase harmonic centrality of vertex. */ edge_map[edge.second].push_back(edge.first); } std::vector ret = harmonic_centrality(vlen, edge_map, depth); delete [] edge_map; return ret; } std::vector harmonic_centrality(size_t vlen, const std::vector *edge_map, size_t depth) { return harmonic_centrality_subvector(vlen, edge_map, depth, 0, vlen); } std::vector harmonic_centrality_threaded(size_t vlen, const std::set> &edges, size_t depth, size_t num_threads) { std::vector *edge_map = new std::vector[vlen]; for (const auto &edge : edges) { /* second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase harmonic centrality of vertex. */ edge_map[edge.second].push_back(edge.first); } std::vector ret = harmonic_centrality_threaded(vlen, edge_map, depth, num_threads); delete [] edge_map; return ret; } std::vector harmonic_centrality_threaded(size_t vlen, const std::vector *edge_map, size_t depth, size_t num_threads) { assert(vlen >= num_threads); std::vector>> threads; // Split the vertices into several vectors. const size_t max_len = ceil((double)vlen / num_threads); for (size_t i = 0; i < vlen; i += max_len) { const size_t len = std::min(max_len, vlen - i); threads.emplace_back(std::async(std::launch::async, harmonic_centrality_subvector, vlen, edge_map, depth, i, len)); } std::vector harmonic; for (auto &thread : threads) { std::vector part = thread.get(); harmonic.insert(harmonic.end(), part.begin(), part.end()); } return harmonic; } std::vector *set_to_edge_map(size_t n, const std::set> &edges) { std::vector *edge_map = new std::vector[n]; for (const auto &edge : edges) { /* second -> first mapping because we want to traverse the edges in the opposite direction of the edge. Incoming edges should increase harmonic centrality of vertex. */ edge_map[edge.second].push_back(edge.first); } return edge_map; } } ================================================ FILE: src/algorithm/algorithm.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace algorithm { template void vector_chunk(const std::vector &vec, size_t chunk_size, std::vector> &dest) { std::vector chunk; for (T item : vec) { chunk.push_back(item); if (chunk.size() == chunk_size) { dest.push_back(chunk); chunk.clear(); } } if (chunk.size()) { dest.push_back(chunk); } } std::vector> incremental_partitions(const std::vector &dims, size_t limit); /* Calculates the harmonic centrality for vertices and edges. The returning vector has the harmonic centrality for vertex i at position i. The depth parameter is the maximum level to traverse in the neighbour tree. The edges set contains pairs of edges (from vertex, to vertex) */ std::vector harmonic_centrality(size_t vlen, const std::set> &edges, size_t depth); std::vector harmonic_centrality(size_t vlen, const std::vector *edge_map, size_t depth); std::vector harmonic_centrality_threaded(size_t vlen, const std::set> &edges, size_t depth, size_t num_threads); std::vector harmonic_centrality_threaded(size_t vlen, const std::vector *edge_map, size_t depth, size_t num_threads); std::vector *set_to_edge_map(size_t n, const std::set> &edges); } ================================================ FILE: src/algorithm/bloom_filter.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "bloom_filter.h" #include "algorithm/hash.h" #include #include #include namespace algorithm { bloom_filter::bloom_filter() { m_bitmap = std::make_unique(m_dim); for (size_t i = 0; i < m_dim; i++) { m_bitmap[i] = 0x0ull; } } // Dim should be a prime number.. bloom_filter::bloom_filter(size_t dim) : m_dim(dim), m_bitlen(dim * 64) { m_bitmap = std::make_unique(m_dim); for (size_t i = 0; i < m_dim; i++) { m_bitmap[i] = 0x0ull; } } void bloom_filter::insert(const std::string &item) { for (size_t i = 0; i < m_seeds.size(); i++) { const uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]); set_bit(hash); } } void bloom_filter::insert(uint64_t item) { insert(std::to_string(item)); } void bloom_filter::insert_many(std::vector &items) { std::vector hashes; for (const auto &item : items) { const auto str_item = std::to_string(item); for (size_t i = 0; i < m_seeds.size(); i++) { const uint64_t hash = algorithm::hash_with_seed(str_item, m_seeds[i]); hashes.push_back(hash); } } std::lock_guard guard(m_mutex); for (const auto &hash : hashes) { set_bit(hash); } } const char * bloom_filter::data() const { return (char *)m_bitmap.get(); } bool bloom_filter::exists(const std::string &item) const { for (size_t i = 0; i < m_seeds.size(); i++) { const uint64_t hash = algorithm::hash_with_seed(item, m_seeds[i]); if (!get_bit(hash)) return false; } return true; } bool bloom_filter::exists(uint64_t data) const { return exists(std::to_string(data)); } void bloom_filter::read(char *data, size_t len) { memcpy((char *)m_bitmap.get(), data, len); } void bloom_filter::merge(const bloom_filter &other) { for (size_t i = 0; i < m_dim; i++) { m_bitmap[i] |= other.m_bitmap[i]; } } double bloom_filter::saturation() { return 1.0; } void bloom_filter::read_file(const std::string &file_name) { std::ifstream infile(file_name, std::ios::binary); infile.read((char *)m_bitmap.get(), size()); } void bloom_filter::write_file(const std::string &file_name) const { std::ofstream outfile(file_name, std::ios::binary | std::ios::trunc); outfile.write((char *)m_bitmap.get(), size()); } void bloom_filter::set_bit(size_t bit) { const size_t x = bit % m_bitlen; const size_t pos = static_cast(x / 64); const size_t bit_in_pos = x % 64; m_bitmap[pos] = m_bitmap[pos] | (0x1ull << bit_in_pos); } bool bloom_filter::get_bit(size_t bit) const { const size_t x = bit % m_bitlen; const size_t pos = static_cast(x / 64); const size_t bit_in_pos = x % 64; return (m_bitmap[pos] & (0x1ull << bit_in_pos)) >> bit_in_pos; } } ================================================ FILE: src/algorithm/bloom_filter.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include "roaring/roaring64map.hh" namespace algorithm { class bloom_filter { public: bloom_filter(); bloom_filter(size_t dim); void insert(const std::string &item); void insert(uint64_t item); void insert_many(std::vector &items); bool exists(const std::string &item) const; bool exists(uint64_t data) const; size_t size() const { return m_dim * sizeof(uint64_t); } const char *data() const; void read(char *data, size_t len); void merge(const bloom_filter &other); double saturation(); void read_file(const std::string &file_name); void write_file(const std::string &file_name) const; private: std::unique_ptr m_bitmap; #ifdef IS_TEST size_t m_dim = 2695797; #else size_t m_dim = 4043696581; #endif size_t m_bitlen = m_dim * 64; // some random prime numbers std::array m_seeds = {3339675911, 2695798769, 2695831867, 2695857877, 2695879891, 2695879891, 2695922687, 2695935521, 3339689791, 3339703163}; std::mutex m_mutex; void set_bit(size_t bit); bool get_bit(size_t bit) const; }; } ================================================ FILE: src/algorithm/hash.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "hash.h" namespace algorithm { /* * Murmur hash by Austin Appleby * Taken from here https://sites.google.com/site/murmurhash/ * */ size_t murmur_hash(const char *key, size_t len, size_t seed) { const uint64_t m = 0xc6a4a7935bd1e995ull; const int r = 47; uint64_t h = seed ^ (len * m); const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len/8); while(data != end) { uint64_t k = *data++; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; } const unsigned char * data2 = (const unsigned char*)data; switch(len & 7) { case 7: h ^= uint64_t(data2[6]) << 48; case 6: h ^= uint64_t(data2[5]) << 40; case 5: h ^= uint64_t(data2[4]) << 32; case 4: h ^= uint64_t(data2[3]) << 24; case 3: h ^= uint64_t(data2[2]) << 16; case 2: h ^= uint64_t(data2[1]) << 8; case 1: h ^= uint64_t(data2[0]); h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; } size_t hash(const std::string &str) { static const size_t seed = 0xc70f6907ul; return murmur_hash(str.c_str(), str.size(), seed); } size_t hash_with_seed(const std::string &str, size_t seed) { return murmur_hash(str.c_str(), str.size(), seed); } } ================================================ FILE: src/algorithm/hash.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include namespace algorithm { size_t hash(const std::string &str); size_t hash_with_seed(const std::string &str, size_t seed); } ================================================ FILE: src/algorithm/hyper_ball.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include "hyper_log_log.h" #include "profiler/profiler.h" #include "logger/logger.h" #include namespace algorithm { template bool hyper_ball_worker(double t, size_t v_begin, size_t v_end, const edge_map_type &edge_map, std::vector &c, std::vector &a, std::vector &harmonic) { bool counter_changed = false; for (uint32_t v = v_begin; v < v_end; v++) { a[v] = c[v]; for (const uint32_t &w : edge_map[v]) { a[v] += c[w]; } // a[v] is t + 1 and c[v] is at t const size_t counter_diff = a[v].count() - c[v].count(); if (counter_diff) { counter_changed = true; harmonic[v] += (1.0 / (t + 1.0)) * counter_diff; } } for (uint32_t v = v_begin; v < v_end; v++) { c[v] = a[v]; } return counter_changed; } /* * n is the number of vertices in graph. * edge_map is pointing to a static array of size n. * each item in edge_map is a vector of variable size. * each vector edge_map[m] contains values between 0 and n-1 indicating edge between m and edge_map[m]. * NOTE direction of edge in edge map has to be EDGE_FROM -> EDGE_TO. * so for vertex m, n = edge_map[m] indicates directed edge from n to m * */ template std::vector hyper_ball(uint32_t n, const edge_map_type &edge_map) { if (n == 0) return {}; const size_t num_threads = std::min(32, (int)n); const size_t items_per_thread = n / num_threads; std::vector c(n, hyper_log_log(10)); std::vector a(n, hyper_log_log(10)); std::vector harmonic(n, 0.0); for (uint32_t v = 0; v < n; v++) { c[v].insert(v); } double t = 0.0; while (true) { std::vector> threads; for (size_t i = 0; i < num_threads; i++) { const size_t v_begin = i * items_per_thread; const size_t v_end = (i == num_threads - 1) ? n : (i + 1) * items_per_thread; auto fut = std::async(hyper_ball_worker, t, v_begin, v_end, std::cref(edge_map), std::ref(c), std::ref(a), std::ref(harmonic)); threads.emplace_back(std::move(fut)); } bool should_continue = false; for (auto &fut : threads) { should_continue = fut.get() || should_continue; } t += 1.0; if (!should_continue) break; } return harmonic; } } ================================================ FILE: src/algorithm/hyper_log_log.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "hyper_log_log.h" #include "algorithm/hash.h" namespace algorithm { hyper_log_log::hyper_log_log(size_t b) : m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) { m_M.resize(m_len); std::fill(m_M.begin(), m_M.end(), 0); } hyper_log_log::hyper_log_log(const char *registers, size_t b) : m_b(b), m_len(1ull << m_b), m_alpha(0.7213/(1.0 + 1.079/m_len)) { m_M.resize(m_len); memcpy(m_M.data(), registers, m_len); } hyper_log_log::hyper_log_log(const hyper_log_log &other) : m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) { m_M.resize(m_len); std::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin()); } hyper_log_log::hyper_log_log(hyper_log_log &&other) : m_b(other.m_b), m_len(other.m_len), m_alpha(other.m_alpha) { m_M.swap(other.m_M); } hyper_log_log::~hyper_log_log() { } void hyper_log_log::insert(size_t v) { size_t x = algorithm::hash(std::to_string(v)); size_t j = x >> (64-m_b); m_M[j] = std::max(m_M[j], leading_zeros_plus_one(x << m_b)); } size_t hyper_log_log::count() const { double Z = 0.0; for (size_t j = 0; j < m_len; j++) { Z += 1.0 / (1ull << m_M[j]); } double E = m_alpha * m_len * m_len / Z; // Only small range correction implemented since we use 64 bit hash. if (E <= (5.0/2.0) * m_len) { size_t V = num_zero_registers(); if (V != 0) { E = m_len * log((double)m_len / V); } } return (size_t)E; } void hyper_log_log::reset() { std::fill(m_M.begin(), m_M.end(), 0); } char hyper_log_log::leading_zeros_plus_one(size_t x) const { size_t num_zeros = 1; for (size_t i = 0; i < 64; i++) { if ((x >> (64 - 1 - i)) & 0x1ull) return num_zeros; num_zeros++; } return num_zeros; } size_t hyper_log_log::num_zero_registers() const { return std::transform_reduce(m_M.begin(), m_M.end(), 0, [](int a, int b) { return a + b; }, [](char a) { return a == 0 ? 1 : 0; }); } double hyper_log_log::error_bound() const { double stdd = 1.04 / sqrt((double)m_len); return stdd * 3; // Gives 99% confidence } hyper_log_log hyper_log_log::operator +(const hyper_log_log &hl) const { hyper_log_log res; std::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(res.m_M), [] (char a, char b) { return std::max(a, b); }); return res; } hyper_log_log &hyper_log_log::operator +=(const hyper_log_log &hl) { std::transform(std::begin(m_M), std::end(m_M), std::begin(hl.m_M), std::begin(m_M), [] (char a, char b) { return std::max(a, b); }); return *this; } hyper_log_log &hyper_log_log::operator =(const hyper_log_log &other) { std::copy(other.m_M.cbegin(), other.m_M.cend(), m_M.begin()); return *this; } } ================================================ FILE: src/algorithm/hyper_log_log.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include namespace algorithm { /* * Implementation of the hyper log log algorithm as described by Flajolet1 et al. * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf * * Using 64 bit hash instead of 32bit. * */ class hyper_log_log { public: /* * initializes with given b parameter. size of data structure will be 2^b bytes. * */ hyper_log_log(size_t b = 15); hyper_log_log(const char *registers, size_t b = 15); hyper_log_log(const hyper_log_log &other); hyper_log_log(hyper_log_log &&other); ~hyper_log_log(); void insert(size_t v); size_t count() const; double error_bound() const; void reset(); const char *data() const { return m_M.data(); }; char *data() { return m_M.data(); }; int b() const { return m_b; } size_t data_size() const { return m_len; }; hyper_log_log operator +(const hyper_log_log &hl) const; hyper_log_log &operator +=(const hyper_log_log &hl); hyper_log_log &operator =(const hyper_log_log &other); char leading_zeros_plus_one(size_t x) const; private: std::vector m_M; // Points to registers. const int m_b; const size_t m_len; const double m_alpha; size_t num_zero_registers() const; }; } ================================================ FILE: src/algorithm/intersection.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "intersection.h" namespace algorithm { roaring::Roaring intersection(const std::vector &input) { if (input.size() == 0) return roaring::Roaring(); roaring::Roaring intersection = input[0]; for (size_t i = 1; i < input.size(); i++) { intersection &= input[i]; } return intersection; } } ================================================ FILE: src/algorithm/intersection.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include "roaring/roaring.hh" namespace algorithm { roaring::Roaring intersection(const std::vector &input); template std::vector intersection(const std::vector> &input, std::function sum_fun) { if (input.size() == 0) return {}; size_t shortest_vector_position = 0; size_t shortest_len = SIZE_MAX; size_t iter_index = 0; for (const std::vector &vec : input) { if (shortest_len > vec.size()) { shortest_len = vec.size(); shortest_vector_position = iter_index; } iter_index++; } std::vector positions(input.size(), 0); std::vector intersection; while (positions[shortest_vector_position] < shortest_len) { bool all_equal = true; item value = input[shortest_vector_position][positions[shortest_vector_position]]; size_t iter_index = 0; for (const std::vector &vec : input) { const size_t len = vec.size(); size_t *pos = &(positions[iter_index]); while (*pos < len && vec[*pos] < value) { (*pos)++; } if (((*pos < len) && (value < vec[*pos])) || *pos >= len) { all_equal = false; break; } else { if (iter_index != shortest_vector_position) { sum_fun(value, vec[*pos]); } } iter_index++; } if (all_equal) { intersection.push_back(value); } positions[shortest_vector_position]++; } return intersection; } template std::vector intersection(const std::vector> &input, const std::vector lengths) { if (input.size() == 0) return {}; size_t shortest_vector_position = 0; size_t shortest_len = SIZE_MAX; size_t iter_index = 0; for (size_t len : lengths) { if (shortest_len > len) { shortest_len = len; shortest_vector_position = iter_index; } iter_index++; } std::vector positions(input.size(), 0); std::vector intersection; while (positions[shortest_vector_position] < shortest_len) { bool all_equal = true; item value = input[shortest_vector_position][positions[shortest_vector_position]]; size_t iter_index = 0; for (const std::unique_ptr &ptr : input) { const size_t len = lengths[iter_index]; size_t *pos = &(positions[iter_index]); while (*pos < len && ptr[*pos] < value) { (*pos)++; } if (((*pos < len) && (value < ptr[*pos])) || *pos >= len) { all_equal = false; break; } else { if (iter_index != shortest_vector_position) { //sum_fun(value, ptr[*pos]); } } iter_index++; } if (all_equal) { intersection.push_back(value); } positions[shortest_vector_position]++; } return intersection; } template std::vector intersection(const std::vector> &input) { return intersection(input, [](item &a, const item &b) {}); } } ================================================ FILE: src/algorithm/sort.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "sort.h" namespace algorithm { } ================================================ FILE: src/algorithm/sort.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace algorithm { namespace sort { template void merge_arrays(const std::vector &arr1, const std::vector &arr2, F compare, std::vector &arr3) { size_t i = 0, j = 0; while (i < arr1.size() && j < arr2.size()) { if (compare(arr1[i], arr2[j])) { arr3.push_back(arr1[i++]); } else { arr3.push_back(arr2[j++]); } } while (i < arr1.size()) arr3.push_back(arr1[i++]); while (j < arr2.size()) arr3.push_back(arr2[j++]); } template void merge_arrays(const std::span *arr1, const std::span *arr2, F compare, std::vector &arr3) { size_t i = 0, j = 0; while (i < arr1->size() && j < arr2->size()) { if (compare((*arr1)[i], (*arr2)[j])) { arr3.push_back((*arr1)[i++]); } else { arr3.push_back((*arr2)[j++]); } } while (i < arr1->size()) arr3.push_back((*arr1)[i++]); while (j < arr2->size()) arr3.push_back((*arr2)[j++]); } template void merge_arrays(const std::vector &arr1, const std::vector &arr2, std::vector &arr3) { merge_arrays(arr1, arr2, [](const data_record &a, const data_record &b) { return a < b; }, arr3); } template void merge_arrays(const std::vector> &arrays, std::vector &res) { merge_arrays(arrays, [](const data_record &a, const data_record &b) { return a < b; }, res); } template void merge_array_range(const std::vector> &arrays, size_t i, size_t j, F compare, std::vector &res) { if (i == j) { for (const data_record &rec : arrays[i]) { res.push_back(rec); } } else if (j - i == 1) { merge_arrays(arrays[i], arrays[j], compare, res); } else { std::vector out1; std::vector out2; merge_array_range(arrays, i, (i + j)/2, compare, out1); merge_array_range(arrays, (i + j)/2 + 1, j, compare, out2); merge_arrays(out1, out2, compare, res); } } template void merge_arrays(const std::vector> &arrays, F compare, std::vector &res) { if (arrays.size() == 0) return; merge_array_range(arrays, 0, arrays.size() - 1, compare, res); } template void merge_array_range(const std::vector *> &arrays, size_t i, size_t j, F compare, std::vector &res) { if (i == j) { for (const data_record &rec : *(arrays[i])) { res.push_back(rec); } } else if (j - i == 1) { merge_arrays(arrays[i], arrays[j], compare, res); } else { std::vector out1; std::vector out2; merge_array_range(arrays, i, (i + j)/2, compare, out1); merge_array_range(arrays, (i + j)/2 + 1, j, compare, out2); merge_arrays(out1, out2, compare, res); } } template void merge_arrays(const std::vector *> &arrays, F compare, std::vector &res) { if (arrays.size() == 0) return; merge_array_range(arrays, 0, arrays.size() - 1, compare, res); } } } ================================================ FILE: src/algorithm/sum_sorted.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace algorithm { template std::vector sum_sorted(const std::vector> &input, std::function plus_eq) { const size_t n = input.size(); if (n == 0) return {}; std::vector ret; std::vector pos(n, 0); while (true) { int start_vec = -1; for (size_t i = 0; i < n; i++) { if (pos[i] < input[i].size() ) { start_vec = i; break; } } if (start_vec == -1) break; dtype smallest = input[start_vec][pos[start_vec]]; for (size_t i = 0; i < n; i++) { if (pos[i] < input[i].size() && input[i][pos[i]] < smallest) { smallest = input[i][pos[i]]; start_vec = i; } } const dtype el = input[start_vec][pos[start_vec]]; dtype sum = el; pos[start_vec]++; for (size_t i = start_vec + 1; i < n; i++) { while (pos[i] < input[i].size() && input[i][pos[i]] < el) { pos[i]++; } if (pos[i] < input[i].size() && input[i][pos[i]] == el) { plus_eq(sum, input[i][pos[i]]); pos[i]++; } } ret.push_back(sum); } return ret; } } ================================================ FILE: src/algorithm/top_k.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace algorithm { /* * Returns top k elements in unsorted const vector in linear time using a 2k memory buffer. * */ template std::vector top_k(const std::vector &input, size_t k, std::function ordered) { if (input.size() <= k) return input; if (input.size() <= 2 * k) { std::vector buf(input.begin(), input.end()); std::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered); return std::vector(buf.begin() + buf.size() / 2, buf.end()); } std::vector buf(input.begin(), input.begin() + (2 * k)); size_t idx = 2 * k; while (idx < input.size()) { std::nth_element(buf.begin(), buf.begin() + k, buf.end(), ordered); for (size_t i = 0, j = idx; i < k && j < input.size(); i++, j++) { // Only insert objects that are out of order compared to pivot buf[k] if (!ordered(input[j], buf[k])) { buf[i] = input[idx + i]; } } idx += k; } // Run final partition. std::nth_element(buf.begin(), buf.begin() + buf.size() / 2, buf.end(), ordered); return std::vector(buf.begin() + k, buf.end()); } /* * top_k but with default less than operator. * */ template std::vector top_k(const std::vector &input, size_t k) { return top_k(input, k, [](const dtype &a, const dtype &b) { return a < b; }); } } ================================================ FILE: src/api/api_response.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "api_response.h" #include "indexer/return_record.h" #include "full_text/search_metric.h" #include "parser/unicode.h" #include "json.hpp" namespace api { api_response::api_response(const std::vector &results, const struct full_text::search_metric &metric, double profile) { using json = nlohmann::ordered_json; json message; json result_array; for (const auto &result : results) { json json_result; try { json_result["url"] = result.m_url.str(); json_result["title"] = parser::unicode::encode(result.m_title); json_result["snippet"] = parser::unicode::encode(result.m_snippet); json_result["score"] = result.m_score; json_result["domain_hash"] = std::to_string(result.m_domain_hash); json_result["url_hash"] = std::to_string(result.m_url.hash()); result_array.push_back(json_result); } catch (nlohmann::detail::type_error &error) { // skip this result. // in future log this and fix what is wrong. } } message["status"] = "success"; message["time_ms"] = profile; message["total_found"] = metric.m_total_found; message["total_url_links_found"] = metric.m_total_url_links_found; message["total_domain_links_found"] = metric.m_total_domain_links_found; message["links_handled"] = metric.m_links_handled; message["link_domain_matches"] = metric.m_link_domain_matches; message["link_url_matches"] = metric.m_link_url_matches; message["results"] = result_array; //m_response = message.dump(); m_response = message.dump(4); } api_response::~api_response() { } std::ostream &operator<<(std::ostream &os, const api_response &api_response) { os << api_response.m_response; return os; } } ================================================ FILE: src/api/api_response.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace full_text { struct search_metric; } namespace indexer { class return_record; } namespace api { class api_response { public: api_response(const std::vector &results, const struct full_text::search_metric &metric, double profile); ~api_response(); friend std::ostream &operator<<(std::ostream &os, const api_response &api_response); private: std::string m_response; }; } ================================================ FILE: src/api/result_with_snippet.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "result_with_snippet.h" #include "text/text.h" namespace api { result_with_snippet::result_with_snippet(const std::string &tsv_data, const indexer::return_record &res) : m_score(res.m_score), m_domain_hash(res.m_domain_hash) { size_t pos_start = 0; size_t pos_end = 0; size_t col_num = 0; while (pos_end != std::string::npos) { pos_end = tsv_data.find('\t', pos_start); const size_t len = pos_end - pos_start; if (col_num == 0) { m_url = URL(tsv_data.substr(pos_start, len)); } if (col_num == 1) { m_title = tsv_data.substr(pos_start, len); } if (col_num == 3) { m_meta = tsv_data.substr(pos_start, len); } if (col_num == 4) { m_snippet = make_snippet(tsv_data.substr(pos_start, len)); if (m_snippet.size() == 0) { m_snippet = make_snippet(m_meta); } } pos_start = pos_end + 1; col_num++; } } result_with_snippet::~result_with_snippet() { } std::string result_with_snippet::make_snippet(const std::string &text) const { std::string response = text.substr(0, 140); text::trim(response); if (response.size() >= 140) response += "..."; return response; } } ================================================ FILE: src/api/result_with_snippet.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include "URL.h" #include "indexer/return_record.h" namespace api { class result_with_snippet { public: result_with_snippet(const std::string &tsv_data, const indexer::return_record &res); ~result_with_snippet(); const URL &url() const { return m_url; }; const std::string &title() const { return m_title; }; const std::string &snippet() const { return m_snippet; }; const float &score() const { return m_score; }; const uint64_t &domain_hash() const { return m_domain_hash; }; private: URL m_url; std::string m_title; std::string m_meta; std::string m_snippet; float m_score; uint64_t m_domain_hash; std::string make_snippet(const std::string &text) const; }; } ================================================ FILE: src/cluster/cluster.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once ================================================ FILE: src/cluster/document.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "document.h" #include "algorithm/hash.h" #include "text/text.h" #include "URL.h" namespace cluster { document::document() : m_name("unnamed document"){ } document::document(const std::string &name) : m_name(name) { } document::~document() { } void document::read_text(const std::string &text) { const std::vector words = text::get_words(text, 0); for (const auto &word : words) { m_counts[algorithm::hash(word)]++; } } void read_text_to_corpus(corpus &corp, const std::string &text) { const std::vector words = text::get_words(text, 0); for (const auto &word : words) { size_t key = algorithm::hash(word); corp.counts[key]++; if (corp.words.count(key) == 0) { corp.words[key] = word; } } } void read_corpus(corpus &corp, documents &documents, std::stringstream &tsv) { std::string line; while (getline(tsv, line)) { const size_t pos = line.find('\t'); if (pos == std::string::npos) continue; URL url(line.substr(0, pos)); const std::string doc_text = line.substr(pos); const size_t key = url.host_hash(); if (!documents.count(key)) { documents.emplace(key, url.host()); } documents[key].read_text(doc_text); if (key == algorithm::hash("annicaviklund.se")) { std::cout << doc_text << std::endl; } read_text_to_corpus(corp, doc_text); } } void print_document(corpus &corp, const document &document) { std::vector> keys; for (const auto &iter : document.m_counts) { keys.emplace_back(iter.first, iter.second); } sort(keys.begin(), keys.end(), [](const auto &a, const auto &b) { return a.second > b.second; }); size_t len = keys.size(); for (size_t i = 0; i < std::min(100ul, len); i++) { std::cout << corp.words[keys[i].first] << " = " << keys[i].second << std::endl; } } } ================================================ FILE: src/cluster/document.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include namespace cluster { typedef struct corpus_s { std::unordered_map words; std::unordered_map counts; } corpus; class document { public: document(); document(const std::string &name); ~document(); std::string name() const { return m_name; }; size_t size() const { return m_counts.size(); }; void read_text(const std::string &text); friend void print_document(corpus &corp, const document &document); private: std::string m_name; std::unordered_map m_counts; }; typedef document topic; typedef std::unordered_map documents; void read_corpus(corpus &corp, documents &documents, std::stringstream &tsv); void print_document(corpus &corp, const document &document); } ================================================ FILE: src/common/ThreadPool.h ================================================ /* Copyright (c) 2012 Jakob Progsch, Václav Zeman This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ #ifndef THREAD_POOL_H #define THREAD_POOL_H #include #include #include #include #include #include #include #include #include class ThreadPool { public: explicit ThreadPool(size_t); template auto enqueue(F&& f, Args&&... args) -> std::future::type>; ~ThreadPool(); private: // need to keep track of threads so we can join them std::vector< std::thread > workers; // the task queue std::queue< std::function > tasks; // synchronization std::mutex queue_mutex; std::condition_variable condition; bool stop; }; // the constructor just launches some amount of workers inline ThreadPool::ThreadPool(size_t threads) : stop(false) { for(size_t i = 0;i task; { std::unique_lock lock(this->queue_mutex); this->condition.wait(lock, [this]{ return this->stop || !this->tasks.empty(); }); if(this->stop && this->tasks.empty()) return; task = std::move(this->tasks.front()); this->tasks.pop(); } task(); } } ); } // add new work item to the pool template auto ThreadPool::enqueue(F&& f, Args&&... args) -> std::future::type> { using return_type = typename std::result_of::type; auto task = std::make_shared< std::packaged_task >( std::bind(std::forward(f), std::forward(args)...) ); std::future res = task->get_future(); { std::unique_lock lock(queue_mutex); // don't allow enqueueing after stopping the pool if(stop) throw std::runtime_error("enqueue on stopped ThreadPool"); tasks.emplace([task](){ (*task)(); }); } condition.notify_one(); return res; } // the destructor joins all threads inline ThreadPool::~ThreadPool() { { std::unique_lock lock(queue_mutex); stop = true; } condition.notify_all(); for(std::thread &worker: workers) worker.join(); } #endif ================================================ FILE: src/common/datetime.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "datetime.h" #include namespace common { size_t cur_date() { time_t tt = time(NULL); struct tm tm = *localtime(&tt); size_t year_since_00 = tm.tm_year - 100; size_t year = 2000 + year_since_00; return (year * 100 * 100) + ((tm.tm_mon + 1) * 100) + tm.tm_mday; } size_t cur_time() { time_t tt = time(NULL); struct tm tm = *localtime(&tt); return (tm.tm_hour * 100 * 100) + (tm.tm_min * 100) + tm.tm_sec; } size_t cur_datetime() { size_t date = cur_date(); return (date * 100 * 100 * 100) + cur_time(); } const std::string iso8601_datetime() { time_t now; time(&now); char buf[21]; strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&now)); return std::string(buf); } } ================================================ FILE: src/common/datetime.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace common { size_t cur_date(); size_t cur_time(); size_t cur_datetime(); const std::string iso8601_datetime(); } ================================================ FILE: src/common/dictionary.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "dictionary.h" #include "logger/logger.h" #include "file/tsv_file.h" #include "dictionary_row.h" #include "algorithm/hash.h" using namespace std; namespace common { dictionary::dictionary() { } dictionary::dictionary(file::tsv_file &tsv_file) { load_tsv(tsv_file); } dictionary::~dictionary() { } void dictionary::load_tsv(file::tsv_file &tsv_file) { while (!tsv_file.eof()) { auto line = tsv_file.get_line(); std::stringstream ss(line); std::string col; getline(ss, col, '\t'); if (col.size()) { size_t key = ::algorithm::hash(col); if (m_rows.find(key) != m_rows.end()) { handle_collision(key, col); } m_rows[key] = dictionary_row(ss); } } } unordered_map::const_iterator dictionary::find(const std::string &key) const { return m_rows.find(::algorithm::hash(key)); } unordered_map::const_iterator dictionary::find(size_t hash) const { return m_rows.find(hash); } unordered_map::const_iterator dictionary::begin() const { return m_rows.begin(); } unordered_map::const_iterator dictionary::end() const { return m_rows.end(); } bool dictionary::has_key(const std::string &key) const { return find(key) != end(); } void dictionary::handle_collision(size_t key, const std::string &col) { LOG_ERROR("Collision: " + std::to_string(key) + " " + col); } } ================================================ FILE: src/common/dictionary.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include "dictionary_row.h" namespace file { class tsv_file; } namespace common { class dictionary { public: dictionary(); explicit dictionary(file::tsv_file &tsv_file); ~dictionary(); void load_tsv(file::tsv_file &tsv_file); std::unordered_map::const_iterator find(const std::string &key) const; std::unordered_map::const_iterator find(size_t hash) const; std::unordered_map::const_iterator begin() const; std::unordered_map::const_iterator end() const; bool has_key(const std::string &key) const; size_t size() const { return m_rows.size(); } private: std::unordered_map m_rows; void handle_collision(size_t key, const std::string &col); }; } ================================================ FILE: src/common/dictionary_row.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "dictionary_row.h" namespace common { dictionary_row::dictionary_row() { } dictionary_row::dictionary_row(const dictionary_row &row) { m_columns = row.m_columns; } dictionary_row::dictionary_row(const std::string &row) { std::stringstream stream(row); read_stream(stream); } dictionary_row::dictionary_row(std::stringstream &stream) { read_stream(stream); } dictionary_row::~dictionary_row() { } int dictionary_row::get_int(int column) const { return (int)m_columns[column]; } float dictionary_row::get_float(int column) const { return (float)m_columns[column]; } double dictionary_row::get_double(int column) const { return m_columns[column]; } void dictionary_row::read_stream(std::stringstream &stream) { std::string col; int i = 0; while (std::getline(stream, col, '\t')) { try { m_columns.push_back(stod(col)); } catch(const std::invalid_argument &error) { } catch(const std::out_of_range &error) { } i++; } } } ================================================ FILE: src/common/dictionary_row.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #define CC_ROW_LEN 5 namespace common { class dictionary_row { public: dictionary_row(); dictionary_row(const dictionary_row &row); explicit dictionary_row(const std::string &row); explicit dictionary_row(std::stringstream &stream); ~dictionary_row(); int get_int(int column) const; float get_float(int column) const; double get_double(int column) const; private: std::vector m_columns; void read_stream(std::stringstream &stream); }; } ================================================ FILE: src/common/simple_thread_pool.hpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace common { class simple_thread_pool { public: explicit simple_thread_pool(size_t); ~simple_thread_pool(); void enqueue(std::function &&fun); private: void handle_work(); std::vector m_workers; std::queue> m_queue; std::mutex m_queue_lock; std::condition_variable m_condition; bool m_stop = false; }; } ================================================ FILE: src/common/system.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "system.h" #include #include #include #include namespace common { bool is_dev() { if (getenv("ALEXANDRIA_LIVE") != NULL && std::stoi(getenv("ALEXANDRIA_LIVE")) > 0) { return false; } return true; } std::string domain_index_filename() { if (is_dev()) { return "/dev_files/domain_info.tsv"; } return "/files/domain_info.tsv"; } std::string dictionary_filename() { if (is_dev()) { return "/dev_files/dictionary.tsv"; } return "/files/dictionary.tsv"; } std::string uuid() { // Create a random UUID boost::uuids::uuid uuid = boost::uuids::random_generator()(); // Convert UUID to string and return return boost::uuids::to_string(uuid); } } ================================================ FILE: src/common/system.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace common { bool is_dev(); std::string domain_index_filename(); std::string dictionary_filename(); std::string uuid(); } ================================================ FILE: src/config.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "config.h" #include "text/text.h" #include "logger/logger.h" #include "file/file.h" using namespace std; namespace config { config::config() { create_data_directories(m_data_path); } const config s_instance = config(); const std::string &data_path() { return s_instance.data_path(); } void create_data_directories(const std::string &data_path) { if (file::directory_exists(data_path)) { for (size_t shard_id = 0; shard_id < 8; shard_id++) { const std::string base = data_path + "/" + to_string(shard_id); file::create_directory(base); file::create_directory(base + "/input"); file::create_directory(base + "/output"); file::create_directory(base + "/upload"); file::create_directory(base + "/hash_table"); file::create_directory(base + "/full_text"); file::create_directory(base + "/tmp"); } } } string node = "test0001"; string master = "localhost"; string upload = "localhost"; string data_node; //string url_store_host = "http://localhost"; string url_store_host = "http://node0009.alexandria.org"; string url_store_path = "/alexandria/urlstore"; string url_store_cache_path = "/mnt/4/urlstore_cache"; size_t nodes_in_cluster = 1; size_t node_id = 0; bool index_snippets = true; bool index_text = true; vector batches; vector link_batches; size_t worker_count = 8; size_t query_max_words = 10; size_t query_max_len = 200; size_t deduplicate_domain_count = 5; size_t pre_result_limit = 200000; size_t result_limit = 1000; string file_upload_user = ""; string file_upload_password = ""; size_t n_grams = 1; size_t shard_hash_table_size = 100000; size_t html_parser_long_text_len = 1000; size_t ft_shard_builder_buffer_len = 240000; size_t ft_num_shards = 2048; size_t ft_max_sections = 8; size_t ft_max_results_per_section = 100000; size_t ft_section_depth = 8; size_t ft_max_cache_gb = 30; size_t ft_num_threads_indexing = 24; size_t ft_num_threads_merging = 24; size_t ft_num_threads_appending = 8; double ft_cached_bytes_per_shard() { return (ft_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * ft_num_threads_indexing); } void read_config(const string &config_file) { batches.clear(); link_batches.clear(); ifstream in(config_file); if (!in.is_open()) { LOG_ERROR("Could not read config file: " + config_file); return; } string line; while (getline(in, line)) { size_t comment_pos = line.find("#"); if (comment_pos != string::npos) { line = line.substr(0, comment_pos); } if (text::trim(line) == "") { continue; } vector parts; boost::split(parts, line, boost::is_any_of("=")); for (string &part : parts) { part = text::trim(part); } if (parts[0] == "node") { node = parts[1]; } else if (parts[0] == "master") { master = parts[1]; upload = parts[1]; } else if (parts[0] == "upload") { upload = parts[1]; } else if (parts[0] == "data_node") { data_node = parts[1]; } else if (parts[0] == "url_store_host") { url_store_host = parts[1]; } else if (parts[0] == "url_store_path") { url_store_path = parts[1]; } else if (parts[0] == "nodes_in_cluster") { nodes_in_cluster = stoi(parts[1]); } else if (parts[0] == "node_id") { node_id = stoi(parts[1]); } else if (parts[0] == "batches[]") { batches.push_back(parts[1]); } else if (parts[0] == "link_batches[]") { link_batches.push_back(parts[1]); } else if (parts[0] == "worker_count") { worker_count = stoi(parts[1]); } else if (parts[0] == "query_max_words") { query_max_words = stoi(parts[1]); } else if (parts[0] == "query_max_len") { query_max_len = stoi(parts[1]); } else if (parts[0] == "deduplicate_domain_count") { deduplicate_domain_count = stoi(parts[1]); } else if (parts[0] == "pre_result_limit") { pre_result_limit = stoi(parts[1]); } else if (parts[0] == "result_limit") { result_limit = stoi(parts[1]); } else if (parts[0] == "ft_num_shards") { ft_num_shards = stoi(parts[1]); } else if (parts[0] == "ft_max_sections") { ft_max_sections = stoi(parts[1]); } else if (parts[0] == "ft_max_results_per_section") { ft_max_results_per_section = stoi(parts[1]); } else if (parts[0] == "ft_section_depth") { ft_section_depth = stoi(parts[1]); } else if (parts[0] == "ft_max_cache_gb") { ft_max_cache_gb = stoi(parts[1]); } else if (parts[0] == "ft_num_threads_indexing") { ft_num_threads_indexing = stoi(parts[1]); } else if (parts[0] == "ft_num_threads_merging") { ft_num_threads_merging = stoi(parts[1]); } else if (parts[0] == "ft_num_threads_appending") { ft_num_threads_appending = stoi(parts[1]); } else if (parts[0] == "file_upload_user") { file_upload_user = parts[1]; } else if (parts[0] == "file_upload_password") { file_upload_password = parts[1]; } else if (parts[0] == "n_grams") { n_grams = stoull(parts[1]); } else if (parts[0] == "index_snippets") { index_snippets = static_cast(stoull(parts[1])); } else if (parts[0] == "index_text") { index_text = static_cast(stoull(parts[1])); } else if (parts[0] == "shard_hash_table_size") { shard_hash_table_size = stoull(parts[1]); } else if (parts[0] == "html_parser_long_text_len") { html_parser_long_text_len = stoull(parts[1]); } else if (parts[0] == "data_path") { s_instance.data_path(parts[1]); } } } } ================================================ FILE: src/config.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include namespace config { void create_data_directories(const std::string &data_path); class config { public: config(); const std::string &data_path() const { return m_data_path; } void data_path(const std::string &str) const { m_data_path = str; create_data_directories(m_data_path); } private: mutable std::string m_data_path = "/mnt"; }; const std::string &data_path(); extern std::string node; extern std::string master; extern std::string upload; extern std::string data_node; extern std::string url_store_host; extern std::string url_store_path; extern std::string url_store_cache_path; const size_t url_store_shards = 24; extern size_t nodes_in_cluster; extern size_t node_id; extern bool index_snippets; extern bool index_text; extern std::vector batches; extern std::vector link_batches; extern size_t worker_count; extern size_t query_max_words; extern size_t query_max_len; extern size_t deduplicate_domain_count; extern size_t pre_result_limit; extern size_t result_limit; extern std::string file_upload_user; extern std::string file_upload_password; extern size_t n_grams; extern size_t shard_hash_table_size; extern size_t html_parser_long_text_len; extern size_t ft_shard_builder_buffer_len; /* Constants only configurable at compilation time. */ // Full text indexer config extern size_t ft_num_shards; extern size_t ft_max_sections; extern size_t ft_max_results_per_section; extern size_t ft_section_depth; extern size_t ft_max_cache_gb; extern size_t ft_num_threads_indexing; extern size_t ft_num_threads_merging; extern size_t ft_num_threads_appending; double ft_cached_bytes_per_shard(); // Link indexer config inline const unsigned long long li_max_cache_gb = 4; inline const unsigned long long li_num_threads_indexing = 48; inline const unsigned long long li_num_threads_merging = 16; inline const double li_cached_bytes_per_shard = (li_max_cache_gb * 1000ul*1000ul*1000ul) / (ft_num_shards * li_num_threads_indexing); inline const unsigned long long li_indexer_max_cache_size = 500; // Hash table indexer config inline const unsigned long long ht_num_shards = 1031; inline const unsigned long long ht_num_buckets = 8; inline const unsigned long long ht_key_size = 8; // Server config // Other constants. inline const unsigned long long num_async_file_transfers = 48; inline const std::string test_data_path = "/var/www/html/node0003.alexandria.org/test-data/"; // Commoncrawl parser. inline const std::string cc_target_output = "alexandria-cc-output"; inline const bool cc_run_on_lambda = false; inline const std::string log_file_path = "/var/log/alexandria.log"; void read_config(const std::string &config_file); } ================================================ FILE: src/debug.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "debug.h" void print_elem(std::map &m, size_t elem) { std::cout << m[elem] << std::endl; } ================================================ FILE: src/debug.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include void print_elem(std::map &m, size_t elem); ================================================ FILE: src/domain_stats/domain_stats.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "domain_stats.h" #include #include "common/dictionary.h" #include "file/tsv_file_remote.h" #include "logger/logger.h" #include "common/system.h" namespace domain_stats { common::dictionary domain_data; void download_domain_stats() { LOG_INFO("download domain_info.tsv"); file::tsv_file_remote domain_info_tsv(common::domain_index_filename()); LOG_INFO("parsing....."); domain_data.load_tsv(domain_info_tsv); } float harmonic_centrality(const URL &url) { return harmonic_centrality(url.host()); } float harmonic_centrality(const std::string &host) { const auto iter = domain_data.find(host); float harmonic = 0.0f; if (iter != domain_data.end()) { const common::dictionary_row row = iter->second; harmonic = row.get_float(0); } return harmonic; } } ================================================ FILE: src/domain_stats/domain_stats.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include "URL.h" namespace domain_stats { void download_domain_stats(); float harmonic_centrality(const URL &url); float harmonic_centrality(const std::string &domain); } ================================================ FILE: src/downloader/merge_downloader.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include "file/file.h" #include "file/archive.h" #include "hash_table2/builder.h" #include "utils/thread_pool.hpp" #include "indexer/index.h" #include "indexer/index_builder.h" #include "indexer/index_reader.h" #include "indexer/value_record.h" namespace downloader { bool internal_links_complete(const std::string &path) { for (size_t i = 0; i < 8; i++) { if (!file::file_exists(path + "/internal_links_" + std::to_string(i))) { return false; } } return true; } bool hash_table_complete(const std::string &path) { const size_t num_shards = 1019; for (size_t i = 0; i < num_shards; i++) { if (!file::file_exists(path + "/" + std::to_string(i) + ".pos")) { return false; } } for (size_t i = 0; i < num_shards; i++) { if (!file::file_exists(path + "/" + std::to_string(i) + ".data")) { return false; } } return true; } void merge_internal_links(const std::string &path, const std::string &batch_name) { return; /* const std::string target_path = "/slow_data/internal_links/" + batch_name; file::create_directory(target_path); for (size_t i = 0; i < 8; i++) { file::copy_file(path + "/internal_links_" + std::to_string(i), target_path + "/internal_links_" + std::to_string(i)); } */ utils::thread_pool pool(8); for (size_t i = 0; i < 8; i++) { pool.enqueue([i, path]() { file::archive tar(path + "/internal_links_" + std::to_string(i)); utils::thread_pool pool(4, 10); tar.untar([&pool](const std::string &filename, const std::string &data) { pool.enqueue([filename, data]() { uint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5)); std::istringstream ram_reader(data); indexer::index_builder idx1("internal_links", host_hash, 1000); indexer::index idx2(&ram_reader, 1000); try { idx1.merge_with(idx2); } catch (const std::runtime_error &err) { // The file is corrupt. Lets delete it and report. std::cout << "internal_links: " << host_hash << " is corrupt" << std::endl; idx1.truncate(); } catch (const std::bad_alloc &err) { // The file is corrupt. Lets delete it and report. std::cout << "internal_links: " << host_hash << " is corrupt" << std::endl; idx1.truncate(); } }); }); pool.run_all(); }); } pool.run_all(); std::cout << "finished with the merge" << std::endl; } void merge_hash_table(const std::string &path) { utils::thread_pool pool(32); hash_table2::builder ht("all_urls", 1019, 1000000, "/slow_data"); for (size_t i = 0; i < 1019; i++) { pool.enqueue([&ht, i, path]() { ht.get_shard(i)->merge_with(path + "/" + std::to_string(i) + ".pos", path + "/" + std::to_string(i) + ".data"); }); } pool.run_all(); } void merge_downloader() { indexer::index_builder::create_directories("internal_links"); file::read_directory(config::data_path() + "/downloader", [](const std::string &node_id) { const std::string dir = config::data_path() + "/downloader/" + node_id; file::read_directory(dir, [dir](const std::string &file) { try { size_t ts = std::stoull(file); const std::string batch = dir + "/" + std::to_string(ts); if (internal_links_complete(batch) && hash_table_complete(batch + "/ht")) { std::cout << "merging directory: " << batch << std::endl; profiler::instance prof1("merge_internal_links"); merge_internal_links(batch, std::to_string(ts)); prof1.stop(); profiler::instance prof2("merge_hash_table"); merge_hash_table(batch + "/ht"); prof2.stop(); file::delete_directory(batch); exit(0); } } catch (...) { } }); }); } } ================================================ FILE: src/downloader/merge_downloader.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace downloader { void merge_downloader(); } ================================================ FILE: src/downloader/warc_downloader.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "config.h" #include "common/datetime.h" #include "warc/warc.h" #include "utils/thread_pool.hpp" #include "utils/id_allocator.h" #include "file/archive.h" #include "logger/logger.h" #include "text/text.h" #include "transfer/transfer.h" #include #include "hash_table2/builder.h" #include "algorithm/algorithm.h" #include "indexer/index_utils.h" #include "indexer/index_builder.h" #include "indexer/value_record.h" #include "indexer/merger.h" namespace downloader { void run_downloader(const std::string &warc_path) { warc::parser pp; for (int retry = 0; retry < 3; retry++) { try { warc::multipart_download("http://data.commoncrawl.org/" + warc_path, [&pp](const std::string &chunk) { std::stringstream ss(chunk); pp.parse_stream(ss); }); break; } catch (const std::runtime_error &err) { std::cout << "GOT ERROR: " << err.what() << std::endl; std::cout << "Retrying... try " << retry << std::endl; std::this_thread::sleep_for(std::chrono::seconds(5)); } } LOG_INFO("uploading: " + warc_path); int error; error = transfer::upload_gz_file(warc::get_result_path(warc_path), pp.result()); error = transfer::upload_gz_file(warc::get_link_result_path(warc_path), pp.link_result()); if (error) { LOG_INFO("error uploading: " + warc_path); } } std::vector download_warc_paths() { int error; auto content = transfer::file_to_string("nodes/" + config::node + "/warc.paths", error); if (error == transfer::ERROR) return {}; content = text::trim(content); std::vector raw_warc_paths; boost::algorithm::split(raw_warc_paths, content, boost::is_any_of("\n")); std::vector warc_paths; for (const auto &warc_path : raw_warc_paths) { if (text::trim(warc_path).size()) { warc_paths.push_back(text::trim(warc_path)); } } return warc_paths; } bool upload_warc_paths(const std::vector &warc_paths) { auto content = boost::algorithm::join(warc_paths, "\n"); int error = transfer::upload_file("nodes/" + config::node + "/warc.paths", content); return error == transfer::OK; } void start_downloaders(const std::vector &warc_paths) { const size_t num_threads = 12; std::vector> chunks; algorithm::vector_chunk(warc_paths, std::ceil(warc_paths.size() / num_threads) + 1, chunks); utils::thread_pool pool(num_threads); for (const auto &chunk : chunks) { pool.enqueue([chunk] { size_t count = 0; for (const auto &warc_path : chunk) { run_downloader(warc_path); count++; std::cout << "done with " << warc_path << " done with " << count << "/" << chunk.size() << std::endl; } }); } pool.run_all(); } void upload_all() { /*auto upload_id = std::to_string(common::cur_datetime()); // Upload internal links. for (size_t i = 0; i < 8; i++) { // Optimize all internal links. utils::thread_pool pool(32); file::read_directory(config::data_path() + "/" + std::to_string(i) + "/full_text/internal_links", [&pool](const std::string &filename) { uint64_t host_hash = std::stoull(filename.substr(0, filename.size() - 5)); indexer::index_builder idx("internal_links", host_hash, 1000); idx.optimize(); }); pool.run_all(); const auto filename = "internal_links_" + std::to_string(i); file::archive tar(filename); tar.read_dir(config::data_path() + "/" + std::to_string(i) + "/full_text/internal_links"); transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/" + filename, filename); file::delete_file(filename); } hash_table2::hash_table ht("crawl_index", 1019); ht.for_each_shard([upload_id](auto shard) { const auto pos_filename = shard->filename_pos(); const auto data_filename = shard->filename_data(); const auto target_filename = std::to_string(shard->shard_id()); transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/ht/" + target_filename + ".pos", pos_filename); transfer::upload_file_from_disk("downloader/" + config::node + "/" + upload_id + "/ht/" + target_filename + ".data", data_filename); }); */ } void warc_downloader_with_url(const std::string &batch, const std::string &warc_paths_url) { std::vector warc_paths; int error; auto content = transfer::gz_file_to_string(warc_paths_url, error); std::stringstream ss(content); std::string line; size_t line_num = 0; while (std::getline(ss, line)) { if (line_num % config::nodes_in_cluster == config::node_id) { warc_paths.emplace_back(std::move(line)); } line_num++; } start_downloaders(warc_paths); } void warc_downloader(const std::string &batch) { warc_downloader_with_url(batch, "https://data.commoncrawl.org/crawl-data/" + batch + "/warc.paths.gz"); } void warc_downloader_missing(const std::string &batch) { warc_downloader_with_url(batch, "crawl-data/" + batch + "/missing.paths.gz"); } } ================================================ FILE: src/downloader/warc_downloader.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace downloader { std::vector download_warc_paths(); bool upload_warc_paths(const std::vector &warc_paths); void warc_downloader(const std::string &batch); void warc_downloader_missing(const std::string &batch); } ================================================ FILE: src/file/archive.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "archive.h" #include "file.h" #include "algorithm/algorithm.h" #include "utils/thread_pool.hpp" #include #include #include #include #include #include namespace file { archive::archive(const std::string &filename) : m_filename(filename) { } archive::~archive() { } void archive::read_dir(const std::string &dirname) { // Truncate target file. std::ofstream outfile(m_filename, std::ios::binary | std::ios::trunc); outfile.close(); boost::filesystem::path path(dirname); std::vector paths; if (is_directory(path)) { boost::filesystem::directory_iterator iter(path); for (auto &file : boost::make_iterator_range(iter, {})) { paths.push_back(file.path()); } } std::vector> chunks; algorithm::vector_chunk(paths, std::ceil(paths.size() / m_num_threads) + 1, chunks); utils::thread_pool pool(m_num_threads); size_t worker_id = 0; for (const auto &chunk : chunks) { // Remove worker file. ::file::delete_file(m_filename + "." + std::to_string(worker_id)); pool.enqueue([this, chunk, worker_id]() { for (const auto &path : chunk) { add_file(path.generic_string(), path.filename().generic_string(), worker_id); } }); worker_id++; } pool.run_all(); // Merge workers. for (size_t worker_id = 0; worker_id < m_num_threads; worker_id++) { std::filebuf infile, outfile; outfile.open(m_filename, std::ios::out | std::ios::binary | std::ios::app); infile.open(m_filename + "." + std::to_string(worker_id), std::ios::in | std::ios::binary); std::copy(std::istreambuf_iterator(&infile), {}, std::ostreambuf_iterator(&outfile)); // Remove worker file. ::file::delete_file(m_filename + "." + std::to_string(worker_id)); } } void archive::untar(const std::string &dest_dir) { std::ifstream infile(m_filename, std::ios::binary); tar_header header; while (!infile.eof()) { infile.read((char *)&header, sizeof(tar_header)); if (infile.eof()) break; // This is an unnessecary copy. char *buffer = new char[header.m_len]; infile.read(buffer, header.m_len); std::string buffer_string(buffer, header.m_len); std::stringstream buffer_stream(buffer_string); delete[] buffer; boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(buffer_stream); std::string decompressed_data(std::istreambuf_iterator(decompress_stream), {}); std::ofstream outfile(dest_dir + "/" + header.m_filename, std::ios::binary); outfile.write(decompressed_data.c_str(), decompressed_data.size()); } } void archive::untar(std::function cb) { std::ifstream infile(m_filename, std::ios::binary); tar_header header; while (!infile.eof()) { infile.read((char *)&header, sizeof(tar_header)); if (infile.eof()) break; // This is an unnessecary copy. char *buffer = new char[header.m_len]; infile.read(buffer, header.m_len); std::string buffer_string(buffer, header.m_len); std::stringstream buffer_stream(buffer_string); delete[] buffer; boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(buffer_stream); std::string decompressed_data(std::istreambuf_iterator(decompress_stream), {}); cb(header.m_filename, decompressed_data); } } void archive::add_file(const std::string &path, const std::string &filename, size_t worker_id) { std::ofstream outfile(m_filename + "." + std::to_string(worker_id), std::ios::binary | std::ios::app); std::string data = ::file::cat(path); std::stringstream ss(data); boost::iostreams::filtering_istream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(ss); std::string compressed_data(std::istreambuf_iterator(compress_stream), {}); tar_header header; header.m_len = compressed_data.size(); filename.copy(header.m_filename, filename.size(), 0); header.m_filename[filename.size()] = 0; outfile.write((char *)&header, sizeof(tar_header)); outfile.write((char *)compressed_data.c_str(), compressed_data.size()); } } ================================================ FILE: src/file/archive.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace file { class archive { public: explicit archive(const std::string &filename); ~archive(); void read_dir(const std::string &dirname); void untar(const std::string &dest_dir); void untar(std::function cb); private: const size_t m_num_threads = 32; std::string m_filename; struct tar_header { size_t m_len; char m_filename[256]; }; void add_file(const std::string &path, const std::string &filename, size_t worker_id); }; } ================================================ FILE: src/file/file.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "config.h" #include "file.h" #include #include namespace file { std::string read_test_file(const std::string &file_name) { std::ifstream file(config::test_data_path + file_name); if (file.is_open()) { std::string ret; file.seekg(0, std::ios::end); ret.resize(file.tellg()); file.seekg(0, std::ios::beg); file.read(&ret[0], ret.size()); file.close(); return ret; } return ""; } void rename(const std::string &old_path, const std::string &new_path) { boost::filesystem::rename(old_path, new_path); } void copy_file(const std::string &source, const std::string &dest) { std::ifstream infile(source, std::ios::binary); std::ofstream outfile(dest, std::ios::binary | std::ios::trunc); outfile << infile.rdbuf(); } void delete_file(const std::string &file) { boost::filesystem::remove(file); } void create_directory(const std::string &path) { boost::filesystem::create_directories(path); } void delete_directory(const std::string &path) { boost::filesystem::remove_all(path); } std::string cat(const std::string &filename) { std::ifstream infile(filename); std::istreambuf_iterator iter(infile), end; std::string ret(iter, end); return ret; } void read_directory(const std::string &dirname, std::function cb) { boost::filesystem::path path(dirname); if (is_directory(path)) { boost::filesystem::directory_iterator iter(path); for (auto &file : boost::make_iterator_range(iter, {})) { cb(file.path().filename().generic_string()); } } } bool directory_exists(const std::string &filename) { return boost::filesystem::is_directory(filename) && boost::filesystem::exists(filename); } bool file_exists(const std::string &filename) { std::ifstream infile(filename); return infile.good(); } } ================================================ FILE: src/file/file.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace file { std::string read_test_file(const std::string &file_name); void rename(const std::string &old_path, const std::string &new_path); void copy_file(const std::string &source, const std::string &dest); void delete_file(const std::string &filename); void create_directory(const std::string &path); void delete_directory(const std::string &path); /* * Returns the whole content of the file. * */ std::string cat(const std::string &filename); void read_directory(const std::string &path, std::function cb); bool directory_exists(const std::string &filename); bool file_exists(const std::string &filename); } ================================================ FILE: src/file/gz_tsv_file.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "gz_tsv_file.h" #include #include #include #include namespace file { gz_tsv_file::gz_tsv_file() { } gz_tsv_file::gz_tsv_file(const std::string &file_name) { m_file_name = file_name; std::ifstream infile(m_file_name); if (infile.is_open()) { boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); m_data = std::string(std::istreambuf_iterator(decompress_stream), {}); } } gz_tsv_file::~gz_tsv_file() { } size_t gz_tsv_file::read_column_into(size_t column, std::vector &container) { std::stringstream ss(m_data); std::string line; size_t rows_read = 0; while (getline(ss, line)) { std::vector cols; boost::algorithm::split(cols, line, boost::is_any_of("\t")); if (cols.size() > column) { container.push_back(cols[column]); } else { container.push_back(""); } rows_read++; } return rows_read; } } ================================================ FILE: src/file/gz_tsv_file.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include #include #include namespace file { class gz_tsv_file { public: gz_tsv_file(); explicit gz_tsv_file(const std::string &file_name); ~gz_tsv_file(); size_t read_column_into(size_t column, std::vector &container); protected: std::string m_file_name; std::string m_data; }; } ================================================ FILE: src/file/tsv_file.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "tsv_file.h" #include namespace file { tsv_file::tsv_file() { } tsv_file::tsv_file(const std::string &file_name) { set_file_name(file_name); } tsv_file::~tsv_file() { m_file.close(); } std::string tsv_file::find(const std::string &key) { size_t pos = binary_find_position(m_file_size, 0, key); if (pos == std::string::npos) { return ""; } m_file.seekg(pos, m_file.beg); std::string line; getline(m_file, line); return line; } size_t tsv_file::find_first_position(const std::string &key) { m_file.clear(); m_file.seekg(0, m_file.beg); const size_t pos = binary_find_position(m_file_size, 0, key); if (pos == std::string::npos) return std::string::npos; // pos is the position of one item. but we need the first one. size_t jump = 1000; while (pos > jump) { m_file.seekg(pos - jump, m_file.beg); // read next line. std::string line; getline(m_file, line); getline(m_file, line); auto jump_key = line.substr(0, line.find("\t")); if (jump_key < key) { // We jamp too far. break; } jump = jump << 1; } if (pos < jump) jump = pos; // The first occurance is between pos - jump and pos - (jump/2) // Linear search. m_file.seekg(pos - jump, m_file.beg); std::string line; if (pos > jump) { getline(m_file, line); } while (getline(m_file, line)) { auto jump_key = line.substr(0, line.find("\t")); if (jump_key == key) { return (size_t)m_file.tellg() - (line.size() + 1u); } } return std::string::npos; } size_t tsv_file::find_last_position(const std::string &key) { m_file.clear(); m_file.seekg(0, m_file.beg); const size_t pos = binary_find_position(m_file_size, 0, key); if (pos == std::string::npos) return std::string::npos; // pos is the position of one item. but we need the last one. size_t jump = 1000; while (pos + jump < m_file_size) { m_file.seekg(pos + jump, m_file.beg); // read next line. std::string line; getline(m_file, line); getline(m_file, line); auto jump_key = line.substr(0, line.find("\t")); if (jump_key > key) { // We jamp too far. break; } jump = jump << 1; } jump = jump >> 1; if (pos + jump > m_file_size) { jump = 0; } // The first occurance is between pos - jump and pos - (jump/2) // Linear search. m_file.seekg(pos + jump, m_file.beg); size_t ret_pos = pos + jump; std::string line; getline(m_file, line); size_t last_line_length = line.size() + 1u; ret_pos += line.size() + 1u; while (getline(m_file, line)) { auto jump_key = line.substr(0, line.find("\t")); if (jump_key > key) { return ret_pos - last_line_length; } ret_pos += line.size() + 1u; last_line_length = line.size() + 1u; } return ret_pos - last_line_length; } size_t tsv_file::find_next_position(const std::string &key) { m_file.clear(); m_file.seekg(0, m_file.beg); const size_t pos = binary_find_position_any(m_file_size, 0, key); // pos is the position of one item. but we need the last one. size_t jump = 1000; while (pos + jump < m_file_size) { m_file.seekg(pos + jump, m_file.beg); // read next line. std::string line; getline(m_file, line); getline(m_file, line); auto jump_key = line.substr(0, line.find("\t")); if (jump_key > key) { // We jamp too far. break; } jump = jump << 1; } jump = jump >> 1; if (pos + jump > m_file_size) { jump = 0; } // The first occurance is between pos - jump and pos - (jump/2) // Linear search. m_file.seekg(pos + jump, m_file.beg); size_t ret_pos = pos + jump; std::string line; getline(m_file, line); ret_pos += line.size() + 1u; while (getline(m_file, line)) { auto jump_key = line.substr(0, line.find("\t")); if (jump_key > key) { return ret_pos; } ret_pos += line.size() + 1u; } return m_file_size; } std::map tsv_file::find_all(const std::set &keys) { m_file.clear(); m_file.seekg(0, m_file.beg); size_t pos = 0; std::map result; std::string line; for (const auto &key : keys) { pos = binary_find_position(m_file_size, pos, key); if (pos != std::string::npos) { m_file.seekg(pos, m_file.beg); getline(m_file, line); result[key] = line; } else { // Key not found, ignore. } } return result; } size_t tsv_file::read_column_into(int column, std::set &container) { (void)column; m_file.clear(); m_file.seekg(0, m_file.beg); if (!m_file.is_open()) { throw std::runtime_error("File is not open any more: " + m_file_name); } std::string line; size_t rows_read = 0; while (getline(m_file, line)) { std::stringstream ss(line); std::string col; ss >> col; container.insert(col); rows_read++; } return rows_read; } size_t tsv_file::read_column_into(int column, std::set &container, size_t limit) { (void)limit; m_file.clear(); m_file.seekg(0, m_file.beg); if (!m_file.is_open()) { throw std::runtime_error("File is not open any more: " + m_file_name); } std::string line; size_t rows_read = 0; while (getline(m_file, line)) { std::stringstream ss(line); std::string col; ss >> col; container.insert(col); rows_read++; if (rows_read >= limit) break; } return rows_read; } size_t tsv_file::read_column_into(int column, std::set &container, size_t limit, size_t offset) { m_file.clear(); m_file.seekg(0, m_file.beg); if (!m_file.is_open()) { throw std::runtime_error("File is not open any more: " + m_file_name); } std::string line; size_t rows_read = 0; while (getline(m_file, line)) { std::stringstream ss(line); std::string col; ss >> col; if (rows_read >= offset) { container.insert(col); rows_read++; if ((rows_read - offset) >= limit) break; } else { rows_read++; } } return rows_read; } size_t tsv_file::size() const { return m_file_size; } bool tsv_file::eof() const { return m_file.eof(); } bool tsv_file::is_open() const { return m_file.is_open(); } std::string tsv_file::get_line() { std::string line; getline(m_file, line); return line; } size_t tsv_file::read_column_into(int column, std::vector &container) { m_file.clear(); m_file.seekg(0, m_file.beg); std::string line; size_t rows_read = 0; while (getline(m_file, line)) { std::stringstream ss(line); std::string col; ss >> col; container.push_back(col); rows_read++; } return rows_read; } size_t tsv_file::read_column_into(int column, std::vector &container, size_t limit) { m_file.clear(); m_file.seekg(0, m_file.beg); std::string line; size_t rows_read = 0; while (getline(m_file, line)) { std::stringstream ss(line); std::string col; ss >> col; container.push_back(col); rows_read++; if (rows_read >= limit) break; } return rows_read; } size_t tsv_file::read_column_into(int column, std::vector &container, size_t limit, size_t offset) { m_file.clear(); m_file.seekg(0, m_file.beg); std::string line; size_t rows_read = 0; while (getline(m_file, line)) { std::stringstream ss(line); std::string col; ss >> col; if (rows_read >= offset) { container.push_back(col); rows_read++; if ((rows_read - offset) >= limit) break; } else { rows_read++; } } return rows_read; } size_t tsv_file::binary_find_position(size_t file_size, size_t offset, const std::string &key) { std::string line; if (file_size - offset < 750) { // Make linear search. m_file.seekg(offset, m_file.beg); size_t bytes_read = 0; while (getline(m_file, line) && bytes_read <= file_size - offset) { bytes_read += (line.size() + 1u); if (line.starts_with(key + "\t")) { return (size_t)m_file.tellg() - (line.size() + 1u); } } return std::string::npos; } size_t pivot_len_1 = (file_size - offset) / 2; size_t pivot = offset + pivot_len_1; // Get key at pivot. m_file.seekg(pivot, m_file.beg); getline(m_file, line); getline(m_file, line); auto pivot_key = line.substr(0, line.find("\t")); if (key < pivot_key) { return binary_find_position(offset + pivot_len_1, offset, key); } else if (key > pivot_key) { return binary_find_position(file_size, pivot, key); } return (size_t)m_file.tellg() - (line.size() + 1u); } size_t tsv_file::binary_find_position_any(size_t file_size, size_t offset, const std::string &key) { std::string line; if (file_size - offset < 750) { // Make linear search. m_file.seekg(offset, m_file.beg); size_t bytes_read = 0; while (getline(m_file, line) && bytes_read <= file_size - offset) { bytes_read += (line.size() + 1u); const auto this_key = line.substr(0, line.find("\t")); if (this_key >= key) { return (size_t)m_file.tellg() - (line.size() + 1u); } } return m_file_size; } size_t pivot_len_1 = (file_size - offset) / 2; size_t pivot = offset + pivot_len_1; // Get key at pivot. m_file.seekg(pivot, m_file.beg); getline(m_file, line); getline(m_file, line); auto pivot_key = line.substr(0, line.find("\t")); if (key < pivot_key) { return binary_find_position(offset + pivot_len_1, offset, key); } else if (key > pivot_key) { return binary_find_position(file_size, pivot, key); } return (size_t)m_file.tellg() - (line.size() + 1u); } void tsv_file::set_file_name(const std::string &file_name) { m_file_name = file_name; m_original_file_name = file_name; m_file.open(m_file_name); if (!m_file.is_open()) { throw std::runtime_error("Could not open file: " + m_file_name + " error: " + strerror(errno)); } m_file.seekg(0, m_file.end); m_file_size = m_file.tellg(); m_file.seekg(0, m_file.beg); } } ================================================ FILE: src/file/tsv_file.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include #include #include namespace file { class tsv_file { public: tsv_file(); explicit tsv_file(const std::string &file_name); ~tsv_file(); // Returns the line with the first column equals key. Returns std::string::npos if not present in file. std::string find(const std::string &key); /* Returns the position of the FIRST line in the file with first column equals key. Returns std::string::npos if not present in file. */ size_t find_first_position(const std::string &key); /* Returns the position of the LAST line in the file with first column equals key. Returns std::string::npos if not present in file. */ size_t find_last_position(const std::string &key); /* Returns the position of the line AFTER the line in the file with first column equals key. If the key does not exist it returns the position to the line where this key would be inserted. If the key should be inserted to the end it returns m_file_size */ size_t find_next_position(const std::string &key); std::map find_all(const std::set &keys); size_t read_column_into(int column, std::set &container); size_t read_column_into(int column, std::set &container, size_t limit); size_t read_column_into(int column, std::set &container, size_t limit, size_t offset); size_t read_column_into(int column, std::vector &container); size_t read_column_into(int column, std::vector &container, size_t limit); size_t read_column_into(int column, std::vector &container, size_t limit, size_t offset); size_t size() const; bool eof() const; bool is_open() const; std::string get_line(); protected: std::string m_file_name; std::string m_original_file_name; std::ifstream m_file; size_t m_file_size; bool m_is_gzipped = false; /* Difference is that _any returns the position where this key WOULD be if it was inserted even if it is not present. */ size_t binary_find_position(size_t file_size, size_t offset, const std::string &key); size_t binary_find_position_any(size_t file_size, size_t offset, const std::string &key); void set_file_name(const std::string &file_name); }; } ================================================ FILE: src/file/tsv_file_remote.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "tsv_file_remote.h" #include "logger/logger.h" #include "transfer/transfer.h" #include #include #include #include //using namespace boost::iostreams; namespace file { tsv_file_remote::tsv_file_remote(const std::string &file_name) { // Check if the file exists. m_file_name = file_name; std::ifstream infile(get_path()); if (download_file() == transfer::OK) { set_file_name(get_path()); } else { infile.close(); } } tsv_file_remote::~tsv_file_remote() { } std::string tsv_file_remote::get_path() const { return config::data_path() + "/0/" + m_file_name; } int tsv_file_remote::download_file() { if (m_file_name.find(".gz") == m_file_name.size() - 3) { m_is_gzipped = true; } else { m_is_gzipped = false; } LOG_INFO("Downloading file with key: " + m_file_name); create_directory(); std::ofstream outfile(get_path(), std::ios::trunc); int error = transfer::ERROR; if (outfile.good()) { if (m_is_gzipped) { transfer::gz_file_to_stream(m_file_name, outfile, error); } else { transfer::file_to_stream(m_file_name, outfile, error); } if (error == transfer::ERROR) { LOG_INFO("Download failed..."); } } LOG_INFO("Done downloading file with key: " + m_file_name); return error; } void tsv_file_remote::create_directory() { boost::filesystem::path path(get_path()); boost::filesystem::create_directories(path.parent_path()); } } ================================================ FILE: src/file/tsv_file_remote.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "tsv_file.h" namespace file { class tsv_file_remote : public tsv_file { public: explicit tsv_file_remote(const std::string &file_name); ~tsv_file_remote(); std::string get_path() const; private: int download_file(); void create_directory(); }; } ================================================ FILE: src/file/tsv_row.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "tsv_row.h" namespace file { tsv_row::tsv_row(const std::string &line) { size_t pos_start = 0; size_t pos_end = 0; while (pos_end != std::string::npos) { pos_end = line.find(pos_start, '\t'); m_cols.emplace_back(line.substr(pos_start, pos_end)); pos_start = pos_end + 1; } } tsv_row::~tsv_row() { } } ================================================ FILE: src/file/tsv_row.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace file { class tsv_row { public: explicit tsv_row(const std::string &line); ~tsv_row(); private: std::vector m_cols; }; } ================================================ FILE: src/full_text/domain_link_record.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once namespace full_text { struct domain_link_record { uint64_t m_value; float m_score; uint64_t m_source_domain; uint64_t m_target_domain; }; } ================================================ FILE: src/full_text/link_record.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once namespace full_text { struct link_record { uint64_t m_value; float m_score; uint64_t m_source_domain; uint64_t m_target_hash; }; } ================================================ FILE: src/full_text/record.h ================================================ #pragma once namespace full_text { struct record { uint64_t m_value; float m_score; uint64_t m_domain_hash; }; } ================================================ FILE: src/full_text/result_set.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "config.h" #include #include #include #include #include namespace full_text { template class result_set { public: result_set(size_t size); ~result_set(); size_t size() const { return m_size; } size_t max_size() const { return m_max_size; } const data_record *data_pointer() const { return m_data_pointer; } const data_record *section_pointer(size_t section) const { return &m_data_pointer[section * config::ft_max_results_per_section]; } data_record *data_pointer() { return m_data_pointer; } data_record *section_pointer(size_t section) { return &m_data_pointer[section * config::ft_max_results_per_section]; } std::span *span_pointer() { return &m_span; } size_t total_num_results() const { return m_total_num_results ; }; void set_total_num_results(size_t total_num_results); void resize(size_t n) { m_span = std::span(m_data_pointer, n); m_size = n; } void prepare_sections(const std::string &filename, size_t offset, size_t len); void read_to_section(size_t section); bool has_next_section(); size_t num_sections(); void close_sections(); void copy_vector(const std::vector &vec); private: result_set(const result_set &res) = delete; std::span m_span; data_record *m_data_pointer; size_t m_size; // The length in first section. const size_t m_max_size; // The maximum number of elements the result set can hold. size_t m_total_size; // The lengths of all elements in all sections. size_t m_total_num_results; // The total indexed length, only used to display total number of results. size_t m_section_len; size_t m_records_read; int m_file_descriptor; bool m_error = false; }; template result_set::result_set(size_t size) : m_size(size), m_max_size(size), m_total_num_results(0) { m_file_descriptor = -1; m_data_pointer = new data_record[size]; m_span = std::span(m_data_pointer, size); } template result_set::~result_set() { delete []m_data_pointer; } template void result_set::set_total_num_results(size_t total_num_results) { m_total_num_results = total_num_results; } template void result_set::prepare_sections(const std::string &filename, size_t offset, size_t len) { assert(m_file_descriptor < 0); m_size = len / sizeof(data_record); m_total_size = m_size; if (m_size > config::ft_max_results_per_section) m_size = config::ft_max_results_per_section; m_file_descriptor = open(filename.c_str(), O_RDONLY); posix_fadvise(m_file_descriptor, offset, m_total_size * sizeof(data_record), POSIX_FADV_SEQUENTIAL); lseek(m_file_descriptor, offset, SEEK_SET); m_records_read = 0; resize(m_size); } /* Reads data up to and includint the section. So if the argument section equals zero the first section is read. */ template void result_set::read_to_section(size_t section) { size_t read_start = m_records_read; size_t read_end = (section + 1) * config::ft_max_results_per_section; if (read_end > m_total_size) read_end = m_total_size; if (read_start > read_end) return; size_t records_to_read = read_end - read_start; int bytes_read = ::read(m_file_descriptor, (void *)&m_data_pointer[m_records_read], (size_t)records_to_read * sizeof(data_record)); if (bytes_read < 0) { m_error = true; } else { m_error = false; } m_records_read += records_to_read; } template bool result_set::has_next_section() { if (m_file_descriptor < 0) return false; return m_total_size > m_records_read; } template size_t result_set::num_sections() { // Ceiling integer division of m_total_size/config::ft_max_results_per_section; return (m_total_size + config::ft_max_results_per_section - 1) / config::ft_max_results_per_section; } template void result_set::close_sections() { if (m_file_descriptor >= 0) { close(m_file_descriptor); m_file_descriptor = -1; } } template void result_set::copy_vector(const std::vector &vec) { memcpy(&m_data_pointer[0], vec.data(), vec.size() * sizeof(data_record)); resize(vec.size()); } } ================================================ FILE: src/full_text/search_metric.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once namespace full_text { class search_metric { public: size_t m_total_found = 0; size_t m_total_url_links_found = 0; size_t m_total_domain_links_found = 0; size_t m_links_handled = 0; size_t m_link_domain_matches = 0; size_t m_link_url_matches = 0; }; } ================================================ FILE: src/hash_table2/builder.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "builder.h" #include "utils/thread_pool.hpp" namespace hash_table2 { builder::builder(const std::string &db_name, size_t num_shards, size_t hash_table_size, const std::string &data_path) : m_db_name(db_name) { for (size_t i = 0; i < num_shards; i++) { m_shards.push_back(new hash_table_shard_builder(db_name, i, hash_table_size, data_path)); } } builder::~builder() { for (hash_table_shard_builder *shard : m_shards) { delete shard; } } void builder::add(uint64_t key, const std::string &value, size_t version) { m_shards[key % m_shards.size()]->add(key, value, version); } void builder::remove(uint64_t key) { m_shards[key % m_shards.size()]->remove(key); } void builder::merge() { utils::thread_pool pool(32); for (hash_table_shard_builder *shard : m_shards) { pool.enqueue([shard]() -> void { shard->append(); shard->merge(); }); } pool.run_all(); } void builder::optimize() { utils::thread_pool pool(32); for (hash_table_shard_builder *shard : m_shards) { pool.enqueue([shard]() -> void { shard->optimize(); }); } pool.run_all(); } void builder::truncate() { for (hash_table_shard_builder *shard : m_shards) { shard->truncate(); } } void builder::merge_with(const builder &other) { for (size_t i = 0; i < m_shards.size(); i++) { m_shards[i]->merge_with(*(other.m_shards[i])); } } } ================================================ FILE: src/hash_table2/builder.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include "hash_table_shard_builder.h" #include "config.h" namespace hash_table2 { class builder { public: explicit builder(const std::string &db_name, size_t num_shards = config::ht_num_shards, size_t hash_table_size = 1000000, const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table"); ~builder(); void add(uint64_t key, const std::string &value, size_t version = 0); void remove(uint64_t key); void merge(); void optimize(); void truncate(); void merge_with(const builder &other); hash_table_shard_builder *get_shard(size_t shard_id) { return m_shards[shard_id]; }; private: std::vector m_shards; const std::string m_db_name; }; } ================================================ FILE: src/hash_table2/hash_table.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "config.h" #include "hash_table.h" #include "hash_table_shard_builder.h" #include "logger/logger.h" namespace hash_table2 { hash_table::hash_table(const std::string &db_name, size_t num_shards, size_t hash_table_size, const std::string &data_path) : m_db_name(db_name) { for (size_t shard_id = 0; shard_id < num_shards; shard_id++) { auto shard = new hash_table_shard(m_db_name, shard_id, hash_table_size, data_path); m_shards.push_back(shard); } } hash_table::~hash_table() { for (hash_table_shard *shard : m_shards) { delete shard; } } void hash_table::add(uint64_t key, const std::string &value) { const size_t shard_id = key % m_shards.size(); hash_table_shard_builder builder(m_db_name, shard_id); builder.add(key, value); } void hash_table::truncate() { for (size_t shard_id = 0; shard_id < m_shards.size(); shard_id++) { hash_table_shard_builder builder(m_db_name, shard_id); builder.truncate(); } } bool hash_table::has(uint64_t key) { return m_shards[key % m_shards.size()]->has(key); } std::string hash_table::find(uint64_t key) { size_t ver = 0; return find(key, ver); } std::string hash_table::find(uint64_t key, size_t &ver) { return m_shards[key % m_shards.size()]->find(key, ver); } size_t hash_table::size() const { size_t num_items = 0; for (const auto &shard : m_shards) { num_items += shard->size(); } return num_items; } void hash_table::for_each(std::function callback) const { for (const auto &shard : m_shards) { shard->for_each(callback); } } void hash_table::for_each_key(std::function callback) const { for (const auto &shard : m_shards) { shard->for_each_key(callback); } } void hash_table::for_each_shard(std::function callback) const { for (const auto &shard : m_shards) { callback(shard); } } } ================================================ FILE: src/hash_table2/hash_table.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include "config.h" #include "hash_table_shard.h" namespace hash_table2 { class hash_table_shard; class hash_table { public: explicit hash_table(const std::string &db_name, size_t num_shards = config::ht_num_shards, size_t hash_table_size = 1000000, const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table"); ~hash_table(); void add(uint64_t key, const std::string &value); void truncate(); bool has(uint64_t key); std::string find(uint64_t key); std::string find(uint64_t key, size_t &ver); size_t size() const; void for_each(std::function callback) const; void for_each_key(std::function callback) const; void for_each_shard(std::function callback) const; private: std::vector m_shards; const std::string m_db_name; }; } ================================================ FILE: src/hash_table2/hash_table_shard.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include "config.h" #include "hash_table_shard.h" #include "logger/logger.h" #include #include namespace hash_table2 { hash_table_shard::hash_table_shard(const std::string &db_name, size_t shard_id, size_t hash_table_size, const std::string &data_path) : hash_table_shard_base(db_name, shard_id, hash_table_size, data_path) { } hash_table_shard::~hash_table_shard() { } bool hash_table_shard::has(uint64_t key) const { std::ifstream reader(filename_pos(), std::ios::binary); const size_t hash_pos = key % this->m_hash_table_size; reader.seekg(hash_pos * sizeof(size_t)); // Read page pos. size_t page_pos = SIZE_MAX; reader.read((char *)&page_pos, sizeof(size_t)); if (page_pos == SIZE_MAX) return false; // Read page. size_t page_len; reader.seekg(this->hash_table_byte_size() + page_pos, std::ios::beg); reader.read((char *)&page_len, sizeof(size_t)); std::vector> page(page_len); reader.read((char *)page.data(), page_len * sizeof(std::array)); // Find key among pages. for (const auto &page_item : page) { if (page_item[0] == key) { return true; } } return false; } std::string hash_table_shard::find(uint64_t key) const { size_t ver; return find(key, ver); } std::string hash_table_shard::find(uint64_t key, size_t &ver) const { std::ifstream reader(filename_pos(), std::ios::binary); const size_t hash_pos = key % this->m_hash_table_size; reader.seekg(hash_pos * sizeof(size_t)); // Read page pos. size_t page_pos = SIZE_MAX; reader.read((char *)&page_pos, sizeof(size_t)); if (page_pos == SIZE_MAX) return ""; // Read page. size_t page_len; reader.seekg(this->hash_table_byte_size() + page_pos, std::ios::beg); reader.read((char *)&page_len, sizeof(size_t)); std::vector> page(page_len); reader.read((char *)page.data(), page_len * sizeof(std::array)); // Find key among pages. size_t pos = SIZE_MAX; for (const auto &page_item : page) { if (page_item[0] == key) { pos = page_item[1]; ver = page_item[2]; } } if (pos == SIZE_MAX) return ""; return data_at_position(pos); } void hash_table_shard::for_each(std::function callback) const { std::ifstream infile(filename_data(), std::ios::binary); infile.seekg(0, std::ios::beg); while (!infile.eof()) { size_t key; if (!infile.read((char *)&key, sizeof(size_t))) break; size_t data_len; if (!infile.read((char *)&data_len, sizeof(size_t))) break; if (key == 0ull) { // Skip. infile.seekg(data_len, std::ios::cur); continue; } std::unique_ptr buffer_allocator; try { buffer_allocator = std::make_unique(data_len); } catch (std::bad_alloc &exception) { std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl; std::cout << "tried to allocate: " << data_len << " bytes" << std::endl; break; } char *buffer = buffer_allocator.get(); infile.read(buffer, data_len); std::stringstream ss(std::string(buffer, data_len)); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(ss); std::stringstream decompressed; decompressed << decompress_stream.rdbuf(); const std::string value = decompressed.str(); callback(key, std::move(value)); } } void hash_table_shard::for_each_key(std::function callback) const { std::ifstream infile(filename_data(), std::ios::binary); infile.seekg(0, std::ios::beg); while (!infile.eof()) { size_t key; if (!infile.read((char *)&key, sizeof(size_t))) break; size_t data_len; if (!infile.read((char *)&data_len, sizeof(size_t))) break; infile.seekg(data_len, std::ios::cur); callback(key); } } size_t hash_table_shard::shard_id() const { return m_shard_id; } size_t hash_table_shard::size() const { auto pages = this->read_pages(); return std::transform_reduce(pages.cbegin(), pages.cend(), 0, [](auto a, auto b) { return a + b; }, [](const auto &p) { return p.size(); }); } size_t hash_table_shard::file_size() const { std::ifstream infile(filename_data(), std::ios::binary); infile.seekg(0, std::ios::end); return infile.tellg(); } std::string hash_table_shard::data_at_position(size_t pos) const { std::ifstream infile(filename_data(), std::ios::binary); infile.seekg(pos, std::ios::beg); // Read key uint64_t read_key; infile.read((char *)&read_key, sizeof(uint64_t)); // Read data length. size_t data_len; infile.read((char *)&data_len, sizeof(size_t)); std::unique_ptr buffer_allocator; try { buffer_allocator = std::make_unique(data_len); } catch (std::bad_alloc &exception) { std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl; std::cout << "tried to allocate: " << data_len << " bytes" << std::endl; return ""; } char *buffer = buffer_allocator.get(); infile.read(buffer, data_len); std::stringstream ss(std::string(buffer, data_len)); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(ss); std::stringstream decompressed; decompressed << decompress_stream.rdbuf(); return decompressed.str(); } } ================================================ FILE: src/hash_table2/hash_table_shard.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include "config.h" #include "hash_table_shard_base.h" namespace hash_table2 { class hash_table_shard : public hash_table_shard_base { public: hash_table_shard(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000, const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table"); ~hash_table_shard(); /* * Checks if the key exists in the hash table. * */ bool has(uint64_t key) const; /* * Finds a value for the given key. Returns empty string if key is not present. * */ std::string find(uint64_t key) const; /* * Finds a value for the given key. Returns empty string if key is not present. Also sets version in 'ver' * */ std::string find(uint64_t key, size_t &ver) const; /* * Loop over all elements in hash table shard and call the given function. * */ void for_each(std::function) const; void for_each_key(std::function) const; /* * Returns the id of the shard. * */ size_t shard_id() const; /* * Returns the number of elements in the shard. * */ size_t size() const; /* * Returns the size of the data file in bytes. * */ size_t file_size() const; private: std::string data_at_position(size_t pos) const; }; } ================================================ FILE: src/hash_table2/hash_table_shard_base.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace hash_table2 { class hash_table_shard_base { public: hash_table_shard_base(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000, const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table") : m_db_name(db_name), m_shard_id(shard_id), m_hash_table_size(hash_table_size), m_data_path(data_path) {} std::string file_base_data() const { const size_t disk_shard = m_shard_id % 8; std::string data_path = m_data_path; if (data_path.find("{shard_id_mod_8}") != std::string::npos) { data_path.replace(data_path.find("{shard_id_mod_8}"), 16, std::to_string(disk_shard)); } return data_path + "/ht_" + m_db_name + "_" + std::to_string(m_shard_id); } std::string file_base() const { const size_t disk_shard = m_shard_id % 8; std::string data_path = config::data_path() + "/{shard_id_mod_8}/hash_table"; if (data_path.find("{shard_id_mod_8}") != std::string::npos) { data_path.replace(data_path.find("{shard_id_mod_8}"), 16, std::to_string(disk_shard)); } return data_path + "/ht_" + m_db_name + "_" + std::to_string(m_shard_id); } std::string filename_data() const { return file_base_data() + ".data"; } std::string filename_pos() const { return file_base() + ".pos"; } std::string filename_data_tmp() const { return file_base() + ".data.tmp"; } protected: const std::string m_db_name; size_t m_shard_id; size_t m_hash_table_size; const std::string m_data_path; size_t hash_table_byte_size() const { return m_hash_table_size * sizeof(size_t); } std::vector>> read_pages() const { std::ifstream infile(filename_pos(), std::ios::binary); return read_pages(infile); } std::vector>> read_pages(std::ifstream &infile) const { const size_t max_records = 10000; const size_t record_len = sizeof(std::array); const size_t buffer_len = record_len * max_records; auto buffer_allocator = std::make_unique(buffer_len); char *buffer = buffer_allocator.get(); std::vector>> ret(this->m_hash_table_size); if (infile.is_open()) { infile.seekg(this->hash_table_byte_size()); do { size_t num_keys; infile.read((char *)&num_keys, sizeof(size_t)); if (infile.eof()) break; if (num_keys > max_records) { break; } const size_t len = record_len * num_keys; infile.read(buffer, len); for (size_t i = 0; i < len; i += record_len) { const uint64_t key = *((uint64_t *)&buffer[i]); const size_t page_id = key % this->m_hash_table_size; const size_t pos = *((size_t *)&buffer[i + sizeof(uint64_t)]); const size_t version = *((size_t *)&buffer[i + sizeof(uint64_t) + sizeof(size_t)]); ret[page_id].emplace_back(std::array{key, (uint64_t)pos, (uint64_t)version}); } } while (!infile.eof()); } return ret; } }; } ================================================ FILE: src/hash_table2/hash_table_shard_builder.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "config.h" #include "hash_table_shard_builder.h" #include "logger/logger.h" #include "file/file.h" #include "indexer/merger.h" #include #include namespace hash_table2 { hash_table_shard_builder::hash_table_shard_builder(const std::string &db_name, size_t shard_id, size_t hash_table_size, const std::string &data_path) : hash_table_shard_base(db_name, shard_id, hash_table_size, data_path) { indexer::merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); }); indexer::merger::register_merger((size_t)this, [this]() {merge();}); } hash_table_shard_builder::~hash_table_shard_builder() { indexer::merger::deregister_merger((size_t)this); } void hash_table_shard_builder::add(uint64_t key, const std::string &value, size_t version) { indexer::merger::lock(); std::lock_guard guard(m_lock); auto ver_iter = m_version.find(key); if (version > 0 && ver_iter != m_version.end() && ver_iter->second > version) { // do nothing } else { m_data_size += value.capacity(); m_cache[key] = value; m_version[key] = version; } } void hash_table_shard_builder::remove(uint64_t key) { m_remove_keys.push_back(key); } size_t hash_table_shard_builder::cache_size() const { // This is an OK approximation since m_data_size will be much larger than the keys. return m_cache.size() * sizeof(uint64_t) * 2 + m_data_size; } void hash_table_shard_builder::append() { std::lock_guard guard(m_lock); ofstream outfile(this->filename_data_tmp(), ios::binary | ios::app); for (const auto &iter : m_cache) { const size_t version = m_version[iter.first]; outfile.write((char *)&iter.first, sizeof(uint64_t)); outfile.write((char *)&version, sizeof(size_t)); // Compress data std::stringstream ss(iter.second); boost::iostreams::filtering_istream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(ss); std::stringstream compressed; compressed << compress_stream.rdbuf(); std::string compressed_string(compressed.str()); const size_t data_len = compressed_string.size(); outfile.write((char *)&data_len, sizeof(size_t)); outfile.write(compressed_string.c_str(), data_len); } // Free RAM caches and set m_data_size to zero. m_cache = std::map{}; m_version = std::map{}; m_data_size = 0; } void hash_table_shard_builder::merge() { auto pages = this->read_pages(); const size_t buffer_len = 1024*1024*20; std::unique_ptr buffer_allocator; try { buffer_allocator = std::make_unique(buffer_len); } catch (std::bad_alloc &exception) { std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl; std::cout << "tried to allocate: " << buffer_len << " bytes" << std::endl; return; } char *buffer = buffer_allocator.get(); // Read append cache and add to pages + data file. std::ifstream infile(this->filename_data_tmp(), std::ios::binary); std::ofstream outfile(this->filename_data(), std::ios::binary | std::ios::app); size_t last_pos = outfile.tellp(); while (!infile.eof()) { uint64_t key; if (!infile.read((char *)&key, sizeof(uint64_t))) break; size_t version; if (!infile.read((char *)&version, sizeof(size_t))) break; size_t data_len; if (!infile.read((char *)&data_len, sizeof(size_t))) break; if (data_len > buffer_len) { LOG_INFO("data_len " + std::to_string(data_len) + "is larger than buffer_len " + std::to_string(buffer_len) + " in file " + filename_data()); infile.seekg(data_len, ios::cur); continue; } else { if (!infile.read(buffer, data_len)) break; } const size_t page_id = key % this->m_hash_table_size; const std::array elem{key, last_pos, version}; auto insert_at = std::upper_bound(pages[page_id].begin(), pages[page_id].end(), elem, [](const auto &a, const auto &b) { return a[0] < b[0]; }); // insert_at points to the element after "elem" bool add_data = false; if (pages[page_id].size() == 0) { pages[page_id].push_back(elem); add_data = true; } else { const auto elem_at = *(insert_at - 1); if (elem_at[0] == elem[0]) { // If version is bigger on the new element. Replace element. if (elem_at[2] <= elem[2]) { *(insert_at - 1) = elem; add_data = true; } } else { pages[page_id].insert(insert_at, elem); add_data = true; } } if (add_data) { outfile.write((char *)&key, sizeof(uint64_t)); outfile.write((char *)&data_len, sizeof(size_t)); outfile.write(buffer, data_len); last_pos += data_len + sizeof(uint64_t) + sizeof(size_t); } } // Delete cache file. file::delete_file(this->filename_data_tmp()); // Remove keys that are in m_remove_keys. remove_keys_from_pages(pages); m_remove_keys = std::vector{}; write_pages(pages); } void hash_table_shard_builder::optimize() { auto pages = this->read_pages(); std::ifstream infile(this->filename_data(), std::ios::binary); std::ofstream outfile(this->filename_data_tmp(), std::ios::binary | std::ios::trunc); read_optimized_to(pages, infile, outfile); outfile.close(); file::delete_file(filename_data()); file::delete_file(filename_pos()); merge(); } void hash_table_shard_builder::truncate() { std::lock_guard guard(m_lock); ofstream outfile(this->filename_data(), ios::binary | ios::trunc); ofstream outfile_pos(this->filename_pos(), ios::binary | ios::trunc); file::delete_file(this->filename_data_tmp()); } void hash_table_shard_builder::merge_with(const hash_table_shard_builder &other) { merge_with(other.filename_pos(), other.filename_data()); } void hash_table_shard_builder::merge_with(const std::string &pos_file, const std::string &data_file) { std::ifstream other_posfile(pos_file, std::ios::binary); auto pages1 = this->read_pages(); auto pages2 = this->read_pages(other_posfile); // Remove the pages in pages1 that have higher version number in pages2 and vise versa. for (size_t p = 0; p < pages1.size(); p++) { size_t i = 0, j = 0; while (i < pages1[p].size() && j < pages2[p].size()) { if (pages1[p][i][0] == pages2[p][j][0]) { if (pages1[p][i][2] < pages2[p][j][2]) { // delete pages1[p][i] pages1[p][i][1] = SIZE_MAX; } else { // delete pages2[p][j] pages2[p][j][1] = SIZE_MAX; } i++; j++; } else if (pages1[p][i][0] < pages2[p][j][0]) { i++; } else { j++; } } } std::ofstream outfile(this->filename_data_tmp(), std::ios::binary | std::ios::trunc); std::ifstream data_file_2(data_file, std::ios::binary); read_optimized_to(pages2, data_file_2, outfile); outfile.close(); merge(); } void hash_table_shard_builder::read_optimized_to(const std::vector>> &pages, std::ifstream &infile, std::ofstream &outfile) const { infile.seekg(0, std::ios::beg); while (!infile.eof()) { const size_t my_pos = infile.tellg(); size_t key; if (!infile.read((char *)&key, sizeof(size_t))) break; size_t data_len; if (!infile.read((char *)&data_len, sizeof(size_t))) break; const size_t page_id = key % this->m_hash_table_size; std::array elem{key, (uint64_t)0, (uint64_t)0}; auto iter = std::upper_bound(pages[page_id].cbegin(), pages[page_id].cend(), elem, [](const auto &a, const auto &b) { return a[0] < b[0]; }); if (pages[page_id].size() == 0) { // Skip. Did not find key. infile.seekg(data_len, std::ios::cur); continue; } elem = *(iter - 1); if (elem[0] == key && elem[1] == my_pos) { std::unique_ptr buffer_allocator; try { buffer_allocator = std::make_unique(data_len); } catch (std::bad_alloc &exception) { std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl; std::cout << "tried to allocate: " << data_len << " bytes" << std::endl; break; } char *buffer = buffer_allocator.get(); infile.read(buffer, data_len); // Keep this data. const size_t version = elem[2]; outfile.write((char *)&key, sizeof(uint64_t)); outfile.write((char *)&version, sizeof(size_t)); outfile.write((char *)&data_len, sizeof(size_t)); outfile.write(buffer, data_len); } else { // Ignore data. infile.seekg(data_len, std::ios::cur); } } } void hash_table_shard_builder::write_pages(const std::vector>> &pages) { std::ofstream key_writer(this->filename_pos(), std::ios::binary | std::ios::trunc); const size_t page_item_size = sizeof(std::array); const size_t empty_key = SIZE_MAX; size_t last_pos = 0; for (size_t page_id = 0; page_id < pages.size(); page_id++) { const size_t page_len = pages[page_id].size(); if (page_len) { key_writer.write((char *)&last_pos, sizeof(size_t)); last_pos += pages[page_id].size() * page_item_size + sizeof(size_t); } else { key_writer.write((char *)&empty_key, sizeof(size_t)); } } // Write pages. for (size_t page_id = 0; page_id < pages.size(); page_id++) { const size_t page_len = pages[page_id].size(); if (page_len) { key_writer.write((char *)&page_len, sizeof(size_t)); for (const auto &page_item : pages[page_id]) { key_writer.write((char *)&page_item, page_item_size); } } } } void hash_table_shard_builder::remove_keys_from_pages(std::vector>> &pages) { for (auto key : m_remove_keys) { const size_t page_id = key % this->m_hash_table_size; std::array elem{key, (uint64_t)0, (uint64_t)0}; auto iter = std::upper_bound(pages[page_id].cbegin(), pages[page_id].cend(), elem, [](const auto &a, const auto &b) { return a[0] < b[0]; }); iter--; if ((*iter)[0] == key) { // remove the key from the page. pages[page_id].erase(iter); } } } } ================================================ FILE: src/hash_table2/hash_table_shard_builder.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include "hash_table.h" #include "hash_table_shard_base.h" namespace hash_table2 { /* * Implementation of a hash table shard. * * usage: * hash_table_shard shard("test_db", 0); * shard.add(12345, "test data", 3); * shard.add(12345, "new test data", 4); * * shard.append(); * shard.merge(); * * */ class hash_table_shard_builder : public hash_table_shard_base { public: hash_table_shard_builder(const std::string &db_name, size_t shard_id, size_t hash_table_size = 1000000, const std::string &data_path = config::data_path() + "/{shard_id_mod_8}/hash_table"); ~hash_table_shard_builder(); /* * Add key/value pair to hash table. * */ void add(uint64_t key, const std::string &value, size_t version = 0); /* * Remove key from hash table. * */ void remove(uint64_t key); /* * Return approximation of amount of memory in cache. * */ size_t cache_size() const; /* * Write memory cache to disc cache. * */ void append(); /* * Write disc cache to persistant hash table. * */ void merge(); /* * Optimize persistant has table to remove data for unused versions. * */ void optimize(); /* * Delete all data in shard. * */ void truncate(); /* * Merge with another shard. Handles key collisions by keeping the one with highest version. * */ void merge_with(const hash_table_shard_builder &other); /* * Merge with another pos and datafile. * */ void merge_with(const std::string &pos_file, const std::string &data_file); private: std::map m_cache; std::map m_version; std::vector m_remove_keys; std::map m_sort_pos; std::mutex m_lock; size_t m_data_size = 0; void read_optimized_to(const std::vector>> &pages, std::ifstream &infile, std::ofstream &outfile) const; void write_pages(const std::vector>> &pages); void remove_keys_from_pages(std::vector>> &pages); }; } ================================================ FILE: src/hash_table_helper/hash_table_helper.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "config.h" #include "hash_table_helper.h" #include "logger/logger.h" namespace hash_table_helper { void truncate(const std::string &hash_table_name) { std::vector shards = create_shard_builders(hash_table_name); for (auto shard : shards) { shard->truncate(); } delete_shard_builders(shards); } std::vector create_shard_builders(const std::string &hash_table_name) { std::vector shards; for (size_t shard_id = 0; shard_id < config::ht_num_shards; shard_id++) { shards.push_back(new hash_table2::hash_table_shard_builder(hash_table_name, shard_id)); } return shards; } void delete_shard_builders(std::vector &shards) { for (auto shard : shards) { delete shard; } shards.clear(); } } ================================================ FILE: src/hash_table_helper/hash_table_helper.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include "hash_table2/hash_table.h" #include "hash_table2/hash_table_shard_builder.h" namespace hash_table_helper { void truncate(const std::string &hash_table_name); std::vector create_shard_builders(const std::string &hash_table_name); void delete_shard_builders(std::vector &shards); } ================================================ FILE: src/http/request.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "request.h" namespace http { request::request(const URL &url, std::string request_method, std::string request_body) : m_url(url), m_request_method(request_method), m_request_body(request_body) { } } ================================================ FILE: src/http/request.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include "URL.h" namespace http { class request { public: request(const URL &url, std::string request_method = "POST", std::string request_body = ""); const URL& url() const { return m_url; } const std::string &request_method() const { return m_request_method; } const std::string &request_body() const { return m_request_body; } private: size_t m_code = 200; URL m_url; std::string m_request_method; std::string m_request_body; }; } ================================================ FILE: src/http/response.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace http { class response { public: void code(size_t code) { m_code = code; } size_t code() const { return m_code; } void body(const std::string &body) { m_body = body; } const std::string &body() const { return m_body; } void content_type(const std::string &content_type) { m_content_type = content_type; } const std::string &content_type() const { return m_content_type; } private: size_t m_code = 200; std::string m_body = ""; std::string m_content_type = "text/html"; }; } ================================================ FILE: src/http/server.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "server.h" #include "fcgio.h" #include "logger/logger.h" #include "URL.h" #include #include namespace http { server::server(std::function handler) { m_handler = handler; start(); } void server::run_worker(int socket_id) { const size_t max_post_len = 1024*1024*1024; const size_t buffer_len = 1024*1024; std::unique_ptr buffer_allocator = std::make_unique(buffer_len); char *buffer = buffer_allocator.get(); FCGX_Request request; FCGX_InitRequest(&request, socket_id, 0); LOG_INFO("Server has started..."); while (true) { m_lock.lock(); int accept_response = FCGX_Accept_r(&request); m_lock.unlock(); if (accept_response < 0) { break; } const char *uri_ptr = FCGX_GetParam("REQUEST_URI", request.envp); const char *req_ptr = FCGX_GetParam("REQUEST_METHOD", request.envp); if ((uri_ptr == nullptr) || (req_ptr == nullptr)) { FCGX_Finish_r(&request); continue; } std::string uri(uri_ptr); std::string request_method(req_ptr); LOG_INFO("Serving request: " + uri); URL url("http://alexandria.org" + uri); std::string post_data; if (request_method == "POST") { while (true) { const size_t read_bytes = FCGX_GetStr(buffer, buffer_len, request.in); if (read_bytes == 0) break; if (post_data.size() + read_bytes > max_post_len) { LOG_ERROR("Posted data larger then " + std::to_string(max_post_len) + ", ignoring request"); break; } post_data.append(buffer, read_bytes); } } ::http::request http_request(url, request_method, post_data); ::http::response http_response = m_handler(http_request); const std::string data_out = http_response.body(); // Output response const std::string content_type = std::string("Content-type: ") + http_response.content_type() + "\r\n"; const std::string status = std::string("Status: ") + std::to_string(http_response.code()) + "\r\n"; const std::string end_req = "\r\n"; FCGX_FPrintF(request.out, status.c_str()); FCGX_FPrintF(request.out, content_type.c_str()); FCGX_FPrintF(request.out, end_req.c_str()); FCGX_PutStr(data_out.c_str(), data_out.size(), request.out); FCGX_Finish_r(&request); } FCGX_Free(&request, true); } void server::start() { FCGX_Init(); int socket_id = FCGX_OpenSocket("127.0.0.1:8000", 20); if (socket_id < 0) { LOG_INFO("Could not open socket, exiting"); return; } std::vector threads; for (size_t i = 0; i < m_workers; i++) { threads.emplace_back(std::move(std::thread([this](int socket_id){ run_worker(socket_id); }, socket_id))); } for (auto &thread : threads) { thread.join(); } close(socket_id); } } ================================================ FILE: src/http/server.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include "request.h" #include "response.h" namespace http { class server { public: server(std::function<::http::response(const ::http::request &)> handler); private: std::function<::http::response(const ::http::request &)> m_handler; size_t m_port = 8080; size_t m_workers = 8; std::mutex m_lock; void run_worker(int socket_id); void start(); }; } ================================================ FILE: src/indexer/basic_index.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "index_reader.h" #include "index_base.h" #include namespace indexer { template class basic_index : public index_base { public: explicit basic_index(const std::string &file_name); explicit basic_index(const std::string &db_name, size_t id); explicit basic_index(const std::string &db_name, size_t id, size_t hash_table_size); explicit basic_index(std::istream *reader, size_t hash_table_size); ~basic_index(); std::vector find(uint64_t key) const; std::vector find(uint64_t key, size_t limit) const; std::unique_ptr find_ptr(uint64_t key, size_t &num_records) const; std::unique_ptr find_ptr(uint64_t key, size_t limit, size_t &num_records) const; size_t find_count(uint64_t key) const; /* * Iterates the keys of the index and calls the callback with key and vector of records for that key. * */ void for_each(std::function &recs)> on_each_key) const; void for_each_key(std::function on_each_key) const; private: mutable std::istream *m_reader; std::unique_ptr m_default_reader; std::string m_file_name; std::string m_db_name; size_t m_id; size_t m_unique_count = 0; size_t read_key_pos(uint64_t key) const; void read_meta(); std::string mountpoint() const; std::string filename() const; std::string meta_filename() const; }; template basic_index::basic_index(const std::string &file_name) : index_base(), m_file_name(file_name) { m_default_reader = std::make_unique(filename(), std::ios::binary); m_reader = m_default_reader.get(); } template basic_index::basic_index(const std::string &db_name, size_t id) : index_base(), m_db_name(db_name), m_id(id) { m_default_reader = std::make_unique(filename(), std::ios::binary); m_reader = m_default_reader.get(); } template basic_index::basic_index(const std::string &db_name, size_t id, size_t hash_table_size) : index_base(hash_table_size), m_db_name(db_name), m_id(id) { m_default_reader = std::make_unique(filename(), std::ios::binary); m_reader = m_default_reader.get(); } template basic_index::basic_index(std::istream *reader, size_t hash_table_size) : index_base(hash_table_size) { m_reader = reader; } template basic_index::~basic_index() { } template std::vector basic_index::find(uint64_t key) const { return find(key, 0); } template std::vector basic_index::find(uint64_t key, size_t limit) const { std::lock_guard lock(this->m_lock); size_t num_records; unique_ptr ptr = find_ptr(key, limit, num_records); std::vector ret; for (size_t i = 0; i < num_records; i++) { ret.push_back(ptr[i]); } return ret; } template std::unique_ptr basic_index::find_ptr(uint64_t key, size_t &num_records) const { return find_ptr(key, 0, num_records); } template std::unique_ptr basic_index::find_ptr(uint64_t key, size_t limit, size_t &num_records) const { std::lock_guard lock(this->m_lock); num_records = 0; size_t key_pos = read_key_pos(key); if (key_pos == SIZE_MAX) { return {}; } // Read page. m_reader->seekg(key_pos); size_t num_keys; m_reader->read((char *)&num_keys, sizeof(size_t)); std::unique_ptr keys_allocator = std::make_unique(num_keys); uint64_t *keys = keys_allocator.get(); m_reader->read((char *)keys, num_keys * sizeof(uint64_t)); size_t key_data_pos = SIZE_MAX; for (size_t i = 0; i < num_keys; i++) { if (keys[i] == key) { key_data_pos = i; } } if (key_data_pos == SIZE_MAX) { return {}; } char buffer[64]; // Read position and length. m_reader->seekg(key_pos + 8 + num_keys * 8 + key_data_pos * 8); m_reader->read(buffer, 8); size_t pos = *((size_t *)(&buffer[0])); m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8); m_reader->read(buffer, 8); size_t len = *((size_t *)(&buffer[0])); m_reader->seekg(key_pos + 8 + (num_keys * 8)*3 + pos); num_records = len / sizeof(data_record); if (limit && num_records > limit) { num_records = limit; len = num_records * sizeof(data_record); } std::unique_ptr ret = std::make_unique(num_records); m_reader->read((char *)ret.get(), len); return ret; } template size_t basic_index::find_count(uint64_t key) const { std::lock_guard lock(this->m_lock); size_t key_pos = read_key_pos(key); if (key_pos == SIZE_MAX) { return 0; } // Read page. m_reader->seekg(key_pos); size_t num_keys; m_reader->read((char *)&num_keys, sizeof(size_t)); std::unique_ptr keys_allocator = std::make_unique(num_keys); uint64_t *keys = keys_allocator.get(); m_reader->read((char *)keys, num_keys * sizeof(uint64_t)); size_t key_data_pos = SIZE_MAX; for (size_t i = 0; i < num_keys; i++) { if (keys[i] == key) { key_data_pos = i; } } if (key_data_pos == SIZE_MAX) { return 0; } char buffer[64]; // Read length only. m_reader->seekg(key_pos + 8 + (num_keys * 8)*2 + key_data_pos * 8); m_reader->read(buffer, 8); size_t len = *((size_t *)(&buffer[0])); return len / sizeof(data_record); } /* * Iterates the keys of the index and calls the callback with key and vector of records for that key. * */ template void basic_index::for_each(std::function &recs)> on_each_key) const { std::ifstream reader(filename(), std::ios::binary); reader.seekg(this->hash_table_byte_size(), std::ios::beg); std::map> page; while (this->read_page_into(reader, page)) { for (auto &iter : page) { on_each_key(iter.first, iter.second); } page.clear(); } } /* * Reads the exact position of the key, returns SIZE_MAX if the key was not found. * */ template size_t basic_index::read_key_pos(uint64_t key) const { if (this->m_hash_table_size == 0) return 0; const size_t hash_pos = key % this->m_hash_table_size; if (!m_reader->seekg(hash_pos * sizeof(size_t))) return SIZE_MAX; size_t pos; m_reader->read((char *)&pos, sizeof(size_t)); return pos; } /* * Reads the count of unique recprds from the count file and puts it in the m_unique_count member. * */ template void basic_index::read_meta() { struct meta { size_t unique_count; }; meta m; std::ifstream meta_reader(meta_filename(), std::ios::binary); if (meta_reader.is_open()) { meta_reader.read((char *)(&m), sizeof(meta)); } m_unique_count = m.unique_count; } template std::string basic_index::mountpoint() const { return std::to_string(m_id % 8); } template std::string basic_index::filename() const { if (m_file_name != "") return m_file_name + ".data"; return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) + ".data"; } template std::string basic_index::meta_filename() const { if (m_file_name != "") return m_file_name + ".meta"; return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) + ".meta"; } } ================================================ FILE: src/indexer/basic_index_builder.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include #include #include #include #include "merger.h" #include "score_builder.h" #include "algorithm/hyper_log_log.h" #include "config.h" #include "profiler/profiler.h" #include "logger/logger.h" #include "memory/debugger.h" #include "file/file.h" #include "index_base.h" namespace indexer { template class basic_index_builder : public index_base{ private: // Non copyable basic_index_builder(const basic_index_builder &); basic_index_builder& operator=(const basic_index_builder &); public: basic_index_builder(const std::string &file_name); basic_index_builder(const std::string &db_name, size_t id); basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size); basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results); ~basic_index_builder(); void add(uint64_t key, const data_record &record); size_t cache_size() const; void append(); void merge(); void transform(const std::function &transform); void sort_by(const std::function sort_by); void truncate(); void truncate_cache_files(); void create_directories(); private: std::string m_file_name; std::string m_db_name; const size_t m_id; const size_t m_max_results; const size_t m_buffer_len = config::ft_shard_builder_buffer_len; char *m_buffer; std::mutex m_lock; // Caches std::vector m_key_cache; std::vector m_record_cache; std::map> m_cache; void read_append_cache(); void read_data_to_cache(); void sort_cache(); void sort_record_list(uint64_t key, std::vector &records); void reset_cache_variables(); void save_file(); void write_key(std::ofstream &key_writer, uint64_t key, size_t page_pos); size_t write_page(std::ofstream &writer, const std::vector &keys); void reset_key_map(std::ofstream &key_writer); std::string mountpoint() const; std::string cache_filename() const; std::string key_cache_filename() const; std::string target_filename() const; std::string meta_filename() const; }; template basic_index_builder::basic_index_builder(const std::string &file_name) : index_base(), m_file_name(file_name), m_id(0), m_max_results(config::ft_max_results_per_section) { merger::register_merger((size_t)this, [this]() {merge();}); merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); }); } template basic_index_builder::basic_index_builder(const std::string &db_name, size_t id) : index_base(), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) { merger::register_merger((size_t)this, [this]() {merge();}); merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); }); } template basic_index_builder::basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size) : index_base(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(config::ft_max_results_per_section) { merger::register_merger((size_t)this, [this]() {append();}); merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); }); } template basic_index_builder::basic_index_builder(const std::string &db_name, size_t id, size_t hash_table_size, size_t max_results) : index_base(hash_table_size), m_db_name(db_name), m_id(id), m_max_results(max_results) { merger::register_merger((size_t)this, [this]() {append();}); merger::register_appender((size_t)this, [this]() {append();}, [this]() { return cache_size(); }); } template basic_index_builder::~basic_index_builder() { merger::deregister_merger((size_t)this); } template void basic_index_builder::add(uint64_t key, const data_record &record) { indexer::merger::lock(); m_lock.lock(); // Amortized constant m_key_cache.push_back(key); m_record_cache.push_back(record); assert(m_record_cache.size() == m_key_cache.size()); m_lock.unlock(); } /* * Returns the allocated size of the cache (m_key_cache and m_record_cache). * */ template size_t basic_index_builder::cache_size() const { return m_key_cache.capacity() * sizeof(uint64_t) + m_record_cache.capacity() * sizeof(data_record); } template void basic_index_builder::append() { assert(m_record_cache.size() == m_key_cache.size()); std::ofstream record_writer(cache_filename(), std::ios::binary | std::ios::app); if (!record_writer.is_open()) { throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + cache_filename() + "). Error: " + std::string(strerror(errno))); } std::ofstream key_writer(key_cache_filename(), std::ios::binary | std::ios::app); if (!key_writer.is_open()) { throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + key_cache_filename() + "). Error: " + std::string(strerror(errno))); } record_writer.write((const char *)m_record_cache.data(), m_record_cache.size() * sizeof(data_record)); key_writer.write((const char *)m_key_cache.data(), m_key_cache.size() * sizeof(uint64_t)); m_record_cache.clear(); m_key_cache.clear(); m_record_cache.shrink_to_fit(); m_key_cache.shrink_to_fit(); } template void basic_index_builder::merge() { { read_append_cache(); sort_cache(); save_file(); truncate_cache_files(); } } /* Transforms all the bitmaps in the index. Basically generating new bitmaps with the transform applied. */ template void basic_index_builder::transform(const std::function &transform) { read_data_to_cache(); // Apply transforms. for (auto &iter : m_cache) { for (size_t i = 0; i < iter.second.size(); i++) { iter.second[i] = transform(iter.second[i], iter.second.size()); } } save_file(); truncate_cache_files(); } template void basic_index_builder::sort_by(const std::function comp) { read_data_to_cache(); for (auto &iter : m_cache) { sort(iter.second.begin(), iter.second.end(), comp); } save_file(); truncate_cache_files(); } /* Deletes ALL data from this shard. */ template void basic_index_builder::truncate() { create_directories(); truncate_cache_files(); std::ofstream target_writer(target_filename(), std::ios::trunc); target_writer.close(); } /* Deletes all data from caches. */ template void basic_index_builder::truncate_cache_files() { reset_cache_variables(); file::delete_file(cache_filename()); file::delete_file(key_cache_filename()); } template void basic_index_builder::create_directories() { for (size_t i = 0; i < 8; i++) { boost::filesystem::create_directories(config::data_path() + "/" + std::to_string(i) + "/full_text/" + m_db_name); } } template void basic_index_builder::read_append_cache() { // Read the current file. read_data_to_cache(); //profiler::instance prof("index_builder::read_append_cache"); // Read the cache into memory. std::ifstream reader(cache_filename(), std::ios::binary); if (!reader.is_open()) { throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + cache_filename() + "). Error: " + std::string(strerror(errno))); } std::ifstream key_reader(key_cache_filename(), std::ios::binary); if (!key_reader.is_open()) { throw LOG_ERROR_EXCEPTION("Could not open full text shard (" + key_cache_filename() + "). Error: " + std::string(strerror(errno))); } const size_t buffer_len = 10000; std::unique_ptr buffer_allocator; try { buffer_allocator = std::make_unique(buffer_len); } catch (std::bad_alloc &exception) { std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl; std::cout << "tried to allocate: " << buffer_len * sizeof(data_record) << " bytes" << std::endl; return; } std::unique_ptr key_buffer_allocator; try { key_buffer_allocator = std::make_unique(buffer_len); } catch (std::bad_alloc &exception) { std::cout << "bad_alloc detected: " << exception.what() << " file: " << __FILE__ << " line: " << __LINE__ << std::endl; std::cout << "tried to allocate: " << buffer_len * sizeof(uint64_t) << " bytes" << std::endl; return; } data_record *buffer = buffer_allocator.get(); uint64_t *key_buffer = key_buffer_allocator.get(); reader.seekg(0, std::ios::beg); unordered_map internal_id_map; unordered_map> bitmap_data; while (!reader.eof()) { reader.read((char *)buffer, buffer_len * sizeof(data_record)); key_reader.read((char *)key_buffer, buffer_len * sizeof(uint64_t)); const size_t read_bytes = reader.gcount(); const size_t num_records = read_bytes / sizeof(data_record); for (size_t i = 0; i < num_records; i++) { m_cache[key_buffer[i]].push_back(buffer[i]); } } } /* * Reads the file into RAM. * */ template void basic_index_builder::read_data_to_cache() { reset_cache_variables(); std::ifstream reader(target_filename(), std::ios::binary); if (!reader.is_open()) return; reader.seekg(0, std::ios::end); const size_t file_size = reader.tellg(); if (file_size <= this->hash_table_byte_size()) return; reader.seekg(this->hash_table_byte_size(), std::ios::beg); while (this->read_page_into(reader, m_cache)) { } } template void basic_index_builder::sort_cache() { for (auto &iter : m_cache) { sort_record_list(iter.first, iter.second); } } template void basic_index_builder::sort_record_list(uint64_t key, std::vector &records) { // Sort records. std::sort(records.begin(), records.end()); // Sum equal elements. for (size_t i = 0, j = 1; i < records.size() && j < records.size(); j++) { if (records[i] != records[j]) { i = j; } else { records[i] += records[j]; } } // Delete consecutive equal elements. Only keeping the first unique. auto last = std::unique(records.begin(), records.end()); records.erase(last, records.end()); if (records.size() > m_max_results) { // Sort before truncation std::sort(records.begin(), records.end(), typename data_record::truncate_order()); records.resize(config::ft_max_results_per_section); // Future fix here is to add hyper log log counting for words with too many urls. } std::sort(records.begin(), records.end()); } template void basic_index_builder::reset_cache_variables() { m_cache = std::map>{}; } template void basic_index_builder::save_file() { //profiler::instance prof("index_builder::save_file"); std::ofstream writer(target_filename(), std::ios::binary | std::ios::trunc); if (!writer.is_open()) { throw LOG_ERROR_EXCEPTION("Could not open full text shard. Error: " + std::string(strerror(errno))); } reset_key_map(writer); std::map> pages; for (auto &iter : m_cache) { if (this->m_hash_table_size) { pages[iter.first % this->m_hash_table_size].push_back(iter.first); } else { pages[0].push_back(iter.first); } } for (const auto &iter : pages) { size_t page_pos = write_page(writer, iter.second); write_key(writer, iter.first, page_pos); writer.flush(); } } template void basic_index_builder::write_key(std::ofstream &key_writer, uint64_t key, size_t page_pos) { if (this->m_hash_table_size > 0) { assert(key < this->m_hash_table_size); key_writer.seekp(key * sizeof(uint64_t)); key_writer.write((char *)&page_pos, sizeof(size_t)); } } /* * Writes the page with keys, appending it to the file stream writer. * */ template size_t basic_index_builder::write_page(std::ofstream &writer, const std::vector &keys) { writer.seekp(0, ios::end); const size_t page_pos = writer.tellp(); size_t num_keys = keys.size(); writer.write((char *)&num_keys, 8); writer.write((char *)keys.data(), keys.size() * 8); std::vector v_pos; std::vector v_len; size_t pos = 0; for (uint64_t key : keys) { // Store position and length const size_t len = m_cache[key].size() * sizeof(data_record); v_pos.push_back(pos); v_len.push_back(len); pos += len; } writer.write((char *)v_pos.data(), keys.size() * 8); writer.write((char *)v_len.data(), keys.size() * 8); // Write data. size_t i = 0; for (uint64_t key : keys) { const size_t len = v_len[i]; writer.write((char *)m_cache[key].data(), len); i++; } return page_pos; } template void basic_index_builder::reset_key_map(std::ofstream &key_writer) { key_writer.seekp(0); uint64_t data = SIZE_MAX; for (size_t i = 0; i < this->m_hash_table_size; i++) { key_writer.write((char *)&data, sizeof(uint64_t)); } } template std::string basic_index_builder::mountpoint() const { return std::to_string(m_id % 8); } template std::string basic_index_builder::cache_filename() const { if (m_file_name != "") return m_file_name + ".cache"; return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) + ".cache"; } template std::string basic_index_builder::key_cache_filename() const { if (m_file_name != "") return m_file_name + ".cache.keys"; return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) + ".cache.keys"; } template std::string basic_index_builder::target_filename() const { if (m_file_name != "") return m_file_name + ".data"; return config::data_path() + "/" + mountpoint() + "/full_text/" + m_db_name + "/" + std::to_string(m_id) + ".data"; } } ================================================ FILE: src/indexer/console.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "console.h" #include #include #include "text/text.h" #include "indexer/index_manager.h" #include "indexer/sharded.h" #include "indexer/basic_index.h" #include "indexer/counted_record.h" #include "URL.h" #include "transfer/transfer.h" #include "domain_stats/domain_stats.h" #include "merger.h" #include "file/tsv_file_remote.h" #include "algorithm/bloom_filter.h" #include "parser/parser.h" #include "http/server.h" #include "json.hpp" namespace indexer { void cmd_index(index_manager &idx_manager, const std::vector &args) { if (args.size() < 2) return; merger::start_merge_thread(); const auto batch = args[1]; size_t limit = 0; if (args.size() > 2) limit = stoull(args[2]); file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths.gz"); std::vector warc_paths; warc_paths_file.read_column_into(0, warc_paths); if (limit && warc_paths.size() > limit) warc_paths.resize(limit); for (auto &path : warc_paths) { const size_t pos = path.find(".warc.gz"); if (pos != std::string::npos) { path.replace(pos, 8, ".gz"); } } auto local_files = transfer::download_gz_files_to_disk(warc_paths); cout << "starting indexer" << endl; idx_manager.add_index_files_threaded(local_files, 24); cout << "done with indexer" << endl; transfer::delete_downloaded_files(local_files); merger::stop_merge_thread(); } void cmd_search(index_manager &idx_manager, hash_table2::hash_table &ht, hash_table2::hash_table &url_ht, const std::string &query) { profiler::instance prof("domain search"); std::vector res = idx_manager.find(query); prof.stop(); cout << "took " << prof.get() << "ms" << endl; cout << setw(50) << "domain"; cout << setw(20) << "score"; cout << endl; std::vector domain_hashes; for (indexer::return_record &rec : res) { const auto host = ht.find(rec.m_value); domain_hashes.push_back(rec.m_value); cout << setw(50) << host; cout << setw(20) << rec.m_score; cout << endl; } profiler::instance prof2("url searches"); cout << "sending " << domain_hashes.size() << " domain hashes" << endl; http::response http_res = transfer::post("http://65.108.132.103/?q=" + parser::urlencode(query), std::string((char *)domain_hashes.data(), domain_hashes.size() * sizeof(uint64_t))); const auto url_res = http_res.body(); std::stringstream ss(url_res); std::map> results; while (!ss.eof()) { uint64_t incoming_domain_hash; ss.read((char *)&incoming_domain_hash, sizeof(uint64_t)); if (ss.eof()) break; size_t num_records; ss.read((char *)&num_records, sizeof(size_t)); for (size_t i = 0; i < num_records; i++) { uint64_t value; float score; ss.read((char *)&value, sizeof(uint64_t)); ss.read((char *)&score, sizeof(float)); results[incoming_domain_hash].push_back(url_record(value, score)); } } for (auto domain_hash : domain_hashes) { for (const auto &url_record : results[domain_hash]) { const auto &line = url_ht.find(url_record.m_value); std::vector cols; boost::algorithm::split(cols, line, boost::is_any_of("\t")); const auto url = cols[0]; const auto title = cols[1]; const auto snippet = cols[4]; std::cout << url << std::endl; } } cout << "took " << prof2.get() << "ms" << endl; cout << "got " << results.size() << " responses" << endl; } void cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query) { indexer::sharded_builder word_index_builder("word_index", 256); indexer::sharded word_index("word_index", 256); const uint64_t word_hash = ::algorithm::hash(query); std::vector res = word_index.find(word_hash, 100000); size_t pos = 0; for (auto &rec : res) { const auto host = ht.find(rec.m_value); cout << host << ": " << rec.m_count << " score: " << rec.m_score << " pos: " << pos << " m_value: " << rec.m_value << " doc_size: " << word_index_builder.document_size(rec.m_value) << endl; pos++; } } void cmd_domain_info(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &domain, size_t limit, size_t offset) { indexer::sharded idx("title_word_counter", 997); const uint64_t domain_hash = ::algorithm::hash(domain); std::vector res = idx.find(domain_hash); sort(res.begin(), res.end(), indexer::counted_record::truncate_order()); size_t pos = 0; for (auto &rec : res) { const auto word = ht.find(rec.m_value); cout << word << ": " << rec.m_count << endl; if (pos >= limit) break; pos++; } } void cmd_word(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query, const std::string &domain) { indexer::sharded_builder word_index_builder("word_index", 256); indexer::sharded word_index("word_index", 256); const uint64_t word_hash = ::algorithm::hash(query); std::vector res = word_index.find(word_hash); size_t pos = 0; for (auto &rec : res) { const auto host = ht.find(rec.m_value); if (host == domain) { cout << host << ": " << rec.m_count << " score: " << rec.m_score << " pos: " << pos << " m_value: " << rec.m_value << " doc_size: " << word_index_builder.document_size(rec.m_value) << endl; } pos++; } } void cmd_word_num(index_manager &idx_manager, hash_table2::hash_table &ht, const std::string &query) { indexer::sharded word_index("word_index", 256); const uint64_t word_hash = ::algorithm::hash(query); std::vector res = word_index.find(word_hash); cout << "num_records: " << res.size() << endl; } void cmd_harmonic(const std::vector &args) { if (args.size() < 2) return; float harmonic = domain_stats::harmonic_centrality(URL(args[1])); cout << "url: " << args[1] << " has harmonic centrality " << harmonic << endl; } std::vector input_to_args(const std::string &input) { const auto word_boundary = " \t,|!"; std::vector raw_words, words; boost::split(raw_words, input, boost::is_any_of(word_boundary)); for (auto &word : raw_words) { if (word.size()) { words.push_back(word); } } return words; } void console() { } void index_link_batch(const std::string &batch) { ::algorithm::bloom_filter urls_to_index(625000027); urls_to_index.read_file(config::data_path() + "/0/url_filter.bloom"); size_t limit = 1000; size_t offset = 0; while (true) { indexer::index_manager idx_manager; merger::start_merge_thread(); file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths"); std::vector warc_paths; warc_paths_file.read_column_into(0, warc_paths, limit, offset); if (warc_paths.size() == 0) { merger::stop_merge_thread(); break; } auto local_files = transfer::download_gz_files_to_disk(warc_paths); cout << "starting indexer" << endl; idx_manager.add_link_files_threaded(local_files, 32, urls_to_index); cout << "done with indexer" << endl; transfer::delete_downloaded_files(local_files); merger::stop_merge_thread(); offset += limit; } } void index_links() { domain_stats::download_domain_stats(); LOG_INFO("Done download_domain_stats"); for (const std::string &batch : config::link_batches) { index_link_batch(batch); } } void index_url_batch(const std::string &batch) { size_t limit = 1000; size_t offset = 0; while (true) { indexer::index_manager idx_manager; merger::start_merge_thread(); file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths"); std::vector warc_paths; warc_paths_file.read_column_into(0, warc_paths, limit, offset); if (warc_paths.size() == 0) { merger::stop_merge_thread(); break; } cout << "downloading " << warc_paths.size() << " to disc" << endl; auto local_files = transfer::download_gz_files_to_disk(warc_paths); cout << "starting indexer" << endl; idx_manager.add_index_files_threaded(local_files, 32); cout << "done with indexer" << endl; transfer::delete_downloaded_files(local_files); merger::stop_merge_thread(); offset += limit; } profiler::print_report(); } void index_urls() { domain_stats::download_domain_stats(); LOG_INFO("Done download_domain_stats"); for (const std::string &batch : config::batches) { index_url_batch(batch); } } void truncate_links() { { indexer::index_manager idx_manager; idx_manager.truncate_links(); } } void domain_info_server() { domain_stats::download_domain_stats(); LOG_INFO("Done download_domain_stats"); indexer::index_manager idx_manager; hash_table2::hash_table ht("word_hash_table"); indexer::sharded fp_title_counter("first_page_title_word_counter", 101); indexer::sharded title_counter("title_word_counter", 997); indexer::sharded link_counter("link_word_counter", 4001); cout << "starting server..." << endl; ::http::server srv([&ht, &fp_title_counter, &title_counter, &link_counter](const http::request &req) { http::response res; URL url = req.url(); auto query = url.query(); size_t limit = 1000; if (query.count("limit")) limit = std::stoi(query["limit"]); size_t offset = 0; if (query.count("offset")) offset = std::stoi(query["offset"]); if (url.path() == "/favicon.ico") { res.code(404); res.body("404"); return res; } std::stringstream body; auto domain = url.path(); domain.erase(0, 1); body << ""; body << "

" << domain << "

" << endl; body << "

harmonic: " << domain_stats::harmonic_centrality(domain) << "

" << endl; body << "

hash: " << ::algorithm::hash(domain) << "

" << endl; body << "
";

			const uint64_t domain_hash = ::algorithm::hash(domain);
			auto fp_results = fp_title_counter.find(domain_hash);
			auto results = title_counter.find(domain_hash);
			auto link_results = link_counter.find(domain_hash);

			sort(fp_results.begin(), fp_results.end(), indexer::counted_record::truncate_order());
			sort(results.begin(), results.end(), indexer::counted_record::truncate_order());
			sort(link_results.begin(), link_results.end(), indexer::counted_record::truncate_order());

			body << "Limit: " + std::to_string(limit) << endl;
			body << "Offset: " + std::to_string(offset) << endl << endl;
			const size_t original_offset = offset;
			body << "
"; body << "
"; body << "
";
			for (auto &rec : fp_results) {
				const auto word = ht.find(rec.m_value);
				body << word << ": " << rec.m_count << endl;
			}
			body << "
"; body << "
";
			double threshold = results.size() ? results[0].m_count : 0.0;
			size_t offset_start = 0;
			for (auto &rec : results) {
				if (rec.m_count >= threshold * 0.8) {
					const auto word = ht.find(rec.m_value);
					body << word << ": " << rec.m_count << endl;
					offset_start++;
				} else {
					break;
				}
			}
			if (offset < offset_start) offset = offset_start;
			body << "
"; body << "
";

			size_t pos = 0;
			for (auto &rec : results) {
				if (pos >= offset) {
					const auto word = ht.find(rec.m_value);
					body << word << ": " << rec.m_count << endl;
				}
				if (pos >= limit + offset) break;
				pos++;
			}

			body << "
";

			pos = 0;
			for (auto &rec : link_results) {
				if (pos >= original_offset) {
					const auto word = ht.find(rec.m_value);
					body << word << ": " << rec.m_count << endl;
				}
				if (pos >= limit + original_offset) break;
				pos++;
			}

			body << "
", tag.second); if (tag.second == string::npos) { break; } m_invisible_pos.push_back(tag); } } void html_parser::find_links(const string &html, const string &base_url) { size_t pos = 0; pair tag(0, 0); while (pos != string::npos) { tag = find_tag(html, "", tag.second); if (tag.second == string::npos) { break; } parse_link(html.substr(tag.first, tag.second - tag.first), base_url); } } int html_parser::parse_link(const string &link, const string &base_url) { const string href_key = "href=\""; const size_t key_len = href_key.size(); const size_t href_start = link.find(href_key); if (href_start == string::npos) return ::parser::ERROR; const size_t href_end = link.find("\"", href_start + key_len); if (href_end == string::npos) return ::parser::ERROR; string href = link.substr(href_start + key_len, href_end - href_start - key_len); const string rel_key = "rel=\""; const size_t rel_key_len = rel_key.size(); const size_t rel_start = link.find(rel_key); bool nofollow = false; if (rel_start != string::npos) { // "rel=" present in string const size_t rel_end = link.find("\"", rel_start + key_len); const string rel = link.substr(rel_start + rel_key_len, rel_end - rel_start - rel_key_len); if (rel.find("nofollow") != string::npos) nofollow = true; } string host; string path; if (parse_url(href, host, path, base_url) != ::parser::OK) return ::parser::ERROR; if (host == m_host) { // Ignore internal links for now. if (!nofollow) { m_internal_links.emplace_back(std::make_pair(URL(m_host, m_path).hash(), URL(host, path).hash())); } return ::parser::OK; } const size_t content_start = link.find(">", href_end) + 1; if (content_start == string::npos) return ::parser::ERROR; const size_t content_end = link.find("", content_start); string content = link.substr(content_start, content_end - content_start); if (m_encoding == ENC_ISO_8859_1) { iso_to_utf8(content); } clean_text(content); if (content == "") return ::parser::ERROR; m_links.push_back(html_link(m_host, m_path, host, path, nofollow, content)); return ::parser::OK; } int html_parser::parse_url(const string &url, string &host, string &path, const string &base_url) { CURLU *h = curl_url(); if (!h) return ::parser::ERROR; if (base_url.size()) { curl_url_set(h, CURLUPART_URL, base_url.c_str(), 0); } CURLUcode uc = curl_url_set(h, CURLUPART_URL, url.c_str(), 0); if (uc) { curl_url_cleanup(h); return ::parser::ERROR; } char *chost; uc = curl_url_get(h, CURLUPART_HOST, &chost, 0); if (!uc) { host = chost; remove_www(host); curl_free(chost); } char *cpath; uc = curl_url_get(h, CURLUPART_PATH, &cpath, 0); if (!uc) { if (strnlen(cpath, m_long_text_len) < m_long_text_len) { decode_html_entities_utf8(m_clean_buff.get(), cpath); path = m_clean_buff.get(); } else { path = cpath; } curl_free(cpath); } char *cquery; uc = curl_url_get(h, CURLUPART_QUERY, &cquery, 0); if (!uc) { if (strnlen(cquery, m_long_text_len) < m_long_text_len) { decode_html_entities_utf8(m_clean_buff.get(), cquery); path += "?" + string(m_clean_buff.get()); } else { path += "?" + string(cquery); } curl_free(cquery); } curl_url_cleanup(h); return ::parser::OK; } void html_parser::remove_www(string &path) { size_t pos = path.find("www."); if (pos == 0) path.erase(0, 4); text::trim_inplace(path); } void html_parser::parse_encoding(const string &html) { m_encoding = ENC_UTF_8; const size_t pos_start = html.find("charset="); if (pos_start == string::npos || pos_start > 1024) return; string encoding = html.substr(pos_start, 40); encoding = text::lower_case(encoding); const size_t utf8_start = encoding.find("utf-8"); const size_t iso88591_start = encoding.find("iso-8859-1"); if (utf8_start != string::npos) m_encoding = ENC_UTF_8; else if (iso88591_start != string::npos) m_encoding = ENC_ISO_8859_1; else m_encoding = ENC_UNKNOWN; } void html_parser::iso_to_utf8(string &str) { string str_out; for (std::string::iterator it = str.begin(); it != str.end(); ++it) { uint8_t ch = *it; if (ch < 0x80) { str_out.push_back(ch); } else { str_out.push_back(0xc0 | ch >> 6); str_out.push_back(0x80 | (ch & 0x3f)); } } str = str_out; } string html_parser::title() const { return m_title; } string html_parser::meta() const { return m_meta; } string html_parser::h1() const { return m_h1; } string html_parser::text() const { return m_text; } vector html_parser::links() const { return m_links; } vector> html_parser::internal_links() const { return m_internal_links; } bool html_parser::should_insert() const { return m_should_insert; } string html_parser::url_tld(const string &url) { string response; string host; vector parts; CURLU *h = curl_url(); if (!h) return ""; CURLUcode uc = curl_url_set(h, CURLUPART_URL, url.c_str(), 0); if (uc) { curl_url_cleanup(h); return ""; } char *chost; uc = curl_url_get(h, CURLUPART_HOST, &chost, 0); if (!uc) { host = chost; boost::split(parts, host, boost::is_any_of(".")); curl_free(chost); if (parts.size()) { response = parts.back(); } } curl_url_cleanup(h); return response; } inline pair html_parser::find_tag(const string &html, const string &tag_start, const string &tag_end, size_t pos) { size_t pos_start = html.find(tag_start, pos); if (pos_start == string::npos) return pair(string::npos, string::npos); const size_t pos_end = html.find(tag_end, pos_start); if (pos_end == string::npos) return pair(string::npos, string::npos); return pair(pos_start, pos_end + tag_end.size()); } string html_parser::get_tag_content(const string &html, const string &tag_start, const string &tag_end) { size_t pos_start = html.find(tag_start); if (pos_start == string::npos || is_invisible(pos_start)) return ""; pos_start = html.find(">", pos_start); const size_t pos_end = html.find(tag_end, pos_start); const size_t len = pos_end - pos_start; if (pos_end == string::npos) return ""; return (string)html.substr(pos_start + 1, len - 1); } string html_parser::get_meta_tag(const string &html) { size_t pos_start = 0; while ((pos_start = html.find("", pos_start); const size_t pos_description = html.find("description\"", pos_start); if (pos_description < pos_end) { const size_t pos_end_tag = html.find(">", pos_description); const size_t pos_start_tag = html.rfind("<", pos_description); const string s = "content="; const size_t content_start = html.find(s, pos_start_tag); if (content_start != string::npos && content_start <= pos_end_tag) { return (string)html.substr(content_start + s.size(), pos_end_tag - content_start - s.size() - 1); } } } return ""; } void html_parser::clean_text(string &str) { strip_tags(str); if (str.size() >= m_long_text_len) return; decode_html_entities_utf8(m_clean_buff.get(), str.c_str()); str = m_clean_buff.get(); strip_whitespace(str); text::trim_both_inplace(str); } void html_parser::strip_tags(string &html) { const int len = html.size(); bool copy = true; bool last_was_space = false; int i = 0, j = 0; const char *html_s = html.c_str(); for (; i < len; i++) { if (html_s[i] == '<') copy = false; if (isspace(html_s[i])) { html[j] = ' '; if (copy && !last_was_space) j++; last_was_space = true; } else { html[j] = html_s[i]; if (copy) j++; last_was_space = false; } if (html_s[i] == '>') copy = true; } html.resize(j); } void html_parser::strip_whitespace(string &html) { const int len = html.size(); bool last_was_space = false; int i = 0, j = 0; const char *html_s = html.c_str(); for (; i < len; i++) { if (isspace(html_s[i])) { html[j] = ' '; if (!last_was_space) j++; last_was_space = true; } else { html[j] = html_s[i]; j++; last_was_space = false; } } html.resize(j); } /* * This function returns the text content of the html by first trying to fetch content after the first

...

tag. If no h1 tag is present * it tries to fetch content from the start of the * */ string html_parser::get_text_content(const string &html) { size_t pos_start = html.find(""); // Start from body if no h1 is present if (pos_start == string::npos || is_invisible(pos_start)) { pos_start = html.find("first < pos_start) { interval++; } const char *html_s = html.c_str(); for (; i < len && j < m_long_text_len; i++) { if (html_s[i] == '<') { if (interval != invisible_end && interval->first == i) { // Skip the whole invisible tag. i = interval->second - 1; interval++; continue; } // Insert a space, because we don't want to concatenate words. m_long_str_buf[j] = ' '; if (copy && !last_was_space) j++; last_was_space = true; copy = false; } if (isspace(html_s[i])) { if (j < m_long_text_len) m_long_str_buf[j] = ' '; if (copy && !last_was_space) j++; last_was_space = true; } else { if (j < m_long_text_len) m_long_str_buf[j] = html_s[i]; if (copy) j++; last_was_space = false; } if (!ignore && html_s[i] == '>') copy = true; } string text(m_long_str_buf.get(), j); return text; } bool html_parser::is_exotic_language_debug(const string &str) const { const size_t len = str.size(); const char *cstr = str.c_str(); int num_exotic = 0; int num_normal = 0; int num_seminormal = 0; for (size_t i = 0; i < len;) { int multibyte_len = 1; int cumsum = 0; for (size_t j = i + 1; (j < len) && IS_MULTIBYTE_CODEPOINT(cstr[j]); j++, multibyte_len++) { cumsum += (unsigned char)cstr[j]; } if (multibyte_len > 2) { num_exotic++; } else if (multibyte_len == 2){ num_seminormal++; } else { num_normal++; } i += multibyte_len; } int total = (num_seminormal + num_exotic + num_normal); cout << str << " exotic: " << num_exotic << " seminormal: " << num_seminormal << " normal: " << num_normal << endl; if (num_exotic > 5) return true; if (total <= 3) return false; if ((float)(num_seminormal + num_exotic) / ((float)total) > 0.5) return true; return false; } bool html_parser::is_exotic_language(const string &str) const { const size_t len = str.size(); const char *cstr = str.c_str(); int num_exotic = 0; int num_normal = 0; int num_seminormal = 0; for (size_t i = 0; i < len;) { int multibyte_len = 1; int cumsum = 0; for (size_t j = i + 1; (j < len) && IS_MULTIBYTE_CODEPOINT(cstr[j]); j++, multibyte_len++) { cumsum += (unsigned char)cstr[j]; } if (multibyte_len > 2) { num_exotic++; } else if (multibyte_len == 2){ num_seminormal++; } else { num_normal++; } i += multibyte_len; } int total = (num_seminormal + num_exotic + num_normal); if (num_exotic > 5) return true; if (total <= 3) return false; if ((float)(num_seminormal + num_exotic) / ((float)total) > 0.5) return true; return false; } void html_parser::sort_invisible() { sort(m_invisible_pos.begin(), m_invisible_pos.end(), [](const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; }); } inline bool html_parser::is_invisible(size_t pos) { for (const auto &interval : m_invisible_pos) { if (interval.first <= pos && pos < interval.second) return true; } return false; } } ================================================ FILE: src/parser/html_parser.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #include #include #include #include #include #include "html_link.h" #include "parser/unicode.h" #define HTML_PARSER_MAX_H1_LEN 400 #define HTML_PARSER_MAX_TITLE_LEN 400 #define ENC_UTF_8 1 #define ENC_ISO_8859_1 2 #define ENC_UNKNOWN -1 namespace parser { class html_parser { public: html_parser(); html_parser(size_t long_text_len); ~html_parser(); void parse(const std::string &html, const std::string &url); void parse(const std::string &html); std::string title() const; std::string meta() const; std::string h1() const; std::string text() const; std::vector links() const; std::vector> internal_links() const; bool should_insert() const; // Return top level domain std::string url_tld(const std::string &url); bool is_exotic_language_debug(const std::string &str) const; bool is_exotic_language(const std::string &str) const; private: std::vector m_links; std::vector> m_internal_links; std::vector> m_invisible_pos; const size_t m_long_text_len = 1000; std::unique_ptr m_long_str_buf; std::unique_ptr m_clean_buff; std::unique_ptr m_encoding_buffer; bool m_should_insert; int m_encoding = ENC_UNKNOWN; std::string m_title; std::string m_h1; std::string m_meta; std::string m_text; std::string m_host; std::string m_path; void find_scripts(const std::string &html); void find_styles(const std::string &html); void find_links(const std::string &html, const std::string &base_url); int parse_link(const std::string &link, const std::string &base_url); int parse_url(const std::string &url, std::string &host, std::string &path, const std::string &base_url); inline void remove_www(std::string &path); void parse_encoding(const std::string &html); void iso_to_utf8(std::string &text); inline std::pair find_tag(const std::string &html, const std::string &tag_start, const std::string &tag_end, size_t pos); std::string get_tag_content(const std::string &html, const std::string &tag_start, const std::string &tag_end); std::string get_meta_tag(const std::string &html); void clean_text(std::string &str); void strip_whitespace(std::string &html); void strip_tags(std::string &html); std::string get_text_content(const std::string &html); void sort_invisible(); inline bool is_invisible(size_t pos); }; } ================================================ FILE: src/parser/parser.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "parser.h" #include using namespace std; namespace parser { bool is_percent_encoding(const char *cstr) { const char first = tolower(cstr[1]); const char second = tolower(cstr[2]); const bool first_valid = (first >= '0' && first <= '9') || (first >= 'a' && first <= 'f'); const bool second_valid = (second >= '0' && second <= '9') || (second >= 'a' && second <= 'f'); return cstr[0] == '%' && first_valid && second_valid; } string urldecode(const string &str) { const size_t len = str.size(); const char *cstr = str.c_str(); char *ret = new char[len + 1]; size_t j = 0; for (size_t i = 0; i < len; i++) { if (i < len - 2 && is_percent_encoding(&cstr[i])) { ret[j++] = (char)stoi(string(&cstr[i + 1], 2), NULL, 16); i += 2; } else if (i < len - 1 && cstr[i] == '%' && cstr[i + 1] == '%') { ret[j++] = '%'; i++; } else { ret[j++] = cstr[i]; } } ret[j] = '\0'; string ret_str(ret); delete[] ret; return ret_str; } string urlencode(const string &str) { CURL *curl = curl_easy_init(); if (curl) { char *output = curl_easy_escape(curl, str.c_str(), str.size()); if (output) { string ret(output); curl_free(output); curl_easy_cleanup(curl); return ret; } curl_easy_cleanup(curl); } return str; } string get_http_header(const string &record, const string &key) { const size_t pos = record.find(key); const size_t pos_end = record.find("\n", pos); if (pos == string::npos) { return ""; } if (pos_end == string::npos) { return record.substr(pos + key.size()); } return record.substr(pos + key.size(), pos_end - pos - key.size() - 1); } } ================================================ FILE: src/parser/parser.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace parser { const int OK = 0; const int ERROR = 1; std::string urldecode(const std::string &str); std::string urlencode(const std::string &str); std::string get_http_header(const std::string &record, const std::string &key); } ================================================ FILE: src/parser/unicode.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "unicode.h" using namespace std; namespace parser { std::string unicode::encode(const std::string &str) { const char *cstr = str.c_str(); size_t len = str.size(); char *target = new char[str.size()]; size_t last_unicode = len; size_t utf8_len = 0; for (size_t i = 0; i < len; i++) { bool copy = true; if (utf8_len == 0) { if (IS_UTF8_START_1(cstr[i])) { utf8_len = 1; last_unicode = i; } else if (IS_UTF8_START_2(cstr[i])) { utf8_len = 2; last_unicode = i; } else if (IS_UTF8_START_3(cstr[i])) { utf8_len = 3; last_unicode = i; } else if (IS_UNKNOWN_UTF8_START(cstr[i])) { copy = false; } else if ('\x00' <= cstr[i] && cstr[i] <= '\x1f') { copy = false; } } else if (IS_MULTIBYTE_CODEPOINT(cstr[i])) { utf8_len--; } else { // This unicode character has been terminated too soon. copy = false; for (size_t j = last_unicode; j <= i; j++) { target[j] = '?'; } utf8_len = 0; } if (copy) { target[i] = cstr[i]; } else { target[i] = '?'; } } std::string ret(target, len); delete []target; if (utf8_len) { return ret.substr(0, last_unicode); } else { return ret; } } bool unicode::is_valid(const std::string &str) { const char *cstr = str.c_str(); size_t len = str.size(); size_t utf8_len = 0; for (size_t i = 0; i < len; i++) { if (utf8_len == 0) { if (IS_UTF8_START_1(cstr[i])) { utf8_len = 1; } else if (IS_UTF8_START_2(cstr[i])) { utf8_len = 2; } else if (IS_UTF8_START_3(cstr[i])) { utf8_len = 3; } else if (IS_UNKNOWN_UTF8_START(cstr[i])) { return false; } } else if (IS_MULTIBYTE_CODEPOINT(cstr[i])) { utf8_len--; } else { // This unicode character has been terminated too soon. return false; } } if (utf8_len) { return false; } return true; } } ================================================ FILE: src/parser/unicode.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #define IS_MULTIBYTE_CODEPOINT(ch) (((unsigned char)ch >> 7) && !(((unsigned char)ch >> 6) & 0x1)) #define IS_UTF8_START_1(ch) (((unsigned char)ch >> 5) == 0b00000110 && ((unsigned char)ch & 0b00011111) >= 0b00000010) #define IS_UTF8_START_2(ch) (((unsigned char)ch >> 4) == 0b00001110) #define IS_UTF8_START_3(ch) (((unsigned char)ch >> 3) == 0b00011110) #define IS_UNKNOWN_UTF8_START(ch) (ch >> 7) namespace parser { class unicode { public: static std::string encode(const std::string &str); static bool is_valid(const std::string &str); }; } ================================================ FILE: src/profiler/profiler.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "profiler.h" #include "logger/logger.h" #include #include using namespace std; namespace profiler { map profiles_per_name; std::chrono::_V2::system_clock::time_point start_time = std::chrono::high_resolution_clock::now(); instance::instance(const string &name) : m_name(name) { m_start_time = std::chrono::high_resolution_clock::now(); } instance::instance() : m_name("unnamed profile") { m_start_time = std::chrono::high_resolution_clock::now(); } instance::~instance() { if (!m_has_stopped) { stop(); } } void instance::enable() { m_enabled = true; } double instance::get() const { auto timer_elapsed = chrono::high_resolution_clock::now() - m_start_time; auto microseconds = chrono::duration_cast(timer_elapsed).count(); return (double)microseconds/1000; } double instance::get_micro() const { if (!m_enabled) return 0; auto timer_elapsed = chrono::high_resolution_clock::now() - m_start_time; auto microseconds = chrono::duration_cast(timer_elapsed).count(); return (double)microseconds; } void instance::stop() { m_has_stopped = true; profiles_per_name[m_name] += get(); if (!m_enabled) return; LOG_INFO("profiler [" + m_name + "] took " + to_string(get()) + "ms"); } void instance::print() { if (!m_enabled) return; cout << "profiler [" + m_name + "] took " + to_string(get()) + "ms" << endl; } void print_memory_status() { ifstream infile("/proc/" + to_string(getpid()) + "/status"); if (infile.is_open()) { string line; while (getline(infile, line)) { LOG_INFO(line); } } } void tick(const string &name, const string §ion) { (void)name; (void)section; } void report_reset(); void report_print(); double now_micro() { auto timer_elapsed = chrono::high_resolution_clock::now() - start_time; auto microseconds = chrono::duration_cast(timer_elapsed).count(); return (double)microseconds; } size_t timestamp() { const auto p1 = std::chrono::system_clock::now(); return std::chrono::duration_cast(p1.time_since_epoch()).count(); } void print_report() { double total_ms = 0.0; for (const auto &iter : profiles_per_name) { total_ms += iter.second; } for (const auto &iter : profiles_per_name) { cout << iter.first << ": " << iter.second << "ms (" << 100.0 * (iter.second / total_ms) << "%)" << endl; } } } ================================================ FILE: src/profiler/profiler.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace profiler { class instance { public: explicit instance(const std::string &name); instance(); ~instance(); void enable(); double get() const; double get_micro() const; void stop(); void print(); private: std::string m_name; bool m_enabled = true; bool m_has_stopped = false; std::chrono::_V2::system_clock::time_point m_start_time; }; void print_memory_status(); void tick(const std::string &name, const std::string §ion); void report_reset(); void report_print(); double now_micro(); size_t timestamp(); void print_report(); } ================================================ FILE: src/scraper/scraper.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "scraper.h" #include "parser/html_parser.h" #include "common/datetime.h" #include "text/text.h" #include "logger/logger.h" #include using namespace std; namespace scraper { string user_agent_token() { return "AlexandriaOrgBot"; } string user_agent() { string ua_version = "1.0"; string ua = "Mozilla/5.0 (Linux) (compatible; "+user_agent_token()+"/"+ua_version+"; +https://www.alexandria.org/bot.html)"; return ua; } scraper_stats::scraper_stats() { } scraper_stats::~scraper_stats() { m_running = false; if (m_thread.joinable()) m_thread.join(); } void scraper_stats::gather_statistics(const map> &scrapers, size_t urls_in_queue) { start_count(urls_in_queue); for (const auto &iter : scrapers) { if (iter.second->finished()) { count_finished(*(iter.second)); } else { count_unfinished(*(iter.second)); } } end_count(); } void scraper_stats::start_thread(size_t timeout) { m_timeout = timeout; m_thread = std::move(thread([this]() { this->run(); })); } void scraper_stats::start_count(size_t urls_in_queue) { m_lock.lock(); m_unfinished_scrapers = 0; m_unfinished_scraped_urls = 0; m_unfinished_scraped_urls_non200 = 0; m_unfinished_scraped_errors = 0; m_urls_in_queue = urls_in_queue; m_urls_assigned = 0; } void scraper_stats::end_count() { m_lock.unlock(); } void scraper_stats::count_finished(const scraper &scraper) { m_scraped_urls += scraper.num_scraped(); m_scraped_urls_non200 += scraper.num_scraped_non200(); m_scraped_errors += scraper.num_errors(); m_finished_scrapers += 1; m_num_blocked += scraper.blocked() ? 1 : 0; } void scraper_stats::count_unfinished(const scraper &scraper) { m_unfinished_scraped_urls += scraper.num_scraped(); m_unfinished_scraped_urls_non200 += scraper.num_scraped_non200(); m_unfinished_scraped_errors += scraper.num_errors(); m_unfinished_scrapers += 1; m_urls_assigned += scraper.size(); } void scraper_stats::run() { size_t time_start = profiler::timestamp(); while (m_running) { std::this_thread::sleep_for(std::chrono::seconds(m_timeout)); log_report(profiler::timestamp() - time_start); } } void scraper_stats::log_report(size_t dt) { m_lock.lock(); std::stringstream ss; ss.precision(2); ss << endl; ss << "Scraper stats:" << endl; ss << m_urls_in_queue << " urls in queue (not assigned to any scraper)" << endl; ss << m_urls_assigned << " urls assigned to running scrapers" << endl; ss << (m_scraped_urls + m_unfinished_scraped_urls) << " urls done (200 response)" << endl; ss << (m_scraped_urls_non200 + m_unfinished_scraped_urls_non200) << " urls (non 200 response)" << endl; ss << (m_scraped_errors + m_unfinished_scraped_errors) << " urls (errors)" << endl; ss << fixed << (double)(m_scraped_urls + m_unfinished_scraped_urls)/dt << "/s" << endl; ss << m_finished_scrapers << " finished scrapers" << endl; ss << m_unfinished_scrapers << " unfinished scrapers" << endl; ss << m_num_blocked << " blocked scrapers" << endl; m_lock.unlock(); LOG_INFO(ss.str()); } scraper::scraper(const string &domain, scraper_store *store) : m_domain(domain), m_store(store) { //m_domain_data.m_domain = domain; m_curl = curl_easy_init(); } scraper::~scraper() { if (m_thread.joinable()) m_thread.join(); upload_domain_info(); curl_easy_cleanup(m_curl); } void scraper::push_url(const URL &url) { m_queue.push(url); } void scraper::run() { download_domain_data(); download_robots(); while (m_queue.size()) { URL url = filter_url(m_queue.front()); m_queue.pop(); if (robots_allow_url(url)) { if (m_timeout) { this_thread::sleep_for(std::chrono::seconds(m_timeout/2 + (rand() % m_timeout))); } handle_url(url); } if (m_consecutive_error_count > 20) break; } m_finished = true; } void scraper::handle_url(const URL &url) { cout << url.str() << endl; m_buffer.resize(0); curl_easy_setopt(m_curl, CURLOPT_USERAGENT, user_agent().c_str()); curl_easy_setopt(m_curl, CURLOPT_FOLLOWLOCATION, 1l); curl_easy_setopt(m_curl, CURLOPT_MAXREDIRS, 5l); curl_easy_setopt(m_curl, CURLOPT_WRITEFUNCTION, curl_string_reader); curl_easy_setopt(m_curl, CURLOPT_WRITEDATA, this); curl_easy_setopt(m_curl, CURLOPT_URL, url.str().c_str()); curl_easy_setopt(m_curl, CURLOPT_TIMEOUT, 30); curl_easy_setopt(m_curl, CURLOPT_ERRORBUFFER, m_curl_error_buffer); CURLcode res = curl_easy_perform(m_curl); if (res == CURLE_OK) { m_consecutive_error_count = 0; long response_code; char *new_url_str = nullptr; curl_easy_getinfo(m_curl, CURLINFO_RESPONSE_CODE, &response_code); curl_easy_getinfo(m_curl, CURLINFO_EFFECTIVE_URL, &new_url_str); // Fetch IP address. char *ip_cstr; string ip; if (!curl_easy_getinfo(m_curl, CURLINFO_PRIMARY_IP, &ip_cstr) && ip_cstr != nullptr) ip = string(ip_cstr); if (new_url_str != nullptr) { string new_u_str(new_url_str); URL new_url(new_u_str); update_url(new_url, response_code, common::cur_datetime(), URL()); if (url.canonically_different(new_url)) { update_url(url, 301, common::cur_datetime(), new_url); // A bit of cheeting heere, it is not sure the original url had a 301 response code. } if (response_code == 200) { handle_200_response(m_buffer, response_code, ip, new_url); } else { handle_non_200_response(m_buffer, response_code, ip, new_url); } } else { update_url(url, response_code, common::cur_datetime(), URL()); if (response_code == 200) { handle_200_response(m_buffer, response_code, ip, url); } else { handle_non_200_response(m_buffer, response_code, ip, url); } } } else { /* * Handle everything here: https://curl.se/libcurl/c/libcurl-errors.html * */ vector domain_errors = { CURLE_COULDNT_RESOLVE_HOST, CURLE_COULDNT_CONNECT, }; handle_curl_error(url, res, string(m_curl_error_buffer)); if (res == CURLE_COULDNT_RESOLVE_HOST || res == CURLE_COULDNT_CONNECT) { update_url(url, 10000 + res, common::cur_datetime(), URL()); mark_all_urls_with_error(10000 + res); } else { update_url(url, 10000 + res, common::cur_datetime(), URL()); } } m_buffer.resize(0); m_buffer.shrink_to_fit(); } void scraper::mark_all_urls_with_error(size_t error_code) { while (m_queue.size()) { URL url = filter_url(m_queue.front()); m_queue.pop(); update_url(url, error_code, common::cur_datetime(), URL()); } } void scraper::update_url(const URL &url, size_t http_code, size_t last_visited, const URL &redirect) { // Store information about URL. } void scraper::handle_curl_error(const URL &url, size_t curl_error, const std::string &error_msg) { m_num_errors++; m_consecutive_error_count++; m_store->add_curl_error(url.str() + "\t" + to_string(curl_error) + "\t" + error_msg + "\n"); m_store->upload_curl_errors(); } void scraper::handle_200_response(const string &data, size_t response_code, const string &ip, const URL &url) { (void)response_code; m_num_200++; parser::html_parser html_parser(100000); html_parser.parse(data, url.str()); m_num_total++; if (url.has_www()) m_num_www++; if (url.has_https()) m_num_https++; if (m_num_total == 3) upload_domain_info(); const string date = common::iso8601_datetime(); if (html_parser.should_insert()) { const string line = (url.str() + '\t' + html_parser.title() + '\t' + html_parser.h1() + '\t' + html_parser.meta() + '\t' + html_parser.text() + '\t' + date + '\t' + ip + '\n'); m_store->add_scraper_data(line); string links; for (const auto &link : html_parser.links()) { links += (link.host() + '\t' + link.path() + '\t' + link.target_host() + '\t' + link.target_path() + '\t' + link.text() + '\t' + (link.nofollow() ? "1" : "0") + '\n'); } m_store->add_link_data(links); m_store->upload_results(); } } void scraper::handle_non_200_response(const string &data, size_t response_code, const string &ip, const URL &url) { m_num_non200++; check_for_captcha_block(data, response_code); parser::html_parser html_parser; html_parser.parse(data, url.str()); const string date = common::iso8601_datetime(); if (html_parser.should_insert()) { const string line = (url.str() + '\t' + html_parser.title() + '\t' + html_parser.h1() + '\t' + html_parser.meta() + '\t' + html_parser.text() + '\t' + date + '\t' + ip + '\n'); m_store->add_non_200_scraper_data(line); m_store->upload_non_200_results(); } } void scraper::check_for_captcha_block(const std::string &data, size_t response_code) { if (response_code != 200 && (data.find("Captcha") != string::npos || data.find("captcha") != string::npos)) { m_blocked = true; mark_all_urls_with_error(10000 + 999); } } void scraper::download_domain_data() { } void scraper::download_robots() { const URL robots_path = filter_url(URL("http://" + m_domain + "/robots.txt")); m_robots_content = simple_get(robots_path); scraper::upload_robots_txt(m_robots_content); } bool scraper::robots_allow_url(const URL &url) const { googlebot::RobotsMatcher matcher; bool allowed = matcher.OneAgentAllowedByRobots(m_robots_content, user_agent_token(), url.str()); return allowed; } string scraper::simple_get(const URL &url) { curl_easy_setopt(m_curl, CURLOPT_USERAGENT, user_agent().c_str()); curl_easy_setopt(m_curl, CURLOPT_FOLLOWLOCATION, 1l); curl_easy_setopt(m_curl, CURLOPT_MAXREDIRS, 5l); curl_easy_setopt(m_curl, CURLOPT_WRITEFUNCTION, curl_string_reader); curl_easy_setopt(m_curl, CURLOPT_WRITEDATA, this); curl_easy_setopt(m_curl, CURLOPT_URL, url.str().c_str()); curl_easy_setopt(m_curl, CURLOPT_TIMEOUT, 30); curl_easy_setopt(m_curl, CURLOPT_ERRORBUFFER, m_curl_error_buffer); m_buffer.resize(0); CURLcode res = curl_easy_perform(m_curl); if (res == CURLE_OK) { long response_code; char *new_url_str = nullptr; curl_easy_getinfo(m_curl, CURLINFO_RESPONSE_CODE, &response_code); curl_easy_getinfo(m_curl, CURLINFO_EFFECTIVE_URL, &new_url_str); check_for_captcha_block(m_buffer, response_code); } else { /* * Handle everything here: https://curl.se/libcurl/c/libcurl-errors.html * */ vector domain_errors = { CURLE_COULDNT_RESOLVE_HOST, CURLE_COULDNT_CONNECT, }; handle_curl_error(url, res, string(m_curl_error_buffer)); if (res == CURLE_COULDNT_RESOLVE_HOST || res == CURLE_COULDNT_CONNECT) { mark_all_urls_with_error(10000 + res); } else { } } return m_buffer; } void scraper::upload_domain_info() { if (m_num_total > 0) { // TODO.. Upload data about domain. } } void scraper::upload_robots_txt(const string &robots_content) { // TODO.. Upload data about robots.txt } URL scraper::filter_url(const URL &url) { URL ret(url); //if (m_domain_data.m_has_https && !url.has_https()) ret.set_scheme("https"); //if (m_domain_data.m_has_www && !url.has_www()) ret.set_www(true); return ret; } void scraper::start_thread() { m_started = true; m_thread = std::move(thread([this](){ this->run(); })); } size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata) { const size_t byte_size = size * nmemb; scraper *s = static_cast(userdata); if (s->m_buffer_len < s->m_buffer.size() + byte_size) return 0; s->m_buffer.append(ptr, byte_size); return byte_size; } size_t read_max_scrapers() { ifstream infile("/tmp/num_scrapers"); if (!infile.is_open()) return 0; size_t max_scrapers; infile >> max_scrapers; return max_scrapers; } bool reset_scraper_urls() { string content = ""; int error = transfer::upload_file("nodes/" + config::node + "/scraper.urls", content); return error == transfer::OK; } vector download_scraper_urls() { int error; string content = transfer::file_to_string("nodes/" + config::node + "/scraper.urls", error); if (error == transfer::ERROR) return {}; reset_scraper_urls(); vector raw_urls; boost::algorithm::split(raw_urls, content, boost::is_any_of("\n")); vector urls; for (const string &url : raw_urls) { if (text::trim(url).size()) { urls.push_back(url); } } return urls; } void run_scraper_on_urls(const vector &input_urls) { size_t max_scrapers = 1000; scraper_store store; scraper_stats stats; map> scrapers; stats.start_thread(60); // Report statistics every minute. vector urls = input_urls; while (urls.size() || scrapers.size()) { LOG_INFO("Starting scrapers with: " + to_string(urls.size()) + " urls"); size_t new_max_scrapers = read_max_scrapers(); if (new_max_scrapers) { max_scrapers = new_max_scrapers; } vector unhandled_urls; for (const string &url_str : urls) { URL url(url_str); if (scrapers.count(url.host()) == 0) { if (scrapers.size() >= max_scrapers) { unhandled_urls.push_back(url_str); } else { scrapers[url.host()] = make_unique(url.host(), &store); scrapers[url.host()]->push_url(url); } } else { scrapers[url.host()]->push_url(url); } } // Start scrapers. for (auto &iter : scrapers) { if (!iter.second->started()) { iter.second->start_thread(); } } // Wait for some scrapers to finish before we assign new scrapers again. while (scrapers.size() > max_scrapers * 0.8) { stats.gather_statistics(scrapers, urls.size()); for (auto iter = scrapers.begin(); iter != scrapers.end(); ) { if (iter->second->finished()) { iter = scrapers.erase(iter); } else { iter++; } } this_thread::sleep_for(1000ms); } stats.gather_statistics(scrapers, urls.size()); urls = unhandled_urls; // Check for new urls and append them. vector new_urls = download_scraper_urls(); urls.insert(urls.end(), new_urls.begin(), new_urls.end()); if (urls.size() == 0) { // We don't have any new urls. Just sleep a bit before checking again. std::this_thread::sleep_for(std::chrono::seconds(60)); } } } void url_downloader() { const size_t timeout = 300; //const size_t limit = 500; // main loop while (true) { // Check if there are any urls to digest every 'timeout' minutes. vector urls = download_scraper_urls(); if (urls.size() > 0) { run_scraper_on_urls(urls); } sleep(timeout); } } } ================================================ FILE: src/scraper/scraper.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include "transfer/transfer.h" #include "robots.h" #include "scraper_store.h" #include "URL.h" #include "profiler/profiler.h" namespace scraper { std::string user_agent_token(); std::string user_agent(); /* * The scraper! * */ class scraper { public: scraper(const std::string &domain, scraper_store *store); ~scraper(); void set_timeout(size_t timeout_in_seconds) { m_timeout = timeout_in_seconds; } void push_url(const URL &url); void run(); void start_thread(); bool finished() const { return m_finished; }; bool started() { return m_started; } std::string domain() { return m_domain; } size_t num_scraped() const { return m_num_200; } size_t num_scraped_non200() const { return m_num_non200; } size_t num_errors() const { return m_num_errors; } size_t size() const { return m_queue.size(); } bool blocked() const { return m_blocked; } private: std::thread m_thread; bool m_started = false; bool m_finished = false; std::string m_domain; std::string m_buffer; char m_curl_error_buffer[CURL_ERROR_SIZE]; size_t m_buffer_len = 1024*1024*10; size_t m_num_200 = 0; size_t m_num_non200 = 0; size_t m_num_errors = 0; bool m_blocked = false; CURL *m_curl; scraper_store *m_store; std::queue m_queue; googlebot::RobotsMatcher m_robots; std::string m_robots_content; size_t m_num_total = 0; size_t m_num_www = 0; size_t m_num_https = 0; size_t m_consecutive_error_count = 0; size_t m_timeout = 30; void handle_curl_error(const URL &url, size_t curl_error, const std::string &error_msg); void handle_url(const URL &url); void mark_all_urls_with_error(size_t error_code); void update_url(const URL &url, size_t http_code, size_t last_visited, const URL &redirect); void handle_200_response(const std::string &data, size_t response_code, const std::string &ip, const URL &url); void handle_non_200_response(const std::string &data, size_t response_code, const std::string &ip, const URL &url); void check_for_captcha_block(const std::string &data, size_t response_code); void download_domain_data(); void download_robots(); bool robots_allow_url(const URL &url) const; std::string simple_get(const URL &url); void upload_domain_info(); void upload_robots_txt(const std::string &robots_content); URL filter_url(const URL &url); public: friend size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata); }; class scraper_stats { public: scraper_stats(); ~scraper_stats(); void gather_statistics(const std::map> &scrapers, size_t urls_in_queue); void start_thread(size_t timeout); void start_count(size_t urls_in_queue); void end_count(); void count_finished(const scraper &scraper); void count_unfinished(const scraper &scraper); private: std::thread m_thread; size_t m_timeout = 300; size_t m_num_blocked = 0; size_t m_finished_scrapers = 0; size_t m_unfinished_scrapers = 0; size_t m_scraped_urls = 0; size_t m_unfinished_scraped_urls = 0; size_t m_scraped_urls_non200 = 0; size_t m_unfinished_scraped_urls_non200 = 0; size_t m_scraped_errors = 0; size_t m_unfinished_scraped_errors = 0; size_t m_urls_in_queue = 0; size_t m_urls_assigned = 0; bool m_running = true; std::mutex m_lock; void run(); void log_report(size_t dt); }; size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata); size_t read_max_scrapers(); void url_downloader(); void run_scraper_on_urls(const std::vector &input_urls); } ================================================ FILE: src/scraper/scraper_store.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "scraper_store.h" #include "common/system.h" #include "common/datetime.h" #include "warc/warc.h" #include "transfer/transfer.h" #include "logger/logger.h" using namespace std; namespace scraper { scraper_store::scraper_store() { } scraper_store::scraper_store(bool do_upload) : m_do_upload(do_upload) { } scraper_store::~scraper_store() { m_upload_limit = 0; upload_results(); upload_non_200_results(); } void scraper_store::add_scraper_data(const std::string &line) { m_lock.lock(); m_results.push_back(line); m_lock.unlock(); } void scraper_store::add_non_200_scraper_data(const std::string &line) { m_lock.lock(); m_non_200_results.push_back(line); m_lock.unlock(); } void scraper_store::add_link_data(const std::string &links) { m_lock.lock(); m_link_results.push_back(links); m_lock.unlock(); } void scraper_store::add_curl_error(const string &line) { m_lock.lock(); m_curl_errors.push_back(line); m_lock.unlock(); } void scraper_store::upload_url_datas() { if (!m_do_upload) return; m_lock.lock(); // todo upload data m_lock.unlock(); } void scraper_store::upload_domain_datas() { if (!m_do_upload) return; m_lock.lock(); // todo upload data m_lock.unlock(); } void scraper_store::upload_robots_datas() { if (!m_do_upload) return; m_lock.lock(); // todo upload data m_lock.unlock(); } void scraper_store::upload_results() { if (!m_do_upload) return; m_lock.lock(); if (m_results.size() >= m_upload_limit) { const string all_results = boost::algorithm::join(m_results, ""); const string all_link_results = boost::algorithm::join(m_link_results, ""); m_results.resize(0); m_link_results.resize(0); m_lock.unlock(); internal_upload_results(all_results, all_link_results); return; } m_lock.unlock(); } void scraper_store::upload_non_200_results() { if (!m_do_upload) return; m_lock.lock(); if (m_non_200_results.size() >= m_non_200_upload_limit) { const string all_results = boost::algorithm::join(m_non_200_results, ""); m_non_200_results.resize(0); m_lock.unlock(); internal_upload_non_200_results(all_results); return; } m_lock.unlock(); } void scraper_store::upload_curl_errors() { if (!m_do_upload) return; m_lock.lock(); if (m_curl_errors.size() >= m_curl_errors_upload_limit) { const string all_results = boost::algorithm::join(m_curl_errors, ""); m_curl_errors.resize(0); m_lock.unlock(); internal_upload_curl_errors(all_results); return; } m_lock.unlock(); } std::string scraper_store::tail() const { if (m_results.size() == 0) return ""; return m_results.back(); } void scraper_store::try_upload_until_complete(const string &path, const string &data) { size_t retry_num = 1; while (transfer::upload_gz_file(path, data) == transfer::ERROR) { LOG_INFO("Error uploading file " + path + " retry no " + to_string(retry_num++)); std::this_thread::sleep_for(std::chrono::seconds(30)); } } void scraper_store::internal_upload_results(const string &all_results, const string &all_link_results) { const string warc_path = "crawl-data/ALEXANDRIA-SCRAPER-01/files/" + common::uuid() + "-" + to_string(common::cur_datetime()) + "-" + to_string(m_file_index++) + ".warc.gz"; try_upload_until_complete(warc::get_result_path(warc_path), all_results); try_upload_until_complete(warc::get_link_result_path(warc_path), all_link_results); } void scraper_store::internal_upload_non_200_results(const string &all_results) { const string warc_path = "crawl-data/ALEXANDRIA-SCRAPER-01/non-200-responses/" + common::uuid() + "-" + to_string(common::cur_datetime()) + "-" + to_string(m_file_index++) + ".warc.gz"; try_upload_until_complete(warc::get_result_path(warc_path), all_results); } void scraper_store::internal_upload_curl_errors(const string &all_results) { const string warc_path = "crawl-data/ALEXANDRIA-SCRAPER-01/curl-errors/" + common::uuid() + "-" + to_string(common::cur_datetime()) + "-" + to_string(m_file_index++) + ".warc.gz"; try_upload_until_complete(warc::get_result_path(warc_path), all_results); } } ================================================ FILE: src/scraper/scraper_store.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace scraper { /* * Responsible for storing scraper data on a file and upload it to our fileserver when the file reaches a number of urls. * */ class scraper_store { public: scraper_store(); scraper_store(bool do_upload); ~scraper_store(); void add_scraper_data(const std::string &line); void add_non_200_scraper_data(const std::string &line); void add_link_data(const std::string &links); void add_curl_error(const std::string &line); void upload_url_datas(); void upload_domain_datas(); void upload_robots_datas(); void upload_results(); void upload_non_200_results(); void upload_curl_errors(); std::string tail() const; std::vector get_results() const { return m_results; } private: std::mutex m_lock; std::vector m_results; std::vector m_non_200_results; std::vector m_link_results; std::vector m_curl_errors; size_t m_file_index = 0; size_t m_upload_limit = 50000; size_t m_non_200_upload_limit = 10000; size_t m_curl_errors_upload_limit = 10000; bool m_do_upload = true; void try_upload_until_complete(const std::string &path, const std::string &data); void internal_upload_results(const std::string &all_results, const std::string &all_link_results); void internal_upload_non_200_results(const std::string &all_results); void internal_upload_curl_errors(const std::string &all_results); }; } ================================================ FILE: src/scraper.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include "fcgio.h" #include "config.h" #include "logger/logger.h" #include "scraper/scraper.h" using namespace std; void custom_scraper() { set files = { "1081037252118226853.gz", "10929784512354426297.gz", "11734959054377540990.gz", "1231587059077024966.gz", "12502184239462757041.gz", "12938836205580400636.gz", "13296278169331508461.gz", "14413462586171452382.gz", "15525439295995440529.gz", "16672519014390713150.gz", "18394430357962364895.gz", "10327881400750748691.gz", "10670281930934377105.gz", "10803309592637608156.gz", "1081037252118226853.gz", "10834835858785818363.gz", "10929784512354426297.gz", "11126428663436160103.gz", "11147566439172409894.gz", "11190665490273023949.gz", "11494937404220367031.gz", "11734959054377540990.gz", "11828921816388240862.gz", "12060772154545358825.gz", "12162727308599252185.gz", "1231587059077024966.gz", "12422730800151531594.gz", "12502184239462757041.gz", "12607232937660003080.gz", "12718743898666138934.gz", "12938836205580400636.gz", "13296278169331508461.gz", "13298202493829067141.gz", "13361744378846796689.gz", "13490885160851937523.gz", "13574739826384812082.gz", "13587802784601809709.gz", "13631835647153009173.gz", "1367770908792956967.gz", "14046839555269968094.gz", "14413462586171452382.gz", "14541904792326560616.gz", "1482373106349460952.gz", "14837337010216722341.gz", "15086873759162732674.gz", "15141235398943116798.gz", "15184607826907101421.gz", "15202491165257081552.gz", "15282359210281111669.gz", "15389582257311135463.gz", "15391345478373482283.gz", "15525439295995440529.gz", "15534406110118601925.gz", "15538335442391548855.gz", "15612477389751002303.gz", "15624474507591924007.gz", "15676254393982196237.gz", "15984927866124019398.gz", "16082148041043793761.gz", "16126091541072713257.gz", "16255682052513253306.gz", "16337701239641827376.gz", "16383716280375787103.gz", "16529912269361020733.gz", "16534544105461457700.gz", "16639969140692056885.gz", "16672519014390713150.gz", "16744732358440828846.gz", "16836166158893839160.gz", "17068835535637839797.gz", "1729061688188470388.gz", "17360561405055540730.gz", "1746843565446970019.gz", "17640709097762418065.gz", "18131842535353305093.gz", "18187211227753083566.gz", "18394430357962364895.gz", "1934117982241616211.gz", "2211216046817783595.gz", "2239809113491403275.gz", "2327635888646701575.gz", "2478041411438244752.gz", "2551177065288807556.gz", "2601237824066336189.gz", "2646934360799240353.gz", "2868212837076456812.gz", "2926810779085983621.gz", "3091319073926623211.gz", "338937183383628192.gz", "3604690558929123764.gz", "3606044194188728481.gz", "3852426225324652244.gz", "3972328001646307399.gz", "4007769859008228127.gz", "4072548759689568430.gz", "4193623627004305293.gz", "4226856446620685890.gz", "4312881270332666532.gz", "4473520710685818343.gz", "4720198542499220909.gz", "4734886902380514989.gz", "4800764859071121577.gz", "4837392932044495189.gz", "493001789945179170.gz", "5263808122620003539.gz", "5284265763220135234.gz", "5322267948444699594.gz", "5339170779334172446.gz", "5496827761574196815.gz", "5683557192991319856.gz", "5772366474889297285.gz", "5790856524309526271.gz", "5853082621493931535.gz", "5936310530969939988.gz", "5958586233415593683.gz", "5969382542874041237.gz", "5969882935831645732.gz", "6133590028181400561.gz", "6168304203247739410.gz", "619121932569169133.gz", "6233832895907042056.gz", "6371233587304885182.gz", "6665598992901336677.gz", "6747719063536596803.gz", "6783121411632321193.gz", "6878954272251422334.gz", "6944679014837000907.gz", "7204366432079867323.gz", "7261759399318904627.gz", "7279922463899918193.gz", "7372161099870305017.gz", "7483704574748382827.gz", "7500975006697782336.gz", "7577940383110528297.gz", "7660839115654270407.gz", "7690859939878490358.gz", "7794216653216203685.gz", "7969521158007747392.gz", "7972503305086309118.gz", "7977087069524267698.gz", "801925665986995127.gz", "8357461134896215565.gz", "8473327975000475483.gz", "8558287370764624669.gz", "88637784417391575.gz", "9219910288440466216.gz", "9257832192261807811.gz", "9300442310473380111.gz", "9529889625719263624.gz", "9668036200275969373.gz", "990293958999783642.gz" }; boost::filesystem::create_directories("output"); for (string file : files) { ifstream infile("output/" + file); if (infile.is_open()) continue; stringstream ss; int error; transfer::gz_file_to_stream("crawl-data/ALEXANDRIA-TEST-SIZES/files/" + file, ss, error); if (error == transfer::OK) { string line; scraper::scraper_store store(false); map> scrapers; while (getline(ss, line)) { vector cols; boost::algorithm::split(cols, line, boost::is_any_of("\t")); URL url(cols[0]); if (scrapers.count(url.host()) == 0) { scrapers[url.host()] = make_unique(url.host(), &store); scrapers[url.host()]->set_timeout(0); } scrapers[url.host()]->push_url(url); } for (auto &_scraper : scrapers) { _scraper.second->run(); } const string filename = "output/" + file; ofstream outfile(filename, ios::trunc | ios::binary); boost::iostreams::filtering_ostream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(outfile); for (const string row : store.get_results()) { compress_stream << row; } } return; } /* scraper::scraper_store store(false); scraper::scraper _scraper("heroes.thelazy.net", &store); _scraper.set_timeout(0); _scraper.push_url(URL("https://heroes.thelazy.net//index.php/Main_Page")); _scraper.push_url(URL("https://heroes.thelazy.net//index.php/Dungeon")); _scraper.run(); for (const string row : store.get_results()) { cout << row << endl; }*/ } int main(int argc, const char **argv) { struct sigaction act{SIG_IGN}; sigaction(SIGPIPE, &act, NULL); logger::start_logger_thread(); if (getenv("ALEXANDRIA_CONFIG") != NULL) { config::read_config(getenv("ALEXANDRIA_CONFIG")); } else { config::read_config("/etc/alexandria.conf"); } custom_scraper(); logger::join_logger_thread(); return 0; } ================================================ FILE: src/search_engine/search_allocation.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "full_text/result_set.h" #include "full_text/record.h" #include "full_text/link_record.h" #include "full_text/domain_link_record.h" #include "config.h" #include #include namespace search_engine { /* The idea with this namespace is to handle all the memory allocation needed for serving a request to the search engine. */ template struct storage { /* result_sets holds pre-allocated object of class full_text::result_set. result_sets[0 ... config::query_max_words] */ std::vector>> m_result_sets; // To hold the intersection of the result sets. std::unique_ptr> m_intersected_result; }; class allocation { public: allocation() { m_storage = create_storage(); m_link_storage = std::make_unique(); m_domain_link_storage = std::make_unique(); } private: std::unique_ptr> m_storage; std::unique_ptr> m_link_storage; std::unique_ptr> m_domain_link_storage; }; template std::unique_ptr> *create_storage() { auto storage = new Storage; // Allocate result_sets. for (size_t j = 0; j < config::query_max_words; j++) { auto result_set = std::make_unique>(config::ft_max_results_per_section * config::ft_max_sections); storage->result_sets.push_back(std::move(result_set)); } storage->intersected_result = std::make_unique>(config::ft_max_results_per_section * config::ft_max_sections); return storage; } allocation *create_allocation(); void delete_allocation(allocation *allocation); } ================================================ FILE: src/search_engine/search_engine.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "search_engine.h" #include using namespace std; namespace search_engine { void reset_search_metric(struct full_text::search_metric &metric) { metric.m_total_found = 0; metric.m_total_url_links_found = 0; metric.m_total_domain_links_found = 0; metric.m_links_handled = 0; metric.m_link_domain_matches = 0; metric.m_link_url_matches = 0; } std::vector search_deduplicate(storage *storage, const full_text::index &index, const vector &links, const vector &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) { vector complete_result = search_wrapper(storage, index, links, domain_links, query, config::pre_result_limit, metric); vector deduped_result = deduplicate_result_vector(complete_result, limit); return deduped_result; } } ================================================ FILE: src/search_engine/search_engine.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include "full_text/index.h" #include "full_text/record.h" #include "full_text/link_record.h" #include "full_text/domain_link_record.h" #include "full_text/shard.h" #include "full_text/search_metric.h" #include "logger/logger.h" #include "profiler/profiler.h" #include "parser/parser.h" #include "transfer/transfer.h" #include "algorithm/hash.h" #include "algorithm/sort.h" #include "algorithm/algorithm.h" #include "search_allocation.h" #include namespace search_engine { using std::string; using std::vector; using std::future; using std::thread; using std::span; using std::pair; using std::map; using std::unordered_map; /* Public interface */ /* Our main search routine, no deduplication just raw search. */ template vector search(storage *storage, const full_text::index &index, const vector &links, const vector &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric); /* Only for FullTextRecords since deduplication requires domain hashes. */ vector search_deduplicate(storage *storage, const full_text::index &index, const vector &links, const vector &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric); /* Search for the exact phrase. Will treat the whole phrase as an n_gram so will only give results when num words in query are less or equal to config::n_gram. */ template vector search_exact(storage *storage, const full_text::index &index, const string &query, size_t limit, struct full_text::search_metric &metric); template vector search_ids(storage *storage, const full_text::index &index, const string &query, size_t limit); template full_text::result_set *search_remote(const std::string &query, storage *storage); template class comparator_class { public: // Comparator function bool operator()(data_record &a, data_record &b) { if (a.m_score == b.m_score) return a.m_value < b.m_value; return a.m_score > b.m_score; } }; void reset_search_metric(struct full_text::search_metric &metric); template void set_total_found(const vector *> result_vector, struct full_text::search_metric &metric, double result_quote) { size_t largest_total = 0; for (full_text::result_set *result : result_vector) { if (result->total_num_results() > largest_total) { largest_total = result->total_num_results(); } } metric.m_total_found = (size_t)(largest_total * result_quote); } template size_t largest_result(const vector *> &result_vector) { size_t largest_size = 0; for (full_text::result_set *result : result_vector) { if (result->size() > largest_size) { largest_size = result->size(); } } return largest_size; } /* Add scores for the given links to the result set. The links are assumed to be ordered by link.m_target_hash ascending. */ template size_t apply_link_scores(const vector &links, full_text::result_set *results) { if (typeid(data_record) != typeid(full_text::record)) return 0; if (links.size() == 0) return 0; size_t applied_links = 0; size_t i = 0; size_t j = 0; map, uint64_t> domain_unique; full_text::record *data = (full_text::record *)results->data_pointer(); while (i < links.size() && j < results->size()) { const uint64_t hash1 = links[i].m_target_hash; const uint64_t hash2 = data[j].m_value; if (hash1 < hash2) { i++; } else if (hash1 == hash2) { if (domain_unique.count(std::make_pair(links[i].m_source_domain, links[i].m_target_hash)) == 0) { const float url_score = expm1(25.0f*links[i].m_score) / 50.0f; data[j].m_score += url_score; applied_links++; domain_unique[std::make_pair(links[i].m_source_domain, links[i].m_target_hash)] = links[i].m_source_domain; } i++; } else { j++; } } return applied_links; } template size_t apply_domain_link_scores(const vector &links, full_text::result_set *results) { if (typeid(data_record) != typeid(full_text::record)) return 0; if (links.size() == 0) return 0; size_t applied_links = 0; { std::unordered_map domain_scores; std::unordered_map domain_counts; std::map, uint64_t> domain_unique; { for (const full_text::domain_link_record &link : links) { if (domain_unique.count(std::make_pair(link.m_source_domain, link.m_target_domain)) == 0) { const float domain_score = expm1(25.0f*link.m_score) / 50.0f; domain_scores[link.m_target_domain] += domain_score; domain_counts[link.m_target_domain]++; domain_unique[std::make_pair(link.m_source_domain, link.m_target_domain)] = link.m_source_domain; } } } // Loop over the results and add the calculated domain scores. full_text::record *data = (full_text::record *)results->data_pointer(); for (size_t i = 0; i < results->size(); i++) { const float domain_score = domain_scores[data[i].m_domain_hash]; data[i].m_score += domain_score; applied_links += domain_counts[data[i].m_domain_hash]; } } return applied_links; } template size_t lower_bound(const data_record *data, size_t pos, size_t len, uint64_t value) { while (pos < len) { size_t m = (pos + len) >> 1; if (data[m].m_value < value) { pos = m + 1; } else { len = m; } } return pos; } template void value_intersection(const vector *> &result_sets, vector sections, vector &dest) { if (result_sets.size() == 0) { return; } size_t shortest_vector_position = 0; size_t shortest_len = SIZE_MAX; { size_t iter_index = 0; for (full_text::result_set *result_set : result_sets) { if (shortest_len > result_set->size()) { shortest_len = result_set->size(); shortest_vector_position = iter_index; } iter_index++; } } vector positions(result_sets.size(), 0); const data_record *shortest_data = result_sets[shortest_vector_position]->section_pointer(sections[shortest_vector_position]); while (positions[shortest_vector_position] < shortest_len) { bool all_equal = true; uint64_t value = shortest_data[positions[shortest_vector_position]].m_value; float score_sum = 0.0f; size_t iter_index = 0; for (full_text::result_set *result_set : result_sets) { const data_record *data_arr = result_set->section_pointer(sections[iter_index]); const size_t len = result_set->size(); size_t *pos = &(positions[iter_index]); // this is a linear search. while (*pos < len && value > data_arr[*pos].m_value) { (*pos)++; } if (*pos < len && value == data_arr[*pos].m_value) { const float score = data_arr[*pos].m_score; score_sum += score; } if ((*pos < len && value < data_arr[*pos].m_value) || *pos >= len) { all_equal = false; break; } iter_index++; } if (all_equal) { dest.push_back(shortest_data[positions[shortest_vector_position]]); dest.back().m_score = score_sum / result_sets.size(); } positions[shortest_vector_position]++; } } template void calculate_intersection(const vector *> &result_sets, full_text::result_set *dest) { for (full_text::result_set *result : result_sets) { if (result->size() == 0) return; } vector *> sorted_result_sets(result_sets); sort(sorted_result_sets.begin(), sorted_result_sets.end(), [](const full_text::result_set *a, const full_text::result_set *b) { return a->total_num_results() < b->total_num_results(); }); vector lengths; for (full_text::result_set *result : sorted_result_sets) { lengths.push_back(result->num_sections()); } vector> partitions = Algorithm::incremental_partitions(lengths, config::ft_section_depth); // First just try the top sections. { vector result; value_intersection(sorted_result_sets, partitions[0], result); if (result.size() >= config::result_limit) { dest->copy_vector(result); return; } } vector maximum(sorted_result_sets.size(), 0); for (const vector &vec : partitions) { for (size_t i = 0; i < vec.size(); i++) { if (vec[i] > maximum[i]) maximum[i] = vec[i]; } } for (size_t i = 0; i < maximum.size(); i++) { sorted_result_sets[i]->read_to_section(maximum[i]); } size_t idx = 0; const size_t num_threads = 8; ThreadPool pool(num_threads); vector> results(partitions.size()); std::vector>> thread_results; for (const vector &partition : partitions) { thread_results.emplace_back(pool.enqueue([sorted_result_sets, partition]() { vector result; value_intersection(sorted_result_sets, partition, result); return result; })); idx++; } idx = 0; for (auto && result: thread_results) { results[idx] = result.get(); idx++; } // merge vector merged_vec; Sort::merge_arrays(results, [](const data_record &a, const data_record &b) { return a.m_value < b.m_value; }, merged_vec); // copy. dest->copy_vector(merged_vec); } template void sort_by_score(vector &results) { sort(results.begin(), results.end(), [](const data_record &a, const data_record &b) { return a.m_score > b.m_score; }); } /* puts the top n elements in the first n slots of results. Then sorts those top n elements by value. this function assumes that the input results are sorted by value! so it does nothing for n < results.size() */ template void get_unsorted_results_with_top_scores(full_text::result_set *result, size_t n) { if (result->size() > n) { span *arr = result->span_pointer(); nth_element(arr->begin(), arr->begin() + (n - 1), arr->end(), SearchEngine::comparator_class{}); sort(arr->begin(), arr->begin() + n, [](const data_record &a, const data_record &b) { return a.m_value < b.m_value; }); result->resize(n); } } template bool result_has_many_domains(const full_text::result_set *results) { if (results->size() == 0) return false; const data_record *data = results->data_pointer(); const uint64_t first_domain_hash = data[0].m_domain_hash; for (size_t i = 0; i < results->size(); i++) { if (data[i].m_domain_hash != first_domain_hash) { return true; } } return false; } template void deduplicate_domains(full_text::result_set *results, size_t results_per_domain, size_t limit) { vector deduplicate; unordered_map domain_counts; data_record *records = results->data_pointer(); size_t j = 0; for (size_t i = 0; i < results->size() && j < limit; i++) { records[j] = records[i]; if (domain_counts[records[i].m_domain_hash] < results_per_domain) { j++; domain_counts[records[i].m_domain_hash]++; } } results->resize(j); } template vector deduplicate_result_vector(const vector &results, size_t limit) { vector deduped; vector non_deduped; map d_count; for (const data_record &result : results) { if (d_count[result.m_domain_hash] < config::deduplicate_domain_count) { deduped.push_back(result); } else { non_deduped.push_back(result); } d_count[result.m_domain_hash]++; } if (deduped.size() < limit) { const size_t num_missing = limit - deduped.size(); if (non_deduped.size() > num_missing) { non_deduped.resize(num_missing); } vector ret; Sort::merge_arrays(deduped, non_deduped, [] (const data_record &a, const data_record &b) { return a.m_score > b.m_score; }, ret); return ret; } deduped.resize(limit); return deduped; } template vector *> search_shards(vector *> &result_sets, const vector *> &shards, const vector &words) { assert(words.size() <= config::query_max_words); assert(words.size() <= result_sets.size()); vector *> result_vector; vector searched_words; size_t word_id = 0; for (const string &word : words) { // One word should only be searched once. if (find(searched_words.begin(), searched_words.end(), word) != searched_words.end()) continue; searched_words.push_back(word); uint64_t word_hash = Hash::str(word); shards[word_hash % config::ft_num_shards]->find(word_hash, result_sets[word_id]); result_vector.push_back(result_sets[word_id]); word_id++; } return result_vector; } template vector *> search_shards_exact(vector *> &result_sets, const vector *> &shards, const vector &words) { assert(words.size() <= config::query_max_words); assert(words.size() <= result_sets.size()); vector *> result_vector; uint64_t n_gram_hash = Hash::str(boost::join(words, " ")); shards[n_gram_hash % config::ft_num_shards]->find(n_gram_hash, result_sets[0]); result_vector.push_back(result_sets[0]); return result_vector; } template full_text::result_set *make_search(storage *storage, const vector *> &shards, const vector &links, const vector &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) { reset_search_metric(metric); vector words = Text::get_full_text_words(query, config::query_max_words); if (words.size() == 0) return new full_text::result_set(0); vector *> result_vector = search_shards(storage->result_sets, shards, words); full_text::result_set *flat_result; if (result_vector.size() > 1) { // We need to calculate the intersection of the given results. flat_result = storage->intersected_result; flat_result->resize(0); calculate_intersection(result_vector, flat_result); set_total_found(result_vector, metric, (double)flat_result->size() / largest_result(result_vector)); } else { flat_result = result_vector[0]; set_total_found(result_vector, metric, 1.0); } // Close file pointers. for (full_text::result_set *result_set : result_vector) { result_set->close_sections(); } metric.m_link_domain_matches = apply_domain_link_scores(domain_links, flat_result); metric.m_link_url_matches = apply_link_scores(links, flat_result); get_unsorted_results_with_top_scores(flat_result, limit); return flat_result; } template full_text::result_set *make_search_exact(storage *storage, const vector *> &shards, const string &query, size_t limit, struct full_text::search_metric &metric) { reset_search_metric(metric); vector words = Text::get_full_text_words(query, config::query_max_words); if (words.size() == 0) return new full_text::result_set(0); vector *> result_vector = search_shards_exact(storage->result_sets, shards, words); full_text::result_set *flat_result; if (result_vector.size() > 1) { // We need to calculate the intersection of the given results. flat_result = storage->intersected_result; flat_result->resize(0); calculate_intersection(result_vector, flat_result); set_total_found(result_vector, metric, (double)flat_result->size() / largest_result(result_vector)); } else { flat_result = result_vector[0]; set_total_found(result_vector, metric, 1.0); } // Close file pointers. for (full_text::result_set *result_set : result_vector) { result_set->close_sections(); } get_unsorted_results_with_top_scores(flat_result, limit); return flat_result; } template vector search_wrapper(storage *storage, const full_text::index &index, const vector &links, const vector &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) { full_text::result_set *result = make_search(storage, index.shards(), links, domain_links, query, limit, metric); vector complete_result(result->span_pointer()->begin(), result->span_pointer()->end()); // Sort. sort_by_score(complete_result); return complete_result; } template vector search_wrapper_exact(storage *storage, const full_text::index &index, const string &query, size_t limit, struct full_text::search_metric &metric) { full_text::result_set *result = make_search_exact(storage, index.shards(), query, limit, metric); vector complete_result(result->span_pointer()->begin(), result->span_pointer()->end()); // Sort. sort_by_score(complete_result); return complete_result; } template vector search(storage *storage, const full_text::index &index, const vector &links, const vector &domain_links, const string &query, size_t limit, struct full_text::search_metric &metric) { vector complete_result = search_wrapper(storage, index, links, domain_links, query, limit, metric); if (complete_result.size() > limit) { complete_result.resize(limit); } return complete_result; } template vector search_exact(storage *storage, const full_text::index &index, const string &query, size_t limit, struct full_text::search_metric &metric) { vector complete_result = search_wrapper_exact(storage, index, query, limit, metric); if (complete_result.size() > limit) { complete_result.resize(limit); } return complete_result; } template vector search_ids(storage *storage, const full_text::index &index, const string &query, size_t limit) { vector words = text::get_expanded_full_text_words(query); uint64_t key = algorithm::hash(boost::algorithm::join(words, " ")); index.shards()[key % config::ft_num_shards]->find(key, storage->result_sets[0]); vector ret(storage->result_sets[0]->span_pointer()->begin(), storage->result_sets[0]->span_pointer()->end()); storage->result_sets[0]->close_sections(); return ret; } template full_text::result_set *search_remote(const std::string &query, storage *storage) { storage->result_sets[0]->resize(0); string buffer; int error; transfer::url_to_string(config::data_node + "/?i=" + parser::urlencode(query), buffer, error); if (error == transfer::OK) { const size_t num_records = buffer.size() / sizeof(data_record); data_record *data_ptr = storage->result_sets[0]->data_pointer(); memcpy(data_ptr, buffer.c_str(), buffer.size()); storage->result_sets[0]->resize(num_records); } return storage->result_sets[0]; } } ================================================ FILE: src/server/search_server.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "search_server.h" #include #include "http/server.h" #include "indexer/index_manager.h" #include "indexer/url_record.h" #include "hash_table2/hash_table.h" #include "transfer/transfer.h" #include "parser/parser.h" #include "parser/unicode.h" #include "api/result_with_snippet.h" #include "api/api_response.h" #include "full_text/search_metric.h" namespace server { void search_server() { indexer::index_manager idx_manager; cout << "starting server..." << endl; ::http::server srv([&idx_manager](const http::request &req) { http::response res; res.content_type("application/json"); URL url = req.url(); auto query = url.query(); size_t limit = 1000; if (query.count("limit")) limit = std::stoi(query["limit"]); (void)limit; if (url.path() == "/favicon.ico") { res.code(404); res.body("404"); return res; } stringstream body; // implement the same search server logic we have on alexandria.org now. LOG_INFO("Serving request: " + url.path()); bool deduplicate = true; if (query.find("d") != query.end()) { if (query["d"] == "a") { deduplicate = false; } } if (query.find("q") != query.end() && deduplicate) { full_text::search_metric metric; profiler::instance profiler; auto results = idx_manager.find(query["q"], metric); api::api_response api_res(results, metric, profiler.get()); body << api_res; } res.code(200); res.body(body.str()); return res; }); } } ================================================ FILE: src/server/search_server.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once namespace server { void search_server(); } ================================================ FILE: src/server/url_server.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "url_server.h" #include #include "http/server.h" #include "indexer/index_manager.h" #include "indexer/url_record.h" namespace server { void url_server() { cout << "starting server..." << endl; ::http::server srv([](const http::request &req) { http::response res; URL url = req.url(); auto query = url.query(); stringstream body; if (req.request_method() == "POST") { const string req_body = req.request_body(); const size_t num_hashes = req_body.size() / sizeof(uint64_t); std::vector domain_hashes(num_hashes); memcpy((char *)domain_hashes.data(), req_body.c_str(), num_hashes * sizeof(uint64_t)); auto tokens = text::get_tokens(query["q"]); size_t len = std::stoull(query["len"]); std::map> results; utils::thread_pool pool(32); std::mutex result_lock; cout << "received " << domain_hashes.size() << " hashes" << endl; size_t all_total_num_results = 0; for (auto dom_hash : domain_hashes) { pool.enqueue([dom_hash, tokens, &query, &result_lock, &results, &all_total_num_results, len]() { std::vector res; vector links; { // read links const string file = config::data_path() + "/" + to_string(dom_hash % 8) + "/full_text/url_links/" + to_string(dom_hash) + ".data"; indexer::index_reader_file reader(file); if (reader.size()) { if (reader.size() > 10 * 1024* 1024) { indexer::index idx("url_links", dom_hash, 1000); links = idx.find_top(tokens, 1000); } else { const size_t size = reader.size(); std::unique_ptr buffer = std::make_unique(size); reader.seek(0); reader.read(buffer.get(), size); std::istringstream ram_reader(string(buffer.get(), size)); indexer::index idx(&ram_reader, 1000); links = idx.find_top(tokens, 1000); } } std::sort(links.begin(), links.end(), indexer::link_record::storage_order()); auto link_formula = [](float score) { return expm1(20.0f * score) / 10.0f; }; std::vector grouped; for (auto rec : links) { if (grouped.size() && grouped.back().storage_equal(rec)) { grouped.back().m_score += link_formula(rec.m_score); } else { grouped.emplace_back(rec); grouped.back().m_score = link_formula(rec.m_score); } } links = grouped; } const string file = config::data_path() + "/" + to_string(dom_hash % 8) + "/full_text/url/" + to_string(dom_hash) + ".data"; indexer::index_reader_file reader(file); size_t mod_incr = 0; auto score_mod = [&mod_incr, &links](const indexer::url_record &record) { while (mod_incr < links.size() && links[mod_incr].m_target_hash < record.m_value) { mod_incr++; } float link_score = 0.0f; if (mod_incr < links.size() && links[mod_incr].m_target_hash == record.m_value) { link_score += links[mod_incr].m_score; } return record.m_score + ((1000.0f - record.url_length()) / 500.0f) + link_score; }; size_t total_num_results = 0; if (reader.size()) { if (reader.size() > 10 * 1024* 1024) { indexer::index idx("url", dom_hash, 1000); res = idx.find_top(total_num_results, tokens, len, score_mod); } else { const size_t size = reader.size(); std::unique_ptr buffer = std::make_unique(size); reader.seek(0); reader.read(buffer.get(), size); std::istringstream ram_reader(std::string(buffer.get(), size)); indexer::index idx(&ram_reader, 1000); res = idx.find_top(total_num_results, tokens, len, score_mod); } } std::lock_guard lock(result_lock); all_total_num_results += total_num_results; results[dom_hash] = res; }); } pool.run_all(); // Output result. body.write((char *)&all_total_num_results, sizeof(size_t)); for (auto domain_hash : domain_hashes) { body.write((char *)&domain_hash, sizeof(uint64_t)); size_t num_records = results[domain_hash].size(); body.write((char *)&num_records, sizeof(size_t)); for (const auto &record : results[domain_hash]) { body.write((char *)&(record.m_value), sizeof(uint64_t)); body.write((char *)&(record.m_score), sizeof(float)); } } res.content_type("application/octet-stream"); } res.code(200); const string res_str = body.str(); cout << "outputting: " << res_str.size() << endl; res.body(res_str); return res; }); } } ================================================ FILE: src/server/url_server.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once namespace server { void url_server(); } ================================================ FILE: src/server.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include "fcgio.h" #include "config.h" #include "logger/logger.h" #include "profiler/profiler.h" #include "indexer/console.h" #include "json.hpp" #include "server/search_server.h" #include "server/url_server.h" #include using namespace std; int main(int argc, const char **argv) { struct sigaction act{SIG_IGN}; sigaction(SIGPIPE, &act, NULL); logger::start_logger_thread(); if (getenv("ALEXANDRIA_CONFIG") != NULL) { config::read_config(getenv("ALEXANDRIA_CONFIG")); } else { config::read_config("/etc/alexandria.conf"); } const string arg(argc > 1 ? argv[1] : ""); server::search_server(); logger::join_logger_thread(); return 0; } ================================================ FILE: src/stats/stats.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "config.h" #include "text/text.h" #include "full_text/full_text_index.h" #include "full_text/full_text_shard.h" namespace stats { std::hash hasher; template std::map word_stats(const full_text::full_text_index &index, const std::string &query, size_t index_size); template std::map get_word_counts(const std::vector *> &shards, const std::string &query) { std::vector words = text::get_full_text_words(query); if (words.size() == 0) return {}; std::map result; std::vector searched_words; for (const std::string &word : words) { // One word should only be searched once. if (find(searched_words.begin(), searched_words.end(), word) != searched_words.end()) continue; searched_words.push_back(word); uint64_t word_hash = hasher(word); result[word] = shards[word_hash % config::ft_num_shards]->total_num_results(word_hash); } return result; } template std::map word_stats(const full_text::full_text_index &index, const std::string &query, size_t index_size) { std::map complete_result = get_word_counts(index.shards(), query); for (const auto &iter : complete_result) { complete_result[iter.first] /= index_size; } return complete_result; } } ================================================ FILE: src/text/stopwords.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "stopwords.h" using namespace std; bool stopwords::is_stop_word(const string &word) { return (s_english.find(word) != s_english.end()) || (s_swedish.find(word) != s_swedish.end()); } set stopwords::s_english{ "the", "of", "and", "in", "to", "a", "is", "as", "for", "was", "by", "that", "with", "on", "from", "are", "an", "or", "it", "at", "his", "be", "which", "this", "he", "were", "not", "also", "has", "have", "its", "their", "but", "first", "had", "one", "other", "new", "they", "such", "been", "can", "after", "more", "who", "two", "all", "some", "most", "may", "into", "when", "between", "than", "there", "these", "during", "only", "many", "time", "would", "states", "no", "over", "about", "while", "use", "both", "if", "where", "then", "i", "through", "since", "being", "made", "became", "part", "her", "de", "three", "any", "up", "each", "them", "often", "will", "him", "so", "out", "same", "because", "well", "several", "form", "name", "could", "although", "set", "different", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0" }; set stopwords::s_swedish{ "och", "i", "av", "som", "en", "att", "till", "den", "med", "på", "är", "för", "det", "de", "ett", "var", "från", "har", "om", "vid", "inte", "även", "eller", "sig", "men", "efter", "man", "kan", "sin", "där", "andra", "hade", "blev", "då", "första", "finns", "mot", "sedan", "så", "genom", "över", "detta", "också", "bland", "mellan", "två", "när", "fick", "samt", "skulle", "annat", "dock", "denna", "inom", "olika", "vilket", "ut", "flera", "se", "vara", "upp", "ha", "senare", "många", "kom", "än", "dessa", "alla", "samma", "del", "stora", "sitt", "sina", "mycket", "tre", "mer", "utan", "nya", "ofta", "enligt", "blir", "några", "kunde", "hela", "gjorde", "varit", "här", "ska", "eftersom", "få", "fanns", "bara", "något", "kommer", "både", "kallas", "vissa", "får", "cirka", "ur", "endast", "tog", "dem", "medan", "redan", "fyra", "någon", "nu", "går", "innan", "bli", "allt", "därefter", "därför", "hur", "varje", "per", "åt", "antal", "delen", "vilken", "vad", "helt", "sätt", "vill", "åren", "gör", "kallade", "främst", "båda", "själv", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0" }; ================================================ FILE: src/text/stopwords.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include class stopwords { public: static bool is_stop_word(const std::string &word); private: static std::set s_english; static std::set s_swedish; }; ================================================ FILE: src/text/text.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "text.h" namespace text { bool is_clean_char(const char *ch, size_t multibyte_len) { if (multibyte_len == 1) { return (ch[0] >= 'a' && ch[0] <= 'z') || (ch[0] >= '0' && ch[0] <= '9'); } else if (multibyte_len == 2) { return (strncmp(ch, "å", 2) == 0) || (strncmp(ch, "ä", 2) == 0) || (strncmp(ch, "ö", 2) == 0); } return false; } bool is_clean_word(const std::string &s) { const char *str = s.c_str(); size_t len = s.size(); for (size_t i = 0; i < len; ) { size_t multibyte_len = 1; for (size_t j = i + 1; IS_MULTIBYTE_CODEPOINT(str[j]) && (j < len); j++, multibyte_len++) { } if (!is_clean_char(&str[i], multibyte_len)) { return false; } i += multibyte_len; } return true; } std::string clean_word(const std::string &s) { std::string result; const char *str = s.c_str(); size_t len = s.size(); for (size_t i = 0; i < len; ) { size_t multibyte_len = 1; for (size_t j = i + 1; IS_MULTIBYTE_CODEPOINT(str[j]) && (j < len); j++, multibyte_len++) { } if (is_clean_char(&str[i], multibyte_len)) { result.append(&str[i], multibyte_len); } i += multibyte_len; } return result; } /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. */ std::vector get_words(const std::string &str, size_t limit) { const std::string word_boundary = " \t,|!"; std::string str_lc = lower_case(str); std::vector raw_words, words; boost::split(raw_words, str_lc, boost::is_any_of(word_boundary)); for (std::string &word : raw_words) { trim_both_inplace(word); if (is_clean_word(word) && word.size() <= CC_MAX_WORD_LEN && word.size() > 0) { words.push_back(word); } if (limit && words.size() == limit) break; } return words; } std::vector get_words(const std::string &str) { return get_words(str, 0); } /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. */ std::vector get_full_text_words(const std::string &str, size_t limit) { const std::string word_boundary = " \t,|!"; std::string str_lc = lower_case(str); std::vector raw_words, words; boost::split(raw_words, str_lc, boost::is_any_of(word_boundary)); for (std::string &word : raw_words) { if (parser::unicode::is_valid(word)) { trim_both_inplace(word); if (word.size() <= CC_MAX_WORD_LEN && word.size() > 0) { words.push_back(word); } if (limit && words.size() == limit) break; } } return words; } std::vector get_full_text_words(const std::string &str) { return get_full_text_words(str, 0); } std::vector get_full_text_tokens(const std::string &str, size_t limit) { const auto words = get_full_text_words(str, limit); std::vector ret(words.size()); std::transform(words.cbegin(), words.cend(), ret.begin(), [](const std::string &word) { return algorithm::hash(word); }); return ret; } std::vector get_full_text_tokens(const std::string &str) { return get_full_text_tokens(str, 0); } std::vector get_unique_full_text_tokens(const std::string &str, size_t limit) { auto vec = get_full_text_tokens(str, 0); std::set s; const unsigned size = vec.size(); for (unsigned i = 0; i < size; ++i) s.insert(vec[i]); vec.assign(s.begin(), s.end()); return vec; } std::vector get_unique_full_text_tokens(const std::string &str) { return get_unique_full_text_tokens(str, 0); } /* This should be the fast way of getting tokens out of a string. It should just read the whole string and store tokens using the str2token hash function. */ std::vector get_tokens(const std::string &str, std::function str2token) { const char *word_boundary = " \t,|!"; std::string cur_token; std::vector tokens; for (const char &ch : str) { // If is word boundary. if (strchr(word_boundary, ch)) { if (cur_token.size() && parser::unicode::is_valid(cur_token)) { trim_punct_inplace(cur_token); tokens.push_back(str2token(cur_token)); } cur_token.clear(); } else { // This if statement trims the token. if (!isspace(ch)) { cur_token.insert(cur_token.end(), tolower(ch)); } } } // Remember the last token. if (cur_token.size() && parser::unicode::is_valid(cur_token)) { trim_punct_inplace(cur_token); tokens.push_back(str2token(cur_token)); } return tokens; } std::vector get_tokens(const std::string &str) { return get_tokens(str, algorithm::hash); } std::vector get_snippets(const std::string &str) { const size_t snippet_len = 300; const char *word_boundary = " \t,|!"; std::string cur_snippet; std::string cur_token; std::vector snippets; for (const char &ch : str) { // If is word boundary. if (strchr(word_boundary, ch)) { if (cur_token.size() && parser::unicode::is_valid(cur_token)) { if (cur_snippet.size() + cur_token.size() <= snippet_len) { cur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end()); cur_snippet.insert(cur_snippet.end(), ' '); } else { trim_inplace(cur_snippet); snippets.push_back(cur_snippet); cur_snippet.clear(); cur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end()); cur_snippet.insert(cur_snippet.end(), ' '); } } cur_token.clear(); } else { // This if statement trims the token. cur_token.insert(cur_token.end(), ch); } } if (cur_token.size() && parser::unicode::is_valid(cur_token)) { cur_snippet.insert(cur_snippet.end(), cur_token.begin(), cur_token.end()); } trim_inplace(cur_snippet); snippets.push_back(cur_snippet); return snippets; } /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. These functions also expand on blend chars. */ std::vector get_expanded_full_text_words(const std::string &str, size_t limit) { const std::string word_boundary = " \t,|!"; const std::string blend_chars = ".-:"; std::string str_lc = lower_case(str); std::vector raw_words, words, blended; boost::split(raw_words, str_lc, boost::is_any_of(word_boundary)); for (std::string &word : raw_words) { if (parser::unicode::is_valid(word)) { trim_both_inplace(word); if (word.size() <= CC_MAX_WORD_LEN && word.size() > 0) { words.push_back(word); if (limit && words.size() == limit) break; boost::split(blended, word, boost::is_any_of(blend_chars)); if (blended.size() > 1) { for (std::string &blended_word : blended) { trim_both_inplace(blended_word); words.push_back(blended_word); if (limit && words.size() == limit) break; } } } } } return words; } std::vector get_expanded_full_text_words(const std::string &str) { return get_expanded_full_text_words(str, 0); } /* * Exactly the same algorithm as above but returns tokens. * */ std::vector get_expanded_full_text_tokens(const std::string &str, size_t limit) { const auto words = get_expanded_full_text_words(str, limit); std::vector ret(words.size()); std::transform(words.cbegin(), words.cend(), ret.begin(), [](const std::string &word) { return algorithm::hash(word); }); return ret; } std::vector get_expanded_full_text_tokens(const std::string &str) { return get_expanded_full_text_tokens(str, 0); } std::vector get_unique_expanded_full_text_tokens(const std::string &str, size_t limit) { auto vec = get_expanded_full_text_tokens(str, 0); std::set s; const unsigned size = vec.size(); for (unsigned i = 0; i < size; ++i) s.insert(vec[i]); vec.assign(s.begin(), s.end()); return vec; } std::vector get_unique_expanded_full_text_tokens(const std::string &str) { return get_unique_expanded_full_text_tokens(str, 0); } /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. */ std::vector get_words_without_stopwords(const std::string &str, size_t limit) { const std::string word_boundary = " \t,|!,"; std::string str_lc = lower_case(str); std::vector raw_words, words; boost::split(raw_words, str_lc, boost::is_any_of(word_boundary)); for (std::string &word : raw_words) { trim_both_inplace(word); if (is_clean_word(word) && !stopwords::is_stop_word(word) && word.size() <= CC_MAX_WORD_LEN && word.size() > 0) { words.push_back(word); } if (limit && words.size() == limit) break; } return words; } std::vector get_words_without_stopwords(const std::string &str) { return get_words_without_stopwords(str, 0); } void words_to_ngram_hash(const std::vector &words, size_t n_grams, const std::function &ins) { const size_t word_iter_max = words.size(); for (size_t i = 0; i < word_iter_max; i++) { for (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) { std::string n_gram = words[i]; for (size_t k = i + 1; k <= i + j; k++) { n_gram += " " + words[k]; } ins(algorithm::hash(n_gram)); } } } void words_to_ngram_hash(const std::vector &words, size_t n_grams, const std::function &ins) { const size_t word_iter_max = words.size(); for (size_t i = 0; i < word_iter_max; i++) { for (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) { std::string n_gram = words[i]; for (size_t k = i + 1; k <= i + j; k++) { n_gram += " " + words[k]; } ins(algorithm::hash(n_gram), n_gram); } } } void words_to_ngram_hash(const std::vector &words, size_t n_grams, const std::function &ins) { const size_t word_iter_max = words.size(); for (size_t i = 0; i < word_iter_max; i++) { for (size_t j = 0; j < n_grams && (j + i) < word_iter_max; j++) { std::string n_gram = words[i]; for (size_t k = i + 1; k <= i + j; k++) { n_gram += " " + words[k]; } ins(algorithm::hash(n_gram), n_gram, j + 1); } } } std::map get_word_counts(const std::string &text) { std::vector words = get_full_text_words(text); std::map counts; for (const std::string &word : words) { counts[word]++; } return counts; } std::map get_word_frequency(const std::string &text) { std::vector words = get_full_text_words(text); std::map counts; size_t total = 0; for (const std::string &word : words) { counts[word]++; total++; } std::map freq; for (const auto &iter : counts) { freq[iter.first] = (float)iter.second / total; } return freq; } } ================================================ FILE: src/text/text.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #define CC_MAX_WORD_LEN 100 #include #include #include #include #include #include #include "stopwords.h" #include "parser/unicode.h" #include "algorithm/hash.h" namespace text { /* * excludes + from punctuation trim since we want to be able to search for c++ */ inline bool my_ispunct(int ch) { if (ch == '+') return false; if (ch == '#') return false; return ispunct(ch); } /* * trim whitespace from beginning (in place) * */ inline void ltrim_inplace(std::string &s) { s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) { return !isspace(ch); })); } /* * trim whitespace from end (in place) * */ inline void rtrim_inplace(std::string &s) { s.erase(find_if(s.rbegin(), s.rend(), [](int ch) { return !isspace(ch); }).base(), s.end()); } /* * trim whitespace from both beginning and end (in place) * */ inline void trim_inplace(std::string &s) { ltrim_inplace(s); rtrim_inplace(s); } /* * trim whitespace from both beginning and end (return result) * */ inline std::string trim(const std::string &s) { std::string copy = s; ltrim_inplace(copy); rtrim_inplace(copy); return copy; } /* * trim punctuation from beginning (in place) * */ inline void ltrim_punct_inplace(std::string &s) { s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) { return !my_ispunct(ch); })); } /* * trim punctuation from end (in place) * */ inline void rtrim_punct_inplace(std::string &s) { s.erase(find_if(s.rbegin(), s.rend(), [](int ch) { return !my_ispunct(ch); }).base(), s.end()); } /* * trim punctuation from both beginning and end (in place) * */ inline void trim_punct_inplace(std::string &s) { ltrim_punct_inplace(s); rtrim_punct_inplace(s); } /* * trim punctuation from both beginning and end (return result) * */ inline std::string trim_punct(const std::string &s) { std::string copy = s; ltrim_punct_inplace(copy); rtrim_punct_inplace(copy); return copy; } /* * trim both whitespace and punctuation from beginning (in place) * */ inline void ltrim_both_inplace(std::string &s) { s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) { return !isspace(ch) && !my_ispunct(ch); })); } /* * trim both whitespace and punctuation from end (in place) * */ inline void rtrim_both_inplace(std::string &s) { s.erase(find_if(s.rbegin(), s.rend(), [](int ch) { return !isspace(ch) && !my_ispunct(ch); }).base(), s.end()); } /* * trim both whitespace and punctuation from both beginning and end (in place) * */ inline void trim_both_inplace(std::string &s) { ltrim_both_inplace(s); rtrim_both_inplace(s); } /* * trim both whitespace and punctuation from both beginning and end (return result) * */ inline std::string trim_both(const std::string &s) { std::string copy = s; ltrim_both_inplace(copy); rtrim_both_inplace(copy); return copy; } inline std::string lower_case(const std::string &str) { std::string ret = str; transform(ret.begin(), ret.end(), ret.begin(), [](unsigned char c){ return tolower(c); }); return ret; } bool is_clean_char(const char *ch, size_t multibyte_len); bool is_clean_word(const std::string &s); std::string clean_word(const std::string &s); /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. */ std::vector get_words(const std::string &str, size_t limit); std::vector get_words(const std::string &str); /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. */ std::vector get_full_text_words(const std::string &str, size_t limit); std::vector get_full_text_words(const std::string &str); std::vector get_full_text_tokens(const std::string &str, size_t limit); std::vector get_full_text_tokens(const std::string &str); std::vector get_unique_full_text_tokens(const std::string &str, size_t limit); std::vector get_unique_full_text_tokens(const std::string &str); std::vector get_tokens(const std::string &str, std::function str2token); std::vector get_tokens(const std::string &str); std::vector get_snippets(const std::string &str); /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. These functions also expand on blend chars. */ std::vector get_expanded_full_text_words(const std::string &str, size_t limit); std::vector get_expanded_full_text_words(const std::string &str); std::vector get_expanded_full_text_tokens(const std::string &str, size_t limit); std::vector get_expanded_full_text_tokens(const std::string &str); std::vector get_unique_expanded_full_text_tokens(const std::string &str, size_t limit); std::vector get_unique_expanded_full_text_tokens(const std::string &str); /* Returns a vector of words lower case, punctuation trimmed and less or equal than CC_MAX_WORD_LEN length. */ std::vector get_words_without_stopwords(const std::string &str, size_t limit); std::vector get_words_without_stopwords(const std::string &str); void words_to_ngram_hash(const std::vector &words, size_t n_grams, const std::function &ins); void words_to_ngram_hash(const std::vector &words, size_t n_grams, const std::function &ins); void words_to_ngram_hash(const std::vector &words, size_t n_grams, const std::function &ins); std::map get_word_counts(const std::string &text); std::map get_word_frequency(const std::string &text); } ================================================ FILE: src/tools/calculate_harmonic.cpp ================================================ #include "calculate_harmonic.h" #include "splitter.h" #include "config.h" #include "url_link/link.h" #include "URL.h" #include "common/ThreadPool.h" #include "algorithm/algorithm.h" #include "algorithm/hyper_ball.h" #include #include #include #include #include #include #include #include #include #include namespace tools { std::unordered_map run_uniq_host(const std::vector files) { std::unordered_map hosts; for (const std::string &warc_path : files) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); uint64_t host_hash = url.host_hash(); if (hosts.count(host_hash) == 0) { hosts[host_hash] = url.host(); } } } return hosts; } struct pair_hash { inline size_t operator() (const std::pair &p) const { return (uint64_t)p.first << 32 | (uint64_t)p.second; } }; std::unordered_set, pair_hash> run_uniq_link(const std::vector files, const std::unordered_map &hosts) { std::unordered_set, pair_hash> edges; for (const std::string &warc_path : files) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); const uint64_t source_hash = link.source_url().host_hash(); const uint64_t target_hash = link.target_url().host_hash(); const size_t source_count = hosts.count(source_hash); const size_t target_count = hosts.count(target_hash); if (source_count && target_count) { // Link between two hosts in the host map. edges.insert(std::make_pair(hosts.at(source_hash), hosts.at(target_hash))); } } } return edges; } void calculate_harmonic_hosts() { auto files = generate_list_with_target_url_files(); std::vector> chunks; algorithm::vector_chunk(files, files.size() / s_num_threads, chunks); ThreadPool pool(s_num_threads); std::vector>> results; for (const std::vector &chunk : chunks) { results.emplace_back(pool.enqueue([chunk] { return run_uniq_host(chunk); })); } std::unordered_map hosts; size_t idx = 0; std::cout.precision(2); for (auto &result : results) { const std::unordered_map result_map = result.get(); for (const auto &iter : result_map) { hosts[iter.first] = iter.second; } const double percent = (100.0*(double)idx/results.size()); std::cout << "hosts contains " << hosts.size() << " elements " << percent << "% done" << std::endl; idx++; } idx = 0; std::ofstream outfile(config::data_path() + "/hosts.txt", std::ios::trunc); for (const auto &iter : hosts) { outfile << idx << '\t' << iter.first << '\t' << iter.second << '\n'; idx++; } outfile.close(); } std::unordered_map read_hosts_file() { // Load the hosts std::ifstream infile(config::data_path() + "/hosts.txt"); std::unordered_map ret; std::string line; while (getline(infile, line)) { std::vector parts; boost::algorithm::split(parts, line, boost::is_any_of("\t")); uint32_t id = std::stoi(parts[0]); uint64_t hash = std::stoull(parts[1]); ret[hash] = id; } return ret; } std::vector read_hosts_file_vec() { // Load the hosts std::ifstream infile(config::data_path() + "/hosts.txt"); std::vector ret; std::string line; while (getline(infile, line)) { std::vector parts; boost::algorithm::split(parts, line, boost::is_any_of("\t")); uint32_t id = std::stoi(parts[0]); ret.push_back(id); } return ret; } std::map read_hosts_file_with_names() { // Load the hosts std::ifstream infile(config::data_path() + "/hosts.txt"); std::map ret; std::string line; while (getline(infile, line)) { std::vector parts; boost::algorithm::split(parts, line, boost::is_any_of("\t")); uint32_t id = std::stoi(parts[0]); ret[id] = parts[2]; } return ret; } std::unique_ptr[]> read_edge_file(size_t vlen) { // Load the hosts std::ifstream infile(config::data_path() + "/edges.txt"); auto edge_map = std::make_unique[]>(vlen); std::string line; while (getline(infile, line)) { std::vector parts; boost::algorithm::split(parts, line, boost::is_any_of("\t")); uint32_t from = std::stoi(parts[0]); // I think we are counting from 0 now but from 1 when we created the edge file. uint32_t to = std::stoi(parts[1]); edge_map[to].push_back(from); } return edge_map; } void calculate_harmonic_links() { std::unordered_map hosts = read_hosts_file(); std::cout << "loaded " << hosts.size() << " hosts" << std::endl; auto files = generate_list_with_target_link_files(); std::vector> chunks; algorithm::vector_chunk(files, files.size() / (s_num_threads * 500), chunks); ThreadPool pool(s_num_threads); std::vector, pair_hash>>> results; for (const std::vector &chunk : chunks) { results.emplace_back(pool.enqueue([chunk, &hosts] { return run_uniq_link(chunk, hosts); })); } std::unordered_set, pair_hash> edges; size_t idx = 0; std::cout.precision(2); for (auto &result : results) { const std::unordered_set, pair_hash> result_set = result.get(); size_t idasd = 0; for (const std::pair &edge : result_set) { edges.insert(edge); idasd++; } const double percent = (100.0*(double)idx/results.size()); std::cout << "edges contains " << edges.size() << " elements " << percent << "% done" << std::endl; idx++; } std::ofstream outfile(config::data_path() + "/edges.txt", std::ios::trunc); for (const std::pair& edge : edges) { outfile << edge.first << '\t' << edge.second << '\n'; } outfile.close(); } void calculate_harmonic() { std::vector hosts = read_hosts_file_vec(); auto edge_map = read_edge_file(hosts.size()); const size_t num_hosts = hosts.size(); std::cout << "loaded " << hosts.size() << " hosts" << std::endl; std::cout << "running harmonic centrality algorithm on " << s_num_threads << " threads" << std::endl; //vector harmonic = algorithm::harmonic_centrality_threaded(hosts.size(), edge_map, 3, num_threads); std::vector harmonic = algorithm::hyper_ball(hosts.size(), edge_map); edge_map.reset(nullptr); std::map host_names = read_hosts_file_with_names(); // Save harmonic centrality. std::ofstream outfile(config::data_path() + "/harmonic.txt", std::ios::trunc); for (size_t i = 0; i < hosts.size(); i++) { const double harmonic_float = harmonic[i] / num_hosts; outfile << std::setprecision(15) << host_names.at(hosts[i]) << '\t' << harmonic_float << '\n'; } } } ================================================ FILE: src/tools/calculate_harmonic.h ================================================ #pragma once namespace tools { void calculate_harmonic_hosts(); void calculate_harmonic_links(); void calculate_harmonic(); } ================================================ FILE: src/tools/counter.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "counter.h" #include #include #include #include #include #include #include "config.h" #include "URL.h" #include "url_link/link.h" #include "transfer/transfer.h" #include "algorithm/hyper_log_log.h" #include "algorithm/algorithm.h" #include "file/tsv_file_remote.h" #include "common/system.h" namespace tools { std::map count_urls_per_domain(const std::vector &warc_paths) { const std::set domains = { "theinstantpottable.com", "thehighlineboutique.com", "harveyspet.com", "finertech.com", "canadiantiresucks.net", "thecounter.org", "learningworksforkids.com", "doodlecraftblog.com", "heroes.thelazy.net", "stedmansonline.com", "restaurantbusinessonline.com", "gotohomerepair.com", "aboutbail.com", "spacefuture.com", "personaltelco.net", "helis.com" }; std::vector saved_rows; std::map counts; size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); if (domains.find(url.host()) != domains.end()) { saved_rows.push_back(line); } counts[url.host()]++; } if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } // Save rows. if (saved_rows.size() > 0) { boost::filesystem::create_directories(config::data_path() + "/crawl-data/ALEXANDRIA-TEST-SIZES/files/"); std::ofstream outfile(config::data_path() + "/crawl-data/ALEXANDRIA-TEST-SIZES/files/" + common::uuid() + ".gz"); boost::iostreams::filtering_ostream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(outfile); for (const std::string& row : saved_rows) { compress_stream << row << "\n"; } } return counts; } void run_counter_per_domain(const std::string &batch) { const size_t num_threads = 12; std::vector files; std::vector link_files; const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz"; std::ifstream infile(file_name); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { std::string warc_path = config::data_path() + "/" + line; const size_t pos = warc_path.find(".warc.gz"); if (pos != std::string::npos) { warc_path.replace(pos, 8, ".gz"); } files.push_back(warc_path); } std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / num_threads), thread_input); /* Run url counters */ std::vector>> futures; for (size_t i = 0; i < num_threads && i < thread_input.size(); i++) { futures.emplace_back(std::async(std::launch::async, count_urls_per_domain, thread_input[i])); } std::map all_counts; for (auto &future : futures) { std::map result = future.get(); for (const auto &iter : result) { all_counts[iter.first] += iter.second; } } futures.clear(); for (const auto &iter : all_counts) { std::cout << iter.first << "\t" << iter.second << std::endl; } } algorithm::hyper_log_log *count_urls(const std::vector &warc_paths) { algorithm::hyper_log_log *counter = new algorithm::hyper_log_log(); size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); counter->insert(url.hash()); } if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } return counter; } algorithm::hyper_log_log *count_links(const std::vector &warc_paths) { algorithm::hyper_log_log *counter = new algorithm::hyper_log_log(); size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); counter->insert(link.target_url().hash()); } if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } return counter; } void run_counter() { const size_t num_threads = 12; std::vector files; std::vector link_files; for (const std::string &batch : config::batches) { const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz"; std::ifstream infile(file_name); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { std::string warc_path = config::data_path() + "/" + line; const size_t pos = warc_path.find(".warc.gz"); if (pos != std::string::npos) { warc_path.replace(pos, 8, ".gz"); } files.push_back(warc_path); } } for (const std::string &batch : config::link_batches) { const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz"; std::ifstream infile(file_name); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { std::string warc_path = config::data_path() + "/" + line; const size_t pos = warc_path.find(".warc.gz"); if (pos != std::string::npos) { warc_path.replace(pos, 8, ".links.gz"); } link_files.push_back(warc_path); } } std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / num_threads), thread_input); std::vector> link_thread_input; algorithm::vector_chunk(link_files, ceil((double)link_files.size() / num_threads), link_thread_input); std::mutex write_file_mutex; /* Run url counters */ std::vector> futures; for (size_t i = 0; i < num_threads && i < thread_input.size(); i++) { futures.emplace_back(std::async(std::launch::async, count_urls, thread_input[i])); } algorithm::hyper_log_log url_counter; for (auto &future : futures) { algorithm::hyper_log_log *result = future.get(); url_counter += *(result); delete result; } futures.clear(); /* Run link counters */ for (size_t i = 0; i < num_threads && i < link_thread_input.size(); i++) { futures.emplace_back(std::async(std::launch::async, count_links, link_thread_input[i])); } algorithm::hyper_log_log link_counter; for (auto &future : futures) { algorithm::hyper_log_log *result = future.get(); link_counter += *(result); delete result; } std::cout << "Uniq urls: " << url_counter.count() << std::endl; std::cout << "Uniq links: " << link_counter.count() << std::endl; } std::vector download_link_batch(const std::string &batch, size_t limit, size_t offset) { file::tsv_file_remote warc_paths_file(std::string("crawl-data/") + batch + "/warc.paths.gz"); std::vector warc_paths; warc_paths_file.read_column_into(0, warc_paths); std::vector files_to_download; for (size_t i = offset; i < warc_paths.size() && i < (offset + limit); i++) { std::string warc_path = warc_paths[i]; const size_t pos = warc_path.find(".warc.gz"); if (pos != std::string::npos) { warc_path.replace(pos, 8, ".links.gz"); } files_to_download.push_back(warc_path); } return transfer::download_gz_files_to_disk(files_to_download); } } ================================================ FILE: src/tools/counter.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace tools { void run_counter_per_domain(const std::string &batch); void run_counter(); void count_all_links(); } ================================================ FILE: src/tools/find_links.cpp ================================================ #include "find_links.h" #include "file/gz_tsv_file.h" #include "URL.h" #include "algorithm/algorithm.h" #include #include #include #include #include #include #include #include #include "utils/thread_pool.hpp" #include "algorithm/hash.h" #include "common/system.h" #include "config.h" namespace tools { void find_links_for_hosts_chunk(const std::set &host_hashes, const std::vector &files) { size_t links_written = 0; const size_t links_per_file = 1000000; std::ofstream outfile; outfile.open(config::data_path() + "/crawl-data/SMALL-LINK-MIX/files/" + common::uuid() + "_" + std::to_string(links_written) + "-" + std::to_string(links_written + links_per_file) + ".gz", std::ios::binary); boost::iostreams::filtering_ostream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(outfile); for (auto file : files) { std::ifstream infile(config::data_path() + "/" + file); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { std::vector col_values; boost::algorithm::split(col_values, line, boost::is_any_of("\t")); const size_t host_hash = algorithm::hash(col_values[2]); if (host_hashes.find(host_hash) != host_hashes.end()) { // Write link to current file. compress_stream << line << "\n"; links_written++; if ((links_written % links_per_file) == 0) { std::cout << "writing file" << std::endl; compress_stream.strict_sync(); compress_stream.pop(); outfile.close(); outfile.open(config::data_path() + "/crawl-data/SMALL-LINK-MIX/files/" + common::uuid() + "_" + std::to_string(links_written) + "-" + std::to_string(links_written + links_per_file) + ".gz", std::ios::binary); compress_stream.push(outfile); } } } } } void find_links_for_hosts(const std::set &host_hashes) { const std::string batch = "LINK-MIX"; const size_t num_threads = 12; size_t limit = 4000; file::gz_tsv_file batch_file(config::data_path() + "/crawl-data/" + batch + "/warc.paths.gz"); std::vector rows; batch_file.read_column_into(0, rows); if (rows.size() > limit) rows.resize(limit); std::vector> chunks; algorithm::vector_chunk(rows, ceil(rows.size() / num_threads) + 1, chunks); utils::thread_pool threads(num_threads); for (auto chunk : chunks) { threads.enqueue([&host_hashes, chunk]() { find_links_for_hosts_chunk(host_hashes, chunk); }); } threads.run_all(); } void find_links() { const auto batch = "SMALL-MIX"; size_t limit = 20; file::gz_tsv_file batch_file(config::data_path() + "/crawl-data/"+batch+"/warc.paths.gz"); std::vector rows; batch_file.read_column_into(0, rows); if (rows.size() > limit) rows.resize(limit); // Load all the host hashes into a set std::set host_hashes; for (auto row : rows) { std::ifstream infile(config::data_path() + "/" + row); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { std::vector col_values; boost::algorithm::split(col_values, line, boost::is_any_of("\t")); URL url(col_values[0]); host_hashes.insert(url.host_hash()); } } std::cout << "found " << host_hashes.size() << " hosts" << std::endl; find_links_for_hosts(host_hashes); } } ================================================ FILE: src/tools/find_links.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include namespace tools { void find_links(); } ================================================ FILE: src/tools/generate_url_lists.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include "generate_url_lists.h" #include #include #include #include using namespace std; using namespace boost::filesystem; namespace tools { vector read_urls_with_many_links(const std::string &file_path) { std::ifstream infile(file_path); if (!infile.is_open()) return {}; vector ret_urls; boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); string line; while (getline(decompress_stream, line)) { vector cols; boost::algorithm::split(cols, line, boost::is_any_of("\t")); if (stoull(cols[1]) > 1) { ret_urls.push_back(cols[0]); } } return ret_urls; } vector read_urls(const std::string &path) { // Only read the first 10 files. vector urls; for (size_t i = 1; i <= 10; i++) { string file_path = path + "/top_" + to_string(i) + ".gz"; if (is_regular_file(file_path)) { vector new_urls = read_urls_with_many_links(file_path); if (new_urls.size() == 0) break; urls.insert(urls.end(), new_urls.begin(), new_urls.end()); } } return urls; } void generate_url_lists(const std::string &batch_path) { path pth(batch_path); directory_iterator end_iter; vector urls; for (directory_iterator iter(pth); iter != end_iter; iter++) { if (is_directory(iter->path())) { string current_file = iter->path().string(); vector new_urls = read_urls(current_file); urls.insert(urls.end(), new_urls.begin(), new_urls.end()); } } for (const string &url : urls) { cout << url << endl; } } } ================================================ FILE: src/tools/generate_url_lists.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "config.h" namespace tools { void generate_url_lists(const std::string &batch_path); } ================================================ FILE: src/tools/splitter.cpp ================================================ #include "splitter.h" #include "config.h" #include "roaring/roaring64map.hh" #include "algorithm/bloom_filter.h" #include #include #include #include #include #include #include #include #include #include #include "url_link/link.h" #include "algorithm/algorithm.h" #include "URL.h" #include "common/system.h" namespace tools { std::vector target_url_batches() { std::vector batches; for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { batches.push_back("NODE-" + std::to_string(node_id) + s_suffix); } return batches; } std::vector target_link_batches() { std::vector batches; for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { batches.push_back("LINK-" + std::to_string(node_id) + s_suffix); } return batches; } std::vector generate_list_with_files(const std::vector &batches, const std::string &suffix = ".gz", const std::string &warc_paths_suffix = ".gz") { std::vector file_names; for (const auto &batch : batches) { const std::string file_name = config::data_path() + "/crawl-data/" + batch + "/warc.paths" + warc_paths_suffix; std::ifstream infile(file_name); if (warc_paths_suffix == ".gz") { boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { std::string warc_path = config::data_path() + "/" + line; const size_t pos = warc_path.find(".warc.gz"); if (pos != std::string::npos) { warc_path.replace(pos, 8, suffix); } file_names.push_back(warc_path); } } else { std::string line; while (getline(infile, line)) { std::string warc_path = config::data_path() + "/" + line; const size_t pos = warc_path.find(".warc.gz"); if (pos != std::string::npos) { warc_path.replace(pos, 8, suffix); } file_names.push_back(warc_path); } } } return file_names; } std::vector generate_list_with_url_files() { // create a list with .gz files that contains urls return generate_list_with_files(config::batches, ".gz"); } std::vector generate_list_with_link_files() { // create a list with .gz files that contains links return generate_list_with_files(config::link_batches, ".links.gz"); } std::vector generate_list_with_direct_link_files() { // create a list with .gz files that contains links return generate_list_with_files(config::link_batches, ".direct.links.gz"); } std::vector generate_list_with_target_url_files() { // create a list with .gz files that contains urls return generate_list_with_files(target_url_batches(), "", ""); } std::vector generate_list_with_target_link_files() { // create a list with .gz files that contains links return generate_list_with_files(target_link_batches(), "", ""); } // File structure is [data_path]/crawl-data/NODE-[node_id]/files/uuid-file_index.gz std::string write_cache(size_t file_index, std::vector &lines, size_t node_id) { auto uuid = common::uuid(); const std::string filename = "crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/files/" + uuid + "-" + std::to_string(file_index) + ".gz"; std::ofstream outfile(config::data_path() + "/" + filename, std::ios::trunc | std::ios::binary); boost::iostreams::filtering_ostream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(outfile); for (const std::string &line : lines) { compress_stream << line << "\n"; } lines.clear(); return filename; } // File structure is [DATA_PATH]/crawl-data/NODE-[node_id]/files/uuid-file_index.gz std::string write_link_cache(size_t file_index, std::vector &lines, size_t node_id) { auto uuid = common::uuid(); const std::string filename = "crawl-data/LINK-" + std::to_string(node_id) + s_suffix + "/files/" + uuid + "-" + std::to_string(file_index) + ".gz"; std::ofstream outfile(config::data_path() + "/" + filename, std::ios::trunc | std::ios::binary); boost::iostreams::filtering_ostream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(outfile); for (const std::string &line : lines) { compress_stream << line << "\n"; } lines.clear(); return filename; } void splitter(const std::vector &warc_paths, std::mutex &write_file_mutex) { const size_t max_cache_size = 10000; size_t file_index = 1; using vec2d_str = std::vector>; vec2d_str file_names(config::nodes_in_cluster); vec2d_str cache(config::nodes_in_cluster); for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); const size_t node_id = url.index_on_node(); cache[node_id].push_back(line); } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { if (cache[node_id].size() > max_cache_size) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } write_file_mutex.lock(); for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths"; std::ofstream outfile(filename, std::ios::app); for (const std::string &file : file_names[node_id]) { outfile << file << "\n"; } } write_file_mutex.unlock(); } void link_splitter(const std::vector &warc_paths, std::mutex &write_file_mutex) { const size_t max_cache_size = 1000000; size_t file_index = 1; using vec2d_str = std::vector>; vec2d_str file_names(config::nodes_in_cluster); vec2d_str cache(config::nodes_in_cluster); size_t done = 0; for (const std::string &warc_path : warc_paths) { std::cout << "done " << done << "/" << warc_paths.size() << std::endl; done++; std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); const size_t node_id = link.index_on_node(); cache[node_id].push_back(line); } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { if (cache[node_id].size() > max_cache_size) { file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id)); } } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id)); } write_file_mutex.lock(); for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { const auto filename = config::data_path() + "/crawl-data/LINK-" + std::to_string(node_id) + s_suffix + "/warc.paths"; std::ofstream outfile(filename, std::ios::app); for (const std::string &file : file_names[node_id]) { outfile << file << "\n"; } } write_file_mutex.unlock(); } void link_splitter_with_hosts(const std::unordered_set &hosts, const std::vector &warc_paths, std::mutex &write_file_mutex) { const size_t max_cache_size = 1000000; size_t file_index = 1; using vec2d_str = std::vector>; vec2d_str file_names(config::nodes_in_cluster); vec2d_str cache(config::nodes_in_cluster); size_t done = 0; for (const std::string &warc_path : warc_paths) { std::cout << "done " << done << "/" << warc_paths.size() << std::endl; done++; std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); const auto target_host = link.target_host_hash(); if (hosts.count(target_host)) { const size_t node_id = link.index_on_node(); cache[node_id].push_back(line); } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { if (cache[node_id].size() > max_cache_size) { file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id)); } } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { file_names[node_id].push_back(write_link_cache(file_index++, cache[node_id], node_id)); } write_file_mutex.lock(); for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { const auto filename = config::data_path() + "/crawl-data/LINK-" + std::to_string(node_id) + s_suffix + "/warc.paths"; std::ofstream outfile(filename, std::ios::app); for (const std::string &file : file_names[node_id]) { outfile << file << "\n"; } } write_file_mutex.unlock(); } void splitter_with_urls(const std::unordered_set &urls, const std::vector &warc_paths, std::mutex &write_file_mutex) { const size_t max_cache_size = 150000; size_t file_index = 1; std::vector> file_names(config::nodes_in_cluster); std::vector> cache(config::nodes_in_cluster); size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::cout << warc_path << std::endl; std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); if (urls.count(url.hash())) { const size_t node_id = url.index_on_node(); cache[node_id].push_back(line); } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { if (cache[node_id].size() > max_cache_size) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } } if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } write_file_mutex.lock(); for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths"; std::ofstream outfile(filename, std::ios::app); for (const std::string &file : file_names[node_id]) { outfile << file << "\n"; } } write_file_mutex.unlock(); } void splitter_with_roaring(const ::roaring::Roaring64Map &urls, const std::vector &warc_paths, std::mutex &write_file_mutex) { const size_t max_cache_size = 150000; size_t file_index = 1; std::vector> file_names(config::nodes_in_cluster); std::vector> cache(config::nodes_in_cluster); size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::cout << warc_path << std::endl; std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); if (urls.contains(url.hash() >> 20)) { const size_t node_id = url.index_on_node(); cache[node_id].push_back(line); } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { if (cache[node_id].size() > max_cache_size) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } } if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } write_file_mutex.lock(); for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths"; std::ofstream outfile(filename, std::ios::app); for (const std::string &file : file_names[node_id]) { outfile << file << "\n"; } } write_file_mutex.unlock(); } void splitter_with_bloom(const ::algorithm::bloom_filter &bloom, const std::vector &warc_paths, std::mutex &write_file_mutex) { const size_t max_cache_size = 10000; size_t file_index = 1; std::vector> file_names(config::nodes_in_cluster); std::vector> cache(config::nodes_in_cluster); size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); if (bloom.exists(url.hash())) { const size_t node_id = url.index_on_node(); cache[node_id].push_back(line); } } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { if (cache[node_id].size() > max_cache_size) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } } if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { file_names[node_id].push_back(write_cache(file_index++, cache[node_id], node_id)); } write_file_mutex.lock(); for (size_t node_id = 0; node_id < config::nodes_in_cluster; node_id++) { const std::string filename = config::data_path() + "/crawl-data/NODE-" + std::to_string(node_id) + s_suffix + "/warc.paths"; std::ofstream outfile(filename, std::ios::app); for (const std::string &file : file_names[node_id]) { outfile << file << "\n"; } } write_file_mutex.unlock(); } std::unordered_set build_link_set(const std::vector &warc_paths, size_t hash_min, size_t hash_max) { std::unordered_set result; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); const size_t hash = link.target_url().hash(); if (hash >= hash_min && hash <= hash_max) { result.insert(hash); } } } return result; } /* * Input is a vector with paths to url files. Returns an unordered set with all the host hashes. * */ std::unordered_set build_url_host_set(const std::vector &warc_paths) { std::unordered_set hosts; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); hosts.insert(url.host_hash()); } } return hosts; } std::unordered_set build_url_set(const std::vector &warc_paths) { std::unordered_set url_hashes; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); url_hashes.insert(url.hash()); } } return url_hashes; } void create_warc_directories() { // Create directories. for (const auto &batch : target_url_batches()) { boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch); boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch + "/files"); } for (const auto &batch : target_link_batches()) { boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch); boost::filesystem::create_directories(config::data_path() + "/crawl-data/" + batch + "/files"); } } void run_splitter() { tools::create_warc_directories(); std::vector threads; auto files = generate_list_with_url_files(); auto link_files = generate_list_with_link_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); std::vector> link_thread_input; algorithm::vector_chunk(link_files, ceil((double)link_files.size() / s_num_threads), link_thread_input); std::mutex write_file_mutex; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(splitter, thread_input[i], ref(write_file_mutex))); } for (std::thread &one_thread : threads) { one_thread.join(); } threads.clear(); /* Run link_splitter threads for (size_t i = 0; i < link_thread_input.size(); i++) { threads.emplace_back(thread(link_splitter, link_thread_input[i], ref(write_file_mutex))); } for (thread &one_thread : threads) { one_thread.join(); } */ } void run_url_splitter_on_urls_in_set(const std::unordered_set &urls) { tools::create_warc_directories(); std::vector threads; auto files = generate_list_with_url_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); std::mutex write_file_mutex; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(splitter_with_urls, std::cref(urls), std::cref(thread_input[i]), ref(write_file_mutex))); } for (std::thread &one_thread : threads) { one_thread.join(); } } void run_url_splitter_on_urls_in_roaring(const ::roaring::Roaring64Map &urls) { tools::create_warc_directories(); std::vector threads; auto files = generate_list_with_url_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); std::mutex write_file_mutex; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(splitter_with_roaring, std::cref(urls), std::cref(thread_input[i]), ref(write_file_mutex))); } for (std::thread &one_thread : threads) { one_thread.join(); } } void run_url_splitter_on_urls_in_bloom_filter(const ::algorithm::bloom_filter &bloom) { tools::create_warc_directories(); std::vector threads; auto files = generate_list_with_url_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); std::mutex write_file_mutex; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(splitter_with_bloom, std::cref(bloom), std::cref(thread_input[i]), ref(write_file_mutex))); } for (std::thread &one_thread : threads) { one_thread.join(); } } void run_link_splitter_on_links_with_target_host_in_set(const std::unordered_set &hosts) { tools::create_warc_directories(); std::vector threads; auto files = generate_list_with_link_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); std::mutex write_file_mutex; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(link_splitter_with_hosts, std::cref(hosts), std::cref(thread_input[i]), ref(write_file_mutex))); } for (std::thread &one_thread : threads) { one_thread.join(); } } std::unordered_set generate_set_of_urls() { auto url_files = generate_list_with_url_files(); // create an unordered set that contains host hashes of all the urls. std::cout << "building url hashes map" << std::endl; std::unordered_set url_hashes; std::vector> thread_input; algorithm::vector_chunk(url_files, ceil((double)url_files.size() / s_num_threads), thread_input); std::vector>> futures; for (size_t i = 0; i < thread_input.size(); i++) { futures.emplace_back(std::async(std::launch::async, build_url_set, thread_input[i])); } for (auto &fut : futures) { auto result = fut.get(); url_hashes.insert(result.begin(), result.end()); } return url_hashes; } void run_split_links_with_relevant_domains() { auto url_files = generate_list_with_target_url_files(); // create an unordered set that contains host hashes of all the urls. std::cout << "building host hashes map" << std::endl; std::unordered_set host_hashes; std::vector> thread_input; algorithm::vector_chunk(url_files, ceil((double)url_files.size() / s_num_threads), thread_input); std::vector>> futures; for (size_t i = 0; i < thread_input.size(); i++) { futures.emplace_back(std::async(std::launch::async, build_url_host_set, thread_input[i])); } for (auto &fut : futures) { auto result = fut.get(); host_hashes.insert(result.begin(), result.end()); } std::cout << "done. the map size is " << host_hashes.size() << std::endl; run_link_splitter_on_links_with_target_host_in_set(host_hashes); } void split_make_bloom(::algorithm::bloom_filter &bloom, const std::vector &warc_paths) { std::vector cache; size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const URL url(line.substr(0, line.find("\t"))); cache.push_back(url.hash()); } bloom.insert_many(cache); cache.clear(); if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } } void run_split_build_url_bloom() { std::vector threads; auto files = generate_list_with_url_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); ::algorithm::bloom_filter bloom; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(split_make_bloom, std::ref(bloom), std::cref(thread_input[i]))); } for (std::thread &one_thread : threads) { one_thread.join(); } bloom.write_file(config::data_path() + "/0/url_filter_main.bloom"); } void split_make_direct_links(const ::algorithm::bloom_filter &bloom, const std::vector &warc_paths) { size_t done = 0; for (const std::string &warc_path : warc_paths) { std::cout << "done " << done << "/" << warc_paths.size() << std::endl; done++; auto target_warc_path = warc_path; const size_t pos = target_warc_path.find(".links.gz"); if (pos != std::string::npos) { target_warc_path.replace(pos, 9, ".direct.links.gz"); } else { std::cout << "ERROR: " << warc_path << std::endl; return; } std::ofstream outfile(target_warc_path, std::ios::trunc | std::ios::binary); boost::iostreams::filtering_ostream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(outfile); std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); if (bloom.exists(link.target_url().hash())) { compress_stream << line << "\n"; } } } } void run_split_direct_links() { ::algorithm::bloom_filter bloom; bloom.read_file(config::data_path() + "/0/url_filter_main.bloom"); std::vector threads; auto files = generate_list_with_link_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(split_make_direct_links, std::cref(bloom), std::cref(thread_input[i]))); } for (std::thread &one_thread : threads) { one_thread.join(); } } void split_make_link_bloom(::algorithm::bloom_filter &bloom, const std::vector &warc_paths) { std::vector cache; size_t idx = 0; for (const std::string &warc_path : warc_paths) { std::ifstream infile(warc_path); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); std::string line; while (getline(decompress_stream, line)) { const url_link::link link(line); const size_t hash = link.target_url().hash(); cache.push_back(hash); } bloom.insert_many(cache); cache.clear(); if (idx % 100 == 0) { std::cout << warc_path << " done " << idx << "/" << warc_paths.size() << std::endl; } idx++; } } void run_split_build_direct_link_bloom() { std::vector threads; auto files = generate_list_with_direct_link_files(); std::vector> thread_input; algorithm::vector_chunk(files, ceil((double)files.size() / s_num_threads), thread_input); ::algorithm::bloom_filter bloom; /* Run splitter threads */ for (size_t i = 0; i < thread_input.size(); i++) { threads.emplace_back(std::thread(split_make_link_bloom, std::ref(bloom), std::cref(thread_input[i]))); } for (std::thread &one_thread : threads) { one_thread.join(); } bloom.write_file(config::data_path() + "/0/direct_link_filter_main.bloom"); } void run_split_urls_with_direct_links() { ::algorithm::bloom_filter bloom; bloom.read_file(config::data_path() + "/0/direct_link_filter_main.bloom"); run_url_splitter_on_urls_in_bloom_filter(bloom); } } ================================================ FILE: src/tools/splitter.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace tools { const std::string s_suffix = "-small"; const size_t s_num_threads = 12; std::vector target_url_batches(); std::vector target_link_batches(); std::vector generate_list_with_url_files(); std::vector generate_list_with_link_files(); std::vector generate_list_with_target_url_files(); std::vector generate_list_with_target_link_files(); void run_splitter(); void run_split_urls_with_direct_links(); void run_split_links_with_relevant_domains(); void run_split_build_url_bloom(); void run_split_direct_links(); void run_split_build_direct_link_bloom(); } ================================================ FILE: src/transfer/transfer.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "config.h" #include "transfer.h" #include #include "common/ThreadPool.h" #include "logger/logger.h" #include "profiler/profiler.h" #include "file/file.h" #include "text/text.h" #include "parser/parser.h" #include "algorithm/hash.h" using namespace std; namespace transfer { size_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, stringstream *ss) { size_t byte_size = size * nmemb; ss->write((char *)ptr, byte_size); return byte_size; } size_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, ostream *os) { size_t byte_size = size * nmemb; os->write((char *)ptr, byte_size); return byte_size; } size_t curl_string_writer(void *ptr, size_t size, size_t nmemb, string *str) { size_t byte_size = size * nmemb; str->append((char *)ptr, byte_size); return byte_size; } struct curl_string_read_struct { const char *buffer; size_t buffer_len; size_t offset; }; size_t curl_string_reader(char *ptr, size_t size, size_t nmemb, void *userdata) { struct curl_string_read_struct *arg = (struct curl_string_read_struct *)userdata; if (arg->offset >= arg->buffer_len) { return 0ull; } size_t max_read = size * nmemb; size_t read_bytes = arg->buffer_len - arg->offset; if (read_bytes > max_read) read_bytes = max_read; memcpy(ptr, &arg->buffer[arg->offset], read_bytes); arg->offset += read_bytes; return read_bytes; } size_t curl_file_reader(char *ptr, size_t size, size_t nmemb, void *userdata) { std::ifstream *infile = (std::ifstream *)userdata; if (infile->eof()) { return 0ull; } size_t max_read = size * nmemb; infile->read(ptr, max_read); return infile->gcount(); } void set_internal_auth(CURL *curl) { curl_easy_setopt(curl, CURLOPT_USERNAME, username.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, password.c_str()); } string make_url(const string &url) { if (url.find("http://") == 0 || url.find("https://") == 0) { return url; } if (url.size() && url[0] != '/') { return "http://" + config::master + "/" + url; } return "http://" + config::master + url; } string file_to_string(const string &file_path, int &error) { CURL *curl = curl_easy_init(); error = ERROR; if (curl) { CURLcode res; LOG_INFO("Downloading url: " + make_url(file_path)); curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str()); set_internal_auth(curl); stringstream response; curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); res = curl_easy_perform(curl); if (res == CURLE_OK) { long response_code; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code == 200) { error = OK; } } curl_easy_cleanup(curl); return response.str(); } return ""; } string gz_file_to_string(const string &file_path, int &error) { CURL *curl = curl_easy_init(); error = ERROR; if (curl) { CURLcode res; LOG_INFO("Downloading url: " + make_url(file_path)); curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str()); set_internal_auth(curl); stringstream response; curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); res = curl_easy_perform(curl); string response_str; try { boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(response); response_str = string(istreambuf_iterator(decompress_stream), {}); } catch (...) { curl_easy_cleanup(curl); error = ERROR; return ""; } if (res == CURLE_OK) { long response_code; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code == 200) { error = OK; } } curl_easy_cleanup(curl); return response_str; } return ""; } void file_to_stream(const string &file_path, ostream &output_stream, int &error) { CURL *curl = curl_easy_init(); error = ERROR; if (curl) { CURLcode res; LOG_INFO("Downloading url: " + make_url(file_path)); curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str()); set_internal_auth(curl); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &output_stream); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_ostream_writer); res = curl_easy_perform(curl); if (res == CURLE_OK) { long response_code; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code == 200) { error = OK; } } curl_easy_cleanup(curl); } } void gz_file_to_stream(const string &file_path, ostream &output_stream, int &error) { CURL *curl = curl_easy_init(); error = ERROR; if (curl) { CURLcode res; LOG_INFO("Downloading url: " + make_url(file_path)); curl_easy_setopt(curl, CURLOPT_URL, make_url(file_path).c_str()); set_internal_auth(curl); stringstream response; curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); res = curl_easy_perform(curl); if (res == CURLE_OK) { long response_code; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code == 200) { error = OK; } } try { boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(response); output_stream << decompress_stream.rdbuf(); } catch(...) { error = ERROR; } curl_easy_cleanup(curl); } } void url_to_string(const string &url, string &buffer, int &error) { CURL *curl = curl_easy_init(); error = ERROR; const size_t original_buffer_size = buffer.size(); if (curl) { CURLcode res; LOG_INFO("Downloading url: " + url); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 5000); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 5); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_string_writer); res = curl_easy_perform(curl); if (res == CURLE_OK) { long response_code; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code >= 200 && response_code < 300) { error = OK; } } else { // If an error ocurred we set the size of the buffer to the original size, removing any appended data. buffer.resize(original_buffer_size); } curl_easy_cleanup(curl); } } string run_gz_download_thread(const string &file_path) { size_t hsh = algorithm::hash(file_path); const string target_filename = config::data_path() + "/" + to_string(hsh % 8) + "/tmp/tmp_" + to_string(hsh); ofstream target_file(target_filename, ios::binary | ios::trunc); int error; gz_file_to_stream(file_path, target_file, error); if (error != OK) { return ""; } return target_filename; } vector download_gz_files_to_disk(const vector &files_to_download) { ThreadPool pool(config::num_async_file_transfers); std::vector> results; for (const string &file : files_to_download) { results.emplace_back( pool.enqueue([file] { return run_gz_download_thread(file); }) ); } vector local_filenames; for(auto && result: results) { const string filename = result.get(); if (filename != "") { local_filenames.push_back(filename); } } return local_filenames; } void delete_downloaded_files(const vector &files) { LOG_INFO("Deleting " + to_string(files.size()) + " downloaded files"); for (const string &file : files) { file::delete_file(file); } } size_t head_content_length(const string &url, int &error) { CURL *curl = curl_easy_init(); error = ERROR; if (curl) { CURLcode res; LOG_INFO("Making head request to:" + url); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); stringstream response; curl_easy_setopt(curl, CURLOPT_NOBODY, 1); curl_easy_setopt(curl, CURLOPT_HEADER, 1); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); res = curl_easy_perform(curl); string response_str; try { response_str = string(istreambuf_iterator(response), {}); } catch (...) { curl_easy_cleanup(curl); error = ERROR; return 0; } if (res == CURLE_OK) { long response_code; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code == 200) { error = OK; } else { curl_easy_cleanup(curl); return 0; } } curl_easy_cleanup(curl); const string content_len_str = parser::get_http_header(text::lower_case(response_str), "content-length: "); size_t content_len; try { content_len = stoull(content_len_str); } catch (...) { error = ERROR; return 0; } return content_len; } return 0; } int upload_file(const string &path, const string &data) { CURL *curl = curl_easy_init(); if (curl) { CURLcode res; const string url = "http://" + config::upload + "/" + path; LOG_INFO("Uploading file to:" + url); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L); struct curl_string_read_struct arg; arg.buffer = data.c_str(); arg.buffer_len = data.size(); arg.offset = 0; curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l); curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str()); curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader); curl_easy_setopt(curl, CURLOPT_READDATA, &arg); res = curl_easy_perform(curl); curl_easy_cleanup(curl); if (res == CURLE_OK) { return OK; } return ERROR; } return ERROR; } int upload_gz_file(const string &path, const string &data) { CURL *curl = curl_easy_init(); if (curl) { CURLcode res; const string url = "http://" + config::upload + "/" + path; LOG_INFO("Uploading file to:" + url); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L); stringstream ss(data); boost::iostreams::filtering_istream compress_stream; compress_stream.push(boost::iostreams::gzip_compressor()); compress_stream.push(ss); string compressed_data = string(istreambuf_iterator(compress_stream), {}); struct curl_string_read_struct arg; arg.buffer = compressed_data.c_str(); arg.buffer_len = compressed_data.size(); arg.offset = 0; curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l); curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str()); curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader); curl_easy_setopt(curl, CURLOPT_READDATA, &arg); res = curl_easy_perform(curl); curl_easy_cleanup(curl); if (res == CURLE_OK) { return OK; } return ERROR; } return ERROR; } int upload_file_from_disk(const string &dest_path, const string &filename) { CURL *curl = curl_easy_init(); if (curl) { CURLcode res; const string url = "http://" + config::upload + "/" + dest_path; LOG_INFO("Uploading file to:" + url); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L); std::ifstream infile(filename, std::ios::in | std::ios::binary); curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l); curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str()); curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_file_reader); curl_easy_setopt(curl, CURLOPT_READDATA, &infile); res = curl_easy_perform(curl); curl_easy_cleanup(curl); if (res == CURLE_OK) { return OK; } return ERROR; } return ERROR; } /* * Perform simple GET request and return response. * */ http::response get(const string &url) { return get(url, vector{}); } http::response get(const string &url, const vector &headers) { CURL *curl = curl_easy_init(); struct curl_slist *header_list = NULL; http::response response; if (curl) { for (const string &header : headers) { header_list = curl_slist_append(header_list, header.c_str()); } curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str()); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list); stringstream response_stream; curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); curl_easy_perform(curl); curl_slist_free_all(header_list); size_t code = 0; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code); response.code(code); response.body(response_stream.str()); curl_easy_cleanup(curl); } return response; } /* * Perform simple POST request and return response. * */ http::response post(const string &url, const string &data) { return post(url, data, {}); } http::response post(const string &url, const string &data, const vector &headers) { CURL *curl = curl_easy_init(); struct curl_slist *header_list = NULL; http::response response; if (curl) { for (const string &header : headers) { header_list = curl_slist_append(header_list, header.c_str()); } curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); struct curl_string_read_struct arg; arg.buffer = data.c_str(); arg.buffer_len = data.size(); arg.offset = 0; curl_easy_setopt(curl, CURLOPT_POST, 1l); curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str()); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list); curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader); curl_easy_setopt(curl, CURLOPT_READDATA, &arg); stringstream response_stream; curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); CURLcode curl_result = curl_easy_perform(curl); if (curl_result == CURLE_OK) { size_t code = 0; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code); response.code(code); response.body(response_stream.str()); } else { response.code(0); response.body(""); } curl_easy_cleanup(curl); } return response; } /* * Perform simple PUT request and return response. * */ http::response put(const string &url, const string &data) { CURL *curl = curl_easy_init(); http::response response; if (curl) { curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L); struct curl_string_read_struct arg; arg.buffer = data.c_str(); arg.buffer_len = data.size(); arg.offset = 0; curl_easy_setopt(curl, CURLOPT_UPLOAD, 1l); curl_easy_setopt(curl, CURLOPT_USERNAME, config::file_upload_user.c_str()); curl_easy_setopt(curl, CURLOPT_PASSWORD, config::file_upload_password.c_str()); curl_easy_setopt(curl, CURLOPT_READFUNCTION, curl_string_reader); curl_easy_setopt(curl, CURLOPT_READDATA, &arg); stringstream response_stream; curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_stream); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_stringstream_writer); curl_easy_perform(curl); size_t code = 0; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code); response.code(code); response.body(response_stream.str()); curl_easy_cleanup(curl); } return response; } } ================================================ FILE: src/transfer/transfer.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include "http/response.h" #include #include namespace transfer { const std::string username = "alexandria"; const std::string password = "wmXN6U4u"; const int OK = 0; const int ERROR = 1; size_t curl_stringstream_writer(void *ptr, size_t size, size_t nmemb, std::stringstream *ss); size_t curl_ostream_writer(void *ptr, size_t size, size_t nmemb, std::ostream *os); std::string file_to_string(const std::string &file_path, int &error); std::string gz_file_to_string(const std::string &file_path, int &error); void file_to_stream(const std::string &file_path, std::ostream &output_stream, int &error); void gz_file_to_stream(const std::string &file_path, std::ostream &output_stream, int &error); void url_to_string(const std::string &url, std::string &buffer, int &error); std::vector download_gz_files_to_disk(const std::vector &files_to_download); void delete_downloaded_files(const std::vector &files); // Make a http HEAD request and return the content length. Return 0 on failure and sets the error parameter to transfer::ERROR size_t head_content_length(const std::string &url, int &error); int upload_file(const std::string &path, const std::string &data); int upload_gz_file(const std::string &path, const std::string &data); int upload_file_from_disk(const std::string &dest_path, const std::string &filename); /* * Perform simple GET request and return response. * */ http::response get(const std::string &url); http::response get(const std::string &url, const std::vector &headers); /* * Perform simple POST request and return response. * */ http::response post(const std::string &url, const std::string &data); http::response post(const std::string &url, const std::string &data, const std::vector &headers); /* * Perform simple PUT request and return response. * */ http::response put(const std::string &url, const std::string &data); } ================================================ FILE: src/url_link/link.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "link.h" #include using namespace std; namespace url_link { link::link() { } link::link(const string &standard_link_data) { vector col_values; boost::algorithm::split(col_values, standard_link_data, boost::is_any_of("\t")); m_source_url = URL(col_values[0], col_values[1]); m_target_url = URL(col_values[2], col_values[3]); m_link_text = col_values[4].substr(0, 1000); m_target_host_hash = m_target_url.host_hash(); m_source_harmonic = 0; m_target_harmonic = 0; } link::link(const URL &source_url, const URL &target_url, float source_harmonic, float target_harmonic) : m_source_url(source_url), m_target_url(target_url), m_target_host_hash(target_url.host_hash()), m_source_harmonic(source_harmonic), m_target_harmonic(target_harmonic) { } link::~link() { } float link::url_score() const { return max(m_source_harmonic - m_target_harmonic, m_source_harmonic / 100.0f); } float link::domain_score() const { return max(m_source_harmonic - m_target_harmonic, m_source_harmonic / 100.0f)/100.0; } } ================================================ FILE: src/url_link/link.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "URL.h" #include "config.h" namespace url_link { class link { public: link(); explicit link(const std::string &standard_link_data); link(const URL &source_url, const URL &target_url, float source_harmonic, float target_harmonic); ~link(); float url_score() const; float domain_score() const; const URL &source_url() const { return m_source_url; } const URL &target_url() const { return m_target_url; } const uint64_t &target_host_hash() const { return m_target_host_hash; } const float &source_harmonic() const { return m_source_harmonic; } const float &target_harmonic() const { return m_target_harmonic; } size_t index_on_node() const { return target_url().host_hash() % config::nodes_in_cluster; } private: URL m_source_url; URL m_target_url; uint64_t m_target_host_hash; float m_source_harmonic; float m_target_harmonic; std::string m_link_text; }; } ================================================ FILE: src/utils/id_allocator.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include namespace utils { /* * Very simple helper for allocating one shared object per id by multiple threads. Each thread should keep its own cache of the pointers since * the get function locks execution. * * * - thread A * std::unordered_map local_cache; * for (...) { * if (!local_cache.count(id)) { * local_cache[id] = alloc.get(id, ...); // alloc is shared instance of id_allocator * } * * local_cache[id] can be used now. * } * */ template class id_allocator { public: /* * Allocates a pointer to an "alloc_type" object associated with id. The rest of the arguments are passed to the constructor of * alloc_type. * */ template alloc_type *get(uint64_t id, type_args&&... args) { std::lock_guard guard(m_lock); if (m_map.count(id) == 0) { m_map[id] = std::make_unique(std::forward(args)...); } return m_map[id].get(); } private: std::mutex m_lock; std::unordered_map> m_map; }; } ================================================ FILE: src/utils/thread_pool.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "thread_pool.hpp" #include #include #include #include #include using namespace std::chrono_literals; namespace utils { thread_pool::thread_pool(size_t num_threads, size_t max_queue_len) : m_max_queue_len(max_queue_len) { for (size_t i = 0; i < num_threads; i++) { m_workers.emplace_back([this]() { this->handle_work(); }); } } thread_pool::~thread_pool() { run_all(); } void thread_pool::enqueue(std::function &&fun) { if (m_stop) { throw std::runtime_error("enqueue on stopped thread_pool not allowed"); } if (m_max_queue_len > 0) { while (true) { { std::lock_guard lock(m_queue_lock); if (m_queue.size() < m_max_queue_len) { m_queue.emplace(std::move(fun)); break; } } std::this_thread::sleep_for(100ms); } } else { m_queue_lock.lock(); m_queue.emplace(std::move(fun)); m_queue_lock.unlock(); } m_condition.notify_one(); } void thread_pool::run_all() { if (m_stop) return; // Already stopped.. m_queue_lock.lock(); m_stop = true; m_queue_lock.unlock(); m_condition.notify_all(); for (std::thread &thread : m_workers) { if (thread.joinable()) { thread.join(); } } } void thread_pool::handle_work() { while (true) { std::function task; { std::unique_lock lock(m_queue_lock); m_condition.wait(lock, [this] { return m_stop || !m_queue.empty(); }); if (m_stop && m_queue.empty()) return; task = std::move(m_queue.front()); m_queue.pop(); } task(); } } } ================================================ FILE: src/utils/thread_pool.hpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace utils { class thread_pool { public: explicit thread_pool(size_t num_workers, size_t max_queue_len = 0); ~thread_pool(); void enqueue(std::function &&fun); void run_all(); private: void handle_work(); std::vector m_workers; std::queue> m_queue; std::mutex m_queue_lock; std::condition_variable m_condition; bool m_stop = false; size_t m_max_queue_len; }; } ================================================ FILE: src/utils/thread_pool_arg.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include namespace utils { template class thread_pool_arg { public: explicit thread_pool_arg(size_t); ~thread_pool_arg(); void enqueue(std::function &&fun); void run_all(); private: void handle_work(); std::vector m_workers; std::queue> m_queue; std::mutex m_queue_lock; std::condition_variable m_condition; bool m_stop = false; }; template thread_pool_arg::thread_pool_arg(size_t num_threads) { for (size_t i = 0; i < num_threads; i++) { m_workers.emplace_back([this]() { this->handle_work(); }); } } template thread_pool_arg::~thread_pool_arg() { run_all(); } template void thread_pool_arg::enqueue(std::function &&fun) { if (m_stop) { throw std::runtime_error("enqueue on stopped thread_pool_arg not allowed"); } m_queue_lock.lock(); m_queue.emplace(std::move(fun)); m_queue_lock.unlock(); m_condition.notify_one(); } template void thread_pool_arg::run_all() { if (m_stop) return; // Already stopped.. m_queue_lock.lock(); m_stop = true; m_queue_lock.unlock(); m_condition.notify_all(); for (std::thread &thread : m_workers) { if (thread.joinable()) { thread.join(); } } } template void thread_pool_arg::handle_work() { arg a; while (true) { std::function task; { std::unique_lock lock(m_queue_lock); m_condition.wait(lock, [this] { return m_stop || !m_queue.empty(); }); if (m_stop && m_queue.empty()) return; task = std::move(m_queue.front()); m_queue.pop(); } task(a); } } } ================================================ FILE: src/warc/tlds.h ================================================ #pragma once #include #include namespace warc { const std::unordered_set double_tlds({ "co.uk" }); const std::unordered_set tlds({ "se", "com", "org", "net", "int", "edu", "gov", "mil", "ad", "as", "az", "bz", "cc", "cd", "co", "dj", "fm", "gg", "io", "la", "me", "ms", "nu", "sc", "tf", "tv", "ws", "ai", "as", "au", "bm", "bs", "gi", "gu", "uk", "us", "sh", "ca", "to", "ac", "academy", "accountant", "accountants", "active", "actor", "ads", "adult", "aero", "africa", "agency", "airforce", "amazon", "analytics", "apartments", "app", "apple", "archi", "army", "art", "associates", "attorney", "auction", "audible", "audio", "author", "auto", "autos", "aws", "baby", "band", "bank", "bar", "barefoot", "bargains", "baseball", "basketball", "beauty", "beer", "best", "bestbuy", "bet", "bible", "bid", "bike", "bingo", "bio", "biz", "black", "blackfriday", "blockbuster", "blog", "blue", "boo", "book", "boots", "boston", "bot", "boutique", "box", "broadway", "broker", "build", "builders", "business", "buy", "buzz", "cab", "cafe", "call", "cam", "camera", "camp", "cancerresearch", "capital", "car", "cards", "care", "career", "careers", "cars", "case", "cash", "casino", "catering", "catholic", "center", "cern", "ceo", "cfd", "channel", "chat", "charity", "cheap", "christmas", "church", "circle", "city", "claims", "cleaning", "click", "clinic", "clothing", "cloud", "club", "coach", "codes", "coffee", "college", "community", "company", "compare", "computer", "condos", "construction", "consulting", "contact", "contractors", "cooking", "cool", "coop", "country", "coupon", "coupons", "courses", "cpa", "credit", "creditcard", "cruise", "cricket", "cruises", "cyou", "dad", "dance", "data", "date", "dating", "day", "deal", "deals", "degree", "delivery", "democrat", "dental", "dentist", "design", "dev", "diamonds", "diet", "digital", "direct", "directory", "discount", "diy", "docs", "doctor", "dog", "domains", "dot", "download", "drive", "duck", "earth", "eat", "eco", "education", "email", "energy", "engineer", "engineering", "edeka", "entertainment", "enterprises", "equipment", "esq", "estate", "events", "exchange", "expert", "exposed", "express", "fail", "faith", "family", "fan", "fans", "farm", "fashion", "fast", "feedback", "fiat", "film", "final", "finance", "financial", "fire", "fish", "fishing", "fit", "fitness", "flights", "florist", "flowers", "fly", "foo", "food", "foodnetwork", "football", "forsale", "forum", "foundation", "free", "frontdoor", "fun", "fund", "furniture", "fyi", "gallery", "game", "games", "garden", "gay", "gdn", "gift", "gifts", "gives", "glass", "gle", "global", "gold", "golf", "google", "gop", "graphics", "green", "gripe", "grocery", "group", "guide", "guitars", "guru", "hair", "hangout", "health", "healthcare", "help", "here", "hiphop", "hiv", "hockey", "holdings", "holiday", "homegoods", "homes", "homesense", "horse", "hospital", "host", "hosting", "hot", "hotels", "house", "how", "ice", "icu", "inc", "industries", "info", "ing", "ink", "institute[50]", "insurance", "insure", "international", "investments", "irish", "jewelry", "jobs", "joy", "kim", "kitchen", "kosher", "land", "lat", "law", "lawyer", "lease", "leclerc", "legal", "lgbt", "life", "lifeinsurance", "lighting", "like", "limited", "limo", "link", "live", "living", "loan", "loans", "locker", "lol", "lotto", "love", "ltd", "luxury", "makeup", "management", "map", "market", "marketing", "markets", "mba", "med", "media", "meet", "meme", "memorial", "men", "menu", "mint", "mobi", "mobile", "mobily", "moe", "mom", "money", "monster", "mortgage", "motorcycles", "mov", "movie", "museum", "music", "name", "navy", "network", "new", "news", "ngo", "ninja", "now", "ntt", "observer", "off", "org", "one", "ong", "onl", "online", "ooo", "open", "organic", "origins", "page", "partners", "parts", "party", "pay", "pet", "pharmacy", "phone", "photo", "photography", "photos", "physio", "pics", "pictures", "pid", "pin", "pink", "pizza", "place", "plumbing", "plus", "poker", "porn", "post", "press", "prime", "pro", "productions", "prof", "promo", "properties", "property", "protection", "pub", "qpon", "racing", "radio", "read", "realestate", "realtor", "realty", "recipes", "red", "rehab", "reit", "rent", "rentals", "repair", "report", "republican", "rest", "restaurant", "review", "reviews", "rich", "rip", "rocks", "rodeo", "room", "rugby", "run", "safe", "sale", "salon", "save", "sbi", "scholarships", "school", "science", "search", "secure", "security", "select", "services", "sex", "sexy", "shoes", "shop", "shopping", "show", "showtime", "silk", "singles", "site", "ski", "skin", "sky", "sling", "smile", "sncf", "soccer", "social", "software", "solar", "solutions", "song", "space", "spreadbetting", "spot", "sport", "storage", "store", "stream", "studio", "study", "style", "sucks", "supplies", "supply", "support", "surf", "surgery", "systems", "talk", "tattoo", "tax", "taxi", "team", "tech", "technology", "tel", "tennis", "theater", "theatre", "tickets", "tips", "tires", "today", "tools", "top", "tours", "town", "toys", "trade", "trading", "training", "travel", "travelersinsurance", "trust", "tube", "tunes", "uconnect", "university", "uno", "vacations", "ventures", "vet", "video", "villas", "vin", "vip", "vision", "vodka", "volvo", "vote", "voting", "voyage", "wang", "watch", "watches", "weather", "webcam", "website", "wed", "wedding", "whoswho", "wiki", "win", "wine", "winners", "work", "works", "world", "wow", "wtf", "xxx", "xyz", "yachts", "yoga", "you", "youtube", "zero", "zip", "zone" }); } ================================================ FILE: src/warc/warc.cpp ================================================ #include "warc.h" #include "tlds.h" #include "text/text.h" #include "logger/logger.h" #include "transfer/transfer.h" using namespace std; namespace warc { parser::parser() { m_z_buffer_in = new char[WARC_PARSER_ZLIB_IN]; m_z_buffer_out = new char[WARC_PARSER_ZLIB_OUT]; } parser::~parser() { delete [] m_z_buffer_in; delete [] m_z_buffer_out; } bool parser::parse_stream(istream &stream) { return parse_stream(stream, [this](const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date) { handle_html(url, html, ip, date); }); } bool parser::parse_stream(std::istream &stream, std::function callback) { m_callback = callback; size_t total_bytes_read = 0; while (stream.good()) { stream.read(m_z_buffer_in, WARC_PARSER_ZLIB_IN); auto bytes_read = stream.gcount(); total_bytes_read += bytes_read; if (bytes_read > 0) { if (unzip_chunk(bytes_read) < 0) { cout << "Stopped because fatal error" << endl; break; } } } return true; } void parser::handle_html(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date) { m_result += (url + '\t' + html.title() + '\t' + html.h1() + '\t' + html.meta() + '\t' + html.text() + '\t' + date + '\t' + ip + '\n'); for (const auto &link : html.links()) { m_links += (link.host() + '\t' + link.path() + '\t' + link.target_host() + '\t' + link.target_path() + '\t' + link.text() + '\t' + (link.nofollow() ? "1" : "0") + '\n'); } // internal links are too messy for us now. /*for (const auto &link : html.internal_links()) { // link is a std::pair m_internal_links.append((char *)&link.first, sizeof(uint64_t)); m_internal_links.append((char *)&link.second, sizeof(uint64_t)); }*/ } int parser::unzip_record(char *data, int size) { /* data is: #|------------------|-----|------------------------|--|----#-------| |doc_a______________doc_b_doc_c_____| WARC_PARSER_ZLIB_IN |_________________________________________________________| size */ int data_size = size; int consumed = 0, consumed_total = 0; int avail_in_before_inflate; int ret = Z_OK; unsigned have; if (!m_continue_inflate) { m_zstream.zalloc = Z_NULL; m_zstream.zfree = Z_NULL; m_zstream.opaque = Z_NULL; m_zstream.avail_in = 0; m_zstream.next_in = Z_NULL; int err = inflateInit2(&m_zstream, 16); if (err != Z_OK) { cout << "zlib error" << endl; } } else { // just continue on the last one. } /* decompress until deflate stream ends or end of file */ do { m_zstream.next_in = (unsigned char *)(data + consumed_total); m_zstream.avail_in = min(WARC_PARSER_ZLIB_IN, data_size); if (m_zstream.avail_in == 0) break; /* run inflate() on input until output buffer not full */ do { m_zstream.avail_out = WARC_PARSER_ZLIB_OUT; m_zstream.next_out = (unsigned char *)m_z_buffer_out; avail_in_before_inflate = m_zstream.avail_in; ret = inflate(&m_zstream, Z_NO_FLUSH); // consumed is the number of bytes read from input in this inflate consumed = (avail_in_before_inflate - m_zstream.avail_in); data_size -= consumed; consumed_total += consumed; assert(ret != Z_STREAM_ERROR); /* state not clobbered */ switch (ret) { case Z_BUF_ERROR: //cout << "Z_BUF_ERROR" << endl; // Not fatal, just keep going. break; case Z_NEED_DICT: ret = Z_DATA_ERROR; /* and fall through */ cout << "Z_MEM_ERROR" << endl; (void)inflateEnd(&m_zstream); return -1; case Z_DATA_ERROR: case Z_MEM_ERROR: cout << "Z_MEM_ERROR" << endl; (void)inflateEnd(&m_zstream); return -1; } have = WARC_PARSER_ZLIB_OUT - m_zstream.avail_out; handle_record_chunk((char *)m_z_buffer_out, have); } while (m_zstream.avail_out == 0); if (data_size <= 0) { break; } /* done when inflate() says it's done */ } while (ret != Z_STREAM_END); //cout << "ret: " << ret << endl; //cout << "Ending with code: " << ret << endl; if (ret == Z_OK || ret == Z_BUF_ERROR) { m_continue_inflate = true; } else { m_continue_inflate = false; (void)inflateEnd(&m_zstream); } /* clean up and return */ return consumed_total; } int parser::unzip_chunk(int bytes_in) { int consumed = 0; int consumed_total = 0; char *ptr = m_z_buffer_in; int len = bytes_in; while (len > 0) { consumed = unzip_record(ptr, len); //cout << "consumed: " << consumed << " len: " << len << endl; if (consumed == 0) { cout << "Nothing consumed, done..." << endl; break; } if (consumed < 0) { cout << "Encountered fatal error" << endl; return -1; } ptr += consumed; len -= consumed; consumed_total += consumed; } return 0; } /* * Handles unzipped data. The data pointer is either pointing to a new warc record or it is the continuation of a previous warc record. * */ void parser::handle_record_chunk(char *data, int len) { m_handled += len; m_num_handled++; if (len > 8 && strncmp(data, "WARC/1.0", 8) == 0) { // data is the start of a warc record string record(data, len); m_current_record.assign(data, len); } else { m_current_record.append(data, len); } if (m_current_record.find("\r\n\r\n") != string::npos) { const string warc_header = get_warc_header(m_current_record); const string content_len_str = ::parser::get_http_header(warc_header, "Content-Length: "); size_t content_len = stoull(content_len_str); size_t received_content = m_current_record.size() - (warc_header.size() + 8); if (content_len == received_content) { const string type = ::parser::get_http_header(warc_header, "WARC-Type: "); if (type == "response") { parse_record(warc_header, m_current_record); } } } } void parser::parse_record(const string &warc_header, const string &warc_record) { const string url = ::parser::get_http_header(warc_header, "WARC-Target-URI: "); const string tld = m_html_parser.url_tld(url); if (tlds.count(tld) == 0) return; const string ip = ::parser::get_http_header(warc_header, "WARC-IP-Address: "); const string date = ::parser::get_http_header(warc_header, "WARC-Date: "); const size_t warc_response_start = warc_record.find("\r\n\r\n"); const size_t response_body_start = warc_record.find("\r\n\r\n", warc_response_start + 4); string http_header = warc_record.substr(warc_response_start + 4, response_body_start - warc_response_start - 4); text::lower_case(http_header); //const size_t http_code = http_response_code(http_header); //const string location = ::parser::get_http_header(warc_header, "location: "); string html = warc_record.substr(response_body_start + 4); m_html_parser.parse(html, url); if (m_html_parser.should_insert()) { m_callback(url, m_html_parser, ip, date); } } string parser::get_warc_header(const string &record) { const size_t pos = record.find("\r\n\r\n"); return record.substr(0, pos); } size_t parser::http_response_code(const string &http_header) { const size_t return_on_invalid = 500; const size_t code_start = http_header.find(' '); const size_t code_end = http_header.find(' ', code_start); if (code_start == string::npos || code_end == string::npos) return return_on_invalid; size_t response_code = stoull(http_header.substr(code_start + 1, 3)); if (response_code < 100 || response_code >= 600) return return_on_invalid; return response_code; } void multipart_download(const string &url, const std::function &callback) { int error; size_t content_len = transfer::head_content_length(url, error); if (error == transfer::ERROR) { throw std::runtime_error("Could not make HEAD request to: " + url); } const size_t max_parts = 50; const size_t max_retries = 3; size_t part = 1; size_t read_bytes = 0; while (read_bytes < content_len && part < max_parts) { size_t retry = 0; while (retry < max_retries) { string buffer; transfer::url_to_string(url + "?partNumber=" + to_string(part), buffer, error); if (error == transfer::OK) { read_bytes += buffer.size(); callback(buffer); break; } else { throw std::runtime_error("Got error response"); } retry++; } if (retry == max_retries) { break; } part++; } } string get_result_path(const string &warc_path) { string path = warc_path; path.replace(path.find(".warc.gz"), 8, string(".gz")); return path; } string get_link_result_path(const string &warc_path) { string path = warc_path; path.replace(path.find(".warc.gz"), 8, string(".links.gz")); return path; } string get_internal_link_result_path(const string &warc_path) { string path = warc_path; path.replace(path.find(".warc.gz"), 8, string(".internal.gz")); return path; } } ================================================ FILE: src/warc/warc.h ================================================ #pragma once #include #include "parser/html_parser.h" #include "parser/parser.h" #include "zlib.h" #define WARC_PARSER_ZLIB_IN 1024*1024*16 #define WARC_PARSER_ZLIB_OUT 1024*1024*16 namespace warc { using std::string; class parser { public: parser(); ~parser(); bool parse_stream(std::istream &stream); bool parse_stream(std::istream &stream, std::function); const string &result() const { return m_result; }; const string &link_result() const { return m_links; }; const string &internal_link_result() const { return m_internal_links; }; void handle_html(const std::string &url, const ::parser::html_parser &html, const std::string &ip, const std::string &date); private: int m_cur_offset = 0; bool m_continue_inflate = false; std::string m_result; std::string m_links; std::string m_internal_links; ::parser::html_parser m_html_parser; std::function m_callback; char *m_z_buffer_in; char *m_z_buffer_out; z_stream m_zstream; /* decompression stream */ size_t m_handled = 0; size_t m_num_handled = 0; string m_current_record; int unzip_record(char *data, int size); int unzip_chunk(int bytes_in); void handle_record_chunk(char *data, int len); void parse_record(const std::string &warc_header, const std::string &warc_record); std::string get_warc_header(const std::string &record); size_t http_response_code(const string &http_header); }; void multipart_download(const string &url, const std::function &callback); string get_result_path(const string &warc_path); string get_link_result_path(const string &warc_path); string get_internal_link_result_path(const string &warc_path); } ================================================ FILE: tests/main.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define BOOST_TEST_MODULE "Unit tests for alexandria.org" #define BOOST_TEST_NO_MAIN #define BOOST_TEST_DYN_LINK #include #include #include "config.h" #include "logger/logger.h" #include #include #include #include #include #include #include #include using std::string; using std::vector; using std::ifstream; using std::stringstream; using std::set; using std::map; using std::pair; void run_before() { config::read_config("../tests/test_config.conf"); logger::start_logger_thread(); } void run_after() { logger::join_logger_thread(); } int BOOST_TEST_CALL_DECL main(int argc, char* argv[]) { run_before(); int ret = ::boost::unit_test::unit_test_main(&init_unit_test, argc, argv); run_after(); return ret; } ================================================ FILE: tests/test_algorithm.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "algorithm/algorithm.h" #include "algorithm/intersection.h" #include "algorithm/hyper_ball.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_algorithm) BOOST_AUTO_TEST_CASE(intersection_test) { { const vector result = algorithm::intersection({ {1, 2, 3}, {2, 3}, {2, 3, 4} }); BOOST_CHECK_EQUAL(2, result.size()); BOOST_CHECK_EQUAL(2, result[0]); BOOST_CHECK_EQUAL(3, result[1]); } { const vector result = algorithm::intersection({ {1, 2, 3, 5}, {2, 3, 5, 7}, {2, 3, 4, 5} }); BOOST_CHECK_EQUAL(3, result.size()); BOOST_CHECK_EQUAL(2, result[0]); BOOST_CHECK_EQUAL(3, result[1]); BOOST_CHECK_EQUAL(5, result[2]); } { const vector result = algorithm::intersection({}); BOOST_CHECK_EQUAL(0, result.size()); } { const vector result = algorithm::intersection({ {1, 2, 3, 5, 6, 7, 8}, {9, 10}, {1, 2, 3, 4, 5} }); BOOST_CHECK_EQUAL(0, result.size()); } { class T { public: size_t m_v; float m_s; T(size_t v, float s) : m_v(v), m_s(s) {} bool operator<(const T &other) const { return m_v < other.m_v; } bool operator==(const T &other) const { return m_v == other.m_v; } }; const vector result = algorithm::intersection({ {T(1, 1.0f), T(2, 1.0f), T(3, 1.0f), T(4, 1.0f)}, {T(3, 2.0f), T(4, 2.0f), T(5, 2.0f)}, {T(4, 3.0f), T(5, 3.0f), T(6, 3.0f), T(7, 3.0f), T(8, 3.0f)} }, [](T &a, const T &b) { return a.m_s += b.m_s; }); BOOST_CHECK_EQUAL(1, result.size()); BOOST_CHECK_EQUAL(result[0].m_v, 4); BOOST_CHECK_EQUAL(result[0].m_s, 6.0f); } } BOOST_AUTO_TEST_CASE(incremental_partitions) { { vector> res = algorithm::incremental_partitions({5}, 64); BOOST_CHECK_EQUAL(res.size(), 5); } { vector> res = algorithm::incremental_partitions({6}, 64); BOOST_CHECK_EQUAL(res.size(), 6); } { vector> res = algorithm::incremental_partitions({3}, 64); BOOST_CHECK_EQUAL(res.size(), 3); BOOST_CHECK(res[0] == vector{0}); BOOST_CHECK(res[1] == vector{1}); BOOST_CHECK(res[2] == vector{2}); } { vector> res = algorithm::incremental_partitions({2, 2}, 64); BOOST_CHECK_EQUAL(res.size(), 4); BOOST_CHECK((res[0] == vector{0, 0})); BOOST_CHECK((res[1] == vector{1, 0})); BOOST_CHECK((res[2] == vector{0, 1})); BOOST_CHECK((res[3] == vector{1, 1})); } { vector> res = algorithm::incremental_partitions({3, 3}, 64); BOOST_CHECK_EQUAL(res.size(), 9); BOOST_CHECK((res[0] == vector{0, 0})); BOOST_CHECK((res[1] == vector{1, 0})); BOOST_CHECK((res[2] == vector{0, 1})); BOOST_CHECK((res[3] == vector{1, 1})); BOOST_CHECK((res[4] == vector{2, 0})); BOOST_CHECK((res[5] == vector{0, 2})); BOOST_CHECK((res[6] == vector{2, 1})); BOOST_CHECK((res[7] == vector{1, 2})); BOOST_CHECK((res[8] == vector{2, 2})); } { vector> res = algorithm::incremental_partitions({3, 3}, 5); BOOST_CHECK_EQUAL(res.size(), 5); BOOST_CHECK((res[0] == vector{0, 0})); BOOST_CHECK((res[1] == vector{1, 0})); BOOST_CHECK((res[2] == vector{0, 1})); BOOST_CHECK((res[3] == vector{1, 1})); BOOST_CHECK((res[4] == vector{2, 0})); } { vector> res = algorithm::incremental_partitions({3, 3, 3}, 64); BOOST_CHECK_EQUAL(res.size(), 27); BOOST_CHECK((res[0] == vector{0, 0, 0})); BOOST_CHECK((res[1] == vector{1, 0, 0})); BOOST_CHECK((res[2] == vector{0, 1, 0})); BOOST_CHECK((res[3] == vector{0, 0, 1})); BOOST_CHECK((res[4] == vector{1, 1, 0})); BOOST_CHECK((res[5] == vector{1, 0, 1})); BOOST_CHECK((res[6] == vector{0, 1, 1})); BOOST_CHECK((res[7] == vector{2, 0, 0})); BOOST_CHECK((res[8] == vector{0, 2, 0})); BOOST_CHECK((res[9] == vector{0, 0, 2})); BOOST_CHECK((res[10] == vector{1, 1, 1})); BOOST_CHECK((res[11] == vector{2, 1, 0})); BOOST_CHECK((res[12] == vector{2, 0, 1})); BOOST_CHECK((res[13] == vector{1, 2, 0})); BOOST_CHECK((res[14] == vector{1, 0, 2})); BOOST_CHECK((res[15] == vector{0, 2, 1})); } { vector> res = algorithm::incremental_partitions({2, 3}, 64); BOOST_CHECK_EQUAL(res.size(), 6); BOOST_CHECK((res[0] == vector{0, 0})); BOOST_CHECK((res[1] == vector{1, 0})); BOOST_CHECK((res[2] == vector{0, 1})); BOOST_CHECK((res[3] == vector{1, 1})); BOOST_CHECK((res[4] == vector{0, 2})); BOOST_CHECK((res[5] == vector{1, 2})); } } BOOST_AUTO_TEST_CASE(harmonic_centrality) { { set> e = {std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0)}; vector h = algorithm::harmonic_centrality(3, e, 6); BOOST_CHECK(h.size() == 3); BOOST_CHECK((h == vector{1.5, 1.5, 1.5})); } { set> e = { std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0), std::make_pair(2, 3), std::make_pair(3, 4), std::make_pair(3, 5), std::make_pair(4, 2), std::make_pair(5, 4), }; vector h = algorithm::harmonic_centrality(7, e, 6); BOOST_CHECK(h.size() == 7); BOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001); BOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001); BOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001); BOOST_CHECK_EQUAL(h[6], 0.0); } { set> e = { std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 1), std::make_pair(3, 1), std::make_pair(4, 1), std::make_pair(5, 1), std::make_pair(6, 1), std::make_pair(7, 1), }; vector h = algorithm::harmonic_centrality(8, e, 6); BOOST_CHECK(h.size() == 8); BOOST_CHECK_CLOSE(h[1], 7, 0.000001); } } BOOST_AUTO_TEST_CASE(harmonic_centrality_threaded) { { set> e = {std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0)}; vector h = algorithm::harmonic_centrality_threaded(3, e, 6, 3); BOOST_CHECK(h.size() == 3); BOOST_CHECK((h == vector{1.5, 1.5, 1.5})); } { set> e = { std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0), std::make_pair(2, 3), std::make_pair(3, 4), std::make_pair(3, 5), std::make_pair(4, 2), std::make_pair(5, 4), }; vector h = algorithm::harmonic_centrality_threaded(7, e, 6, 2); BOOST_CHECK(h.size() == 7); BOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001); BOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001); BOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001); BOOST_CHECK_EQUAL(h[6], 0.0); } { set> e = { std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 1), std::make_pair(3, 1), std::make_pair(4, 1), std::make_pair(5, 1), std::make_pair(6, 1), std::make_pair(7, 1), }; vector h = algorithm::harmonic_centrality_threaded(8, e, 6, 1); BOOST_CHECK(h.size() == 8); BOOST_CHECK_CLOSE(h[1], 7, 0.000001); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_bloom_filter.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include "algorithm/bloom_filter.h" #include "algorithm/hash.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_bloom_filter) BOOST_AUTO_TEST_CASE(test_bloom_filter) { algorithm::bloom_filter bf; bf.insert("test"); BOOST_CHECK(bf.exists("test")); BOOST_CHECK(!bf.exists("test2")); bf.insert("test2"); BOOST_CHECK(bf.exists("test2")); } BOOST_AUTO_TEST_CASE(test_bloom_filter_merge) { algorithm::bloom_filter bf1; bf1.insert("test1"); bf1.insert("test2"); algorithm::bloom_filter bf2; bf2.insert("test3"); bf2.insert("test4"); bf1.merge(bf2); BOOST_CHECK(bf1.exists("test1")); BOOST_CHECK(bf1.exists("test2")); BOOST_CHECK(bf1.exists("test3")); BOOST_CHECK(bf1.exists("test4")); BOOST_CHECK(!bf1.exists("test0")); BOOST_CHECK(!bf1.exists("test5")); BOOST_CHECK(!bf1.exists("random")); BOOST_CHECK(!bf1.exists("random2")); } BOOST_AUTO_TEST_CASE(test_bloom_filter_save) { { algorithm::bloom_filter bf; bf.insert("test1"); bf.insert("test2"); bf.write_file("/tmp/bloom"); } { algorithm::bloom_filter bf; bf.read_file("/tmp/bloom"); BOOST_CHECK(bf.exists("test1")); BOOST_CHECK(bf.exists("test2")); BOOST_CHECK(!bf.exists("test3")); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_cc_parser.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "config.h" #include "warc/warc.h" #include "URL.h" #include "parser/cc_parser.h" using namespace std; BOOST_AUTO_TEST_SUITE(cc_parser) BOOST_AUTO_TEST_CASE(download_warc_paths) { { vector paths = parser::download_warc_paths(); BOOST_CHECK_EQUAL(paths.size(), 0); paths.push_back("test_path/testing1"); paths.push_back("test_path/testing2"); BOOST_CHECK(parser::upload_warc_paths(paths)); } { vector paths = parser::download_warc_paths(); BOOST_CHECK_EQUAL(paths.size(), 2); BOOST_CHECK_EQUAL(paths[0], "test_path/testing1"); BOOST_CHECK_EQUAL(paths[1], "test_path/testing2"); } BOOST_CHECK(parser::upload_warc_paths({})); } BOOST_AUTO_TEST_CASE(download_warc) { // This amazon bucket is gone /* string buffer; warc::multipart_download("http://alexandria-test-data.s3.amazonaws.com/multipart_test", [&buffer](const string &data) { buffer.append(data); }); BOOST_CHECK_EQUAL(buffer.size(), 15728640); BOOST_CHECK_EQUAL(algorithm::hash(buffer), 1803966798292769636ull); */ } BOOST_AUTO_TEST_CASE(parse_cc_batch) { ifstream infile(config::test_data_path + "bokus_test.warc.gz", std::ios::binary); warc::parser pp; pp.parse_stream(infile); { stringstream ss(pp.result()); string line; bool found_url = false; while (getline(ss, line)) { vector cols; boost::algorithm::split(cols, line, boost::is_any_of("\t")); if (cols[0] == "https://www.bokus.com/recension/670934") { BOOST_CHECK(cols[1].substr(0, 26) == "Mycket intressant läsning"); BOOST_CHECK(cols[2].substr(0, 25) == "Recension av Lena Klippvi"); BOOST_CHECK(cols[3].substr(0, 25) == "Mycket intressant läsnin"); BOOST_CHECK(cols[4].substr(0, 120) == "Recenserad produkt Los Angeles's Original Farmers Market Häftad (Trade Paper) Mycket intressant läsning om hur Farmers"); BOOST_CHECK(cols[5] == "2021-07-31T20:08:45Z"); BOOST_CHECK(cols[6] == "213.187.205.190"); found_url = true; } } BOOST_CHECK(found_url); } { stringstream ss(pp.link_result()); string line; int links_found = 0; while (getline(ss, line)) { vector cols; boost::algorithm::split(cols, line, boost::is_any_of("\t")); if (links_found == 0) { BOOST_CHECK(cols[0] == "bokus.com"); BOOST_CHECK(cols[1] == "/recension/670934"); BOOST_CHECK(cols[2] == "help.bokus.com"); BOOST_CHECK(cols[3] == "/"); BOOST_CHECK(cols[4] == "Vanliga frågor & svar"); } links_found++; } BOOST_CHECK_EQUAL(links_found, 8); } /*{ const char *internal_links = pp.internal_link_result().c_str(); { const uint64_t hash1 = *((uint64_t *)&internal_links[0]); const uint64_t hash2 = *((uint64_t *)&internal_links[8]); BOOST_CHECK_EQUAL(hash1, URL("https://www.bokus.com/recension/670934").hash()); BOOST_CHECK_EQUAL(hash2, URL("https://www.bokus.com/cgi-bin/logout_user_info.cgi").hash()); } { const uint64_t hash1 = *((uint64_t *)&internal_links[16]); const uint64_t hash2 = *((uint64_t *)&internal_links[24]); BOOST_CHECK_EQUAL(hash1, URL("https://www.bokus.com/recension/670934").hash()); BOOST_CHECK_EQUAL(hash2, URL("https://www.bokus.com/cgi-bin/log_in_real.cgi").hash()); } }*/ } BOOST_AUTO_TEST_CASE(parse_cc_batch_multistream) { string response; { warc::parser pp; ifstream infile(config::test_data_path + "warc_test.gz", std::ios::binary); pp.parse_stream(infile); response = pp.result(); } vector files = { config::test_data_path + "warc_test.gz.aa", config::test_data_path + "warc_test.gz.ab", config::test_data_path + "warc_test.gz.ac", config::test_data_path + "warc_test.gz.ad", config::test_data_path + "warc_test.gz.ae", config::test_data_path + "warc_test.gz.af", config::test_data_path + "warc_test.gz.ag", config::test_data_path + "warc_test.gz.ah", config::test_data_path + "warc_test.gz.ai", config::test_data_path + "warc_test.gz.aj" }; warc::parser pp; for (const string &filename : files) { ifstream infile(filename, std::ios::binary); pp.parse_stream(infile); } BOOST_CHECK_EQUAL(pp.result().size(), response.size()); } BOOST_AUTO_TEST_CASE(parse_cc_batch_301) { } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_config.conf ================================================ # Cluster config nodes_in_cluster = 3 node_id = 0 url_store_host = "http://localhost"; data_path = . index_snippets = 1 # Indexer config batches[] = ALEXANDRIA-MANUAL-01 batches[] = CC-MAIN-2021-25 batches[] = CC-MAIN-2021-31 link_batches[] = CC-MAIN-2021-31 link_batches[] = CC-MAIN-2021-25 link_batches[] = CC-MAIN-2021-21 link_batches[] = CC-MAIN-2021-17 link_batches[] = CC-MAIN-2021-10 link_batches[] = CC-MAIN-2021-04 link_batches[] = CC-MAIN-2020-50 link_batches[] = CC-MAIN-2020-45 # Server config worker_count = 8 query_max_words = 10 # Maximum number of words used in query. query_max_len = 200 deduplicate_domain_count = 5 pre_result_limit = 200000 result_limit = 1000 # Full text config ft_max_sections = 4 ft_max_results_per_section = 2000000 n_grams = 1 shard_hash_table_size = 100000 ================================================ FILE: tests/test_config2.conf ================================================ # Cluster config nodes_in_cluster = 8; node_id = 1; index_snippets = 0 # Indexer config batches[] = ALEXANDRIA-MANUAL-02 batches[] = CC-MAIN-2021-20 batches[] = CC-MAIN-2021-30 link_batches[] = CC-MAIN-2021-30 link_batches[] = CC-MAIN-2021-20 link_batches[] = CC-MAIN-2021-20 link_batches[] = CC-MAIN-2021-10 link_batches[] = CC-MAIN-2021-11 link_batches[] = CC-MAIN-2021-00 link_batches[] = CC-MAIN-2020-51 link_batches[] = CC-MAIN-2020-40 # Server config worker_count = 9 query_max_words = 100 # Maximum number of words used in query. query_max_len = 0 deduplicate_domain_count = 5000 pre_result_limit = 2 result_limit = 10 # Full text config ft_max_sections = 2 ft_max_results_per_section = 20 n_grams = 5 shard_hash_table_size = 100000 ================================================ FILE: tests/test_configuration.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "config.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_config) BOOST_AUTO_TEST_CASE(read_config) { config::read_config("../tests/test_config.conf"); BOOST_CHECK_EQUAL(config::nodes_in_cluster, 3); BOOST_CHECK_EQUAL(config::node_id, 0); vector batches{"ALEXANDRIA-MANUAL-01", "CC-MAIN-2021-25", "CC-MAIN-2021-31"}; BOOST_CHECK(config::batches == batches); vector link_batches{ "CC-MAIN-2021-31", "CC-MAIN-2021-25", "CC-MAIN-2021-21", "CC-MAIN-2021-17", "CC-MAIN-2021-10", "CC-MAIN-2021-04", "CC-MAIN-2020-50", "CC-MAIN-2020-45" }; BOOST_CHECK(config::link_batches == link_batches); BOOST_CHECK_EQUAL(config::worker_count, 8); BOOST_CHECK_EQUAL(config::query_max_words, 10); BOOST_CHECK_EQUAL(config::query_max_len, 200); BOOST_CHECK_EQUAL(config::deduplicate_domain_count, 5); BOOST_CHECK_EQUAL(config::pre_result_limit, 200000); BOOST_CHECK_EQUAL(config::result_limit, 1000); BOOST_CHECK_EQUAL(config::ft_max_sections, 4); BOOST_CHECK_EQUAL(config::ft_max_results_per_section, 2000000); config::read_config("../tests/test_config2.conf"); BOOST_CHECK_EQUAL(config::nodes_in_cluster, 8); BOOST_CHECK_EQUAL(config::node_id, 1); vector batches2{"ALEXANDRIA-MANUAL-02", "CC-MAIN-2021-20", "CC-MAIN-2021-30"}; BOOST_CHECK(config::batches == batches2); vector link_batches2{ "CC-MAIN-2021-30", "CC-MAIN-2021-20", "CC-MAIN-2021-20", "CC-MAIN-2021-10", "CC-MAIN-2021-11", "CC-MAIN-2021-00", "CC-MAIN-2020-51", "CC-MAIN-2020-40" }; BOOST_CHECK(config::link_batches == link_batches2); BOOST_CHECK_EQUAL(config::worker_count, 9); BOOST_CHECK_EQUAL(config::query_max_words, 100); BOOST_CHECK_EQUAL(config::query_max_len, 0); BOOST_CHECK_EQUAL(config::deduplicate_domain_count, 5000); BOOST_CHECK_EQUAL(config::pre_result_limit, 2); BOOST_CHECK_EQUAL(config::result_limit, 10); BOOST_CHECK_EQUAL(config::ft_max_sections, 2); BOOST_CHECK_EQUAL(config::ft_max_results_per_section, 20); BOOST_CHECK_EQUAL(config::n_grams, 5); BOOST_CHECK_EQUAL(config::index_snippets, false); config::read_config("../tests/test_config.conf"); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_counted_index_builder.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "indexer/basic_index_builder.h" #include "indexer/basic_index.h" #include "indexer/counted_record.h" #include "indexer/sharded_builder.h" #include "indexer/sharded.h" using namespace indexer; BOOST_AUTO_TEST_SUITE(test_basic_index_builder) BOOST_AUTO_TEST_CASE(test_case_1) { { basic_index_builder idx("test_index", 0); idx.truncate(); idx.add(101, counted_record(1000, 1.0f)); idx.append(); idx.merge(); } { basic_index idx("test_index", 0); std::vector res = idx.find(101); BOOST_REQUIRE(res.size() == 1); BOOST_CHECK(res[0].m_value == 1000); BOOST_CHECK(res[0].m_count == 1); } } BOOST_AUTO_TEST_CASE(test_case_2) { { basic_index_builder idx("test_index", 0); idx.truncate(); idx.add(101, counted_record(1000)); idx.add(101, counted_record(1000)); idx.append(); idx.merge(); } { basic_index idx("test_index", 0); std::vector res = idx.find(101); BOOST_REQUIRE(res.size() == 1); BOOST_CHECK(res[0].m_value == 1000); BOOST_CHECK(res[0].m_count == 2); } } BOOST_AUTO_TEST_CASE(test_case_3) { { basic_index_builder idx("test_index", 0); idx.truncate(); idx.add(101, counted_record(1000)); idx.add(101, counted_record(1001)); idx.add(101, counted_record(1000)); idx.append(); idx.merge(); } { basic_index idx("test_index", 0); std::vector res = idx.find(101); BOOST_REQUIRE(res.size() == 2); BOOST_CHECK(res[0].m_value == 1000); BOOST_CHECK(res[0].m_count == 2); BOOST_CHECK(res[1].m_value == 1001); BOOST_CHECK(res[1].m_count == 1); } } BOOST_AUTO_TEST_CASE(test_case_4) { { sharded_builder idx("test_index", 10); idx.truncate(); idx.add(101, indexer::counted_record(1000)); idx.add(101, indexer::counted_record(1001)); idx.add(101, indexer::counted_record(1000)); idx.add(102, indexer::counted_record(1002)); idx.append(); idx.merge(); BOOST_CHECK(idx.document_count() == 3); } { sharded idx("test_index", 10); std::vector res = idx.find(101); BOOST_REQUIRE(res.size() == 2); BOOST_CHECK(res[0].m_value == 1000); BOOST_CHECK(res[0].m_count == 2); BOOST_CHECK(res[1].m_value == 1001); BOOST_CHECK(res[1].m_count == 1); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_datetime.h ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "common/datetime.h" BOOST_AUTO_TEST_SUITE(test_datetime) BOOST_AUTO_TEST_CASE(cur_date) { /*std::cout << System::cur_date() << std::endl; std::cout << System::cur_datetime() << std::endl; std::cout << System::iso8601_datetime() << std::endl;*/ } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_file.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "transfer/transfer.h" #include "text/text.h" #include "file/file.h" #include "file/tsv_file_remote.h" #include "file/tsv_file.h" #include "file/archive.h" #include "algorithm/hash.h" #include "config.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_file) BOOST_AUTO_TEST_CASE(transfer_test) { int error; { string result = transfer::file_to_string("/test-data/example.txt", error); BOOST_CHECK(error == transfer::OK); BOOST_CHECK(text::trim(result) == "An example file"); } { string result = transfer::gz_file_to_string("/test-data/example.txt.gz", error); BOOST_CHECK(error == transfer::OK); BOOST_CHECK(text::trim(result) == "An example file"); } { string result = transfer::file_to_string("test-data/example.txt", error); BOOST_CHECK(error == transfer::OK); BOOST_CHECK(text::trim(result) == "An example file"); } { string result = transfer::gz_file_to_string("test-data/example.txt.gz", error); BOOST_CHECK(error == transfer::OK); BOOST_CHECK(text::trim(result) == "An example file"); } { stringstream ss; transfer::file_to_stream("/test-data/example.txt", ss, error); string result = ss.str(); BOOST_CHECK(error == transfer::OK); BOOST_CHECK(text::trim(result) == "An example file"); } { stringstream ss; transfer::gz_file_to_stream("/test-data/example.txt.gz", ss, error); string result = ss.str(); BOOST_CHECK(error == transfer::OK); BOOST_CHECK(text::trim(result) == "An example file"); } } BOOST_AUTO_TEST_CASE(handle_errors) { int error; { string result = transfer::file_to_string("/non-existing.txt", error); BOOST_CHECK(error == transfer::ERROR); } { string result = transfer::gz_file_to_string("/non-existing.txt.gz", error); BOOST_CHECK(error == transfer::ERROR); } { stringstream ss; transfer::file_to_stream("/non-existing.txt", ss, error); BOOST_CHECK(error == transfer::ERROR); } { stringstream ss; transfer::gz_file_to_stream("/non-existing.txt.gz", ss, error); BOOST_CHECK(error == transfer::ERROR); } { vector downloaded = transfer::download_gz_files_to_disk({"/non-existing.txt.gz"}); BOOST_CHECK(downloaded.size() == 0); } } BOOST_AUTO_TEST_CASE(tsv_file_exists) { file::tsv_file_remote manual_paths_file("crawl-data/ALEXANDRIA-MANUAL-01/warc.paths.gz"); vector warc_paths; manual_paths_file.read_column_into(0, warc_paths); BOOST_CHECK(manual_paths_file.is_open()); BOOST_CHECK(warc_paths.size() > 0); BOOST_CHECK(warc_paths[0] == "crawl-data/ALEXANDRIA-MANUAL-01/files/top_domains.txt.gz"); } BOOST_AUTO_TEST_CASE(tsv_file_dont_exists) { file::tsv_file_remote manual_paths_file("non-existing-file.gz"); BOOST_CHECK(!manual_paths_file.is_open()); } BOOST_AUTO_TEST_CASE(local_tsv_files) { file::tsv_file my_file(config::test_data_path + "tsvtest.tsv"); BOOST_CHECK_EQUAL(my_file.find_first_position("aaa"), 0); BOOST_CHECK_EQUAL(my_file.find_first_position("aab"), 126); BOOST_CHECK_EQUAL(my_file.find_first_position("european"), string::npos); BOOST_CHECK_EQUAL(my_file.find_last_position("aaa"), 112); BOOST_CHECK_EQUAL(my_file.find_last_position("aab"), 126); BOOST_CHECK_EQUAL(my_file.find_last_position("european"), string::npos); file::tsv_file my_file2(config::test_data_path + "tsvtest2.tsv"); BOOST_CHECK_EQUAL(my_file2.find_first_position("aaa"), 0); BOOST_CHECK(my_file2.find_first_position("aab") > 0); BOOST_CHECK_EQUAL(my_file2.find_first_position("european"), string::npos); BOOST_CHECK(my_file2.find_last_position("aaa") > 0 && my_file2.find_last_position("aaa") < my_file2.size()); BOOST_CHECK(my_file2.find_last_position("aab") > 0 && my_file2.find_last_position("aab") < my_file2.size()); BOOST_CHECK(my_file2.find_last_position("aac") > 0 && my_file2.find_last_position("aac") == my_file2.size() - 115); BOOST_CHECK(my_file2.find_last_position("european") == string::npos); BOOST_CHECK_EQUAL(my_file2.find_next_position("aaa"), my_file2.find_first_position("aab")); BOOST_CHECK_EQUAL(my_file2.find_next_position("aab"), my_file2.find_first_position("aac")); BOOST_CHECK_EQUAL(my_file2.find_next_position("aabb"), my_file2.find_first_position("aac")); BOOST_CHECK_EQUAL(my_file2.find_next_position("aac"), my_file2.size()); } BOOST_AUTO_TEST_CASE(head_content_len) { { int error; size_t content_len = transfer::head_content_length("http://127.0.0.1/test-data/automobileszone.com", error); BOOST_CHECK_EQUAL(error, transfer::OK); BOOST_CHECK_EQUAL(content_len, 8084); } { int error; size_t content_len = transfer::head_content_length("http://127.0.0.1/test-data/automobileszone.com-not-here", error); BOOST_CHECK_EQUAL(error, transfer::ERROR); BOOST_CHECK_EQUAL(content_len, 0); } } BOOST_AUTO_TEST_CASE(test_upload) { // This amazon bucket is gone. /*{ int error; string buffer; transfer::url_to_string("http://alexandria-test-data.s3.amazonaws.com/multipart_test", buffer, error); BOOST_CHECK_EQUAL(error, transfer::OK); error = transfer::upload_file("multipart_test", buffer); BOOST_CHECK_EQUAL(error, transfer::OK); }*/ } BOOST_AUTO_TEST_CASE(test_upload_gz) { // This amazon bucket is gone. /*{ int error; string buffer; transfer::url_to_string("http://alexandria-test-data.s3.amazonaws.com/multipart_test", buffer, error); BOOST_CHECK_EQUAL(error, transfer::OK); error = transfer::upload_gz_file("multipart_test.gz", buffer); BOOST_CHECK_EQUAL(error, transfer::OK); // Download it again as gz file and see if we get the same result. const string result_back = transfer::gz_file_to_string("multipart_test.gz", error); BOOST_CHECK_EQUAL(error, transfer::OK); BOOST_CHECK_EQUAL(result_back.size(), buffer.size()); BOOST_CHECK_EQUAL(algorithm::hash(result_back), algorithm::hash(buffer)); }*/ } /* * Test the tsv_file::read_column_into function that is used a lot. * */ BOOST_AUTO_TEST_CASE(test_tsv_file) { { file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv"); vector vec; tsv.read_column_into(0, vec, 2, 3); BOOST_CHECK(vec.size() == 2); BOOST_CHECK(vec[0] == "line4"); BOOST_CHECK(vec[1] == "line5"); } { file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv"); set data; tsv.read_column_into(0, data, 2, 3); BOOST_CHECK(data.size() == 2); BOOST_CHECK(data.count("line4") == 1); BOOST_CHECK(data.count("line5") == 1); } { file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv"); vector vec; tsv.read_column_into(0, vec, 100, 3); BOOST_CHECK(vec.size() == 3); BOOST_CHECK(vec[0] == "line4"); BOOST_CHECK(vec[1] == "line5"); BOOST_CHECK(vec[2] == "line6"); } { file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv"); set data; tsv.read_column_into(0, data, 100, 3); BOOST_CHECK(data.size() == 3); BOOST_CHECK(data.count("line4") == 1); BOOST_CHECK(data.count("line5") == 1); BOOST_CHECK(data.count("line6") == 1); } { file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv"); vector vec; tsv.read_column_into(0, vec, 3, 0); BOOST_CHECK(vec.size() == 3); BOOST_CHECK(vec[0] == "line1"); BOOST_CHECK(vec[1] == "line2"); BOOST_CHECK(vec[2] == "line3"); } { file::tsv_file tsv(config::test_data_path + "tsvtest3.tsv"); set data; tsv.read_column_into(0, data, 3, 0); BOOST_CHECK(data.size() == 3); BOOST_CHECK(data.count("line1") == 1); BOOST_CHECK(data.count("line2") == 1); BOOST_CHECK(data.count("line3") == 1); } } /* * Test the file::archive simple tarball * */ BOOST_AUTO_TEST_CASE(test_archive) { { file::archive tar("test_dir.tar"); file::create_directory("test_dir1"); std::ofstream file1("test_dir1/file1.txt"); file1 << "hello world 1"; file1.close(); std::ofstream file2("test_dir1/file2.txt"); file2 << "hello world 2"; file2.close(); std::ofstream file3("test_dir1/file3.txt"); file3 << "hello world 3"; file3.close(); tar.read_dir("test_dir1"); } { file::archive tar("test_dir.tar"); file::create_directory("test_dir2"); tar.untar("test_dir2"); BOOST_CHECK_EQUAL(file::cat("test_dir2/file1.txt"), "hello world 1"); BOOST_CHECK_EQUAL(file::cat("test_dir2/file2.txt"), "hello world 2"); BOOST_CHECK_EQUAL(file::cat("test_dir2/file3.txt"), "hello world 3"); } file::delete_directory("test_dir1"); file::delete_directory("test_dir2"); file::delete_file("test_dir.tar"); } BOOST_AUTO_TEST_CASE(test_archive2) { { file::archive tar("test_dir.tar"); file::create_directory("test_dir1"); // Create 500 files. for (size_t i = 1; i <= 500; i++) { std::ofstream file1("test_dir1/file" + std::to_string(i) + ".txt"); for (size_t j = 0; j < i; j++) { file1 << "hello world " << j << std::endl; } } tar.read_dir("test_dir1"); } { file::archive tar("test_dir.tar"); file::create_directory("test_dir2"); tar.untar("test_dir2"); // Check 500 files. for (size_t i = 1; i <= 500; i++) { std::ifstream file1("test_dir2/file" + std::to_string(i) + ".txt"); std::string line; size_t j = 0; while (std::getline(file1, line)) { BOOST_CHECK_EQUAL(line, "hello world " + std::to_string(j)); j++; } BOOST_CHECK_EQUAL(j, i); } } file::delete_directory("test_dir1"); file::delete_directory("test_dir2"); file::delete_file("test_dir.tar"); } BOOST_AUTO_TEST_CASE(test_rename_file) { file::create_directory("/tmp/alexandria_test_98237593257"); file::create_directory("/tmp/alexandria_test_98237593257/testdir"); file::rename("/tmp/alexandria_test_98237593257/testdir", "/tmp/alexandria_test_98237593257/testdir2"); BOOST_CHECK(file::file_exists("/tmp/alexandria_test_98237593257/testdir2")); BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257/testdir")); file::delete_directory("/tmp/alexandria_test_98237593257"); BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257/testdir")); BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257/testdir2")); BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257")); file::create_directory("/tmp/alexandria_test_98237593257/testdir"); BOOST_CHECK(file::file_exists("/tmp/alexandria_test_98237593257")); file::delete_directory("/tmp/alexandria_test_98237593257"); BOOST_CHECK(!file::file_exists("/tmp/alexandria_test_98237593257")); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_hash.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "algorithm/hash.h" BOOST_AUTO_TEST_SUITE(hash) BOOST_AUTO_TEST_CASE(str) { BOOST_CHECK_EQUAL(algorithm::hash("testing"), 4540905123118180926ull); BOOST_CHECK_EQUAL(algorithm::hash(""), 6142509188972423790ull); BOOST_CHECK_EQUAL(algorithm::hash("abcdefghijklmnopqrstuvxyz"), 17219978627035894604ull); BOOST_CHECK_EQUAL(algorithm::hash("123"), 10089081994332581363ull); BOOST_CHECK_EQUAL(algorithm::hash("1234"), 15651099383784684535ull); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_hash_table.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "hash_table2/hash_table.h" #include "hash_table2/builder.h" #include "hash_table_helper/hash_table_helper.h" #include "indexer/merger.h" #include BOOST_AUTO_TEST_SUITE(test_hash_table) BOOST_AUTO_TEST_CASE(test_file_paths) { { hash_table2::hash_table_shard_builder ht_builder("test_index", 8); BOOST_CHECK_EQUAL(ht_builder.file_base_data(), "./0/hash_table/ht_test_index_8"); BOOST_CHECK_EQUAL(ht_builder.filename_data(), "./0/hash_table/ht_test_index_8.data"); } { hash_table2::hash_table_shard_builder ht_builder("test_index", 8, 1000, "/data_path"); BOOST_CHECK_EQUAL(ht_builder.file_base_data(), "/data_path/ht_test_index_8"); BOOST_CHECK_EQUAL(ht_builder.filename_data(), "/data_path/ht_test_index_8.data"); BOOST_CHECK_EQUAL(ht_builder.filename_pos(), "./0/hash_table/ht_test_index_8.pos"); } } BOOST_AUTO_TEST_CASE(single_shard_add) { hash_table_helper::truncate("test_index"); { hash_table2::hash_table_shard_builder idx("test_index", 0); idx.truncate(); idx.add(123, "hello world"); idx.append(); idx.merge(); } { hash_table2::hash_table_shard idx("test_index", 0); BOOST_CHECK(idx.has(123)); BOOST_CHECK(!idx.has(1234)); BOOST_CHECK_EQUAL(idx.find(123), "hello world"); } } BOOST_AUTO_TEST_CASE(single_shard_add_versioned) { { hash_table2::hash_table_shard_builder idx("test_index", 0); idx.truncate(); idx.add(123, "hello world", 5); idx.append(); idx.merge(); idx.add(123, "new value", 6); idx.append(); idx.merge(); idx.add(123, "old value", 4); idx.append(); idx.merge(); idx.add(123, "old value 2", 3); idx.add(123, "newest value", 7); idx.append(); idx.merge(); } { hash_table2::hash_table_shard idx("test_index", 0); BOOST_CHECK_EQUAL(idx.find(123), "newest value"); } } BOOST_AUTO_TEST_CASE(single_shard_add_versioned2) { { hash_table2::hash_table_shard_builder idx("test_index", 0); idx.truncate(); idx.add(101, "an old value", 1000); idx.append(); idx.merge(); idx.optimize(); idx.add(101, "another old value", 1000); idx.append(); idx.merge(); idx.optimize(); idx.add(101, "a new value", 1001); idx.append(); idx.merge(); idx.optimize(); idx.add(101, "an older value", 999); idx.append(); idx.merge(); idx.optimize(); } { hash_table2::hash_table_shard idx("test_index", 0); BOOST_CHECK_EQUAL(idx.find(101), "a new value"); } } BOOST_AUTO_TEST_CASE(add_to_hash_table) { hash_table_helper::truncate("test_index"); { hash_table2::builder idx("test_index", 43); idx.truncate(); // Add 1000 elements. for (size_t i = 0; i < 1000; i++) { idx.add(i, "Random test data with id: " + std::to_string(i)); } idx.merge(); } { hash_table2::hash_table hash_table("test_index", 43); for (size_t i = 0; i < 1000; i++) { BOOST_CHECK_EQUAL(hash_table.find(i), "Random test data with id: " + std::to_string(i)); } } { hash_table2::builder idx("test_index", 43); idx.truncate(); // Add 1000 elements. for (size_t i = 1000; i < 2000; i++) { idx.add(i, "Random test data with id: " + std::to_string(i)); } idx.merge(); } { hash_table2::hash_table hash_table("test_index", 43); for (size_t i = 1000; i < 2000; i++) { BOOST_CHECK_EQUAL(hash_table.find(i), "Random test data with id: " + std::to_string(i)); } } } BOOST_AUTO_TEST_CASE(add_to_hash_table_reverse) { hash_table_helper::truncate("test_index"); { hash_table2::builder idx("test_index", 17); idx.truncate(); // Add 1000 elements. for (size_t i = 100000; i < 200000; i++) { idx.add(i, "Random test data with id: " + std::to_string(i)); } idx.merge(); } { hash_table2::hash_table hash_table("test_index", 17); BOOST_CHECK_EQUAL(hash_table.size(), 100000); } { // Add more elements. hash_table2::builder idx("test_index", 17); // Add 1000 elements. for (size_t i = 0; i < 100000; i++) { idx.add(i, "Random test data with id: " + std::to_string(i)); } idx.merge(); } { hash_table2::hash_table hash_table("test_index", 17); BOOST_CHECK_EQUAL(hash_table.size(), 200000); } } BOOST_AUTO_TEST_CASE(optimize) { hash_table_helper::truncate("test_index"); size_t shard_size = 0; size_t shard_file_size = 0; { hash_table2::hash_table_shard_builder builder("test_index", 0); builder.add(1, "data element 1 v1"); builder.add(2, "data element 2 v1"); builder.add(3, "data element 3 v1"); builder.append(); builder.merge(); hash_table2::hash_table_shard shard("test_index", 0); shard_size = shard.size(); shard_file_size = shard.file_size(); } { // Add some more elements with identical keys. hash_table2::hash_table_shard_builder builder("test_index", 0); builder.add(1, "data element 1 v2"); builder.add(2, "data element 2 v2"); builder.add(3, "data element 3 v2"); builder.append(); builder.merge(); builder.optimize(); hash_table2::hash_table_shard shard("test_index", 0); BOOST_CHECK_EQUAL(shard.size(), shard_size); BOOST_CHECK_EQUAL(shard.file_size(), shard_file_size); BOOST_CHECK_EQUAL(shard.find(1), "data element 1 v2"); BOOST_CHECK_EQUAL(shard.find(2), "data element 2 v2"); BOOST_CHECK_EQUAL(shard.find(3), "data element 3 v2"); } } BOOST_AUTO_TEST_CASE(optimize_empty) { hash_table_helper::truncate("main_index"); hash_table2::hash_table_shard_builder idx("main_index", 0); idx.optimize(); } BOOST_AUTO_TEST_CASE(conditional) { hash_table_helper::truncate("main_index"); { hash_table2::builder ht("main_index", 10); ht.truncate(); ht.add(101, "an old value", 1000); ht.add(101, "another old value", 1000); ht.add(101, "a new value", 1001); ht.add(101, "an older value", 999); ht.merge(); } { hash_table2::hash_table ht("main_index", 10); std::string value = ht.find(101); BOOST_CHECK_EQUAL(value, "a new value"); } } BOOST_AUTO_TEST_CASE(conditional2) { hash_table_helper::truncate("main_index"); { hash_table2::builder ht("main_index", 10); ht.truncate(); // Merge between each. Should still get the same value. ht.add(101, "an old value", 1000); ht.merge(); ht.add(101, "another old value", 1000); ht.merge(); ht.add(101, "a new value", 1001); ht.merge(); ht.add(101, "an older value", 999); ht.merge(); } { hash_table2::hash_table ht("main_index", 10); std::string value = ht.find(101); BOOST_CHECK_EQUAL(value, "a new value"); } } BOOST_AUTO_TEST_CASE(more_tests) { hash_table_helper::truncate("main_index"); { hash_table2::builder ht("main_index", 10); ht.truncate(); ht.add(101, "first value", 1000); ht.add(101, "second value", 1001); ht.add(101, "third value", 1002); ht.add(102, "first value", 1000); ht.add(102, "second value", 1001); ht.add(102, "third value", 1002); ht.add(103, "first value", 1); ht.add(103, "second value", 100000); ht.add(103, "third value", 99999999999); ht.add(50, "third value"); ht.merge(); } { hash_table2::hash_table ht("main_index", 10); BOOST_CHECK_EQUAL(ht.find(101), "third value"); BOOST_CHECK_EQUAL(ht.find(102), "third value"); BOOST_CHECK_EQUAL(ht.find(103), "third value"); BOOST_CHECK_EQUAL(ht.find(50), "third value"); } } BOOST_AUTO_TEST_CASE(for_each) { hash_table_helper::truncate("main_index"); { hash_table2::builder ht("main_index", 10); ht.truncate(); ht.add(101, "first value", 1000); ht.merge(); ht.add(101, "second value", 1001); ht.merge(); ht.add(101, "third value", 1002); ht.add(102, "first value", 1000); ht.merge(); ht.add(102, "second value", 1001); ht.merge(); ht.add(102, "third value", 1002); ht.add(103, "third value", 99999999999); ht.add(103, "first value", 1); ht.merge(); ht.add(103, "second value", 100000); ht.merge(); ht.add(50, "third value"); ht.merge(); ht.optimize(); } { hash_table2::hash_table ht("main_index", 10); BOOST_CHECK_EQUAL(ht.find(101), "third value"); BOOST_CHECK_EQUAL(ht.find(102), "third value"); BOOST_CHECK_EQUAL(ht.find(103), "third value"); BOOST_CHECK_EQUAL(ht.find(50), "third value"); std::set keys; std::set values; ht.for_each([&keys, &values](uint64_t key, const std::string &val) { keys.insert(key); values.insert(val); }); BOOST_CHECK_EQUAL(keys.size(), 4); BOOST_CHECK_EQUAL(values.size(), 1); for (const auto &val : values) { BOOST_CHECK_EQUAL(val, "third value"); } } } BOOST_AUTO_TEST_CASE(larger_test) { { indexer::merger::start_merge_thread(); hash_table2::builder ht("main_index", 10); ht.truncate(); for (size_t key = 1000; key < 10000; key++) { ht.add(key, std::string(key, 'x')); } for (size_t key = 1000; key < 10000; key++) { ht.add(key, std::string(key, 'y'), 1); } indexer::merger::stop_merge_thread(); } { indexer::merger::start_merge_thread(); hash_table2::builder ht("main_index", 10); for (size_t key = 1000; key < 10000; key++) { ht.add(key, std::string(key, 'z'), 2); } indexer::merger::stop_merge_thread(); } { indexer::merger::start_merge_thread(); hash_table2::builder ht("main_index", 10); for (size_t key = 1000; key < 10000; key++) { ht.add(key, std::string(key, 'a'), 2); } indexer::merger::stop_merge_thread(); } { hash_table2::builder ht("main_index", 10); ht.optimize(); } { hash_table2::hash_table ht("main_index", 10); for (size_t key = 1000; key < 10000; key++) { BOOST_REQUIRE_EQUAL(ht.find(key), std::string(key, 'a')); } std::map> vals; ht.for_each([&vals](uint64_t key, const std::string &val) { vals[key].push_back(val); }); for (const auto &iter : vals) { BOOST_REQUIRE_EQUAL(iter.second.size(), 1); BOOST_REQUIRE_EQUAL(iter.second[0], std::string(iter.first, 'a')); } } } BOOST_AUTO_TEST_CASE(merge_with) { { hash_table2::builder ht("main_index", 11); ht.truncate(); ht.add(123, "a1", 10); ht.add(1230, "a2", 10); ht.add(1231, "a3", 10); ht.add(1231, "a3_n2", 11); ht.add(3828540, "a4", 10); ht.add(2234645, "a5", 10); ht.add(8424878, "a6", 10); ht.add(4174861, "a7", 10); ht.add(7013344, "a8", 10); ht.merge(); } { hash_table2::builder ht("main_index2", 11); ht.truncate(); ht.add(123, "b1", 11); ht.add(1230, "b2", 12); ht.add(1231, "b3", 9); ht.add(1231, "b3", 8); ht.add(8321508, "b4", 10); ht.add(7309646, "b5", 10); ht.add(2809224, "b6", 10); ht.add(6543485, "b7", 10); ht.add(6078858, "b8", 10); ht.merge(); } { hash_table2::builder ht1("main_index", 11); hash_table2::builder ht2("main_index2", 11); ht1.merge_with(ht2); } { hash_table2::hash_table ht("main_index", 11); BOOST_CHECK_EQUAL(ht.find(123), "b1"); BOOST_CHECK_EQUAL(ht.find(1230), "b2"); BOOST_CHECK_EQUAL(ht.find(1231), "a3_n2"); BOOST_CHECK_EQUAL(ht.find(6543485), "b7"); BOOST_CHECK_EQUAL(ht.find(2234645), "a5"); } } BOOST_AUTO_TEST_CASE(merge_with_files) { { hash_table2::builder ht("main_index", 1); ht.truncate(); ht.add(123, "a1", 10); ht.add(1230, "a2", 10); ht.add(1231, "a3", 10); ht.add(1231, "a3_n2", 11); ht.add(3828540, "a4", 10); ht.add(2234645, "a5", 10); ht.add(8424878, "a6", 10); ht.add(4174861, "a7", 10); ht.add(7013344, "a8", 10); ht.merge(); } { hash_table2::builder ht("main_index2", 1); ht.truncate(); ht.add(123, "b1", 11); ht.add(1230, "b2", 12); ht.add(1231, "b3", 9); ht.add(1231, "b3", 8); ht.add(8321508, "b4", 10); ht.add(7309646, "b5", 10); ht.add(2809224, "b6", 10); ht.add(6543485, "b7", 10); ht.add(6078858, "b8", 10); ht.merge(); } { hash_table2::builder ht("main_index2", 1); ht.get_shard(0)->merge_with("./0/hash_table/ht_main_index_0.pos", "./0/hash_table/ht_main_index_0.data"); } { hash_table2::hash_table ht("main_index2", 1); BOOST_CHECK_EQUAL(ht.find(123), "b1"); BOOST_CHECK_EQUAL(ht.find(1230), "b2"); BOOST_CHECK_EQUAL(ht.find(1231), "a3_n2"); BOOST_CHECK_EQUAL(ht.find(6543485), "b7"); BOOST_CHECK_EQUAL(ht.find(2234645), "a5"); } } BOOST_AUTO_TEST_CASE(remove_record) { { hash_table2::builder ht("main_index", 1); ht.truncate(); ht.add(10000, "data1", 10); ht.add(10001, "data2", 10); ht.add(10002, "data3", 10); ht.merge(); } { hash_table2::hash_table ht("main_index", 1); BOOST_CHECK_EQUAL(ht.find(10000), "data1"); BOOST_CHECK_EQUAL(ht.find(10001), "data2"); BOOST_CHECK_EQUAL(ht.find(10002), "data3"); } { hash_table2::builder ht("main_index", 1); ht.remove(10001); ht.merge(); } { hash_table2::hash_table ht("main_index", 1); BOOST_CHECK_EQUAL(ht.find(10000), "data1"); BOOST_CHECK_EQUAL(ht.find(10001), ""); BOOST_CHECK_EQUAL(ht.find(10002), "data3"); } } BOOST_AUTO_TEST_CASE(remove_record2) { { hash_table2::builder ht("main_index", 1); ht.truncate(); ht.add(10000, "data1", 10); ht.add(10001, "data2", 10); ht.add(10002, "data3", 10); ht.merge(); } { hash_table2::builder ht("main_index2", 1); ht.truncate(); ht.add(10000, "data1", 10); ht.add(10002, "data3", 10); ht.merge(); } { hash_table2::hash_table ht("main_index", 1); BOOST_CHECK_EQUAL(ht.find(10000), "data1"); BOOST_CHECK_EQUAL(ht.find(10001), "data2"); BOOST_CHECK_EQUAL(ht.find(10002), "data3"); } { hash_table2::hash_table ht("main_index2", 1); BOOST_CHECK_EQUAL(ht.find(10000), "data1"); BOOST_CHECK_EQUAL(ht.find(10002), "data3"); } { hash_table2::builder ht("main_index", 1); ht.remove(10001); ht.merge(); } { hash_table2::hash_table ht1("main_index", 1); hash_table2::hash_table ht2("main_index", 1); size_t total_size1 = 0; ht1.for_each_shard([&total_size1](auto shard) { total_size1 += shard->file_size(); }); size_t total_size2 = 0; ht2.for_each_shard([&total_size2](auto shard) { total_size2 += shard->file_size(); }); BOOST_CHECK_EQUAL(total_size1, total_size2); } } BOOST_AUTO_TEST_CASE(for_each_key) { { hash_table2::builder ht("main_index", 1); ht.truncate(); ht.add(100, "data1"); ht.add(101, "other data"); ht.add(102, "data3"); ht.merge(); } { hash_table2::hash_table ht("main_index", 1); int num = 0; ht.for_each_key([&num](uint64_t key) { BOOST_CHECK(key == 100 || key == 101 || key == 102); num++; }); BOOST_CHECK_EQUAL(num, 3); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_html_parser.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "parser/html_parser.h" #include "text/text.h" #include "file/file.h" using namespace std; BOOST_AUTO_TEST_SUITE(html_parser) BOOST_AUTO_TEST_CASE(html_parse1) { parser::html_parser parser; parser.parse("test1"); BOOST_CHECK_EQUAL(parser.title(), "test1"); parser.parse("test1

test2

"); BOOST_CHECK_EQUAL(parser.h1(), "test2"); parser.parse("he oisjdf osdjfo idjsofi djsof

"); BOOST_CHECK_EQUAL(parser.title(), ""); BOOST_CHECK_EQUAL(parser.h1(), ""); parser.parse("test1"); BOOST_CHECK_EQUAL(parser.meta(), "Recensioner av Vår vid sommen och andra böcker"); parser.parse(file::read_test_file("test1.html")); BOOST_CHECK_EQUAL(parser.meta(), "Pris: 199 kr. Inbunden, 2021. Finns i lager. Köp Sammetsdiktaturen : motstånd och medlöpare i dagens Ryssland av Anna-Lena Laurén på Bokus.com. Boken har 3 st läsarrecensioner"); parser.parse("test1

Hej Hopp

"); BOOST_CHECK_EQUAL(parser.h1(), "Hej Hopp"); parser.parse("test1

test2

lite text efter"); BOOST_CHECK_EQUAL(parser.text(), "lite text efter"); } BOOST_AUTO_TEST_CASE(html_parse2) { parser::html_parser parser; parser.parse(file::read_test_file("test5.html")); BOOST_CHECK_EQUAL(parser.text().substr(0, 50), string("Nya lån 2021 Nya lån 2020 Nya lån 2019 Nya lån 2018 Nya lån 2017 Nya lån 2016 Uppdaterad 2021-10-01.").substr(0, 50)); parser.parse(file::read_test_file("test6.html")); } BOOST_AUTO_TEST_CASE(html_parse3) { parser::html_parser parser; parser.parse(file::read_test_file("test7.html")); BOOST_CHECK_EQUAL(parser.text().substr(0, 20), "Add to wishlist Adde"); } BOOST_AUTO_TEST_CASE(html_parse4) { parser::html_parser parser; parser.parse(file::read_test_file("test8.html")); BOOST_CHECK_EQUAL(parser.text().substr(0, 107), "Hacker News new | past | comments | ask | show | jobs | submit login 1. Apple Broke Up with Me ( merecivili"); } BOOST_AUTO_TEST_CASE(html_parse5) { parser::html_parser parser; parser.parse(file::read_test_file("test10.html")); BOOST_CHECK_EQUAL(parser.meta(), ""); BOOST_CHECK_EQUAL(parser.title(), "Association for Progressive Communications | Internet for social justice and sustainable development"); BOOST_CHECK_EQUAL(parser.h1(), ""); } BOOST_AUTO_TEST_CASE(html_parse6) { parser::html_parser parser; parser.parse(file::read_test_file("test11.html")); BOOST_CHECK_EQUAL(parser.meta(), "Svenska Dagbladet står för seriös och faktabaserad kvalitetsjournalistik som utmanar, ifrågasätter och inspirerar"); BOOST_CHECK_EQUAL(parser.title(), "SvD | Sveriges kvalitetssajt för nyheter"); } BOOST_AUTO_TEST_CASE(html_parse7) { parser::html_parser parser; parser.parse(file::read_test_file("test12.html")); BOOST_CHECK_EQUAL(parser.meta(), "The systematic thinking in our industry is that settings are the result of design failure. As designers, our goal is to create product experiences that don’t require any adjustment by the user. So offering customization options is often seen as a failure to make firm product decisions. I think there is a misunderstanding about what settings really are"); BOOST_CHECK_EQUAL(parser.title(), "Settings are not a design failure"); } BOOST_AUTO_TEST_CASE(html_parse_links) { string html; vector links; string test2_html = file::read_test_file("test2.html"); parser::html_parser parser; parser.parse(test2_html); BOOST_CHECK_EQUAL(parser.title(), "Resebyrån Främmande Världar - L. D. Lapinski - inbunden (9789178937943) | Adlibris Bokhandel"); BOOST_CHECK_EQUAL(parser.meta(), "inbunden, 2021. Köp boken Resebyrån Främmande Världar av L. D. Lapinski (ISBN 9789178937943) hos Adlibris. Fraktfritt över 229 kr Alltid bra priser och snabb leverans. | Adlibris"); BOOST_CHECK_EQUAL(parser.h1(), "Resebyrån Främmande Världar - inbunden, Svenska, 2021"); BOOST_CHECK_EQUAL(parser.text(), ""); BOOST_CHECK(parser.should_insert()); string test4_html = file::read_test_file("test4.html"); parser.parse(test4_html); BOOST_CHECK_EQUAL(parser.title(), "Corona – samlad information för privatpersoner | Skatteverket"); BOOST_CHECK_EQUAL(parser.h1(), "Corona – information för privatpersoner"); BOOST_CHECK_EQUAL(parser.meta(), "Här har vi samlat information för privatpersoner som påverkas av corona på olika sätt"); BOOST_CHECK(parser.should_insert()); string stackoverflow_html = file::read_test_file("stackoverflow.html"); parser.parse(stackoverflow_html); BOOST_CHECK_EQUAL(parser.title(), "node.js - How to use Async and Await with AWS SDK Javascript - Stack Overflow"); BOOST_CHECK_EQUAL(parser.h1(), "How to use Async and Await with AWS SDK Javascript"); BOOST_CHECK_EQUAL(parser.meta(), "I am working with the AWS SDK using the KMS libary. I would like to use async and await instead of callbacks. import AWS, { KMS } from \"aws-sdk\"; this.kms = new AWS.KMS(); const key = await this"); BOOST_CHECK(parser.should_insert()); html = file::read_test_file("hallakonsument.html"); parser.parse(html, "https://www.hallakonsument.se/konsumentratt-kopsatt/innan-du-tar-ett-lan/"); BOOST_CHECK_EQUAL(parser.title(), "Innan du tar ett lån | Hallå konsument – Konsumentverket"); BOOST_CHECK_EQUAL(parser.h1(), "Innan du tar ett lån"); BOOST_CHECK_EQUAL(parser.meta(), "Om du har ett behov av att låna pengar är det viktigt att läsa på om vilken typ av lån som passar dig. Prata med flera banker, jämför villkoren och kostnaderna för olika lån"); BOOST_CHECK(parser.should_insert()); links = parser.links(); bool found_link = false; for (const auto &link : links) { if (link.target_host() == "konsumenternas.se" && link.target_path() == "/lan--betalningar/lan/sa-fungerar-ett-lan/forhandsinformation/" && link.text() == "Läs mer om förhandsinformation på webbplatsen konsumenternas.se") { found_link = true; } } BOOST_CHECK(found_link); html = file::read_test_file("konsumenternas.html"); parser.parse(html, "https://www.konsumenternas.se/lan--betalningar/lan/"); BOOST_CHECK_EQUAL(parser.title(), "Lån"); BOOST_CHECK_EQUAL(parser.h1(), "Lån"); BOOST_CHECK_EQUAL(parser.meta(), "Att låna pengar kan vara ett sätt att finansiera något som du behöver eller gärna vill köpa, men inte har råd att betala direkt. Men ett lån kostar pengar i form av avgifter och räntor"); BOOST_CHECK(parser.should_insert()); links = parser.links(); found_link = false; for (const auto &link : links) { if (link.target_host() == "konsumenternas.us17.list-manage.com" && link.target_path() == "/subscribe?u=a63ab96c95e9b06c9a857d5f9&id=132436ec8d" && link.text() == "Nyhetsbrev") { found_link = true; } } BOOST_CHECK(found_link); html = file::read_test_file("sbab.html"); parser.parse(html, "https://www.sbab.se/1/privat/lana/privatlan/privatlan_-_sa_funkar_det.html#/berakna_manadskostnad"); BOOST_CHECK_EQUAL(parser.title(), "Privatlån - låna pengar till bra ränta - SBAB"); BOOST_CHECK_EQUAL(parser.h1(), "Privatlån – låna pengar till bra ränta"); BOOST_CHECK_EQUAL(parser.meta(), "Ansök om ett privatlån mellan 30 000 och 500 000 kronor. Låna pengar utan säkerhet. Ansök och få besked direkt"); BOOST_CHECK(parser.should_insert()); links = parser.links(); found_link = false; for (const auto &link : links) { if (link.target_host() == "sbab.kundo.se" && link.target_path() == "/org/sbab/" && link.text() == "Kundforum") { found_link = true; } } BOOST_CHECK(found_link); html = file::read_test_file("kronofogden.html"); parser.parse(html, "https://www.kronofogden.se/82374.html"); BOOST_CHECK_EQUAL(parser.title(), "Fem tips om ekonomin förändras | Kronofogden"); BOOST_CHECK_EQUAL(parser.h1(), "Fem tips om ekonomin förändras"); BOOST_CHECK_EQUAL(parser.meta(), ""); BOOST_CHECK(parser.should_insert()); links = parser.links(); found_link = false; for (const auto &link : links) { if (link.target_host() == "hallakonsument.se" && link.target_path() == "/" && link.text() == "Välkommen till Hallå konsument") { found_link = true; } } BOOST_CHECK(found_link); html = file::read_test_file("uppsala.html"); parser.parse(html, "https://www.uppsala.se/stod-och-omsorg/privatekonomi-och-ekonomiskt-stod/boka-tid-for-budget--och-skuldradgivning/"); BOOST_CHECK_EQUAL(parser.title(), "Budget- och skuldrådgivning hos Konsument Uppsala - Uppsala kommun"); BOOST_CHECK_EQUAL(parser.h1(), "Budget- och skuldrådgivning hos Konsument Uppsala"); BOOST_CHECK_EQUAL(parser.meta(), "Om du vill göra din egen hushållsbudget, vill ha ekonomisk rådgivning eller har skulder och inte får pengarna att räcka till kan du vända dig till Konsument Uppsala. "); BOOST_CHECK(parser.should_insert()); links = parser.links(); found_link = false; for (const auto &link : links) { if (link.target_host() == "outlook.office365.com" && link.target_path() == "/owa/calendar/Budgetochskuldrdgivning@uppsalakommun1.onmicrosoft.com/bookings/" && link.text() == "Boka tid online") { found_link = true; } } BOOST_CHECK(found_link); html = file::read_test_file("chessgames.com"); parser.parse(html, "http://store.chessgames.com/chess-books/chess-notation-type/an---algebraic/author/s/alexander-cherniaev-anatoly-karpov-joe-gallagher-joel-r.-steed-miguel-a.-sanchez-richard-obrien/hardware-requirements/windows.html"); BOOST_CHECK_EQUAL(parser.title(), "Chess Books : Windows, AN - Algebraic, Alexander Cherniaev, Anatoly Karpov, Joe Gallagher, Joel R. Steed, Miguel A. Sanchez and Richard O'Brien"); BOOST_CHECK_EQUAL(parser.h1(), "Chess Books"); BOOST_CHECK_EQUAL(parser.meta(), "Shop for Chess Books at US Chess Federation Sales. We offer the widest selection of Chess Books at the lowest prices with same-day shipping.Windows, AN - Algebraic, Alexander Cherniaev, Anatoly Karpov, Joe Gallagher, Joel R. Steed, Miguel A. Sanchez and Richard O'Brien"); BOOST_CHECK_EQUAL(parser.links().size(), 0); BOOST_CHECK(parser.should_insert()); html = file::read_test_file("acomesf.org"); parser.parse(html, "http://acomesf.org/download/42104960-3er-congreso-acomesf/"); BOOST_CHECK_EQUAL(parser.title(), "42104960 3er Congreso ACOMESF | Asociación Colombiana de Médicos Especialistas en Salud Familiar (ACOMESF"); BOOST_CHECK_EQUAL(parser.h1(), "42104960 3er Congreso ACOMESF"); BOOST_CHECK_EQUAL(parser.meta(), ""); BOOST_CHECK(parser.should_insert()); html = file::read_test_file("automobileszone.com"); parser.parse(html, "http://automobileszone.com/wp-login.php?redirect_to=http%3A%2F%2Fautomobileszone.com%2Fbest-bronco-build-off-our-editors-weigh-in-on-their-ideal-suvs%2F"); BOOST_CHECK_EQUAL(parser.text(), "Username or Email Address Password Remember Me Lost your password? ← Back to Automobiles Zone Log in with WordPress.com"); BOOST_CHECK(parser.should_insert()); html = file::read_test_file("vcareprojectmanagement.com"); parser.parse(html, "https://vcareprojectmanagement.com/products/project-manager-project-management-certification-pmi-atp-authorised-training-provider-pmp-capm-2021-online-training-course-class"); BOOST_CHECK_EQUAL(parser.h1(), ""); BOOST_CHECK_EQUAL(parser.text(), ""); } BOOST_AUTO_TEST_CASE(html_parser_encodings) { parser::html_parser parser; BOOST_CHECK(!parser.is_exotic_language("hej jag heter josef cullhed")); BOOST_CHECK(!parser.is_exotic_language("åäö")); BOOST_CHECK(!parser.is_exotic_language("Đảng,Đoàn thể - tnxp.hochiminhcity.gov.vn")); BOOST_CHECK(!parser.is_exotic_language("Maktspelet i Volvo : en skildring inifrån - Hans Nyman - Kartonnage (9789189323056) | Bokus")); BOOST_CHECK(parser.is_exotic_language("В КФУ проходят съемки короткометражного фильма в рамках проекта «Кино за 7 дней» | ВидеоПрокат+")); BOOST_CHECK(parser.is_exotic_language("2015-09-09から1日間の記事一覧 - Nani-Sore 何それ?")); BOOST_CHECK(parser.is_exotic_language("Ремонт Принтеров Hp в Спб Адреса | Ремонт принтеров")); } BOOST_AUTO_TEST_CASE(html_parser_long_text) { parser::html_parser parser(100000); string html = file::read_test_file("zlib_manual.html"); parser.parse(html, "https://zlib.net/manual.html"); string text = parser.text(); BOOST_CHECK_EQUAL(text.substr(text.size() - 14), "# endif #endif"); vector words = text::get_expanded_full_text_words(text); bool has_word = false; for (const string &word : words) { if (word == "inflateinit2") has_word = true; } BOOST_CHECK(has_word); } /* test these links: Skatteverket here: http://nomell.se/2009/03/24/prisa-gud-har-kommer-skatteaterbaringen/ */ BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_hyper_ball.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "algorithm/hyper_ball.h" #include "algorithm/algorithm.h" #include #include using namespace std; BOOST_AUTO_TEST_SUITE(hyper_ball) BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball) { { set> e = { std::make_pair(0, 1), std::make_pair(1, 2), std::make_pair(2, 0), std::make_pair(2, 3), std::make_pair(3, 4), std::make_pair(3, 5), std::make_pair(4, 2), std::make_pair(5, 4), }; const size_t n = 1000; vector *edge_map = algorithm::set_to_edge_map(n, e); vector h = algorithm::hyper_ball(n, edge_map); delete [] edge_map; BOOST_CHECK(h.size() == n); BOOST_CHECK_CLOSE(h[0], 8.0/3.0, 0.000001); BOOST_CHECK_CLOSE(h[1], 7.0/3.0, 0.000001); BOOST_CHECK_CLOSE(h[2], 7.0/2.0, 0.000001); BOOST_CHECK_EQUAL(h[6], 0.0); } } BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball2) { { set> e = { std::make_pair(0, 1), std::make_pair(1, 5), std::make_pair(2, 5), std::make_pair(3, 2), std::make_pair(6, 2), std::make_pair(7, 3), std::make_pair(10, 7), std::make_pair(7, 9), std::make_pair(9, 3), std::make_pair(9, 6), std::make_pair(8, 9), std::make_pair(4, 8), }; const size_t n = 1000; vector *edge_map = algorithm::set_to_edge_map(n, e); vector h = algorithm::hyper_ball(n, edge_map); delete [] edge_map; BOOST_CHECK(h.size() == n); BOOST_CHECK_CLOSE(h[5], 4.86666666667, 0.000001); BOOST_CHECK_CLOSE(h[8], 1.0, 0.000001); BOOST_CHECK_CLOSE(h[2], 3.91666666667, 0.000001); } } BOOST_AUTO_TEST_CASE(harmonic_centrality_hyper_ball3) { { set> e = { std::make_pair(0, 11), std::make_pair(1, 0), std::make_pair(2, 1), std::make_pair(3, 2), std::make_pair(3, 8), std::make_pair(4, 7), std::make_pair(5, 7), std::make_pair(6, 7), std::make_pair(7, 8), std::make_pair(10, 12), std::make_pair(11, 1), std::make_pair(11, 10), std::make_pair(12, 25), std::make_pair(13, 9), std::make_pair(13, 14), std::make_pair(14, 9), std::make_pair(14, 8), std::make_pair(14, 15), std::make_pair(15, 7), std::make_pair(19, 15), std::make_pair(20, 21), std::make_pair(21, 16), std::make_pair(21, 17), std::make_pair(21, 18), std::make_pair(21, 22), std::make_pair(22, 23), std::make_pair(23, 19), std::make_pair(24, 20), std::make_pair(24, 21), std::make_pair(24, 25), std::make_pair(25, 24), }; const size_t n = 1000; vector *edge_map = algorithm::set_to_edge_map(n, e); vector h = algorithm::hyper_ball(n, edge_map); delete [] edge_map; BOOST_CHECK(h.size() == n); BOOST_CHECK_CLOSE(h[0], 2.33333333333, 0.000001); BOOST_CHECK_CLOSE(h[7], 7.25156232656, 0.000001); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_hyper_log_log.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "algorithm/hyper_log_log.h" #include #include using namespace std; BOOST_AUTO_TEST_SUITE(hyper_log_log) BOOST_AUTO_TEST_CASE(hyper_simple) { { algorithm::hyper_log_log hl; BOOST_CHECK(hl.leading_zeros_plus_one(0x0ull) == 65); BOOST_CHECK(hl.leading_zeros_plus_one(0x1ull) == 64); BOOST_CHECK(hl.leading_zeros_plus_one(0xFFFFFFFFull) == 33); BOOST_CHECK(hl.leading_zeros_plus_one(0xFFFFFFFFull) == 33); } } BOOST_AUTO_TEST_CASE(hyper_inserts) { { algorithm::hyper_log_log hl; hl.insert(0); hl.insert(1); hl.insert(2); hl.insert(3); hl.insert(4); hl.insert(5); hl.insert(6); algorithm::hyper_log_log hl2; hl2.insert(0); hl2.insert(1); hl2.insert(2); hl2.insert(3); hl2.insert(4); hl2.insert(5); hl2.insert(7); algorithm::hyper_log_log hl3 = hl + hl2; } vector intervals = {400000, 500000, 1000000, 10000000}; for (size_t interval : intervals) { algorithm::hyper_log_log hl; for (size_t i = 0; i < interval; i++) { hl.insert(i); } BOOST_CHECK(std::abs((int)hl.count() - (int)interval) < interval * hl.error_bound()); } } BOOST_AUTO_TEST_CASE(hyper_union) { algorithm::hyper_log_log hl1; algorithm::hyper_log_log hl2; for (size_t i = 0; i < 250000; i++) { hl1.insert(i); } for (size_t i = 250000; i < 500000; i++) { hl2.insert(i); } algorithm::hyper_log_log hl3 = hl1 + hl2; BOOST_CHECK(std::abs((int)hl3.count() - 500000) < 500000 * hl3.error_bound()); } BOOST_AUTO_TEST_CASE(hyper_log_log_data_copy) { algorithm::hyper_log_log hl1; for (size_t i = 0; i < 250000; i++) { hl1.insert(i); } algorithm::hyper_log_log hl2(hl1.data(), hl1.b()); BOOST_CHECK(std::abs((int)hl2.count() - 250000) < 250000 * hl1.error_bound()); std::vector sizes = {25000, 50000, 75000, 100000, 200000, 300000, 400000}; srand(100); for (size_t size : sizes) { algorithm::hyper_log_log hll; for (size_t i = 0; i < size; i++) { size_t rnd = (((size_t)rand()) << 32) | ((size_t)rand()); hll.insert(rnd); } BOOST_CHECK(std::abs((int)hll.count() - (int)size) < size * hl1.error_bound()); } } BOOST_AUTO_TEST_CASE(hyper_log_log_test2) { algorithm::hyper_log_log hl1(10); const int sz = 100000; for (size_t i = 0; i < sz; i++) { hl1.insert(rand()); } BOOST_CHECK(std::abs((int)hl1.count() - sz) < sz * hl1.error_bound()); } BOOST_AUTO_TEST_CASE(hyper_log_log_move) { algorithm::hyper_log_log hl1(10); const int sz = 100000; for (size_t i = 0; i < sz; i++) { hl1.insert(rand()); } auto hl2 = std::move(hl1); BOOST_CHECK(std::abs((int)hl2.count() - sz) < sz * hl1.error_bound()); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_index_builder.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "file/file.h" #include "indexer/index_builder.h" #include "indexer/index.h" #include "indexer/generic_record.h" #include "indexer/value_record.h" BOOST_AUTO_TEST_SUITE(test_index_builder) BOOST_AUTO_TEST_CASE(test_merge_with) { file::delete_directory("./0/full_text/test_index"); file::create_directory("./0/full_text/test_index"); { indexer::index_builder idx("test_index", 0, 1000); idx.add(123, indexer::value_record(1000)); idx.add(123, indexer::value_record(1001)); idx.add(124, indexer::value_record(1000)); idx.append(); idx.merge(); } { indexer::index idx("test_index", 0, 1000); auto res1 = idx.find(123); auto res2 = idx.find(124); BOOST_REQUIRE_EQUAL(res1.size(), 2); BOOST_REQUIRE_EQUAL(res2.size(), 1); BOOST_CHECK_EQUAL(res1[0].m_value, 1000); BOOST_CHECK_EQUAL(res1[1].m_value, 1001); BOOST_CHECK_EQUAL(res2[0].m_value, 1000); } { indexer::index_builder idx("test_index", 8, 1000); idx.add(123, indexer::value_record(1002)); idx.add(123, indexer::value_record(1003)); idx.add(124, indexer::value_record(1010)); idx.add(125, indexer::value_record(1011)); idx.append(); idx.merge(); } { indexer::index_builder idx1("test_index", 0, 1000); indexer::index idx2("test_index", 8, 1000); idx1.merge_with(idx2); } { indexer::index idx("test_index", 0, 1000); auto res1 = idx.find(123); auto res2 = idx.find(124); auto res3 = idx.find(125); BOOST_REQUIRE_EQUAL(res1.size(), 4); BOOST_REQUIRE_EQUAL(res2.size(), 2); BOOST_REQUIRE_EQUAL(res3.size(), 1); BOOST_CHECK_EQUAL(res1[0].m_value, 1000); BOOST_CHECK_EQUAL(res1[1].m_value, 1001); BOOST_CHECK_EQUAL(res1[2].m_value, 1002); BOOST_CHECK_EQUAL(res1[3].m_value, 1003); BOOST_CHECK_EQUAL(res2[0].m_value, 1000); BOOST_CHECK_EQUAL(res2[1].m_value, 1010); BOOST_CHECK_EQUAL(res3[0].m_value, 1011); } } BOOST_AUTO_TEST_CASE(test_merge_with2) { file::delete_directory("./0/full_text/test_index"); file::create_directory("./0/full_text/test_index"); { indexer::index_builder idx("test_index", 0, 1000); idx.add(123, indexer::value_record(1000)); idx.add(123, indexer::value_record(1001)); idx.add(124, indexer::value_record(1000)); idx.append(); idx.merge(); } { indexer::index idx("test_index", 0, 1000); auto res1 = idx.find(123); auto res2 = idx.find(124); BOOST_REQUIRE_EQUAL(res1.size(), 2); BOOST_REQUIRE_EQUAL(res2.size(), 1); BOOST_CHECK_EQUAL(res1[0].m_value, 1000); BOOST_CHECK_EQUAL(res1[1].m_value, 1001); BOOST_CHECK_EQUAL(res2[0].m_value, 1000); } { indexer::index_builder idx("test_index", 8, 1000); idx.add(123, indexer::value_record(1002)); idx.add(123, indexer::value_record(1003)); idx.add(124, indexer::value_record(1010)); idx.add(125, indexer::value_record(1011)); idx.append(); idx.merge(); } { indexer::index_builder idx1("test_index", 0, 1000); indexer::index idx2("test_index", 8, 1000); idx1.merge_with(idx2); } { indexer::index idx("test_index", 0, 1000); auto res1 = idx.find(123); auto res2 = idx.find(124); auto res3 = idx.find(125); BOOST_REQUIRE_EQUAL(res1.size(), 4); BOOST_REQUIRE_EQUAL(res2.size(), 2); BOOST_REQUIRE_EQUAL(res3.size(), 1); BOOST_CHECK_EQUAL(res1[0].m_value, 1000); BOOST_CHECK_EQUAL(res1[1].m_value, 1001); BOOST_CHECK_EQUAL(res1[2].m_value, 1002); BOOST_CHECK_EQUAL(res1[3].m_value, 1003); BOOST_CHECK_EQUAL(res2[0].m_value, 1000); BOOST_CHECK_EQUAL(res2[1].m_value, 1010); BOOST_CHECK_EQUAL(res3[0].m_value, 1011); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_index_iteration.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include "indexer/sharded_builder.h" #include "indexer/sharded.h" #include "indexer/basic_index_builder.h" #include "indexer/basic_index.h" #include "indexer/counted_record.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_index_iteration) BOOST_AUTO_TEST_CASE(test_index_iteration) { { indexer::sharded_builder idx("test_index", 10); idx.truncate(); idx.add(100, indexer::counted_record(1000)); idx.add(101, indexer::counted_record(1001)); idx.add(101, indexer::counted_record(1002)); idx.add(102, indexer::counted_record(1003)); idx.append(); idx.merge(); } indexer::sharded idx("test_index", 10); std::vector found_keys; std::vector found_values; std::mutex lock; idx.for_each([&lock, &found_keys, &found_values](uint64_t key, const std::vector &recs) { std::lock_guard grd(lock); found_keys.push_back(key); for (auto &rec : recs) { found_values.push_back(rec.m_value); } }); std::sort(found_keys.begin(), found_keys.end()); std::sort(found_values.begin(), found_values.end()); BOOST_CHECK(found_keys[0] == 100); BOOST_CHECK(found_keys[1] == 101); BOOST_CHECK(found_keys[2] == 102); BOOST_CHECK(found_keys.size() == 3); BOOST_CHECK(found_values[0] == 1000); BOOST_CHECK(found_values[1] == 1001); BOOST_CHECK(found_values[2] == 1002); BOOST_CHECK(found_values[3] == 1003); BOOST_CHECK(found_values.size() == 4); } BOOST_AUTO_TEST_CASE(test_index_iteration2) { { indexer::sharded_builder idx("test_index", 10); idx.truncate(); for (size_t i = 1; i <= 10000; i++) { idx.add(i % 10, indexer::counted_record(i)); idx.add(i % 100, indexer::counted_record(i)); idx.add(i % 7, indexer::counted_record(i)); idx.add(i % 13, indexer::counted_record(i)); } idx.append(); idx.merge(); } indexer::sharded idx("test_index", 10); std::map> records; std::mutex lock; idx.for_each([&lock, &records](uint64_t key, const std::vector &recs) { std::lock_guard grd(lock); for (auto &rec : recs) { records[key].push_back(rec.m_value); } }); for (size_t i = 1; i <= 10000; i++) { BOOST_CHECK(std::find(records[i % 10].begin(), records[i % 10].end(), i) != records[i % 10].end()); BOOST_CHECK(std::find(records[i % 100].begin(), records[i % 100].end(), i) != records[i % 100].end()); BOOST_CHECK(std::find(records[i % 7].begin(), records[i % 7].end(), i) != records[i % 7].end()); BOOST_CHECK(std::find(records[i % 13].begin(), records[i % 13].end(), i) != records[i % 13].end()); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_index_reader.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "indexer/index_builder.h" #include "indexer/index.h" #include "indexer/generic_record.h" #include #include #include "URL.h" #include "text/text.h" #include "profiler/profiler.h" #include "roaring/roaring.hh" BOOST_AUTO_TEST_SUITE(test_index_reader) BOOST_AUTO_TEST_CASE(test_index_reader1) { { indexer::index_builder idx("test_db", 0, 1000); idx.truncate(); idx.add(100, indexer::generic_record(1000)); idx.add(100, indexer::generic_record(1001)); idx.add(100, indexer::generic_record(1002)); idx.append(); idx.merge(); } { ifstream reader("./0/full_text/test_db/0.data", ios::binary); reader.seekg(0, ios::end); size_t file_size = reader.tellg(); reader.seekg(0, ios::beg); char *buffer = new char[file_size]; reader.read(buffer, file_size); std::string file_data(buffer, file_size); std::istringstream ram_reader(file_data); indexer::index idx(&ram_reader, 1000); vector res = idx.find(100); BOOST_REQUIRE(res.size() == 3); BOOST_CHECK(res[0].m_value == 1000); BOOST_CHECK(res[1].m_value == 1001); BOOST_CHECK(res[2].m_value == 1002); delete[] buffer; } } BOOST_AUTO_TEST_CASE(test_index_reader_2) { /* { indexer::index_builder idx("restaurantbusinessonline.com"); idx.set_hash_table_size(1000); idx.truncate(); const vector cols = {1, 2, 3, 4}; vector files; boost::filesystem::path p ("./output"); boost::filesystem::directory_iterator end_itr; for (boost::filesystem::directory_iterator itr(p); itr != end_itr; ++itr) { // If it's not a directory, list it. If you want to list directories too, just remove this check. if (boost::filesystem::is_regular_file(itr->path())) { // assign current file name to current_file and echo it out to the console. string current_file = itr->path().string(); files.push_back(current_file); } } size_t num_added = 0; size_t num_bytes_added = 0; for (const string &local_path : files) { ifstream infile(local_path, ios::in); boost::iostreams::filtering_istream decompress_stream; decompress_stream.push(boost::iostreams::gzip_decompressor()); decompress_stream.push(infile); string line; while (getline(decompress_stream, line)) { vector col_values; boost::algorithm::split(col_values, line, boost::is_any_of("\t")); URL url(col_values[0]); if (url.host() != "doodlecraftblog.com") continue; num_added++; uint64_t url_hash = url.hash(); for (size_t col : cols) { vector words = text::get_full_text_words(col_values[col]); for (const string &word : words) { num_bytes_added += word.size(); idx.add(::algorithm::hash(word), ::indexer::url_record(url_hash)); } } } } num_added++; cout << "ADDED " << num_added << " URLS" << endl; cout << num_bytes_added << " bytes" << endl; idx.append(); idx.merge(); } { logger::verbose(true); profiler::instance prof("load index file to ram"); ifstream reader("restaurantbusinessonline.com.data", ios::binary); reader.seekg(0, ios::end); size_t file_size = reader.tellg(); reader.seekg(0, ios::beg); char *buffer = new char[file_size]; reader.read(buffer, file_size); prof.stop(); indexer::index_reader_ram ram_reader(buffer, file_size); indexer::index idx((indexer::index_reader *)&ram_reader, 1000); cout << "file_size: " << file_size << endl; idx.print_stats(); vector res = idx.find(::algorithm::hash("helicopter")); BOOST_REQUIRE(res.size() > 0); delete buffer; }*/ } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_logger.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "logger/logger.h" #include "config.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_logger) BOOST_AUTO_TEST_CASE(test_logger1) { logger::log_string("test1"); logger::log_string("test2"); logger::sync(); ifstream logfile(config::log_file_path); logfile.seekg(-12, std::ios::end); string line1, line2; getline(logfile, line1); getline(logfile, line2); BOOST_CHECK_EQUAL(line1, "test1"); BOOST_CHECK_EQUAL(line2, "test2"); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_memory.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "memory/memory.h" #include "memory/debugger.h" #include "indexer/index_builder.h" #include "indexer/basic_index_builder.h" #include "indexer/domain_link_record.h" BOOST_AUTO_TEST_SUITE(test_memory) BOOST_AUTO_TEST_CASE(test_memory) { memory::update(); BOOST_CHECK(memory::get_available_memory() > 0); BOOST_CHECK(memory::get_total_memory() > 0); const size_t used1 = memory::allocated_memory(); const size_t memlen = 1000000; char *some_mem = new char[memlen]; for (size_t i = 0; i < memlen; i++) { some_mem[i] = 1; } memory::update(); const size_t used2 = memory::allocated_memory(); delete[] some_mem; const size_t used3 = memory::allocated_memory(); std::cout << "used1: " << used1 << std::endl; std::cout << "used2: " << used2 << std::endl; std::cout << "used3: " << used3 << std::endl; BOOST_CHECK(used1 + 1000000 == used2); BOOST_CHECK(used1 == used3); } /* * Test memory consumtion during merge, should end with same amount. * */ BOOST_AUTO_TEST_CASE(test_indexer_memory) { memory::update(); indexer::create_db_directories("domain_link_index"); BOOST_CHECK(memory::get_available_memory() > 0); BOOST_CHECK(memory::get_total_memory() > 0); size_t memuse1, memuse2, memuse3, memuse4; memuse1 = memory::allocated_memory(); { indexer::basic_index_builder idx("domain_link_index", 97ull); memuse2 = memory::allocated_memory(); idx.append(); idx.merge(); memuse3 = memory::allocated_memory(); } memuse4 = memory::allocated_memory(); BOOST_CHECK(memuse1 == memuse4); BOOST_CHECK(memuse2 == memuse3); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_n_gram.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "text/text.h" #include "algorithm/hash.h" using namespace std; BOOST_AUTO_TEST_SUITE(n_gram) BOOST_AUTO_TEST_CASE(words_to_ngram) { vector ngrams; text::words_to_ngram_hash({"the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"}, 3, [&ngrams](const uint64_t hash) { ngrams.push_back(hash); }); BOOST_CHECK_EQUAL(ngrams[0], algorithm::hash("the")); BOOST_CHECK_EQUAL(ngrams[1], algorithm::hash("the quick")); BOOST_CHECK_EQUAL(ngrams[2], algorithm::hash("the quick brown")); BOOST_CHECK_EQUAL(ngrams[3], algorithm::hash("quick")); BOOST_CHECK_EQUAL(ngrams[4], algorithm::hash("quick brown")); BOOST_CHECK_EQUAL(ngrams[5], algorithm::hash("quick brown fox")); BOOST_CHECK_EQUAL(ngrams[6], algorithm::hash("brown")); BOOST_CHECK_EQUAL(ngrams[7], algorithm::hash("brown fox")); BOOST_CHECK_EQUAL(ngrams[8], algorithm::hash("brown fox jumps")); BOOST_CHECK_EQUAL(ngrams[18], algorithm::hash("the")); BOOST_CHECK_EQUAL(ngrams[19], algorithm::hash("the lazy")); BOOST_CHECK_EQUAL(ngrams[20], algorithm::hash("the lazy dog")); BOOST_CHECK_EQUAL(ngrams[21], algorithm::hash("lazy")); BOOST_CHECK_EQUAL(ngrams[22], algorithm::hash("lazy dog")); BOOST_CHECK_EQUAL(ngrams[23], algorithm::hash("dog")); BOOST_CHECK_EQUAL(ngrams.size(), 24); } BOOST_AUTO_TEST_CASE(n_gram2) { vector ngrams; text::words_to_ngram_hash({"i", "liberoklubben", "här"}, 3, [&ngrams](const uint64_t hash, const std::string &word) { ngrams.push_back(hash); }); BOOST_CHECK_EQUAL(ngrams[0], algorithm::hash("i")); BOOST_CHECK_EQUAL(ngrams[1], algorithm::hash("i liberoklubben")); BOOST_CHECK_EQUAL(ngrams[2], algorithm::hash("i liberoklubben här")); BOOST_CHECK_EQUAL(ngrams[3], algorithm::hash("liberoklubben")); BOOST_CHECK_EQUAL(ngrams[4], algorithm::hash("liberoklubben här")); BOOST_CHECK_EQUAL(ngrams[5], algorithm::hash("här")); BOOST_CHECK_EQUAL(ngrams.size(), 6); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_robot_parser.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "robots.h" using namespace std; BOOST_AUTO_TEST_SUITE(robot_parser) BOOST_AUTO_TEST_CASE(parse) { std::string robots_content = "Sitemap: https://www.omnible.se/sitemap.xml\n" "User-agent: AlexandriaBot\n" "Disallow: *\n" "User-agent: * # all agents\n" "Disallow: /*crawl=no*\n" "Disallow: /basket/add*\n" ; std::string user_agent = "AlexandriaBot"; googlebot::RobotsMatcher matcher; std::string url = "/visit"; bool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url); BOOST_CHECK(!allowed); } BOOST_AUTO_TEST_CASE(parse2) { std::string robots_content = string("Sitemap: https://www.omnible.se/sitemap.xml\n" "User-agent: *\n" "Disallow: /visit\n" "User-agent: AlexandriaBot\n" "Disallow: /10126597891759986715\n"); std::string user_agent = "AlexandriaBot"; googlebot::RobotsMatcher matcher; { std::string url = "https://www.omnible.se/10126597891759986715"; bool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url); BOOST_CHECK(!allowed); } { std::string url = "https://www.omnible.se/1012659789175998671"; bool allowed = matcher.OneAgentAllowedByRobots(robots_content, user_agent, url); BOOST_CHECK(allowed); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_scraper.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include "scraper/scraper.h" #include #include using namespace std; BOOST_AUTO_TEST_SUITE(test_scraper) BOOST_AUTO_TEST_CASE(test_scraper) { scraper::scraper_store store; scraper::scraper scraper("omnible.se", &store); scraper.set_timeout(0); scraper.push_url(URL("http://omnible.se/")); scraper.push_url(URL("http://omnible.se/10126597891759986715")); scraper.push_url(URL("http://omnible.se/10123997891267016458")); scraper.push_url(URL("http://omnible.se/gtin/9789180230865")); scraper.push_url(URL("http://omnible.se/10123697814011564169")); scraper.push_url(URL("https://www.omnible.se/notfound")); scraper.push_url(URL("https://www.omnible.se/gtin/9789177714958")); scraper.run(); string last = store.tail(); vector cols; boost::algorithm::split(cols, last, boost::is_any_of("\t")); BOOST_CHECK_EQUAL(cols[0], "https://www.omnible.se/10123697814011564169"); BOOST_CHECK_EQUAL(cols[1], "Den sista gåvan av Abdulrazak Gurnah - recensioner & prisjämförelse - Omnible"); } BOOST_AUTO_TEST_CASE(scraper_multithreaded) { return; vector urls = { /*"http://omnible.se/", "http://omnible.se/10126597891759986715", "http://omnible.se/10123997891267016458", "https://spelagratis.nu/", "https://spelagratis.nu/super_mario_world.html", "http://omnible.se/gtin/9789180230865", "http://omnible.se/10123697814011564169", "https://spelagratis.nu/dirt_bike.html"*/ "http://optout.aboutads.info/", "http://tabernus.com/", "http://tabernus.com/test", "http://apnews.excite.com/article/20071031/D8SKBRKO0.html", "http://thebetter.wiki/en/Jeb_Magruder", "https://www.thebetter.wiki/en/testing" }; scraper::run_scraper_on_urls(urls); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_sharded_index_builder.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "indexer/index_manager.h" #include "indexer/sharded_index_builder.h" #include "indexer/sharded_index.h" #include "indexer/merger.h" #include "text/text.h" #include "algorithm/hash.h" #include "transfer/transfer.h" BOOST_AUTO_TEST_SUITE(test_sharded_index_builder) BOOST_AUTO_TEST_CASE(test_sharded_index_builder) { { indexer::sharded_index_builder idx("test_index", 10); idx.truncate(); idx.add(101, indexer::generic_record(1000, 1.0f)); idx.add(102, indexer::generic_record(1001, 1.0f)); idx.append(); idx.merge(); } { indexer::sharded_index idx("test_index", 10); vector res = idx.find(101); BOOST_REQUIRE(res.size() == 1); BOOST_CHECK(res[0].m_value == 1000); } } BOOST_AUTO_TEST_CASE(test_group_by) { using indexer::domain_link_record; { indexer::sharded_index_builder idx("test_index", 1); idx.truncate(); idx.add(101, domain_link_record(1000, 1.0f, 200)); idx.add(101, domain_link_record(1004, 1.0f, 300)); idx.add(101, domain_link_record(1001, 1.0f, 200)); idx.add(101, domain_link_record(1003, 1.0f, 300)); idx.add(101, domain_link_record(1002, 1.0f, 200)); idx.add(102, domain_link_record(1000, 1.0f, 200)); idx.add(102, domain_link_record(1001, 1.0f, 200)); idx.add(102, domain_link_record(1005, 1.0f, 300)); idx.add(102, domain_link_record(1002, 1.0f, 200)); idx.add(103, domain_link_record(1000, 1.0f, 200)); idx.add(103, domain_link_record(1001, 1.0f, 200)); idx.add(103, domain_link_record(1004, 1.0f, 300)); idx.add(103, domain_link_record(1002, 1.0f, 200)); idx.append(); idx.merge(); idx.optimize(); } { indexer::sharded_index idx("test_index", 1); auto identity = [](float score) { return score; }; std::vector counts; vector res = idx.find_group_by({101, 102}, identity, counts); BOOST_REQUIRE(res.size() == 1); BOOST_CHECK(res[0].m_score == 3.0f); BOOST_CHECK(counts[0] == 3); } { indexer::sharded_index idx("test_index", 1); auto times_two = [](float score) { return 2.0f * score; }; std::vector counts; vector res = idx.find_group_by({101, 103}, times_two, counts); BOOST_REQUIRE(res.size() == 2); sort(res.begin(), res.end(), domain_link_record::storage_order()); BOOST_CHECK(res[0].m_score == 2.0f * (3.0f)); BOOST_CHECK(res[1].m_score == 2.0f * (1.0f)); BOOST_CHECK(counts[0] == 3); BOOST_CHECK(counts[1] == 1); } } BOOST_AUTO_TEST_CASE(test_score_mod) { using indexer::domain_record; { indexer::sharded_index_builder idx("test_index", 1); idx.truncate(); idx.add(101, domain_record(1000, 1.0f)); idx.add(101, domain_record(1004, 1.0f)); idx.add(101, domain_record(1001, 1.0f)); idx.add(101, domain_record(1003, 1.0f)); idx.add(101, domain_record(1002, 1.0f)); idx.add(102, domain_record(1000, 1.0f)); idx.add(102, domain_record(1001, 1.0f)); idx.add(102, domain_record(1005, 1.0f)); idx.add(102, domain_record(1002, 1.0f)); idx.append(); idx.merge(); idx.optimize(); } { /* * intersected records will be in this order: * 1000 * 1001 * 1002 * * so score modification will take place in that order. * * */ indexer::sharded_index idx("test_index", 1); uint64_t sum_id = 0; vector res = idx.find_top({101, 102}, 2, [&sum_id](const domain_record &val) -> float { return (float)(sum_id++); }); BOOST_REQUIRE(res.size() == 2); BOOST_CHECK(res[0].m_score == 2.0f); BOOST_CHECK(res[0].m_value == 1002); BOOST_CHECK(res[1].m_score == 1.0f); BOOST_CHECK(res[1].m_value == 1001); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_sort.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "algorithm/sort.h" #include using namespace std; BOOST_AUTO_TEST_SUITE(test_sort) struct test_data_struct1 { int data1; int data2; }; BOOST_AUTO_TEST_CASE(merge_arrays) { { vector arr1 = {1, 2, 3}; vector arr2 = {4, 5, 6}; vector arr3; vector arr4{1, 2, 3, 4, 5, 6}; algorithm::sort::merge_arrays(arr1, arr2, arr3); BOOST_CHECK(arr3 == arr4); } { vector arr1 = {1, 2, 3}; vector arr2 = {3, 4, 5, 6}; vector arr3; vector arr4{1, 2, 3, 3, 4, 5, 6}; algorithm::sort::merge_arrays(arr1, arr2, arr3); BOOST_CHECK(arr3 == arr4); } { vector arr1 = {}; vector arr2 = {3, 4, 5, 6}; vector arr3; vector arr4{3, 4, 5, 6}; algorithm::sort::merge_arrays(arr1, arr2, arr3); BOOST_CHECK(arr3 == arr4); } } BOOST_AUTO_TEST_CASE(merge_arrays_of_struct) { { vector arr1{test_data_struct1{.data1 = 1, .data2 = 2}}; vector arr2{test_data_struct1{.data1 = 2, .data2 = 3}}; vector arr3; vector arr4{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 2, .data2 = 3}}; algorithm::sort::merge_arrays(arr1, arr2, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) { return a.data1 < b.data1; }, arr3); BOOST_CHECK(arr3[0].data1 == arr4[0].data1 && arr3[0].data2 == arr4[0].data2); BOOST_CHECK(arr3[1].data1 == arr4[1].data1 && arr3[1].data2 == arr4[1].data2); } { vector arr1{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 3, .data2 = 4}}; vector arr2{test_data_struct1{.data1 = 2, .data2 = 3}}; vector arr3; vector arr4{test_data_struct1{.data1 = 1, .data2 = 2}, test_data_struct1{.data1 = 2, .data2 = 3}, test_data_struct1{.data1 = 3, .data2 = 4}}; algorithm::sort::merge_arrays(arr1, arr2, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) { return a.data1 < b.data1; }, arr3); BOOST_CHECK(arr3[0].data1 == arr4[0].data1 && arr3[0].data2 == arr4[0].data2); BOOST_CHECK(arr3[1].data1 == arr4[1].data1 && arr3[1].data2 == arr4[1].data2); BOOST_CHECK(arr3[2].data1 == arr4[2].data1 && arr3[2].data2 == arr4[2].data2); } } BOOST_AUTO_TEST_CASE(merge_many_arrays) { { vector arr1 = {1, 2, 3}; vector arr2 = {4, 5, 6}; vector arr3 = {7, 8, 9}; vector res; vector> inp{arr1, arr2, arr3}; vector corr{1, 2, 3, 4, 5, 6, 7, 8, 9}; algorithm::sort::merge_arrays(inp, res); BOOST_CHECK(res == corr); } { vector arr1 = {1, 3, 6}; vector arr2 = {2, 4, 9}; vector arr3 = {1, 5, 7, 8}; vector res; vector> inp{arr1, arr2, arr3}; vector corr{1, 1, 2, 3, 4, 5, 6, 7, 8, 9}; algorithm::sort::merge_arrays(inp, res); BOOST_CHECK(res == corr); } } BOOST_AUTO_TEST_CASE(merge_many_arrays_of_struct) { { vector arr1{ test_data_struct1{.data1 = 1, .data2 = 11}, test_data_struct1{.data1 = 2, .data2 = 12}, test_data_struct1{.data1 = 3, .data2 = 13} }; vector arr2 = { test_data_struct1{.data1 = 4, .data2 = 14}, test_data_struct1{.data1 = 5, .data2 = 15}, test_data_struct1{.data1 = 6, .data2 = 16} }; vector arr3 = { test_data_struct1{.data1 = 7, .data2 = 17}, test_data_struct1{.data1 = 8, .data2 = 18}, test_data_struct1{.data1 = 9, .data2 = 19} }; vector res; vector> inp{arr1, arr2, arr3}; vector corr{ test_data_struct1{.data1 = 1, .data2 = 11}, test_data_struct1{.data1 = 2, .data2 = 12}, test_data_struct1{.data1 = 3, .data2 = 13}, test_data_struct1{.data1 = 4, .data2 = 14}, test_data_struct1{.data1 = 5, .data2 = 15}, test_data_struct1{.data1 = 6, .data2 = 16}, test_data_struct1{.data1 = 7, .data2 = 17}, test_data_struct1{.data1 = 8, .data2 = 18}, test_data_struct1{.data1 = 9, .data2 = 19} }; algorithm::sort::merge_arrays(inp, [](const struct test_data_struct1 &a, const struct test_data_struct1 &b) { return a.data1 < b.data1; }, res); BOOST_CHECK(corr.size() == res.size()); for (size_t i = 0; i < corr.size(); i++) { BOOST_CHECK(res[i].data1 == corr[i].data1 && res[i].data2 == corr[i].data2); } } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_sum_sorted.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include "algorithm/sum_sorted.h" #include "indexer/counted_record.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_sum_sorted, * boost::unit_test::tolerance(0.00001)) BOOST_AUTO_TEST_CASE(test_sum_sorted1) { vector> sorted = { {1, 2, 3}, {2, 3}, {3} }; vector res = ::algorithm::sum_sorted(sorted, [](int &a, const int &b) { a += b; }); BOOST_REQUIRE(res.size() == 3); BOOST_CHECK(res[0] == 1); BOOST_CHECK(res[1] == 4); BOOST_CHECK(res[2] == 9); } BOOST_AUTO_TEST_CASE(test_sum_sorted2) { vector> sorted = { {3}, {2, 3}, {1, 2, 3}, }; vector res = ::algorithm::sum_sorted(sorted, [](int &a, const int &b) { a += b; }); BOOST_REQUIRE(res.size() == 3); BOOST_CHECK(res[0] == 1); BOOST_CHECK(res[1] == 4); BOOST_CHECK(res[2] == 9); } BOOST_AUTO_TEST_CASE(test_sum_sorted3) { vector> sorted = { {indexer::counted_record(3, 0.1)}, {indexer::counted_record(2, 0.1), indexer::counted_record(3, 0.1)}, {indexer::counted_record(1, 0.1), indexer::counted_record(2, 0.1), indexer::counted_record(3, 0.1)}, }; vector res = ::algorithm::sum_sorted(sorted, [](indexer::counted_record &a, const indexer::counted_record &b) { a.m_score += b.m_score; }); BOOST_REQUIRE(res.size() == 3); BOOST_CHECK_EQUAL(res[0].m_score, 0.1f); BOOST_CHECK_EQUAL(res[1].m_score, 0.2f); BOOST_CHECK_EQUAL(res[2].m_score, 0.3f); } BOOST_AUTO_TEST_CASE(test_sum_sorted4) { vector> sorted = { {indexer::counted_record(1, 0.1), indexer::counted_record(2, 0.2), indexer::counted_record(3, 0.3)}, {indexer::counted_record(10, 0.4), indexer::counted_record(25, 0.5), indexer::counted_record(30, 0.6)}, {indexer::counted_record(1, 0.7), indexer::counted_record(25, 0.8), indexer::counted_record(40, 0.9)}, }; vector res = ::algorithm::sum_sorted(sorted, [](indexer::counted_record &a, const indexer::counted_record &b) { a.m_score += b.m_score; }); BOOST_REQUIRE(res.size() == 7); BOOST_CHECK_EQUAL(res[0].m_score, 0.8f); BOOST_CHECK_EQUAL(res[1].m_score, 0.2f); BOOST_CHECK_EQUAL(res[2].m_score, 0.3f); BOOST_CHECK_EQUAL(res[3].m_score, 0.4f); BOOST_CHECK_EQUAL(res[4].m_score, 1.3f); BOOST_CHECK_EQUAL(res[5].m_score, 0.6f); BOOST_CHECK_EQUAL(res[6].m_score, 0.9f); BOOST_CHECK_EQUAL(res[0].m_value, 1); BOOST_CHECK_EQUAL(res[1].m_value, 2); BOOST_CHECK_EQUAL(res[2].m_value, 3); BOOST_CHECK_EQUAL(res[3].m_value, 10); BOOST_CHECK_EQUAL(res[4].m_value, 25); BOOST_CHECK_EQUAL(res[5].m_value, 30); BOOST_CHECK_EQUAL(res[6].m_value, 40); } BOOST_AUTO_TEST_CASE(test_sum_sorted5) { vector> sorted = { {1, 2, 3}, {} }; vector res = ::algorithm::sum_sorted(sorted, [](int &a, const int &b) { a += b; }); BOOST_REQUIRE(res.size() == 3); BOOST_CHECK(res[0] == 1); BOOST_CHECK(res[1] == 2); BOOST_CHECK(res[2] == 3); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_text.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "text/text.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_text) BOOST_AUTO_TEST_CASE(get_full_text_words) { { vector words = text::get_full_text_words("C++ map. is the, best thing"); BOOST_CHECK_EQUAL(words[0], "c++"); BOOST_CHECK_EQUAL(words[1], "map"); BOOST_CHECK_EQUAL(words[2], "is"); BOOST_CHECK_EQUAL(words[3], "the"); BOOST_CHECK_EQUAL(words[4], "best"); BOOST_CHECK_EQUAL(words[5], "thing"); } { vector words = text::get_full_text_words("C# is also good."); BOOST_CHECK_EQUAL(words[0], "c#"); BOOST_CHECK_EQUAL(words[1], "is"); BOOST_CHECK_EQUAL(words[2], "also"); BOOST_CHECK_EQUAL(words[3], "good"); } } BOOST_AUTO_TEST_CASE(get_tokens) { vector tokens = text::get_tokens("My name is Josef Cullhed"); vector targets = { algorithm::hash("my"), algorithm::hash("name"), algorithm::hash("is"), algorithm::hash("josef"), algorithm::hash("cullhed"), }; BOOST_CHECK(tokens == targets); } BOOST_AUTO_TEST_CASE(get_tokens2) { vector tokens = text::get_tokens("Test. Ing! the test +function+"); vector targets = { algorithm::hash("test"), algorithm::hash("ing"), algorithm::hash("the"), algorithm::hash("test"), algorithm::hash("+function+"), }; BOOST_CHECK(tokens == targets); } BOOST_AUTO_TEST_CASE(get_tokens3) { vector tokens = text::get_expanded_full_text_tokens("Test. Ing! the test +func-tion+"); vector targets = { algorithm::hash("test"), algorithm::hash("ing"), algorithm::hash("the"), algorithm::hash("test"), algorithm::hash("+func-tion+"), algorithm::hash("+func"), algorithm::hash("tion+"), }; BOOST_CHECK(tokens == targets); } BOOST_AUTO_TEST_CASE(get_snippets) { { vector snippets = text::get_snippets("A small text that should fit in one snippet"); BOOST_REQUIRE(snippets.size() == 1); BOOST_CHECK(snippets[0] == "A small text that should fit in one snippet"); } { vector snippets = text::get_snippets(" The zlib compression library provides in-memory compression and decompression functions, including integrity checks of the uncompressed data. This version of the library supports only one compression method (deflation) but other algorithms will be added later and will have the same stream interface. Compression can be done in a single step if the buffers are large enough (for example if an input file is mmap'ed), or can be done by repeated calls of the compression function. In the latter case, the application must provide more input and/or consume the output (providing more output space) before each call. "); BOOST_REQUIRE(snippets.size() == 3); } } BOOST_AUTO_TEST_CASE(get_words_without_stopwords) { vector words = text::get_words_without_stopwords("Hej asd!asd jag, heter! !josef. cullhed \ jfoidjfoai823hr9hfhwe9f8hshgohewogiqhoih"); BOOST_CHECK_EQUAL(words.size(), 8); BOOST_CHECK_EQUAL(words[0], "hej"); BOOST_CHECK_EQUAL(words[1], "asd"); BOOST_CHECK_EQUAL(words[2], "asd"); BOOST_CHECK_EQUAL(words[3], "jag"); BOOST_CHECK_EQUAL(words[4], "heter"); BOOST_CHECK_EQUAL(words[5], "josef"); BOOST_CHECK_EQUAL(words[6], "cullhed"); BOOST_CHECK_EQUAL(words[7], "jfoidjfoai823hr9hfhwe9f8hshgohewogiqhoih"); } BOOST_AUTO_TEST_CASE(clean_word) { BOOST_CHECK_EQUAL(text::clean_word("hej"), "hej"); BOOST_CHECK_EQUAL(text::clean_word("åäö"), "åäö"); BOOST_CHECK_EQUAL(text::clean_word("123"), "123"); BOOST_CHECK_EQUAL(text::clean_word("$Üç"), ""); BOOST_CHECK_EQUAL(text::clean_word("hejç"), "hej"); BOOST_CHECK_EQUAL(text::clean_word("açd"), "ad"); BOOST_CHECK(text::is_clean_word("hej")); BOOST_CHECK(text::is_clean_word("åäö")); BOOST_CHECK(text::is_clean_word("123")); BOOST_CHECK(!text::is_clean_word("$Üç")); BOOST_CHECK(!text::is_clean_word("hejç")); BOOST_CHECK(!text::is_clean_word("açd")); BOOST_CHECK_EQUAL(text::get_words_without_stopwords("hej")[0], "hej"); BOOST_CHECK_EQUAL(text::get_words_without_stopwords("åäö")[0], "åäö"); BOOST_CHECK_EQUAL(text::get_words_without_stopwords("123")[0], "123"); BOOST_CHECK_EQUAL(text::get_words_without_stopwords("$Üç").size(), 0); BOOST_CHECK_EQUAL(text::get_words_without_stopwords("hejç").size(), 0); BOOST_CHECK_EQUAL(text::get_words_without_stopwords("açd").size(), 0); BOOST_CHECK(text::get_words_without_stopwords("hej josef") == vector({"hej", "josef"})); BOOST_CHECK(text::get_words_without_stopwords("hej, josef!") == vector({"hej", "josef"})); BOOST_CHECK(text::get_words_without_stopwords("hej jÜsef cullhed du är bäst") == vector({"hej", "cullhed", "du", "bäst"})); BOOST_CHECK(text::get_words_without_stopwords("Låna! (Pengar till bilar)") == vector({"låna", "pengar", "bilar"})); BOOST_CHECK(text::get_words_without_stopwords("Dallas Swarner | Character | zKillboard", 3) == vector({"dallas", "swarner", "character"})); BOOST_CHECK(text::get_words_without_stopwords("Tapis Fleur des Champs Moutarde | Zen Dos", 3) == vector({"tapis", "fleur", "des"})); BOOST_CHECK(text::get_words_without_stopwords("Gina Osorno & The Dreamers", 3) == vector({"gina", "osorno", "dreamers"})); BOOST_CHECK(text::get_words_without_stopwords("IMG_2190 | Zhenyu (Tony) Tian") == vector({"zhenyu", "tony", "tian"})); BOOST_CHECK(text::get_words_without_stopwords("Tills alla dör - Diamant Salihu - Bok (9789189061842) | Bokus", 3) == vector({"tills", "dör", "diamant"})); BOOST_CHECK(text::get_words_without_stopwords("Messages postés par Prechan • Forum • Zeste de Savoir", 3) == vector({"messages", "par", "prechan"})); BOOST_CHECK(text::get_words_without_stopwords("Science SARU – 紙本分格") == vector({"science", "saru"})); BOOST_CHECK(text::get_words_without_stopwords("Realiteti i trishtë shqiptar përmes fotove të gazetarit gjerman që komunizmi nuk i lejoi \ të bëheshin publike | Gazeta Malesia", 3) == vector({"realiteti", "shqiptar", "fotove"})); BOOST_CHECK(text::get_words_without_stopwords("York County, VA") == vector({"york", "county", "va"})); BOOST_CHECK(text::get_words_without_stopwords("HTML Sitemap 14 - zfreeti.com", 3) == vector({"html", "sitemap", "14"})); BOOST_CHECK(text::get_words_without_stopwords("HTML Sitemap 14 - zfreeti.com") == vector({"html", "sitemap", "14"})); BOOST_CHECK(text::get_words_without_stopwords("Archives.com zfreeti.com best. stream. in .the world") == vector({"best", "stream", "world"})); } BOOST_AUTO_TEST_CASE(word_freq, * boost::unit_test::tolerance(0.00001)) { auto freq = text::get_word_frequency("hello my name is josef and it is good"); BOOST_TEST(freq["hello"] == 1.0/9.0); BOOST_TEST(freq["is"] == 2.0/9.0); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_thread_pool.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "utils/thread_pool.hpp" #include "profiler/profiler.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_thread_pool) BOOST_AUTO_TEST_CASE(thread_pool) { utils::thread_pool pool(10); vector vec(10); for (int &i : vec) { pool.enqueue([&i]() { i++; }); } pool.run_all(); for (int i : vec) { BOOST_CHECK(i == 1); } } BOOST_AUTO_TEST_CASE(thread_pool2) { utils::thread_pool pool(12); vector vec(24); for (int &i : vec) { pool.enqueue([&i]() { std::this_thread::sleep_for(200ms); i = 1; }); } double now = profiler::now_micro(); pool.run_all(); double dt = profiler::now_micro() - now; BOOST_CHECK(dt < (200*2 + 10)*1000); for (int i : vec) { BOOST_CHECK(i == 1); } } /* * Test limit of queue length. The idea here is that if you pass a second parameter to the pool * you get a maximum queue length. Then if the workers are all working and the queue is full * the next call to enqueue will wait for the queue to become smaller. * * This is useful if you want X workers to work but you don't want to fill up the queue because of.. limited memory. * */ BOOST_AUTO_TEST_CASE(thread_pool3) { utils::thread_pool pool(4, 1); vector vec(4); int idx = 1; for (int &i : vec) { pool.enqueue([&i, idx]() { std::this_thread::sleep_for(200ms); i = idx; }); // Allow some time for the work to be picked from the queue. std::this_thread::sleep_for(10ms); idx++; } // Now the 4 workers are working. // Enqueue one more. double now1 = profiler::now_micro(); pool.enqueue([]() { std::this_thread::sleep_for(200ms); }); double now2 = profiler::now_micro(); // Should be quick. BOOST_CHECK(now2 - now1 < 10 * 1000); // < 10 milliseconds. // Now the next enqueue should wait around 200ms double now3 = profiler::now_micro(); pool.enqueue([]() { std::this_thread::sleep_for(200ms); }); double now4 = profiler::now_micro(); BOOST_CHECK(now4 - now3 > 180 * 1000); std::this_thread::sleep_for(300ms); // All threads should be done now idx = 1; for (int i : vec) { BOOST_CHECK(i == idx); idx++; } pool.run_all(); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_top_k.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include BOOST_AUTO_TEST_SUITE(test_top_k) BOOST_AUTO_TEST_CASE(test_1) { const std::vector res = ::algorithm::top_k({1,2,3,4,5,6}, 2); bool is_correct = (res == std::vector{5,6} || res == std::vector{6,5}); BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_2) { const std::vector res = ::algorithm::top_k({1,2,3,4,5,6,7}, 2); bool is_correct = (res == std::vector{6,7} || res == std::vector{7,6}); BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_3) { const std::vector res = ::algorithm::top_k({}, 2); bool is_correct = (res == std::vector{}); BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_4) { const std::vector res = ::algorithm::top_k({2,3,1}, 2); bool is_correct = (res == std::vector{2,3} || res == std::vector{3,2}); BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_5) { const std::vector res = ::algorithm::top_k({7,5,3,4,4,8,4,1,1,3,4}, 3); bool is_correct = true; for (int i : res) { if (i < 5) is_correct = false; } BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_6) { const std::vector res = ::algorithm::top_k({7,5,3,4,4,8,4,1,1,3,4}, 6); bool is_correct = true; for (int i : res) { if (i < 4) is_correct = false; } BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_7) { const std::vector res = ::algorithm::top_k({1,3,0,1,4,3,9,2,0,3}, 1); bool is_correct = res == std::vector{9}; BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_CASE(test_8) { const std::vector res = ::algorithm::top_k({1,3,0,1,4,3,9,2,0,3}, 3, [](const int &a, const int &b) { return a > b; }); bool is_correct = true; for (int i : res) { if (i > 1) is_correct = false; } BOOST_CHECK(is_correct); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_unicode.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "parser/unicode.h" BOOST_AUTO_TEST_SUITE(unicode) BOOST_AUTO_TEST_CASE(unicode) { BOOST_CHECK_EQUAL(parser::unicode::encode("hej jag heter josef"), "hej jag heter josef"); BOOST_CHECK_EQUAL(parser::unicode::encode("hej jag heter josef och jag tillåter utf8 åäö chars$€"), "hej jag heter josef och jag tillåter utf8 åäö chars$€"); BOOST_CHECK_EQUAL(parser::unicode::encode("是美国民主党政治家,于19世纪下半叶担"), "是美国民主党政治家,于19世纪下半叶担"); BOOST_CHECK(parser::unicode::is_valid(parser::unicode::encode("L�gg i varukorg Om produkten Specifikation Anv�ndning Våra bönor är \ rika på protein, mineraler och fibrer. Smaken är söt och konsistensen le"))); } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_url.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "URL.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_url) BOOST_AUTO_TEST_CASE(basic) { BOOST_CHECK_EQUAL(URL("https://www.facebook.com/test.html?key=value").str(), "https://www.facebook.com/test.html?key=value"); { URL url("https://www.facebook.com/test.html?key=value"); url.set_scheme("http"); url.set_www(false); BOOST_CHECK_EQUAL(url.str(), "http://facebook.com/test.html?key=value"); url.set_scheme("https"); url.set_www(true); BOOST_CHECK_EQUAL(url.str(), "https://www.facebook.com/test.html?key=value"); } } BOOST_AUTO_TEST_CASE(url_parsing) { { URL url("https://www.facebook.com/test.html?key=value"); BOOST_CHECK_EQUAL(url.str(), "https://www.facebook.com/test.html?key=value"); BOOST_CHECK_EQUAL(url.domain_without_tld(), "facebook"); BOOST_CHECK_EQUAL(url.host(), "facebook.com"); BOOST_CHECK_EQUAL(url.host_reverse(), "com.facebook"); BOOST_CHECK_EQUAL(url.scheme(), "https"); BOOST_CHECK_EQUAL(url.path(), "/test.html"); BOOST_CHECK_EQUAL(url.path_with_query(), "/test.html?key=value"); BOOST_CHECK_EQUAL(url.size(), strlen("https://www.facebook.com/test.html?key=value")); BOOST_CHECK_EQUAL(url.has_https(), true); BOOST_CHECK_EQUAL(url.has_www(), true); auto query = url.query(); BOOST_CHECK_EQUAL(query.size(), 1); BOOST_CHECK_EQUAL(query["key"], "value"); } { URL url("http://example.com/"); BOOST_CHECK_EQUAL(url.has_https(), false); BOOST_CHECK_EQUAL(url.has_www(), false); } { URL url("http://example.com/"); BOOST_CHECK_EQUAL(url.path(), "/"); } { URL url("http://example.com"); BOOST_CHECK_EQUAL(url.path(), "/"); } } BOOST_AUTO_TEST_CASE(url_parsing2) { URL url("https://github.com/joscul/alexandria/blob/main/tests/File.h"); BOOST_CHECK_EQUAL(url.domain_without_tld(), "github"); BOOST_CHECK_EQUAL(url.host(), "github.com"); BOOST_CHECK_EQUAL(url.scheme(), "https"); BOOST_CHECK_EQUAL(url.path(), "/joscul/alexandria/blob/main/tests/File.h"); BOOST_CHECK_EQUAL(url.path_with_query(), "/joscul/alexandria/blob/main/tests/File.h"); auto query = url.query(); BOOST_CHECK_EQUAL(query.size(), 0); } BOOST_AUTO_TEST_CASE(hash) { URL url("https://github.com/joscul/alexandria/blob/main/tests/File.h"); size_t hash1 = URL("https://github.com/joscul/alexandria/blob/main/tests/File.h").hash(); size_t hash2 = URL("https://github.com/joscul/alexandria/blob/main/tests/File.h?query=param").hash(); size_t hash3 = URL("https://github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp").hash(); size_t hash4 = URL("https://www.github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp").hash(); size_t hash5 = URL("http://github.com/joscul/alexandria/blob/main/tests/File.h?hej=hopp").hash(); BOOST_CHECK(hash1 != hash2); BOOST_CHECK(hash2 != hash3); BOOST_CHECK(hash3 == hash4); BOOST_CHECK(hash4 == hash5); } BOOST_AUTO_TEST_CASE(unescape) { { URL url("https://github.com/?q=test%20test"); map query = url.query(); BOOST_CHECK_EQUAL(query["q"], "test test"); } { URL url("https://github.com/?q=test%2020"); map query = url.query(); BOOST_CHECK_EQUAL(query["q"], "test 20"); } { URL url("https://github.com/search?q=targumical&cp=0&hl=en-US&pq=%targumical%&sourceid=chrome&ie=UTF-8"); map query = url.query(); BOOST_CHECK_EQUAL(query["pq"], "%targumical%"); } { URL url("https://github.com/search?q=stress%%c3%C3%a5%C3%A4%c3%b6%0G"); map query = url.query(); BOOST_CHECK_EQUAL(query["q"], "stress%c3åäö%0G"); } { // Test double encoding. URL url("https://github.com/search?q=%25C3%25A5%25C3%25A4%25C3%25B6"); map query = url.query(); BOOST_CHECK_EQUAL(query["q"], "%C3%A5%C3%A4%C3%B6"); } { // Test double encoding. URL url("https://github.com/search?q=%josef%0"); map query = url.query(); BOOST_CHECK_EQUAL(query["q"], "%josef%0"); } } BOOST_AUTO_TEST_CASE(host_top_domain) { { URL url("https://test.uk"); BOOST_CHECK_EQUAL(url.host_top_domain(), "test.uk"); } { URL url("https://testing.com.au"); BOOST_CHECK_EQUAL(url.host_top_domain(), "testing.com.au"); } { URL url("https://subdomain.testing.com.au"); BOOST_CHECK_EQUAL(url.host_top_domain(), "testing.com.au"); } { URL url("https://github.com/"); BOOST_CHECK_EQUAL(url.host_top_domain(), "github.com"); } { URL url("https://test.github.com/"); BOOST_CHECK_EQUAL(url.host_top_domain(), "github.com"); } { URL url("https://bbc.co.uk/"); BOOST_CHECK_EQUAL(url.host_top_domain(), "bbc.co.uk"); } { URL url("https://testing.bbc.co.uk/"); BOOST_CHECK_EQUAL(url.host_top_domain(), "bbc.co.uk"); } { URL url("."); BOOST_CHECK_EQUAL(url.host_top_domain(), ""); } { URL url(""); BOOST_CHECK_EQUAL(url.host_top_domain(), ""); } } BOOST_AUTO_TEST_SUITE_END() ================================================ FILE: tests/test_url_record.cpp ================================================ /* * MIT License * * Alexandria.org * * Copyright (c) 2021 Josef Cullhed, , et al. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include "indexer/url_record.h" using namespace std; BOOST_AUTO_TEST_SUITE(test_url_record) BOOST_AUTO_TEST_CASE(basic) { indexer::url_record record(123ull); record.url_length(442); BOOST_CHECK_EQUAL(record.url_length(), 442); record.url_length(4); BOOST_CHECK_EQUAL(record.url_length(), 4); record.url_length(0); BOOST_CHECK_EQUAL(record.url_length(), 0); } BOOST_AUTO_TEST_SUITE_END()