Repository: PhilipHazel/pcre2 Branch: main Commit: 4f460e5edaa6 Files: 496 Total size: 14.2 MB Directory structure: gitextract_ehge08ch/ ├── .editorconfig ├── .gitattributes ├── .github/ │ ├── codecov.yml │ ├── dependabot.yml │ ├── scripts/ │ │ └── merge_sarif.py │ └── workflows/ │ ├── build.yml │ ├── cifuzz.yml │ ├── clang-analyzer.yml │ ├── codeql.yml │ ├── dev.yml │ ├── pages.yml │ ├── scorecards.yml │ └── sync.yml ├── .gitignore ├── .gitmodules ├── AUTHORS.md ├── BUILD.bazel ├── CMakeLists.txt ├── COPYING ├── ChangeLog ├── HACKING ├── INSTALL ├── LICENCE.md ├── MODULE.bazel ├── Makefile.am ├── NEWS ├── NON-AUTOTOOLS-BUILD ├── README ├── README.md ├── RunGrepTest ├── RunGrepTest.bat ├── RunTest ├── RunTest.bat ├── SECURITY.md ├── autogen.sh ├── build.zig ├── cmake/ │ ├── COPYING-CMAKE-SCRIPTS │ ├── FindEditline.cmake │ ├── FindReadline.cmake │ ├── PCRE2CheckVscript.cmake │ ├── PCRE2UseSystemExtensions.cmake │ ├── PCRE2WarningAsError.cmake │ └── pcre2-config.cmake.in ├── configure.ac ├── doc/ │ ├── html/ │ │ ├── NON-AUTOTOOLS-BUILD.txt │ │ ├── README.txt │ │ ├── index.html │ │ ├── pcre2-config.html │ │ ├── pcre2.html │ │ ├── pcre2_callout_enumerate.html │ │ ├── pcre2_code_copy.html │ │ ├── pcre2_code_copy_with_tables.html │ │ ├── pcre2_code_free.html │ │ ├── pcre2_compile.html │ │ ├── pcre2_compile_context_copy.html │ │ ├── pcre2_compile_context_create.html │ │ ├── pcre2_compile_context_free.html │ │ ├── pcre2_config.html │ │ ├── pcre2_convert_context_copy.html │ │ ├── pcre2_convert_context_create.html │ │ ├── pcre2_convert_context_free.html │ │ ├── pcre2_converted_pattern_free.html │ │ ├── pcre2_dfa_match.html │ │ ├── pcre2_general_context_copy.html │ │ ├── pcre2_general_context_create.html │ │ ├── pcre2_general_context_free.html │ │ ├── pcre2_get_error_message.html │ │ ├── pcre2_get_mark.html │ │ ├── pcre2_get_match_data_heapframes_size.html │ │ ├── pcre2_get_match_data_size.html │ │ ├── pcre2_get_ovector_count.html │ │ ├── pcre2_get_ovector_pointer.html │ │ ├── pcre2_get_startchar.html │ │ ├── pcre2_jit_compile.html │ │ ├── pcre2_jit_free_unused_memory.html │ │ ├── pcre2_jit_match.html │ │ ├── pcre2_jit_stack_assign.html │ │ ├── pcre2_jit_stack_create.html │ │ ├── pcre2_jit_stack_free.html │ │ ├── pcre2_maketables.html │ │ ├── pcre2_maketables_free.html │ │ ├── pcre2_match.html │ │ ├── pcre2_match_context_copy.html │ │ ├── pcre2_match_context_create.html │ │ ├── pcre2_match_context_free.html │ │ ├── pcre2_match_data_create.html │ │ ├── pcre2_match_data_create_from_pattern.html │ │ ├── pcre2_match_data_free.html │ │ ├── pcre2_next_match.html │ │ ├── pcre2_pattern_convert.html │ │ ├── pcre2_pattern_info.html │ │ ├── pcre2_serialize_decode.html │ │ ├── pcre2_serialize_encode.html │ │ ├── pcre2_serialize_free.html │ │ ├── pcre2_serialize_get_number_of_codes.html │ │ ├── pcre2_set_bsr.html │ │ ├── pcre2_set_callout.html │ │ ├── pcre2_set_character_tables.html │ │ ├── pcre2_set_compile_extra_options.html │ │ ├── pcre2_set_compile_recursion_guard.html │ │ ├── pcre2_set_depth_limit.html │ │ ├── pcre2_set_glob_escape.html │ │ ├── pcre2_set_glob_separator.html │ │ ├── pcre2_set_heap_limit.html │ │ ├── pcre2_set_match_limit.html │ │ ├── pcre2_set_max_pattern_compiled_length.html │ │ ├── pcre2_set_max_pattern_length.html │ │ ├── pcre2_set_max_varlookbehind.html │ │ ├── pcre2_set_newline.html │ │ ├── pcre2_set_offset_limit.html │ │ ├── pcre2_set_optimize.html │ │ ├── pcre2_set_parens_nest_limit.html │ │ ├── pcre2_set_recursion_limit.html │ │ ├── pcre2_set_recursion_memory_management.html │ │ ├── pcre2_set_substitute_callout.html │ │ ├── pcre2_set_substitute_case_callout.html │ │ ├── pcre2_substitute.html │ │ ├── pcre2_substring_copy_byname.html │ │ ├── pcre2_substring_copy_bynumber.html │ │ ├── pcre2_substring_free.html │ │ ├── pcre2_substring_get_byname.html │ │ ├── pcre2_substring_get_bynumber.html │ │ ├── pcre2_substring_length_byname.html │ │ ├── pcre2_substring_length_bynumber.html │ │ ├── pcre2_substring_list_free.html │ │ ├── pcre2_substring_list_get.html │ │ ├── pcre2_substring_nametable_scan.html │ │ ├── pcre2_substring_number_from_name.html │ │ ├── pcre2api.html │ │ ├── pcre2build.html │ │ ├── pcre2callout.html │ │ ├── pcre2compat.html │ │ ├── pcre2convert.html │ │ ├── pcre2demo.html │ │ ├── pcre2grep.html │ │ ├── pcre2jit.html │ │ ├── pcre2limits.html │ │ ├── pcre2matching.html │ │ ├── pcre2partial.html │ │ ├── pcre2pattern.html │ │ ├── pcre2perform.html │ │ ├── pcre2posix.html │ │ ├── pcre2sample.html │ │ ├── pcre2serialize.html │ │ ├── pcre2syntax.html │ │ ├── pcre2test.html │ │ └── pcre2unicode.html │ ├── index.html.src │ ├── pcre2-config.1 │ ├── pcre2-config.txt │ ├── pcre2.3 │ ├── pcre2.txt │ ├── pcre2_callout_enumerate.3 │ ├── pcre2_code_copy.3 │ ├── pcre2_code_copy_with_tables.3 │ ├── pcre2_code_free.3 │ ├── pcre2_compile.3 │ ├── pcre2_compile_context_copy.3 │ ├── pcre2_compile_context_create.3 │ ├── pcre2_compile_context_free.3 │ ├── pcre2_config.3 │ ├── pcre2_convert_context_copy.3 │ ├── pcre2_convert_context_create.3 │ ├── pcre2_convert_context_free.3 │ ├── pcre2_converted_pattern_free.3 │ ├── pcre2_dfa_match.3 │ ├── pcre2_general_context_copy.3 │ ├── pcre2_general_context_create.3 │ ├── pcre2_general_context_free.3 │ ├── pcre2_get_error_message.3 │ ├── pcre2_get_mark.3 │ ├── pcre2_get_match_data_heapframes_size.3 │ ├── pcre2_get_match_data_size.3 │ ├── pcre2_get_ovector_count.3 │ ├── pcre2_get_ovector_pointer.3 │ ├── pcre2_get_startchar.3 │ ├── pcre2_jit_compile.3 │ ├── pcre2_jit_free_unused_memory.3 │ ├── pcre2_jit_match.3 │ ├── pcre2_jit_stack_assign.3 │ ├── pcre2_jit_stack_create.3 │ ├── pcre2_jit_stack_free.3 │ ├── pcre2_maketables.3 │ ├── pcre2_maketables_free.3 │ ├── pcre2_match.3 │ ├── pcre2_match_context_copy.3 │ ├── pcre2_match_context_create.3 │ ├── pcre2_match_context_free.3 │ ├── pcre2_match_data_create.3 │ ├── pcre2_match_data_create_from_pattern.3 │ ├── pcre2_match_data_free.3 │ ├── pcre2_next_match.3 │ ├── pcre2_pattern_convert.3 │ ├── pcre2_pattern_info.3 │ ├── pcre2_serialize_decode.3 │ ├── pcre2_serialize_encode.3 │ ├── pcre2_serialize_free.3 │ ├── pcre2_serialize_get_number_of_codes.3 │ ├── pcre2_set_bsr.3 │ ├── pcre2_set_callout.3 │ ├── pcre2_set_character_tables.3 │ ├── pcre2_set_compile_extra_options.3 │ ├── pcre2_set_compile_recursion_guard.3 │ ├── pcre2_set_depth_limit.3 │ ├── pcre2_set_glob_escape.3 │ ├── pcre2_set_glob_separator.3 │ ├── pcre2_set_heap_limit.3 │ ├── pcre2_set_match_limit.3 │ ├── pcre2_set_max_pattern_compiled_length.3 │ ├── pcre2_set_max_pattern_length.3 │ ├── pcre2_set_max_varlookbehind.3 │ ├── pcre2_set_newline.3 │ ├── pcre2_set_offset_limit.3 │ ├── pcre2_set_optimize.3 │ ├── pcre2_set_parens_nest_limit.3 │ ├── pcre2_set_recursion_limit.3 │ ├── pcre2_set_recursion_memory_management.3 │ ├── pcre2_set_substitute_callout.3 │ ├── pcre2_set_substitute_case_callout.3 │ ├── pcre2_substitute.3 │ ├── pcre2_substring_copy_byname.3 │ ├── pcre2_substring_copy_bynumber.3 │ ├── pcre2_substring_free.3 │ ├── pcre2_substring_get_byname.3 │ ├── pcre2_substring_get_bynumber.3 │ ├── pcre2_substring_length_byname.3 │ ├── pcre2_substring_length_bynumber.3 │ ├── pcre2_substring_list_free.3 │ ├── pcre2_substring_list_get.3 │ ├── pcre2_substring_nametable_scan.3 │ ├── pcre2_substring_number_from_name.3 │ ├── pcre2api.3 │ ├── pcre2build.3 │ ├── pcre2callout.3 │ ├── pcre2compat.3 │ ├── pcre2convert.3 │ ├── pcre2demo.3 │ ├── pcre2grep.1 │ ├── pcre2grep.txt │ ├── pcre2jit.3 │ ├── pcre2limits.3 │ ├── pcre2matching.3 │ ├── pcre2partial.3 │ ├── pcre2pattern.3 │ ├── pcre2perform.3 │ ├── pcre2posix.3 │ ├── pcre2sample.3 │ ├── pcre2serialize.3 │ ├── pcre2syntax.3 │ ├── pcre2test.1 │ ├── pcre2test.txt │ └── pcre2unicode.3 ├── libpcre2-16.pc.in ├── libpcre2-32.pc.in ├── libpcre2-8.pc.in ├── libpcre2-posix.pc.in ├── m4/ │ ├── ax_pthread.m4 │ ├── pcre2_check_vscript.m4 │ ├── pcre2_visibility.m4 │ └── pcre2_zos.m4 ├── maint/ │ ├── .gitignore │ ├── 132html │ ├── CheckMan │ ├── CheckTxt │ ├── CleanTxt │ ├── Detrail │ ├── FetchUcd.sh │ ├── FilterCoverage.py │ ├── GenerateCommon.py │ ├── GenerateTest.py │ ├── GenerateUcd.py │ ├── GenerateUcpHeader.py │ ├── GenerateUcpTables.py │ ├── LintMan │ ├── ManyConfigTests │ ├── README │ ├── RunCoverage │ ├── RunManifestTest │ ├── RunManifestTest.ps1 │ ├── RunPerlTest │ ├── RunSymbolTest │ ├── RunSymbolTest.ps1 │ ├── Unicode.tables/ │ │ ├── BidiMirroring.txt │ │ ├── CaseFolding.txt │ │ ├── DerivedBidiClass.txt │ │ ├── DerivedCoreProperties.txt │ │ ├── DerivedGeneralCategory.txt │ │ ├── GraphemeBreakProperty.txt │ │ ├── PropList.txt │ │ ├── PropertyAliases.txt │ │ ├── PropertyValueAliases.txt │ │ ├── ScriptExtensions.txt │ │ ├── Scripts.txt │ │ ├── UnicodeData.txt │ │ └── emoji-data.txt │ ├── UpdateAlways │ ├── UpdateCommon.py │ ├── UpdateDates.py │ ├── UpdateRelease.py │ ├── cmake-tests/ │ │ ├── build-interface/ │ │ │ ├── CMakeLists.txt │ │ │ └── main.c │ │ └── install-interface/ │ │ ├── CMakeLists.txt │ │ └── main.c │ ├── manifest-cmakeinstall-freebsd │ ├── manifest-cmakeinstall-linux │ ├── manifest-cmakeinstall-macos │ ├── manifest-cmakeinstall-solaris │ ├── manifest-cmakeinstall-windows │ ├── manifest-makeinstall-freebsd │ ├── manifest-makeinstall-linux │ ├── manifest-makeinstall-solaris │ ├── manifest-tarball │ ├── pcre2_chartables.c.non-standard │ ├── ucptest.c │ └── ucptestdata/ │ ├── testinput1 │ ├── testinput2 │ ├── testoutput1 │ └── testoutput2 ├── pcre2-config.in ├── perltest.sh ├── src/ │ ├── config-cmake.h.in │ ├── config.h.generic │ ├── libpcre2-16.sym.in │ ├── libpcre2-32.sym.in │ ├── libpcre2-8.sym.in │ ├── libpcre2-posix.sym.in │ ├── pcre2.h.generic │ ├── pcre2.h.in │ ├── pcre2_auto_possess.c │ ├── pcre2_chartables.c.dist │ ├── pcre2_chartables.c.ebcdic-1047-nl15 │ ├── pcre2_chartables.c.ebcdic-1047-nl25 │ ├── pcre2_chkdint.c │ ├── pcre2_compile.c │ ├── pcre2_compile.h │ ├── pcre2_compile_cgroup.c │ ├── pcre2_compile_class.c │ ├── pcre2_config.c │ ├── pcre2_context.c │ ├── pcre2_convert.c │ ├── pcre2_dfa_match.c │ ├── pcre2_dftables.c │ ├── pcre2_error.c │ ├── pcre2_extuni.c │ ├── pcre2_find_bracket.c │ ├── pcre2_fuzzsupport.c │ ├── pcre2_internal.h │ ├── pcre2_intmodedep.h │ ├── pcre2_jit_char_inc.h │ ├── pcre2_jit_compile.c │ ├── pcre2_jit_match_inc.h │ ├── pcre2_jit_misc_inc.h │ ├── pcre2_jit_simd_inc.h │ ├── pcre2_jit_test.c │ ├── pcre2_maketables.c │ ├── pcre2_match.c │ ├── pcre2_match_data.c │ ├── pcre2_match_next.c │ ├── pcre2_newline.c │ ├── pcre2_ord2utf.c │ ├── pcre2_pattern_info.c │ ├── pcre2_printint_inc.h │ ├── pcre2_script_run.c │ ├── pcre2_serialize.c │ ├── pcre2_string_utils.c │ ├── pcre2_study.c │ ├── pcre2_substitute.c │ ├── pcre2_substring.c │ ├── pcre2_tables.c │ ├── pcre2_ucd.c │ ├── pcre2_ucp.h │ ├── pcre2_ucptables_inc.h │ ├── pcre2_util.h │ ├── pcre2_valid_utf.c │ ├── pcre2_xclass.c │ ├── pcre2demo.c │ ├── pcre2grep.c │ ├── pcre2posix.c │ ├── pcre2posix.h │ ├── pcre2posix_test.c │ ├── pcre2test.c │ └── pcre2test_inc.h ├── testdata/ │ ├── fuzzing/ │ │ ├── pcre2_fuzzer.dict │ │ ├── pcre2_fuzzer.options │ │ ├── pcre2_fuzzer_16.dict │ │ ├── pcre2_fuzzer_16.options │ │ ├── pcre2_fuzzer_32.dict │ │ └── pcre2_fuzzer_32.options │ ├── grepbinary │ ├── grepfilelist │ ├── grepinput │ ├── grepinput3 │ ├── grepinput8 │ ├── grepinputBad8 │ ├── grepinputBad8_Trail │ ├── grepinputC.bz2 │ ├── grepinputM │ ├── grepinputUN │ ├── grepinputv │ ├── grepinputx │ ├── greplist │ ├── greplistBad │ ├── grepnot.bz2 │ ├── grepoutput │ ├── grepoutput8 │ ├── grepoutputC │ ├── grepoutputCN │ ├── grepoutputCNU │ ├── grepoutputCU │ ├── grepoutputCbz2 │ ├── grepoutputCgz │ ├── grepoutputN │ ├── grepoutputUN │ ├── greppatN4 │ ├── testinput1 │ ├── testinput10 │ ├── testinput11 │ ├── testinput12 │ ├── testinput13 │ ├── testinput14 │ ├── testinput15 │ ├── testinput16 │ ├── testinput17 │ ├── testinput18 │ ├── testinput19 │ ├── testinput2 │ ├── testinput20 │ ├── testinput21 │ ├── testinput22 │ ├── testinput23 │ ├── testinput24 │ ├── testinput25 │ ├── testinput26 │ ├── testinput27 │ ├── testinput28 │ ├── testinput29 │ ├── testinput3 │ ├── testinput4 │ ├── testinput5 │ ├── testinput6 │ ├── testinput7 │ ├── testinput8 │ ├── testinput9 │ ├── testinputheap │ ├── testoutput1 │ ├── testoutput10 │ ├── testoutput11-16 │ ├── testoutput11-32 │ ├── testoutput12-16 │ ├── testoutput12-32 │ ├── testoutput13 │ ├── testoutput14-16 │ ├── testoutput14-32 │ ├── testoutput14-8 │ ├── testoutput15 │ ├── testoutput17 │ ├── testoutput18 │ ├── testoutput19 │ ├── testoutput2 │ ├── testoutput20 │ ├── testoutput21 │ ├── testoutput22-16 │ ├── testoutput22-32 │ ├── testoutput22-8 │ ├── testoutput23 │ ├── testoutput24 │ ├── testoutput25 │ ├── testoutput26 │ ├── testoutput27 │ ├── testoutput28 │ ├── testoutput29 │ ├── testoutput3 │ ├── testoutput3A │ ├── testoutput3B │ ├── testoutput3C │ ├── testoutput4 │ ├── testoutput5 │ ├── testoutput6 │ ├── testoutput7 │ ├── testoutput8-16-2 │ ├── testoutput8-16-4 │ ├── testoutput8-32-4 │ ├── testoutput8-8-2 │ ├── testoutput8-8-3 │ ├── testoutput8-8-4 │ ├── testoutput9 │ ├── testoutputheap-16 │ ├── testoutputheap-32 │ ├── testoutputheap-8 │ ├── valgrind-jit.supp │ ├── wintestinput3 │ └── wintestoutput3 └── vms/ ├── configure.com ├── openvms_readme.txt ├── pcre2.h_patch └── stdint.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .editorconfig ================================================ # EditorConfig helps ensure that files are opened in editors with the correct # settings, regardless of the editor or platform. See http://editorconfig.org. root = true [*] charset = utf-8 indent_style = space indent_size = 2 end_of_line = lf [Makefile.am] indent_style = tab [*.bat] end_of_line = crlf [testdata/*] insert_final_newline = false trim_trailing_whitespace = false [testdata/test{input,output}{1,2,3,3A,3B,3C,6,28,29}] charset = latin1 ================================================ FILE: .gitattributes ================================================ testdata/* -text maint/manifest-* -text maint/ucptestdata -text *.sh text eol=lf pcre2-config.in text eol=lf RunTest text eol=lf RunGrepTest text eol=lf ================================================ FILE: .github/codecov.yml ================================================ codecov: strict_yaml_branch: default require_ci_to_pass: false notify: wait_for_ci: false notify_error: true coverage: range: 75..90 round: nearest precision: 2 status: project: false patch: default: target: 0% github_checks: annotations: false comment: false # layout: "condensed_header, condensed_files, condensed_footer" # hide_project_coverage: true # require_head: true # require_base: true # require_changes: "coverage_drop OR uncovered_patch" component_management: individual_components: - component_id: library name: "Core library" paths: - '!src/((pcre2test|pcre2grep|pcre2_jit_test|pcre2posix_test|pcre2_printint)\.c|pcre2test_inc\.h)' statuses: - type: project target: auto threshold: 0.5% - component_id: test_binaries name: "Test binaries" paths: - 'src/((pcre2test|pcre2grep|pcre2_jit_test|pcre2posix_test|pcre2_printint)\.c|pcre2test_inc\.h)' statuses: - type: project target: auto threshold: 2% ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: github-actions directory: / schedule: interval: monthly groups: minor-and-patch: update-types: - "minor" - "patch" ================================================ FILE: .github/scripts/merge_sarif.py ================================================ #! /usr/bin/env python3 # The purpose of this file is to adapt the output from # Clang's static analyzer into a format suitable for GitHub # Actions. The problem is that Clang outputs a separate "run" # per file in its SARIF output, but GitHub requires a single # run per tool (Clang is wrong here). import sys import json if len(sys.argv) < 2: print("Usage: munge-sarif.py INPUT", file=sys.stderr) sys.exit(1) data = None with open(sys.argv[1], 'rb') as f: data = json.load(f) # Arbitrarily pick the first run as the one from which to copy all the properties base_run = data['runs'][0] # We don't need these, GitHub ignores them base_run['artifacts'] = [] # Concatenate results for r in data['runs'][1:]: base_run['results'].extend(r['results']) data['runs'] = [base_run] def fix_region(region): startLine = region.get('startLine', None) startColumn = region.get('startColumn', 1) endLine = region.get('endLine', None) endColumn = region.get('endColumn', None) if startLine is None: raise ValueError("Region must have startLine") if endLine is not None and endLine < startLine: region['endLine'] = startLine del region['endColumn'] endLine = startLine endColumn = None if endColumn is not None and (endLine == startLine or endLine is None) and endColumn < startColumn: region['endColumn'] = startColumn endColumn = startColumn # Recursively scan the data dictionary, and apply the fix_region() function # to all "region":Region key-value pairs. def fix_regions(data): if isinstance(data, dict): if 'region' in data: fix_region(data['region']) for key, value in data.items(): fix_regions(value) elif isinstance(data, list): for item in data: fix_regions(item) fix_regions(data) with open(sys.argv[1], 'w') as f: json.dump(data, f, indent=2) ================================================ FILE: .github/workflows/build.yml ================================================ name: Build on: workflow_dispatch: inputs: job_id: type: choice description: Specific job to run default: all required: true options: - all - linux - alpine - macos - windows - freebsd - openbsd - solaris - zos - distcheck - coverage push: branches: [ main, "release/**" ] pull_request: branches: [ main ] permissions: contents: read env: CFLAGS_GCC_STYLE: '-Wall -Wextra -pedantic -Wdeclaration-after-statement -Wshadow -Wno-overlength-strings -Wimplicit-fallthrough' CFLAGS_MSVC: '/W3' CFLAGS_SOLARIS_CC: '-errtags=yes -erroff=E_STATEMENT_NOT_REACHED' CMAKE_FLAGS: '-Wdev -Werror=dev -Wdeprecated -Werror=deprecated --warn-uninitialized' jobs: linux: name: Linux runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'linux') steps: - name: Setup run: | sudo apt-get -qq update sudo apt-get -qq install zlib1g-dev libbz2-dev - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Configure run: ./configure CFLAGS="$CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-pcre2grep-libz --enable-pcre2grep-libbz2 --enable-Werror - name: Build run: make -j3 - name: Test (main test script) run: ./RunTest - name: Test (JIT test program) run: ./pcre2_jit_test - name: Test (pcre2grep test script) run: ./RunGrepTest - name: Test (pcre2posix program) run: ./pcre2posix_test -v - name: Install run: | make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-linux maint/RunSymbolTest install-dir/usr/local/lib/ maint/ alpine: name: alpine runs-on: ubuntu-latest container: alpine if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'alpine') steps: - name: Setup run: apk add --no-cache automake autoconf gcc libtool make musl-dev git zlib zlib-dev bzip2 bzip2-dev - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Configure run: ./configure CFLAGS="$CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-pcre2grep-libz --enable-pcre2grep-libbz2 --enable-Werror - name: Build run: make -j3 - name: Test (main test script) run: ./RunTest - name: Test (JIT test program) run: ./pcre2_jit_test - name: Test (pcre2grep test script) run: ./RunGrepTest - name: Test (pcre2posix program) run: ./pcre2posix_test -v - name: Install run: | make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-linux maint/RunSymbolTest install-dir/usr/local/lib/ maint/ macos: name: macOS universal runs-on: macos-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'macos') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure run: cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DPCRE2_SUPPORT_LIBZ=ON -DPCRE2_SUPPORT_LIBBZ2=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DCMAKE_OSX_ARCHITECTURES='arm64;x86_64' -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build - name: Build run: cd build && make -j3 - name: Test run: cd build && ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) - name: Install run: | cd build cmake --install . --prefix install-dir ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-macos ../maint/RunSymbolTest install-dir/lib/ ../maint/ - name: Test CMake install interface run: | INSTALL_PREFIX=`pwd`/build/install-dir cd maint/cmake-tests/install-interface for useStaticLibs in ON OFF; do echo "== Testing CMake install interface with PCRE2_USE_STATIC_LIBS=$useStaticLibs ==" rm -rf build cmake $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$INSTALL_PREFIX" -DPCRE2_USE_STATIC_LIBS=$useStaticLibs -B build (cd build; make) ./build/test_executable otool -L ./build/test_executable if [ $useStaticLibs = ON ]; then (otool -L ./build/test_executable | grep -q "pcre2") && (echo "Error: PCRE2 found in otool output" && exit 1) else # Test that the shared library is actually linked in (otool -L ./build/test_executable | grep -q "@rpath/libpcre2-8.0.dylib") || (echo "Error: Shared library not linked in" && exit 1) fi done - name: Test CMake build interface run: | BUILD_DIR=`pwd` cp -rp maint/cmake-tests/build-interface ../cmake-tests-build-interface cd ../cmake-tests-build-interface ln -s "$BUILD_DIR" pcre2 for buildLibs in "ON;OFF" "OFF;ON"; do static=`echo $buildLibs | cut -d';' -f1` shared=`echo $buildLibs | cut -d';' -f2` echo "== Testing CMake build interface with BUILD_STATIC_LIBS=$static and BUILD_SHARED_LIBS=$shared ==" rm -rf build cmake $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DBUILD_STATIC_LIBS=$static -DBUILD_SHARED_LIBS=$shared -B build (cd build; make) ./build/test_executable otool -L ./build/test_executable if [ $static = ON ]; then (otool -L ./build/test_executable | grep -q "pcre2") && (echo "Error: PCRE2 found in ldd output" && exit 1) else # Test that the shared library is actually linked in (otool -L ./build/test_executable | grep -q "@rpath/libpcre2-8.0.dylib") || (echo "Error: Shared library not linked in" && exit 1) fi done windows: name: Windows runs-on: windows-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'windows') strategy: fail-fast: false matrix: arch: ["Win32", "x64"] steps: - name: Setup run: | # GitHub Actions Windows images ship with Git for Windows, which is great, # but it also pollutes the PATH with a lot of Unix tools which we don't # want to require as build dependencies. This filters out the Unix tools. # The GitHub images still include an absolute ton of junk in the PATH, # but it seems to be rare for unintended dependencies to be added to our # build scripts, so we can live with it for now. $PATCHED_PATH = ($env:PATH -split ';' | Where-Object { $_ -notmatch 'C:\\Program Files\\Git\\usr\\bin|C:\\Program Files\\Git\\mingw64\\bin' }) -join ';' # We can't seem to use $GITHUB_PATH here because that only allows # appending to the PATH, not replacing it. echo "PATH=$PATCHED_PATH" >> "$env:GITHUB_ENV" - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure run: | echo "PATH=$env:PATH" cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DCMAKE_C_FLAGS="$CFLAGS_MSVC" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -B build -A ${{ matrix.arch }} - name: Build run: cmake --build build --config Release - name: Test run: cd build && ctest -C Release -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) - name: Install run: | cd build cmake --install . --config Release --prefix install-dir ../maint/RunManifestTest.ps1 install-dir ../maint/manifest-cmakeinstall-windows ../maint/RunSymbolTest.ps1 install-dir/bin ../maint/ - name: Test CMake install interface run: | $INSTALL_PREFIX = (pwd).Path + "\build\install-dir" cd maint/cmake-tests/install-interface $vswhere = "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" $dumpbin = & $vswhere -latest -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -find VC\Tools\MSVC\*\bin\Hostx64\x64\dumpbin.exe | Select-Object -First 1 foreach ($useStaticLibs in @("ON", "OFF")) { echo "== Testing CMake install interface with PCRE2_USE_STATIC_LIBS=$useStaticLibs ==" if (Test-Path build) { rm -Recurse -Force build } cmake $CMAKE_FLAGS "-DCMAKE_PREFIX_PATH=$INSTALL_PREFIX" "-DPCRE2_USE_STATIC_LIBS=$useStaticLibs" -B build -A ${{ matrix.arch }} cmake --build build --config Release ./build/Release/test_executable.exe & $dumpbin /dependents ./build/Release/test_executable.exe if ($useStaticLibs -eq "ON") { if ((& $dumpbin /dependents ./build/Release/test_executable.exe | Out-String).Contains("pcre2")) { Write-Error "Error: PCRE2 found in dumpbin output" exit 1 } } else { # Test that the shared library is actually linked in if (-not ((& $dumpbin /dependents ./build/Release/test_executable.exe | Out-String).Contains("pcre2-8.dll"))) { Write-Error "Error: Shared library not linked in" exit 1 } } } - name: Test CMake build interface run: | $BUILD_DIR = (pwd).Path cp -Recurse -Path maint/cmake-tests/build-interface ../cmake-tests-build-interface cd ../cmake-tests-build-interface New-Item -ItemType SymbolicLink -Path "pcre2" -Target "$BUILD_DIR" $vswhere = "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" $dumpbin = & $vswhere -latest -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -find VC\Tools\MSVC\*\bin\Hostx64\x64\dumpbin.exe | Select-Object -First 1 foreach ($buildLibs in @(@{static="ON"; shared="OFF"}, @{static="OFF"; shared="ON"})) { echo "== Testing CMake build interface with BUILD_STATIC_LIBS=$($buildLibs.static) ==" if (Test-Path build) { rm -Recurse -Force build } cmake $CMAKE_FLAGS "-DBUILD_STATIC_LIBS=$($buildLibs.static)" "-DBUILD_SHARED_LIBS=$($buildLibs.shared)" -B build -A ${{ matrix.arch }} cmake --build build --config Debug ./build/Debug/test_executable.exe & $dumpbin /dependents ./build/Debug/test_executable.exe if ($buildLibs.static -eq "ON") { if ((& $dumpbin /dependents ./build/Debug/test_executable.exe | Out-String).Contains("pcre2")) { Write-Error "Error: PCRE2 found in dumpbin output" exit 1 } } else { # Test that the shared library is actually linked in if (-not ((& $dumpbin /dependents ./build/Debug/test_executable.exe | Out-String).Contains("pcre2-8d.dll"))) { Write-Error "Error: Shared library not linked in" exit 1 } } } freebsd: name: FreeBSD runs-on: ubuntu-latest if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'freebsd')) || github.event_name == 'push' steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Build & test uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5 with: envs: 'CFLAGS_GCC_STYLE CMAKE_FLAGS' usesh: true prepare: | set -e pkg install -y cmake run: | set -e cp -rp . ../build-autoconf cp -rp . ../build-cmake echo "== Autoconf ==" cd ../build-autoconf ./configure CFLAGS="$CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-Werror make -j3 (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-freebsd maint/RunSymbolTest install-dir/usr/local/lib/ maint/ echo "== CMake ==" cd ../build-cmake cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build cd build make -j3 ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) cmake --install . --prefix install-dir ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-freebsd ../maint/RunSymbolTest install-dir/lib/ ../maint/ openbsd: name: OpenBSD runs-on: ubuntu-latest if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'openbsd')) || github.event_name == 'push' steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Build & test uses: vmactions/openbsd-vm@d7d892b7b9ba97ed2747b0fc201be65037d64c3e # v1.4.0 with: envs: 'CFLAGS_GCC_STYLE CMAKE_FLAGS' usesh: true prepare: | set -e pkg_add cmake run: | set -e export MALLOC_OPTIONS="USRJGFC>>" EXTRA_CFLAGS="-DSLJIT_WX_EXECUTABLE_ALLOCATOR" cp -rp . ../build-autoconf cp -rp . ../build-cmake echo "== Autoconf ==" cd ../build-autoconf ./configure CFLAGS="$CFLAGS_GCC_STYLE $EXTRA_CFLAGS" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-Werror make -j3 (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) make install "DESTDIR=`pwd`/install-dir" # I don't really know enough about OpenBSD to say whether the unusually-numbered .so files # with no symlinks are correct or not. # maint/RunManifestTest install-dir maint/manifest-makeinstall-openbsd # maint/RunSymbolTest install-dir/usr/local/lib/ maint/ echo "== CMake ==" cd ../build-cmake cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE $EXTRA_CFLAGS" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build cd build make -j3 ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) cmake --install . --prefix install-dir # ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-openbsd # ../maint/RunSymbolTest install-dir/lib/ ../maint/ solaris: name: Solaris runs-on: ubuntu-latest if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'solaris')) || github.event_name == 'push' steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Transfer Oracle Studio certificates env: PKG_ORACLE_COM_CERTIFICATE_PEM: ${{ secrets.PKG_ORACLE_COM_CERTIFICATE_PEM }} PKG_ORACLE_COM_KEY_PEM: ${{ secrets.PKG_ORACLE_COM_KEY_PEM }} run: | printenv PKG_ORACLE_COM_CERTIFICATE_PEM > pkg.oracle.com.certificate.pem printenv PKG_ORACLE_COM_KEY_PEM > pkg.oracle.com.key.pem - name: Prepare run: ./autogen.sh - name: Build & test uses: vmactions/solaris-vm@c20562b2c69737b06be9e828915761703e487373 # v1.3.3 with: envs: 'CFLAGS_SOLARIS_CC CMAKE_FLAGS' usesh: true # Seriously! Solaris is the only OS to actually ship without a C # compiler, and not even to provide a simple download to get one! # You have to actually register with Oracle to get an X.509 # certificate before you can even download their compiler. Whatever. prepare: | set -e cp "$GITHUB_WORKSPACE/pkg.oracle.com.key.pem" /root/pkg.oracle.com.key.pem cp "$GITHUB_WORKSPACE/pkg.oracle.com.certificate.pem" /root/pkg.oracle.com.certificate.pem sudo pkg set-publisher \ -k /root/pkg.oracle.com.key.pem \ -c /root/pkg.oracle.com.certificate.pem \ -G "*" -g https://pkg.oracle.com/solarisstudio/release solarisstudio pkg install developer/build/make developer/build/cmake system/header pkg install --accept developerstudio-126/cc run: | set -e PATH=/opt/developerstudio12.6/bin:"$PATH" export PATH cp -rp . ../build-autoconf-32 cp -rp . ../build-autoconf-64 cp -rp . ../build-cmake-64 echo "== Autoconf, 32-bit ==" cd ../build-autoconf-32 ./configure CC="cc -m32" CFLAGS="$CFLAGS_SOLARIS_CC" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-errwarn make (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-solaris maint/RunSymbolTest install-dir/usr/local/lib/ maint/ echo "== Autoconf, 64-bit ==" cd ../build-autoconf-64 ./configure CC="cc -m64" CFLAGS="$CFLAGS_SOLARIS_CC" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-errwarn make (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-solaris maint/RunSymbolTest install-dir/usr/local/lib/ maint/ echo "== CMake, 64-bit ==" cd ../build-cmake-64 CC="cc -m64" cmake $CMAKE_FLAGS -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_SOLARIS_CC" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build cd build make ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) cmake --install . --prefix install-dir ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-solaris ../maint/RunSymbolTest install-dir/lib/ ../maint/ zos: name: z/OS runs-on: ubuntu-latest # No longer running on push events, due to flaky z/OS runner if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'zos')) concurrency: group: zos-ssh-build steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Build & test env: ZOS_HOST: ${{ secrets.ZOS_HOST }} ZOS_PORT: ${{ secrets.ZOS_PORT }} ZOS_PRIVATE_KEY: ${{ secrets.ZOS_PRIVATE_KEY }} ZOS_KNOWN_HOSTS: ${{ secrets.ZOS_KNOWN_HOSTS }} run: | (umask 0077 && printenv ZOS_PRIVATE_KEY > id_rsa_zos) mkdir -p ~/.ssh printenv ZOS_KNOWN_HOSTS > ~/.ssh/known_hosts tar czf ../pcre2-build.tar.gz --exclude=.git . mv ../pcre2-build.tar.gz . scp -i id_rsa_zos -P "$ZOS_PORT" pcre2-build.tar.gz "$ZOS_HOST:/data/" ssh -i id_rsa_zos -p "$ZOS_PORT" "$ZOS_HOST" /data/zopen/usr/local/bin/bash -c \ 'export _BPXK_AUTOCVT=ON; export _CEE_RUNOPTS="FILETAG(AUTOCVT,AUTOTAG) POSIX(ON)"; export _TAG_REDIR_ERR=txt; export _TAG_REDIR_IN=txt; export _TAG_REDIR_OUT=txt; export PATH="/data/zopen/usr/local/bin:/data/zopen/usr/bin:/data/zopen/bin:/data/zopen/boot:/bin:/usr/lpp/IBM/cnw/v2r1/openxl/bin"; . /data/zopen/etc/zopen-config; set -e; set -x; cd /data; echo "== Autoconf, XLC compiler =="; rm -rf pcre2-build; mkdir pcre2-build; gtar xzf pcre2-build.tar.gz -C pcre2-build; cd pcre2-build; chtag -R -tc ISO8859-1 .; MAKE=gmake CC=xlc ./configure --enable-ebcdic --disable-unicode; gmake; (gmake check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc); echo "== CMake, IBM-Clang -m64 compiler =="; cd ..; rm -rf pcre2-build; mkdir pcre2-build; gtar xzf pcre2-build.tar.gz -C pcre2-build; cd pcre2-build; chtag -R -tc ISO8859-1 .; cmake $CMAKE_FLAGS -G Ninja -DPCRE2_EBCDIC=ON -DPCRE2_SUPPORT_UNICODE=OFF -DCMAKE_C_COMPILER=ibm-clang -DCMAKE_C_FLAGS="-m64 $CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build cd build; ninja; ctest -j3 --output-on-failure; && (cat ./Testing/Temporary/LastTest.log || true) ' distcheck: name: Build & verify distribution runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'distcheck') permissions: id-token: write # Needed to make calls to the Sigstore service attestations: write # Needed to write the attestation to GitHub's database artifact-metadata: write # As detailed in the action documentation contents: read steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: | ./autogen.sh # Workaround for incorrect filesystem permissions on /usr/share/aclocal, which # causes the m4 macros to be copied with incorrect permissions. # https://github.com/actions/runner-images/issues/11212 chmod u=rw,go=r m4/*.m4 - name: Configure run: ./configure - name: Distcheck run: make distcheck -j3 - name: Manifest run: | mkdir tarball-dir tar -C tarball-dir -xzf pcre2-*.tar.gz # Budge the directory, so we don't bake the version number into the # `manifest-tarball` file: mv tarball-dir/pcre2-* tarball-dir/pcre2-SNAPSHOT maint/RunManifestTest tarball-dir maint/manifest-tarball - name: Upload to GitHub artifacts uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: "Distribution release" path: | pcre2-*.tar.bz2 pcre2-*.tar.gz pcre2-*.zip if-no-files-found: error - name: Attest uses: actions/attest@59d89421af93a897026c735860bf21b6eb4f7b26 # v4.1.0 if: | github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/tags/pcre2-')) with: subject-path: 'pcre2-*.tar.bz2, pcre2-*.tar.gz, pcre2-*.zip' coverage: name: Code coverage runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'coverage') steps: - name: Setup run: | sudo apt-get -qq update sudo apt-get -qq install zlib1g-dev libbz2-dev libedit-dev lcov - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure # We use DEBUG=OFF here in order to suppress the coverage misses due to # assertions, which obviously always pass. run: CC="clang -fprofile-instr-generate -fcoverage-mapping" cmake $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DPCRE2_DEBUG=OFF -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DPCRE2_SUPPORT_LIBZ=ON -DPCRE2_SUPPORT_LIBBZ2=ON -DPCRE2_SUPPORT_LIBEDIT=ON -DPCRE2_SUPPORT_LIBREADLINE=OFF -B build - name: Build run: cd build && make -j3 - name: Test run: | cd build ../maint/RunCoverage - name: Upload report to GitHub artifacts uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: "Coverage report" path: './build/coverage-html' if-no-files-found: error - name: Upload report to Codecov uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0 with: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true disable_search: true files: ./build/coverage-lcov.info ================================================ FILE: .github/workflows/cifuzz.yml ================================================ name: CIFuzz on: workflow_dispatch: pull_request: branches: [ main ] permissions: contents: read jobs: Fuzzing: runs-on: ubuntu-latest steps: - name: Build Fuzzers id: build uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@3d38acd485bc848e33396e7523b9a4f2aff9027e # master with: oss-fuzz-project-name: 'pcre2' dry-run: false - name: Run Fuzzers uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@3d38acd485bc848e33396e7523b9a4f2aff9027e # master with: oss-fuzz-project-name: 'pcre2' fuzz-seconds: 300 dry-run: false - name: Upload Crash uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() && steps.build.outcome == 'success' with: name: artifacts path: ./out/artifacts ================================================ FILE: .github/workflows/clang-analyzer.yml ================================================ name: Clang Static Analyzer on: workflow_dispatch: push: branches: [ main, "release/**" ] pull_request: branches: [ main ] permissions: contents: read jobs: Analyze: runs-on: ubuntu-latest permissions: # Needed to upload the results to code-scanning dashboard. security-events: write contents: read steps: - name: Setup run: | echo "set man-db/auto-update false" | sudo debconf-communicate && sudo dpkg-reconfigure man-db sudo apt-get -qq update sudo apt-get -qq install -y ninja-build clang-tools - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure & Build run: | LLVM_VER=`clang --version | head -n1 | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+' | cut -d. -f1` echo "Using LLVM version $LLVM_VER" mkdir build cd build scan-build-py-$LLVM_VER cmake -G Ninja -DPCRE2_SUPPORT_JIT=ON -DCMAKE_BUILD_TYPE=Debug .. scan-build-py-$LLVM_VER -o clang-sarif-root/ --sarif-html ninja rm clang-sarif-root/*/result-*.sarif mv clang-sarif-root/* ../clang-report ../.github/scripts/merge_sarif.py ../clang-report/results-merged.sarif # Upload the browsable HTML report as an artifact. - name: Upload report uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: "Clang Static Analyzer report" path: './clang-report' # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v3.29.5 with: sarif_file: ./clang-report/results-merged.sarif category: clang-analyzer ================================================ FILE: .github/workflows/codeql.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ main, "release/**" ] pull_request: # The branches below must be a subset of the branches above branches: [ main ] schedule: - cron: '27 6 * * 4' permissions: contents: read jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: # Needed to upload the results to code-scanning dashboard. security-events: write actions: read contents: read strategy: fail-fast: false matrix: language: [ 'cpp' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at https://git.io/codeql-language-support steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v3.29.5 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # queries: ./path/to/local/query, your-org/your-repo/queries@main # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v3.29.5 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # and modify them (or add more) to build your code if your project # uses a compiled language #- run: | # make bootstrap # make release - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v3.29.5 ================================================ FILE: .github/workflows/dev.yml ================================================ name: Dev on: workflow_dispatch: inputs: job_id: type: choice description: Specific job to run default: all required: true options: - all - canary - dragon - puffin - dodo - passenger - greatawk - wasp - bat - pterodactyl - bigbird - camel - chaffinch - fruitbat - ptarmigan - zebrilus - bee push: branches: [ main, "release/**" ] pull_request: branches: [ main ] permissions: contents: read env: CFLAGS_GCC_STYLE: '-Wall -Wextra -pedantic -Wdeclaration-after-statement -Wshadow -Wno-overlength-strings -Wimplicit-fallthrough' CFLAGS_MSVC: '/W3' CMAKE_FLAGS: '-Wdev -Werror=dev -Wdeprecated -Werror=deprecated --warn-uninitialized' jobs: canary: # Tests with: Debug & assertions; link-size=4; libedit name: GCC -O0 runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'canary') steps: - name: Setup run: | echo "set man-db/auto-update false" | sudo debconf-communicate && sudo dpkg-reconfigure man-db sudo apt-get -qq update sudo apt-get -qq install -y libedit-dev - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Configure run: ./configure CC='gcc -fsanitize=undefined,address -fsanitize-undefined-trap-on-error' CFLAGS="-O0 $CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug --enable-Werror --enable-pcre2test-libedit --with-link-size=4 - name: Build run: make -j3 - name: Test run: (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) dragon: # Tests with: clang AB/UB; link-size=3. Clang's logo is a dragon. name: Clang runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'dragon') strategy: fail-fast: false matrix: opt: ["-O0", "-O2"] steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Configure run: ./configure CC='clang -fsanitize=undefined,address,integer -fno-sanitize-recover=undefined,integer -fno-sanitize=unsigned-integer-overflow,unsigned-shift-base,function' CFLAGS="${{ matrix.opt }} $CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug --enable-Werror --with-link-size=3 - name: Build run: make -j3 - name: Test run: | ulimit -S -s 49152 # Raise stack limit; ASAN with -O0 is very stack-hungry (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) puffin: # Tests with: GCC, -O3, very latest CMake, libedit name: GCC -O3, CMake runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'puffin') steps: - name: Setup run: | echo "set man-db/auto-update false" | sudo debconf-communicate && sudo dpkg-reconfigure man-db sudo apt-get -qq update sudo apt-get -qq install -y git build-essential cmake zlib1g-dev libbz2-dev libedit-dev ninja-build - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Check latest CMake id: get-cmake-ver run: | CMAKE_VER=$(curl -s https://api.github.com/repos/Kitware/CMake/releases/latest | jq -r '.tag_name' | sed 's/^v//') if ! echo "$CMAKE_VER" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+$' ; then echo "Extracted CMake version: '$CMAKE_VER'" >&2 echo "This does not match the expected version format" >&2 exit 1 fi echo "CMAKE_VER=$CMAKE_VER" >> $GITHUB_OUTPUT echo "CMAKE_VER=$CMAKE_VER" >> $GITHUB_ENV echo "Latest CMake version is $CMAKE_VER" - name: Cache CMake uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: key: cmake-${{ steps.get-cmake-ver.outputs.CMAKE_VER }}-linux-x86_64 path: cmake-${{ steps.get-cmake-ver.outputs.CMAKE_VER }}-linux-x86_64.tar.gz - name: Install CMake run: | [ -f cmake-${CMAKE_VER}-linux-x86_64.tar.gz ] || curl -L -S -O "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VER}/cmake-${CMAKE_VER}-linux-x86_64.tar.gz" tar -xz -f cmake-${CMAKE_VER}-linux-x86_64.tar.gz -C "$RUNNER_TEMP" realpath "$RUNNER_TEMP/cmake-${CMAKE_VER}-linux-x86_64/bin" >> "$GITHUB_PATH" - name: Configure run: | cmake --version | grep "version ${CMAKE_VER}" || (echo "CMake version mismatch" && exit 1) cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DPCRE2_SUPPORT_LIBEDIT=ON -DPCRE2_SUPPORT_LIBREADLINE=OFF -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_POLICY_VERSION_MINIMUM=$CMAKE_VER -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build - name: Build run: cd build && make -j3 - name: Test run: cd build && ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) - name: Install run: | cd build cmake --install . --prefix install-dir ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-linux ../maint/RunSymbolTest install-dir/lib/ ../maint/ - name: Test CMake install interface run: | INSTALL_PREFIX=`pwd`/build/install-dir cd maint/cmake-tests/install-interface for useStaticLibs in ON OFF; do echo "== Testing CMake install interface with PCRE2_USE_STATIC_LIBS=$useStaticLibs ==" rm -rf build cmake -GNinja $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$INSTALL_PREFIX" -DPCRE2_USE_STATIC_LIBS=$useStaticLibs -B build (cd build; ninja) ./build/test_executable ldd ./build/test_executable if [ $useStaticLibs = ON ]; then (ldd ./build/test_executable | grep -q "pcre2") && (echo "Error: PCRE2 found in ldd output" && exit 1) else # Test that the shared library is actually linked in (ldd ./build/test_executable | grep -q "$INSTALL_PREFIX/lib/libpcre2-8.so.0") || (echo "Error: Shared library not linked in" && exit 1) fi done - name: Test CMake build interface run: | BUILD_DIR=`pwd` cp -rp maint/cmake-tests/build-interface ../cmake-tests-build-interface cd ../cmake-tests-build-interface ln -s "$BUILD_DIR" pcre2 for buildLibs in "ON;OFF" "OFF;ON"; do static=`echo $buildLibs | cut -d';' -f1` shared=`echo $buildLibs | cut -d';' -f2` echo "== Testing CMake build interface with BUILD_STATIC_LIBS=$static and BUILD_SHARED_LIBS=$shared ==" rm -rf build cmake -GNinja $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DBUILD_STATIC_LIBS=$static -DBUILD_SHARED_LIBS=$shared -B build (cd build; ninja) ./build/test_executable ldd ./build/test_executable if [ $static = ON ]; then (ldd ./build/test_executable | grep -q "pcre2") && (echo "Error: PCRE2 found in ldd output" && exit 1) else # Test that the shared library is actually linked in (ldd ./build/test_executable | grep -q "`pwd`/build/pcre2/libpcre2-8.so.0") || (echo "Error: Shared library not linked in" && exit 1) fi done dodo: # Tests with: Autoconf on oldest supported Ubuntu (in non-extended support) name: GCC -Os, old Autotools runs-on: ubuntu-latest container: ubuntu:22.04 if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'dodo') steps: - name: Setup run: | echo "set man-db/auto-update false" | debconf-communicate && dpkg-reconfigure man-db export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -qq update apt-get -qq install -y git build-essential autoconf automake libtool - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Configure run: ./configure CFLAGS="-Os $CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug --enable-Werror - name: Build run: make -j3 - name: Test run: (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) - name: Install run: | make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-linux maint/RunSymbolTest install-dir/usr/local/lib/ maint/ passenger: # Tests with: Autoconf on oldest RHEL (in extended support). # That's the absolute limit to how old a Linux version I'll tolerate regular testing on. name: GCC, very old Autotools runs-on: ubuntu-latest container: redhat/ubi8:8.6 if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'passenger') steps: - name: Setup run: | yum -q makecache yum -q install -y gcc git make automake autoconf libtool diffutils file glibc-langpack-en yum -q update -y glibc-common - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - name: Configure run: ./configure CFLAGS="-O0 $CFLAGS_GCC_STYLE" --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug --enable-Werror - name: Build run: make -j3 - name: Test run: (make check; rc=$?; for i in test-suite.log Run*Test.log pcre2*_test.log; do echo "== $i =="; cat $i; done; exit $rc) - name: Install run: | make install "DESTDIR=`pwd`/install-dir" maint/RunManifestTest install-dir maint/manifest-makeinstall-linux maint/RunSymbolTest install-dir/usr/local/lib/ maint/ greatawk: # Tests with: GCC, -O2, oldest supported Ubuntu (in non-extended support) name: GCC -O2, old CMake runs-on: ubuntu-latest container: ubuntu:22.04 if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'greatawk') steps: - name: Setup run: | echo "set man-db/auto-update false" | debconf-communicate && dpkg-reconfigure man-db export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -qq update apt-get -qq install -y git build-essential cmake zlib1g-dev libbz2-dev libreadline-dev - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure run: cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build - name: Build run: cd build && make -j3 - name: Test run: cd build && ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) - name: Install run: | cd build cmake --install . --prefix install-dir ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-linux relwithdebinfo ../maint/RunSymbolTest install-dir/lib/ ../maint/ - name: Test CMake install interface run: | INSTALL_PREFIX=`pwd`/build/install-dir cd maint/cmake-tests/install-interface for useStaticLibs in ON OFF; do echo "== Testing CMake install interface with PCRE2_USE_STATIC_LIBS=$useStaticLibs ==" rm -rf build cmake $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$INSTALL_PREFIX" -DPCRE2_USE_STATIC_LIBS=$useStaticLibs -B build (cd build; make) ./build/test_executable ldd ./build/test_executable if [ $useStaticLibs = ON ]; then (ldd ./build/test_executable | grep -q "pcre2") && (echo "Error: PCRE2 found in ldd output" && exit 1) else # Test that the shared library is actually linked in (ldd ./build/test_executable | grep -q "$INSTALL_PREFIX/lib/libpcre2-8.so.0") || (echo "Error: Shared library not linked in" && exit 1) fi done - name: Test CMake build interface run: | BUILD_DIR=`pwd` cp -rp maint/cmake-tests/build-interface ../cmake-tests-build-interface cd ../cmake-tests-build-interface ln -s "$BUILD_DIR" pcre2 for buildLibs in "ON;OFF" "OFF;ON"; do static=`echo $buildLibs | cut -d';' -f1` shared=`echo $buildLibs | cut -d';' -f2` echo "== Testing CMake build interface with BUILD_STATIC_LIBS=$static and BUILD_SHARED_LIBS=$shared ==" rm -rf build cmake $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DBUILD_STATIC_LIBS=$static -DBUILD_SHARED_LIBS=$shared -B build (cd build; make) ./build/test_executable ldd ./build/test_executable if [ $static = ON ]; then (ldd ./build/test_executable | grep -q "pcre2") && (echo "Error: PCRE2 found in ldd output" && exit 1) else # Test that the shared library is actually linked in (ldd ./build/test_executable | grep -q "`pwd`/build/pcre2/libpcre2-8.so.0") || (echo "Error: Shared library not linked in" && exit 1) fi done wasp: # Tests with: French locale; oldest supported CMake; no JIT; -Os; libreadline name: GCC -Os, very old CMake, ninja, no JIT runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'wasp') env: CMAKE_VER: "3.15.7" steps: - name: Setup run: | echo "set man-db/auto-update false" | sudo debconf-communicate && sudo dpkg-reconfigure man-db sudo apt-get -qq update sudo apt-get -qq install -y language-pack-fr ninja-build zlib1g-dev libbz2-dev libreadline-dev - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Cache CMake uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: key: cmake-${{ env.CMAKE_VER }}-Linux-x86_64 path: cmake-${{ env.CMAKE_VER }}-Linux-x86_64.tar.gz - name: Install CMake run: | [ -f cmake-${CMAKE_VER}-Linux-x86_64.tar.gz ] || curl -L -S -O "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VER}/cmake-${CMAKE_VER}-Linux-x86_64.tar.gz" tar -xz -f cmake-${CMAKE_VER}-Linux-x86_64.tar.gz -C "$RUNNER_TEMP" realpath "$RUNNER_TEMP/cmake-${CMAKE_VER}-Linux-x86_64/bin" >> "$GITHUB_PATH" - name: Configure run: | cmake --version | grep "version ${CMAKE_VER}" || (echo "CMake version mismatch" && exit 1) CC='clang' cmake $CMAKE_FLAGS -G Ninja -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DPCRE2_SUPPORT_LIBREADLINE=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=MinSizeRel -B build - name: Build run: ninja -C build - name: Test run: cd build && ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) - name: Install run: | cd build cmake --install . --prefix install-dir ../maint/RunManifestTest install-dir ../maint/manifest-cmakeinstall-linux minsizerel ../maint/RunSymbolTest install-dir/lib/ ../maint/ bat: # Tests with: MSVC 32-bit, and a variety of CMake options. Windows has "bat" files. name: Windows (Win32) runs-on: windows-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'bat') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure run: cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DPCRE2GREP_SUPPORT_CALLOUT_FORK=OFF -DPCRE2_DEBUG=ON -DPCRE2_NEWLINE=ANYCRLF -DPCRE2_STATIC_PIC=ON -DPCRE2_SUPPORT_BSR_ANYCRLF=ON -DBUILD_SHARED_LIBS=OFF -DBUILD_STATIC_LIBS=ON -DCMAKE_C_FLAGS="$CFLAGS_MSVC" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded -B build -A Win32 - name: Build run: cmake --build build --config RelWithDebInfo - name: Test run: cd build && ctest -C RelWithDebInfo -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) pterodactyl: # Tests with: MSVC 64-bit, Debug, shared libraries name: Windows (x64) runs-on: windows-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'pterodactyl') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure run: cmake $CMAKE_FLAGS -DPCRE2_SUPPORT_JIT=OFF -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DPCRE2_DEBUG=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=OFF -DCMAKE_C_FLAGS="$CFLAGS_MSVC" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -B build -A x64 - name: Build run: cmake --build build --config Debug - name: Test run: cd build && ctest -C Debug -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) bigbird: # Job to execute ManyConfigTests name: manyconfig runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'bigbird') steps: - name: Setup run: | echo "set man-db/auto-update false" | sudo debconf-communicate && sudo dpkg-reconfigure man-db sudo apt-get -qq update sudo apt-get -qq install -y valgrind - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Run run: | ./autogen.sh ./maint/ManyConfigTests camel: # Job to execute RunPerlTest. "Camel bird" is another name for an ostrich (and it's Perl's logo). name: perl runs-on: ubuntu-latest container: perl:devel if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'camel') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: yes - name: Test run: | perl -v maint/RunPerlTest chaffinch: # Job to verify that the CMake "unity" build (single-file / jumbo build) passes. # If this fails, it's usually because two different files define some file-static # functions or macros which collide. name: CMake unity build runs-on: ubuntu-latest if: github.event_name != 'workflow_dispatch' || (inputs.job_id == 'all' || inputs.job_id == 'chaffinch') env: # Disallowing shadowing would be very spammy for unity builds, because the # same variable name can be used in multiple files. CFLAGS_UNITY: "-Wno-shadow" steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Configure run: cmake $CMAKE_FLAGS -DCMAKE_UNITY_BUILD=ON -DCMAKE_UNITY_BUILD_BATCH_SIZE=0 -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE $CFLAGS_UNITY" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build - name: Build run: cd build && make -j3 - name: Test run: cd build && ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) fruitbat: # Tests with: MSYS2 unix-on-Windows environment name: MSYS2 runs-on: windows-latest if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'fruitbat')) || github.event_name == 'push' strategy: fail-fast: false matrix: # UCRT64 is the new default MSYS2 runtime, which builds native 64-bit # binaries which can then be shipped and run on systems without MSYS2 # installed (using MinGW-x64 + the UCRT). # MSYS is the Unix-variant runtime, which builds binaries that have a # dependency on MSYS2 being installed, but those binaries then use a # full emulated Unix environment at runtime. msystem: ["UCRT64", "MSYS"] steps: - name: Pre-checkout run: git config --global core.autocrlf input - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Setup uses: msys2/setup-msys2@e9898307ac31d1a803454791be09ab9973336e1c # v2.31.1 with: msystem: ${{ matrix.msystem }} update: true pacboy: diffutils gcc:p cmake:p ninja:p ${{ matrix.msystem == 'MSYS' && 'libreadline:p' || 'readline:p' }} - name: Configure shell: msys2 {0} run: cmake $CMAKE_FLAGS -G Ninja -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_BUILD_TYPE=Release -B build - name: Build shell: msys2 {0} run: ninja -C build - name: Test shell: msys2 {0} run: cd build && ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) ptarmigan: # Tests with various unusual processor architectures name: Multiarch strategy: fail-fast: false matrix: include: # S390x is important, because it's basically the only supported big-endian # architecture I can find anywhere. I used to work on SPARC and PPC-be systems # a long time ago, but even Debian has dropped those architectures now, so # it's nice that there's *least one* arch remaining to shake out endian # assumptions. - arch: "s390x" distro: ubuntu_latest # Big-iron POWER only (this is not the PowerPC arch used in old Apple Macs) - arch: "ppc64le" distro: "ubuntu_latest" # A 32-bit Linux build. i386 is mostly gone now, so ARMv7 is all that's left. - arch: "armv7" distro: "ubuntu_latest" # The only really widely-deployed non-x86 archicture, at least that's likely # to be running PCRE2. - arch: "aarch64" distro: "ubuntu_latest" # Not used by anyone yet, really, but potentially the "next big thing". - arch: "riscv64" distro: "ubuntu_latest" runs-on: ubuntu-latest permissions: contents: read packages: write # Necessary for uraimo/run-on-arch-action to use GitHub's Docker repository as a cache if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'ptarmigan')) || github.event_name == 'push' steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Prepare run: ./autogen.sh - uses: uraimo/run-on-arch-action@f9b26e3a1a408d5fd530d20c17b9f3f4428ff8d9 # v3.1.0 name: Configure, build, and test with: arch: ${{ matrix.arch }} distro: ${{ matrix.distro }} # Not required, but speeds up builds by storing container images in # a GitHub package registry. githubToken: ${{ github.token }} env: | # YAML, but pipe character is necessary CFLAGS_GCC_STYLE: ${{ env.CFLAGS_GCC_STYLE }} CMAKE_FLAGS: ${{ env.CMAKE_FLAGS }} install: | echo "set man-db/auto-update false" | debconf-communicate && dpkg-reconfigure man-db apt-get -qq update apt-get -qq install -y gcc cmake ninja-build zlib1g-dev libbz2-dev libreadline-dev run: | set -e # TODO: Set -DCMAKE_COMPILE_WARNING_AS_ERROR=ON (there's currently a build failure on S390x) cmake $CMAKE_FLAGS -G Ninja -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DPCRE2_DEBUG=ON -DCMAKE_C_FLAGS="$CFLAGS_GCC_STYLE" -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build cd build ninja ctest -j3 --output-on-failure && (cat ./Testing/Temporary/LastTest.log || true) zebrilus: # Tests with: Zig compiler. A "zebrilus" is known as a "zigzag heron". name: Zig runs-on: ubuntu-latest if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'zebrilus')) || github.event_name == 'push' steps: - name: Setup run: | sudo snap install zig --classic --beta - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Build run: zig build -Dsupport_jit - name: Test run: | srcdir=`pwd` pcre2test=`pwd`/zig-out/bin/pcre2test ./RunTest -bigstack bee: # Tests with: Bazel build system. A bee goes "buzz buzz buzz(el)". name: Bazel strategy: fail-fast: false matrix: os: ["ubuntu-latest", "windows-latest"] runs-on: ${{ matrix.os }} if: | (github.event_name == 'workflow_dispatch' && (inputs.job_id == 'all' || inputs.job_id == 'bee')) || github.event_name == 'push' steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true - name: Build run: bazelisk build //... --enable_runfiles --incompatible_strict_action_env - name: Test run: bazelisk test //... --enable_runfiles --incompatible_strict_action_env --test_output=all ================================================ FILE: .github/workflows/pages.yml ================================================ name: Deploy Pages on: workflow_dispatch: workflow_run: workflows: [ 'Sync' ] types: - completed branches: [ main ] push: branches: [ pages ] pull_request: branches: [ pages ] permissions: contents: read concurrency: group: "pages" cancel-in-progress: false jobs: Build: runs-on: ubuntu-latest steps: - name: Setup run: | echo "set man-db/auto-update false" | sudo debconf-communicate && sudo dpkg-reconfigure man-db sudo apt-get -qq update sudo apt-get -qq install -y hugo - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: ref: pages - name: Setup Pages id: pages uses: actions/configure-pages@45bfe0192ca1faeb007ade9deae92b16b8254a0d # v6.0.0 - name: Build with Hugo env: HUGO_BASE_URL: ${{ steps.pages.outputs.base_url }} run: pages/maint/Build.py - name: Upload artifact uses: actions/upload-pages-artifact@fc324d3547104276b827a68afc52ff2a11cc49c9 # v5.0.0 with: path: ./pages/public Deploy: needs: Build if: github.event_name != 'pull_request' && github.ref == 'refs/heads/pages' permissions: pages: write # to deploy to Pages id-token: write # to verify the deployment originates from an appropriate source environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@cd2ce8fcbc39b97be8ca5fce6e763baed58fa128 # v5.0.0 ================================================ FILE: .github/workflows/scorecards.yml ================================================ name: Scorecards supply-chain security on: workflow_dispatch: # Only the default branch is supported. branch_protection_rule: schedule: - cron: '23 17 * * 1' push: branches: [ main ] permissions: read-all jobs: analysis: name: Scorecards analysis runs-on: ubuntu-latest permissions: # Needed to upload the results to code-scanning dashboard. security-events: write # Needed to publish the results to Scorecard's service. id-token: write actions: read contents: read steps: - name: "Checkout code" uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: true persist-credentials: false - name: "Run analysis" uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # tag=v2.4.3 with: results_file: results.sarif results_format: sarif # repo_token: ${{ secrets.GITHUB_TOKEN }} # Publish the results to enable scorecard badges. For more details, see # https://github.com/ossf/scorecard-action#publishing-results. # For private repositories, `publish_results` will automatically be set to `false`, # regardless of the value entered here. publish_results: true # Upload the results as artifacts (optional). - name: "Upload artifact" uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: SARIF file path: results.sarif retention-days: 5 # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v3.29.5 with: sarif_file: results.sarif category: ossf-scorecard ================================================ FILE: .github/workflows/sync.yml ================================================ name: Sync on: workflow_dispatch: push: branches: [ main, "release/**" ] pull_request: branches: [ main ] permissions: contents: read jobs: sync-autogenerated: # Job to verify that the tasks performed by UpdateAlways have been done. It is # the committer's responsibility (currently) to run UpdateAlways themselves when # making a PR, so that everything is kept in-sync. name: Check autogenerated file freshness runs-on: ubuntu-latest permissions: contents: write steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: false fetch-depth: 0 # Necessary for maint/UpdateAlways fetch-tags: false # Check out the unmerged source branch for `pull_request`-triggered runs; # otherwise use the tip of the branch for `workflow_dispatch` and `pull` triggers. ref: ${{ github.event.pull_request.head.ref || github.ref }} repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} - name: UpdateDates if: | github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/tags/pcre2-')) run: maint/UpdateDates.py - name: UpdateAlways run: maint/UpdateAlways - name: 'Rebuild *.h.generic' run: | ./autogen.sh && ./configure # Workaround for incorrect filesystem permissions on /usr/share/aclocal, which # causes the m4 macros to be copied with incorrect permissions. # https://github.com/actions/runner-images/issues/11212 chmod u=rw,go=r m4/*.m4 rm -f src/*.generic make src/config.h.generic src/pcre2.h.generic # If we're in a forked repo, it's too onerous to expect contributors to run the # checks locally to keep these files up to date (since the tool versions are very # fussy and brittle). # # However, we still want to run the steps above, to check that the UpdateAlways # process is able to run to completion, since it can pick up errors in the man pages. - name: Commit and push, if not in a forked repo if: github.event_name != 'pull_request' || ( ! github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' ) run: | if [ -n "`git status --porcelain`" ] ; then # Dirty working tree: push it git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -u git commit -m "Sync autogenerated files #noupdate" git push fi sync-docs: name: Sync content from main to pages runs-on: ubuntu-latest if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main' needs: ['sync-autogenerated'] permissions: contents: write steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: false fetch-depth: 0 # Necessary to get both the main and pages branches fetch-tags: false ref: pages - name: Commit and push, if docs have changed run: | if ! git diff --exit-code origin/main -- \ ./doc ./AUTHORS.md ./LICENCE.md ./SECURITY.md ./README.md \ ./README ./NON-AUTOTOOLS-BUILD >/dev/null ; then # Differences from main: merge and push git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" git merge origin/main --no-edit -m"Sync content from main to pages" git push else echo "No content changes to sync" fi ================================================ FILE: .gitignore ================================================ # Public .gitignore file for PCRE2 build/ build-*/ pages/ *.a *.gcda *.gcno *.profraw *.lo *.la *.pc *.o *~ *-coverage* __pycache__ .deps .libs Makefile Makefile.in RunGrepTest.log RunGrepTest.trs RunTest.log RunTest.trs aclocal.m4 ar-lib autom4te.cache compile config.guess config.log config.status config.sub config.lt configure depcomp install-sh libtool ltmain.sh missing pcre2-config pcre2_dftables pcre2_jit_test pcre2_jit_test.exe pcre2_jit_test.log pcre2_jit_test.trs pcre2posix_test pcre2posix_test.exe pcre2posix_test.log pcre2posix_test.trs pcre2demo pcre2fuzzcheck-* pcre2fuzzer-* pcre2grep pcre2grep.exe pcre2test pcre2test.exe test-driver test-suite.log test3input test3output test3outputA test3outputB testNinput testNinputgrep teststderr teststderrM teststderrgrep teststdout teststdoutM testtemp1 testtemp1grep testtemp2 testtemp2grep testtry testtry2 testtrygrep testSinput testSoutput testbtables testsaved1 testsaved2 testoutput8 testoutput8-jit testoutput8-dfa testoutput16 testoutput16-jit testoutput16-dfa testoutput32 testoutput32-jit testoutput32-dfa m4/libtool.m4 m4/ltoptions.m4 m4/ltsugar.m4 m4/ltversion.m4 m4/lt~obsolete.m4 src/.deps src/.dirstamp src/config.h src/config.h.in src/pcre2.h src/pcre2_chartables.c src/libpcre2-8.sym src/libpcre2-16.sym src/libpcre2-32.sym src/libpcre2-posix.sym src/stamp-h1 /bazel-* *.bazel.lock zig-out/ zig-cache/ .zig-cache/ # Folders that may be used by individual developers, without appearing in git # status output. .vscode/ .devcontainer/ .personal/ # End ================================================ FILE: .gitmodules ================================================ [submodule "deps/sljit"] path = deps/sljit url = https://github.com/zherczeg/sljit.git ================================================ FILE: AUTHORS.md ================================================ PCRE2 Authorship and Contributors ================================= Copyright --------- Please see the file [LICENCE](./LICENCE.md) in the PCRE2 distribution for copyright details. Maintainers ----------- The PCRE and PCRE2 libraries were authored and maintained by Philip Hazel. Since 2024, the contributors with administrator access to the project are now Nicholas Wilson and Zoltán Herczeg. See the file [SECURITY](./SECURITY.md) for GPG keys. Both administrators are volunteers acting in a personal capacity.
Name Role
Nicholas Wilson
`nicholas@nicholaswilson.me.uk`
Currently of Microsoft Research Cambridge, UK
* General project administration & maintenance * Release management * Code maintenance
Zoltán Herczeg
`hzmester@freemail.hu`
Currently of the University of Szeged, Hungary
* Code maintenance * Ownership of `sljit` and PCRE2's JIT
Contributors ------------ Many others have participated and contributed to PCRE2 over its history. The maintainers are grateful for all contributions and participation over the years. We apologise for any names we have forgotten. We are especially grateful to Philip Hazel, creator of PCRE and PCRE2, and maintainer from 1997 to 2024. All names listed alphabetically. ### Contributors to PCRE2 This list includes names up until the PCRE2 10.47 release. New names will be added from the Git history on each release. Scott Bell Carlo Marcelo Arenas Belón Edward Betts Jan-Willem Blokland Ross Burton Dmitry Cherniachenko Alexey Chupahin Jessica Clarke Alejandro Colomar Jeremie Courreges-Anglas Addison Crump Alex Dowad Daniel Engberg Marco Feuerstein Daniel Richard G Isaac Oscar Gariano David Gaussmann Andrey Gorbachev Jordan Griege Jason Hood Bumsu Hyeon Roy Ivy Nobuhiro Iwamatsu Martin Joerg Guillem Jover Ralf Junker Ayesh Karunaratne Michael Kaufmann Yunho Kim Joshua Kinard David Korczynski Uwe Korn Jonas Kvinge Kristian Larsson Kai Lu Behzod Mansurov B. Scott Michel Greg Minshall Nathan Moinvaziri Mike Munday Marc Mutz Fabio Pagani Christian Persch Alex Reinking Joshua Rogers Tristan Ross William A Rowe Jr Rocco Ruscitti David Seifert Yaakov Selkowitz Rich Siegel Karl Skomski Maciej Sroczyński Wolfgang Stöggl Thomas Tempelmann Greg Thain Lucas Trzesniewski Theodore Tsirpanis Aaron M. Ucko Matthew Vernon Rémi Verschelde Thomas Voss Ezekiel Warren Carl Weaver Chris Wilson Amin Yahyaabadi Joe Zhang ### Contributors to PCRE1 These people contributed either by sending patches or reporting serious issues. Irfan Adilovic Alexander Barkov Daniel Bergström David Burgess Ross Burton David Byron Fred Cox Christian Ehrlicher Tom Fortmann Lionel Fourquaux Mike Frysinger Daniel Richard G Dair Gran "Graycode" (Red Hat Product Security) Viktor Griph Wen Guanxing Robin Houston Martin Jerabek Peter Kankowski Stephen Kelly Yunho Kim Joshua Kinard Carsten Klein Evgeny Kotkov Ronald Landheer-Cieslak Alan Lehotsky Dmitry V. Levin Nuno Lopes Kai Lu Giuseppe Maxia Dan Mooney Marc Mutz Markus Oberhumer Sheri Pierce Petr Pisar Ari Pollak Bob Rossi Ruiger Rill Michael Shigorin Rich Siegel Craig Silverstein (C++ wrapper) Karl Skomski Paul Sokolovsky Stan Switzer Ian Taylor Mark Tetrode Jeff Trawick Steven Van Ingelgem Lawrence Velazquez Jiong Wang Stefan Weber Chris Wilson Thanks go to Jeffrey Friedl for testing and debugging assistance. ================================================ FILE: BUILD.bazel ================================================ load("@bazel_skylib//rules:copy_file.bzl", "copy_file") load("@bazel_skylib//rules:native_binary.bzl", "native_test") load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") copy_file( name = "config_h_generic", src = "src/config.h.generic", out = "src/config.h", ) copy_file( name = "pcre2_h_generic", src = "src/pcre2.h.generic", out = "src/pcre2.h", ) copy_file( name = "pcre2_chartables_c", src = "src/pcre2_chartables.c.dist", out = "src/pcre2_chartables.c", ) LOCAL_DEFINES = [ "HAVE_CONFIG_H", "SUPPORT_PCRE2_8", "SUPPORT_UNICODE", ] + select({ "@platforms//os:windows": [], "//conditions:default": ["HAVE_UNISTD_H"], }) # Workaround for a Bazel quirk. It is extremely strict about the #include path # used for internal headers. We have our headers inside src/, but we #include # them as '#include "pcre2_internal.h"', assuming that src/ is added to the # compiler's include path. Unfortunately, that violates the conventions used by # Bazel. # # This is a workaround. Note that we can't use the "include = [...]" property # to add the src/ directory, since that pollutes the include path for projects # depending on PCRE2 (we must not make our config.h file visible to consumers # of PCRE2). cc_library( name = "pcre2_internal_headers", hdrs = [ "src/pcre2_compile.h", "src/pcre2_internal.h", "src/pcre2_intmodedep.h", "src/pcre2_jit_match_inc.h", "src/pcre2_jit_misc_inc.h", "src/pcre2_printint_inc.h", "src/pcre2_ucp.h", "src/pcre2_ucptables_inc.h", "src/pcre2_util.h", "src/pcre2test_inc.h", ":config_h_generic", ], strip_include_prefix = "src", visibility = ["//visibility:private"], ) cc_library( name = "pcre2", srcs = [ "src/pcre2_auto_possess.c", "src/pcre2_chkdint.c", "src/pcre2_compile.c", "src/pcre2_compile_cgroup.c", "src/pcre2_compile_class.c", "src/pcre2_config.c", "src/pcre2_context.c", "src/pcre2_convert.c", "src/pcre2_dfa_match.c", "src/pcre2_error.c", "src/pcre2_extuni.c", "src/pcre2_find_bracket.c", "src/pcre2_jit_compile.c", "src/pcre2_maketables.c", "src/pcre2_match.c", "src/pcre2_match_data.c", "src/pcre2_match_next.c", "src/pcre2_newline.c", "src/pcre2_ord2utf.c", "src/pcre2_pattern_info.c", "src/pcre2_script_run.c", "src/pcre2_serialize.c", "src/pcre2_string_utils.c", "src/pcre2_study.c", "src/pcre2_substitute.c", "src/pcre2_substring.c", "src/pcre2_tables.c", "src/pcre2_ucd.c", "src/pcre2_valid_utf.c", "src/pcre2_xclass.c", ":pcre2_chartables_c", ], hdrs = [":pcre2_h_generic"], implementation_deps = [":pcre2_internal_headers"], defines = [ "PCRE2_STATIC", ], local_defines = LOCAL_DEFINES + [ "PCRE2_CODE_UNIT_WIDTH=8", ], strip_include_prefix = "src", linkstatic = True, visibility = ["//visibility:public"], ) # See below for explanation of why we need this. # # https://github.com/bazelbuild/bazel/issues/680 cc_library( name = "pcre2posix_dotc_headers", hdrs = [ "src/pcre2_tables.c", ], strip_include_prefix = "src", visibility = ["//visibility:private"], ) cc_library( name = "pcre2-posix", srcs = ["src/pcre2posix.c"], hdrs = ["src/pcre2posix.h"], implementation_deps = [ ":pcre2_internal_headers", ":pcre2posix_dotc_headers", ], local_defines = LOCAL_DEFINES + [ "PCRE2_CODE_UNIT_WIDTH=8", ], strip_include_prefix = "src", linkstatic = True, visibility = ["//visibility:public"], deps = [ ":pcre2", ], ) # Totally weird issue in Bazel. It won't let you #include any files unless they # are declared to the build system. OK, fair enough. But - for a cc_binary it # uses the file extension to determine whether it's a header or a compilation # unit. But... we have several .c files which are #included, rather than treated # as a compilation unit. # # For cc_library() above, we can overcome this with textual_hdrs. But that # doesn't work for cc_binary(). Here's our workaround. # # https://github.com/bazelbuild/bazel/issues/680 cc_library( name = "pcre2test_dotc_headers", hdrs = [ "src/pcre2_chkdint.c", "src/pcre2_tables.c", "src/pcre2_ucd.c", "src/pcre2_valid_utf.c", ], strip_include_prefix = "src", visibility = ["//visibility:private"], ) cc_binary( name = "pcre2test", srcs = ["src/pcre2test.c"], linkopts = select({ "@platforms//os:windows": ["-STACK:2500000"], "//conditions:default": [], }), local_defines = LOCAL_DEFINES, visibility = ["//visibility:public"], deps = [ ":pcre2", ":pcre2-posix", ":pcre2_internal_headers", ":pcre2test_dotc_headers", ], ) filegroup( name = "testdata", srcs = glob(["testdata/*"]), ) native_test( name = "pcre2_test", size = "small", src = select({ "@platforms//os:windows": "RunTest.bat", "//conditions:default": "RunTest", }), out = select({ "@platforms//os:windows": "RunTest.bat", "//conditions:default": "RunTest", }), data = [ ":pcre2test", ":testdata", ], ) ================================================ FILE: CMakeLists.txt ================================================ # CMakeLists.txt # # This file enables PCRE2 to be built with the CMake configuration and build # tool. Download CMake in source or binary form from http://www.cmake.org/ # Converted to support PCRE2 from the original PCRE file, August 2014. # # Original listfile by Christian Ehrlicher # Refined and expanded by Daniel Richard G. # 2007-09-14 mod by Sheri so 7.4 supported configuration options can be entered # 2007-09-19 Adjusted by PH to retain previous default settings # 2007-12-26 (a) On UNIX, use names libpcre instead of just pcre # (b) Ensure pcretest and pcregrep link with the local library, # not a previously-installed one. # (c) Add PCRE_SUPPORT_LIBREADLINE, PCRE_SUPPORT_LIBZ, and # PCRE_SUPPORT_LIBBZ2. # 2008-01-20 Brought up to date to include several new features by Christian # Ehrlicher. # 2008-01-22 Sheri added options for backward compatibility of library names # when building with minGW: # if "ON", NON_STANDARD_LIB_PREFIX causes shared libraries to # be built without "lib" as prefix. (The libraries will be named # pcre.dll, pcreposix.dll and pcrecpp.dll). # if "ON", NON_STANDARD_LIB_SUFFIX causes shared libraries to # be built with suffix of "-0.dll". (The libraries will be named # libpcre-0.dll, libpcreposix-0.dll and libpcrecpp-0.dll - same names # built by default with Configure and Make. # 2008-01-23 PH removed the automatic build of pcredemo. # 2008-04-22 PH modified READLINE support so it finds NCURSES when needed. # 2008-07-03 PH updated for revised UCP property support (change of files) # 2009-03-23 PH applied Steven Van Ingelgem's patch to change the name # CMAKE_BINARY_DIR to PROJECT_BINARY_DIR so that it works when PCRE # is included within another project. # 2009-03-23 PH applied a modified version of Steven Van Ingelgem's patches to # add options to stop the building of pcregrep and the tests, and # to disable the final configuration report. # 2009-04-11 PH applied Christian Ehrlicher's patch to show compiler flags that # are set by specifying a release type. # 2010-01-02 PH added test for stdint.h # 2010-03-02 PH added test for inttypes.h # 2011-08-01 PH added PCREGREP_BUFSIZE # 2011-08-22 PH added PCRE_SUPPORT_JIT # 2011-09-06 PH modified WIN32 ADD_TEST line as suggested by Sergey Cherepanov # 2011-09-06 PH added PCRE_SUPPORT_PCREGREP_JIT # 2011-10-04 Sheri added support for including coff data in windows shared libraries # compiled with MINGW if pcre.rc and/or pcreposix.rc are placed in # the source dir by the user prior to building # 2011-10-04 Sheri changed various add_test's to use exes' location built instead # of DEBUG location only (likely only matters in MSVC) # 2011-10-04 Sheri added scripts to provide needed variables to RunTest and # RunGrepTest (used for UNIX and Msys) # 2011-10-04 Sheri added scripts to provide needed variables and to execute # RunTest.bat in Win32 (for effortless testing with "make test") # 2011-10-04 Sheri Increased minimum required cmake version # 2012-01-06 PH removed pcre_info.c and added pcre_string_utils.c # 2012-01-10 Zoltan Herczeg added libpcre16 support # 2012-01-13 Stephen Kelly added out of source build support # 2012-01-17 PH applied Stephen Kelly's patch to parse the version data out # of the configure.ac file # 2012-02-26 PH added support for libedit # 2012-09-06 PH added support for PCRE_EBCDIC_NL25 # 2012-09-08 ChPe added PCRE32 support # 2012-10-23 PH added support for VALGRIND and GCOV # 2012-12-08 PH added patch from Daniel Richard G to quash some MSVC warnings # 2013-07-01 PH realized that the "support" for GCOV was a total nonsense and # so it has been removed. # 2013-10-08 PH got rid of the "source" command, which is a bash-ism (use ".") # 2013-11-05 PH added support for PARENS_NEST_LIMIT # 2014-08-29 PH converted the file for PCRE2 (which has no C++). # 2015-04-24 PH added support for PCRE2_DEBUG # 2015-07-16 PH updated for new pcre2_find_bracket source module # 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III) # 2015-10=16 PH added support for never-backslash-C # 2016-03-01 PH applied Chris Wilson's patch for MSVC static # 2016-06-24 PH applied Chris Wilson's second patch, putting the first under # a new option instead of being unconditional. # 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch # fix by David Gaussmann # 2016-10-07 PH added PCREGREP_MAX_BUFSIZE # 2017-03-11 PH turned HEAP_MATCH_RECURSE into a NO-OP for 10.30 # 2017-04-08 PH added HEAP_LIMIT # 2017-06-15 ZH added SUPPORT_JIT_SEALLOC support # 2018-06-19 PH added checks for stdint.h and inttypes.h (later removed) # 2018-06-27 PH added Daniel's patch to increase the stack for MSVC # 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h # 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied # 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below) # 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere) # 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace # 2020-04-08 Carlo added function check for secure_getenv, fixed strerror # 2020-04-16 enh added check for __attribute__((uninitialized)) # 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and # library versioning. # 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator # 2020-04-28 PH added function check for memfd_create based on Carlo's patch # 2020-05-25 PH added a check for Intel CET # 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel # 2021-06-29 JWSB added the option to build static library with PIC. # 2021-07-05 JWSB modified such both the static and shared library can be # build in one go. # 2021-08-28 PH increased minimum version # 2021-08-28 PH added test for realpath() # 2022-12-10 PH added support for pcre2posix_test # 2023-01-15 Carlo added C99 as the minimum required # 2023-08-06 PH added support for setting variable length lookbehind maximum # 2025-03-27 Theodore used standard CMake constructs to export the library's # targets. ################################################################################ # We have used `gersemi` for auto-formatting our CMake files. # Applied to all CMake files using: # > pip3 install gersemi # > gersemi --in-place --line-length 120 --indent 2 \ # --definitions cmake/*.cmake \ # -- ./CMakeLists.txt ./cmake/*.cmake ./cmake/*.cmake.in ################################################################################ message(STATUS "Using CMake version ${CMAKE_VERSION} (${CMAKE_COMMAND})") # Increased minimum to 3.15 to allow use of string(REPEAT). cmake_minimum_required(VERSION 3.15 FATAL_ERROR) project(PCRE2 C) set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED TRUE) set(CMAKE_C_VISIBILITY_PRESET hidden) # Solaris-specific fix for "CMAKE_C_VISIBILITY_PRESET": this feature was only # recently added to CMake for the `cc` compiler (Oracle Developer Studio). The # CMake version from OpenCSW and Oracle's package repository is too old and # requires this fix. if( CMAKE_SYSTEM_NAME STREQUAL "SunOS" AND CMAKE_VERSION VERSION_LESS 3.31 AND CMAKE_C_COMPILER_ID STREQUAL "SunPro" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 5.15 ) set(CMAKE_C_COMPILE_OPTIONS_VISIBILITY "-fvisibility=") endif() # The following policies have been set in the PCRE2 CMake file in the past. # Since we specify a minimum of CMake 3.15, these are no longer required. # cmake_policy(SET CMP0063 NEW) # cmake_policy(SET CMP0026 OLD) # cmake_policy(SET CMP0074 NEW) # For our modules in cmake/. This uses list(APPEND) rather than set() to allow # setting CMAKE_MODULE_PATH on the command line. list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # External packages find_package(BZip2) find_package(ZLIB) find_package(Readline) find_package(Editline) # Configuration checks include(CheckCSourceCompiles) include(CheckFunctionExists) include(CheckSymbolExists) include(CheckIncludeFile) include(CheckTypeSize) include(CMakePackageConfigHelpers) include(CMakePushCheckState) include(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR include(PCRE2CheckVscript) include(PCRE2UseSystemExtensions) include(PCRE2WarningAsError) check_include_file(assert.h HAVE_ASSERT_H) check_include_file(dirent.h HAVE_DIRENT_H) check_include_file(sys/stat.h HAVE_SYS_STAT_H) check_include_file(sys/types.h HAVE_SYS_TYPES_H) check_include_file(unistd.h HAVE_UNISTD_H) check_include_file(windows.h HAVE_WINDOWS_H) # Check whether any system-wide extensions need to be enabled, in order for # OS functionality to be exposed. pcre2_use_system_extensions() cmake_push_check_state(RESET) # Propagate the _GNU_SOURCE definition (added to COMPILE_DEFINITIONS by # pcre2_use_system_extensions()) so that these check_symbol_exists() calls # can find symbols only exposed with _GNU_SOURCE. # We only pass through known plain-text definitions (like _GNU_SOURCE) and # skip anything else (e.g. generator expressions inherited from a parent # project), because try_compile() cannot handle generator expressions. get_directory_property(_pcre2_compile_definitions COMPILE_DEFINITIONS) if(DEFINED _pcre2_compile_definitions) set(CMAKE_REQUIRED_DEFINITIONS "") list(FIND _pcre2_compile_definitions "_GNU_SOURCE" _idx) if(_idx GREATER -1) list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE") endif() endif() check_symbol_exists(mkostemp stdlib.h HAVE_MKOSTEMP) # glibc 2.7 check_symbol_exists(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE) # glibc 2.27 check_symbol_exists(secure_getenv "stdlib.h" HAVE_SECURE_GETENV) # glibc 2.17 check_symbol_exists(setrlimit "sys/resource.h" HAVE_SETRLIMIT) cmake_pop_check_state() check_c_source_compiles( [=[ #include #include int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[c], buf); return 0; } ]=] HAVE_REALPATH ) # Many CMake tests use highly-dubious C source code which generates warnings # (for example, calling a function with the wrong signature just to see whether # the linker can find it). These couple of tests here though need to be compiled # with -Werror to be effective. cmake_push_check_state() if(NOT DEFINED CMAKE_REQUIRED_FLAGS) set(CMAKE_REQUIRED_FLAGS "") endif() pcre2_warning_as_error(WARNING_AS_ERROR_FLAGS) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${WARNING_AS_ERROR_FLAGS}") check_c_source_compiles( "int main(void) { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }" HAVE_ATTRIBUTE_UNINITIALIZED ) if(NOT MSVC) check_c_source_compiles( [=[ extern __attribute__ ((visibility ("default"))) int f(void); int main(void) { return f(); } int f(void) { return 42; } ]=] HAVE_VISIBILITY ) endif() if(CMAKE_C_COMPILE_OPTIONS_VISIBILITY AND NOT HAVE_VISIBILITY) # Hmm. We don't know of any way to set a symbol to have default visibility, other # than the common "__attribute__((visibility("default")))". But CMake thinks that # the compiler supports CMAKE_C_VISIBILITY_PRESET. The build is likely to fail to # generate a usable shared library. message( WARNING "C compiler uses the visibility flag ${CMAKE_C_COMPILE_OPTIONS_VISIBILITY}. We did not detect support for __attribute__((visibility(\"default\"))). We do not know how to set a symbol to have default visibility." ) elseif(HAVE_VISIBILITY AND NOT CMAKE_C_COMPILE_OPTIONS_VISIBILITY) # Here, the build may well work, but it's possible that CMake isn't correctly # suppressing the visibility of private symbols. message( WARNING "C compiler appears to support __attribute__((visibility(\"default\"))). However, CMake is not using a visibility flag. The visibility of private symbols may not be correct." ) endif() if(HAVE_VISIBILITY) set(PCRE2_EXPORT [=[__attribute__ ((visibility ("default")))]=]) else() set(PCRE2_EXPORT) endif() cmake_pop_check_state() check_c_source_compiles("int main(void) { __assume(1); return 0; }" HAVE_BUILTIN_ASSUME) check_c_source_compiles( [=[ #include int main(void) { int a,b; size_t m; __builtin_mul_overflow(a,b,&m); return 0; } ]=] HAVE_BUILTIN_MUL_OVERFLOW ) check_c_source_compiles( "int main(int c, char *v[]) { if (c) __builtin_unreachable(); return (int)(*v[0]); }" HAVE_BUILTIN_UNREACHABLE ) # Detect support for linker scripts. pcre2_check_vscript(HAVE_VSCRIPT VSCRIPT_FLAG HAVE_VSCRIPT_NO_STAR) # Check whether Intel CET is enabled, and if so, adjust compiler flags. This # code was written by PH, trying to imitate the logic from the autotools # configuration. check_c_source_compiles( [=[ #ifndef __CET__ #error CET is not enabled #endif int main() { return 0; } ]=] INTEL_CET_ENABLED ) if(INTEL_CET_ENABLED) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mshstk") endif() # User-configurable options # # Note: CMakeSetup displays these in alphabetical order, regardless of # the order we use here. set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries.") option(BUILD_STATIC_LIBS "Build static libraries." ON) option(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON) option(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF) option(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF) option(PCRE2_STATIC_PIC "Build the static library with the option position independent code enabled." OFF) set(PCRE2_DEBUG "IfDebugBuild" CACHE STRING "Include debugging code") set_property(CACHE PCRE2_DEBUG PROPERTY STRINGS "IfDebugBuild" "ON" "OFF") option(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF) set( PCRE2_EBCDIC OFF CACHE BOOL "Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems.)" ) set(PCRE2_EBCDIC_NL25 OFF CACHE BOOL "Use 0x25 as EBCDIC NL character instead of 0x15; implies EBCDIC.") set( PCRE2_EBCDIC_IGNORING_COMPILER OFF CACHE BOOL "Force EBCDIC 1047 using numeric literals rather than C character literals; implies EBCDIC." ) option(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF) set( PCRE2_LINK_SIZE "2" CACHE STRING "Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details." ) set( PCRE2_PARENS_NEST_LIMIT "250" CACHE STRING "Default nested parentheses limit. See PARENS_NEST_LIMIT in config.h.in for details." ) set( PCRE2_HEAP_LIMIT "20000000" CACHE STRING "Default limit on heap memory (kibibytes). See HEAP_LIMIT in config.h.in for details." ) set(PCRE2_MAX_VARLOOKBEHIND "255" CACHE STRING "Default limit on variable lookbehinds.") set( PCRE2_MATCH_LIMIT "10000000" CACHE STRING "Default limit on internal looping. See MATCH_LIMIT in config.h.in for details." ) set( PCRE2_MATCH_LIMIT_DEPTH "MATCH_LIMIT" CACHE STRING "Default limit on internal depth of search. See MATCH_LIMIT_DEPTH in config.h.in for details." ) set( PCRE2GREP_BUFSIZE "20480" CACHE STRING "Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details." ) set( PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING "Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details." ) set(PCRE2_NEWLINE "LF" CACHE STRING "What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF, NUL).") set(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL "Obsolete option: do not use") mark_as_advanced(PCRE2_HEAP_MATCH_RECURSE) set(PCRE2_SUPPORT_JIT OFF CACHE BOOL "Enable support for Just-in-time compiling.") if(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD) set(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL "Enable SELinux compatible execmem allocator in JIT (experimental).") else() set(PCRE2_SUPPORT_JIT_SEALLOC IGNORE) endif() set(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL "Enable use of Just-in-time compiling in pcre2grep.") set(PCRE2GREP_SUPPORT_CALLOUT ON CACHE BOOL "Enable callout string support in pcre2grep.") set(PCRE2GREP_SUPPORT_CALLOUT_FORK ON CACHE BOOL "Enable callout string fork support in pcre2grep.") set(PCRE2_SUPPORT_UNICODE ON CACHE BOOL "Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.") set( PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL "ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks" ) set(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL "If ON, backslash-C (upper case C) is locked out.") set(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL "Enable Valgrind support.") option(PCRE2_SHOW_REPORT "Show the final configuration report" ON) option(PCRE2_BUILD_PCRE2GREP "Build pcre2grep" ON) option(PCRE2_BUILD_TESTS "Build the tests" ON) set( PCRE2_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/pcre2" CACHE STRING "Path used during CMake install for placing PCRE2's CMake config files, relative to the installation root (prefix)" ) if(MINGW) option( NON_STANDARD_LIB_PREFIX "ON=Shared libraries built in mingw will be named pcre2.dll, etc., instead of libpcre2.dll, etc." OFF ) option( NON_STANDARD_LIB_SUFFIX "ON=Shared libraries built in mingw will be named libpcre2-0.dll, etc., instead of libpcre2.dll, etc." OFF ) endif() if(MSVC) option(INSTALL_MSVC_PDB "ON=Install .pdb files built by MSVC, if generated" OFF) endif() if(HAVE_VSCRIPT) option(PCRE2_SYMVERS "Enable library symbol versioning" ON) endif() # bzip2 lib if(BZip2_FOUND) option(PCRE2_SUPPORT_LIBBZ2 "Enable support for linking pcre2grep with libbz2." ON) endif() # zlib if(ZLIB_FOUND) option(PCRE2_SUPPORT_LIBZ "Enable support for linking pcre2grep with libz." ON) endif() # editline lib if(Editline_FOUND) option(PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF) endif() # readline lib if(Readline_FOUND) option(PCRE2_SUPPORT_LIBREADLINE "Enable support for linking pcre2test with libreadline." ON) endif() # Prepare build configuration include_directories(${PROJECT_BINARY_DIR}/interface ${PROJECT_BINARY_DIR}/src ${PROJECT_SOURCE_DIR}/src) if(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS) message(FATAL_ERROR "At least one of BUILD_SHARED_LIBS or BUILD_STATIC_LIBS must be enabled.") endif() if(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32) message( FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled" ) endif() if(PCRE2_BUILD_PCRE2_8) set(SUPPORT_PCRE2_8 1) endif() if(PCRE2_BUILD_PCRE2_16) set(SUPPORT_PCRE2_16 1) endif() if(PCRE2_BUILD_PCRE2_32) set(SUPPORT_PCRE2_32 1) endif() if(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8) message(STATUS "** PCRE2_BUILD_PCRE2_8 must be enabled for the pcre2grep program") set(PCRE2_BUILD_PCRE2GREP OFF) endif() if(PCRE2_SUPPORT_LIBBZ2) if(BZip2_FOUND) include_directories(${BZIP2_INCLUDE_DIRS}) else() message( FATAL_ERROR " libbz2 not found. Set BZIP2_INCLUDE_DIRS to a compatible header\n" " or set BZip2_ROOT to a full bzip2 installed tree, as needed." ) endif() endif() if(PCRE2_SUPPORT_LIBZ) if(ZLIB_FOUND) include_directories(${ZLIB_INCLUDE_DIRS}) else() message( FATAL_ERROR " zlib not found. Set ZLIB_INCLUDE_DIRS to a compatible header\n" " or set ZLIB_ROOT to a full zlib installed tree, as needed." ) endif() endif() if(PCRE2_SUPPORT_LIBEDIT) if(Editline_FOUND) include_directories(${EDITLINE_INCLUDE_DIRS}) else() message( FATAL_ERROR " libedit not found. Set EDITLINE_INCLUDE_DIRS to a compatible header\n" " or set Editline_ROOT to a full libedit installed tree, as needed." ) endif() endif() if(PCRE2_SUPPORT_LIBREADLINE) if(Readline_FOUND) include_directories(${READLINE_INCLUDE_DIRS}) else() message( FATAL_ERROR " libreadline not found. Set READLINE_INCLUDE_DIRS to a compatible header\n" " or set Readline_ROOT to a full libreadline installed tree, as needed." ) endif() endif() if(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT) if(Readline_FOUND) message( FATAL_ERROR " Only one of the readline compatible libraries can be enabled.\n" " Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF" ) endif() endif() if(PCRE2_SUPPORT_BSR_ANYCRLF) set(BSR_ANYCRLF 1) endif() if(PCRE2_NEVER_BACKSLASH_C) set(NEVER_BACKSLASH_C 1) endif() if(PCRE2_SUPPORT_UNICODE) set(SUPPORT_UNICODE 1) endif() if(PCRE2_SUPPORT_JIT) set(SUPPORT_JIT 1) if(UNIX) find_package(Threads REQUIRED) if(CMAKE_USE_PTHREADS_INIT) set(REQUIRE_PTHREAD 1) endif() endif() endif() if(PCRE2_SUPPORT_JIT_SEALLOC) set(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1) endif() if(PCRE2GREP_SUPPORT_JIT) set(SUPPORT_PCRE2GREP_JIT 1) endif() if(PCRE2GREP_SUPPORT_CALLOUT) set(SUPPORT_PCRE2GREP_CALLOUT 1) if(PCRE2GREP_SUPPORT_CALLOUT_FORK) set(SUPPORT_PCRE2GREP_CALLOUT_FORK 1) endif() endif() if(PCRE2_SUPPORT_VALGRIND) set(SUPPORT_VALGRIND 1) endif() if(PCRE2_DISABLE_PERCENT_ZT) set(DISABLE_PERCENT_ZT 1) endif() set(PCRE2TEST_LIBS) set(PCRE2GREP_LIBS) if(PCRE2_SUPPORT_LIBREADLINE) set(SUPPORT_LIBREADLINE 1) list(APPEND PCRE2TEST_LIBS ${READLINE_LIBRARIES}) endif() # libedit is a plug-compatible alternative to libreadline if(PCRE2_SUPPORT_LIBEDIT) set(SUPPORT_LIBEDIT 1) list(APPEND PCRE2TEST_LIBS ${EDITLINE_LIBRARIES}) endif() if(PCRE2_SUPPORT_LIBZ) set(SUPPORT_LIBZ 1) list(APPEND PCRE2GREP_LIBS ${ZLIB_LIBRARIES}) endif() if(PCRE2_SUPPORT_LIBBZ2) set(SUPPORT_LIBBZ2 1) list(APPEND PCRE2GREP_LIBS ${BZIP2_LIBRARIES}) endif() set(NEWLINE_DEFAULT "") if(PCRE2_NEWLINE STREQUAL "CR") set(NEWLINE_DEFAULT "1") endif() if(PCRE2_NEWLINE STREQUAL "LF") set(NEWLINE_DEFAULT "2") endif() if(PCRE2_NEWLINE STREQUAL "CRLF") set(NEWLINE_DEFAULT "3") endif() if(PCRE2_NEWLINE STREQUAL "ANY") set(NEWLINE_DEFAULT "4") endif() if(PCRE2_NEWLINE STREQUAL "ANYCRLF") set(NEWLINE_DEFAULT "5") endif() if(PCRE2_NEWLINE STREQUAL "NUL") set(NEWLINE_DEFAULT "6") endif() if(NEWLINE_DEFAULT STREQUAL "") message( FATAL_ERROR "The PCRE2_NEWLINE variable must be set to one of the following values: \"LF\", \"CR\", \"CRLF\", \"ANY\", \"ANYCRLF\"." ) endif() set(REBUILD_CHARTABLES OFF) if(PCRE2_REBUILD_CHARTABLES) set(REBUILD_CHARTABLES ON) endif() set(EBCDIC OFF) if(PCRE2_EBCDIC) set(EBCDIC ON) endif() if(PCRE2_EBCDIC_NL25) set(EBCDIC ON) set(EBCDIC_NL25 ON) endif() if(PCRE2_EBCDIC_IGNORING_COMPILER) set(EBCDIC ON) set(EBCDIC_IGNORING_COMPILER ON) endif() # Make sure that if EBCDIC is set (without EBCDIC_IGNORING_COMPILER), then # REBUILD_CHARTABLES is also enabled. # Also check that UTF support is not requested, because PCRE2 cannot handle # EBCDIC and UTF in the same build. To do so it would need to use different # character constants depending on the mode. # Also, EBCDIC cannot be used with 16-bit and 32-bit libraries. if(EBCDIC) if(NOT EBCDIC_IGNORING_COMPILER) set(REBUILD_CHARTABLES ON) endif() if(PCRE2_SUPPORT_UNICODE) message(FATAL_ERROR "Support for EBCDIC and Unicode cannot be enabled at the same time") endif() if(PCRE2_BUILD_PCRE2_16 OR PCRE2_BUILD_PCRE2_32) message(FATAL_ERROR "EBCDIC support is available only for the 8-bit library") endif() endif() # Remove the Autotools-generated copy of config.h and pcre2.h. I sometimes switch # between Autoconf and CMake, and the conflicting copies of config.h generate some # very unintuitive build errors. file(REMOVE ${PROJECT_SOURCE_DIR}/src/config.h) file(REMOVE ${PROJECT_SOURCE_DIR}/src/pcre2.h) # Parse version numbers and date out of configure.ac file( STRINGS ${PROJECT_SOURCE_DIR}/configure.ac configure_lines LIMIT_COUNT 50 # Read only the first 50 lines of the file ) set( SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date" "libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version" ) foreach(configure_line ${configure_lines}) foreach(substitution_variable ${SEARCHED_VARIABLES}) string(TOUPPER ${substitution_variable} substitution_variable_upper) if(NOT DEFINED ${substitution_variable_upper}) if("${configure_line}" MATCHES "m4_define\\(${substitution_variable}, *\\[(.*)\\]") set(${substitution_variable_upper} "${CMAKE_MATCH_1}") endif() endif() endforeach() endforeach() macro(PARSE_LIB_VERSION variable_prefix) string(REPLACE ":" ";" ${variable_prefix}_VERSION_LIST ${${variable_prefix}_VERSION}) list(GET ${variable_prefix}_VERSION_LIST 0 ${variable_prefix}_VERSION_CURRENT) list(GET ${variable_prefix}_VERSION_LIST 1 ${variable_prefix}_VERSION_REVISION) list(GET ${variable_prefix}_VERSION_LIST 2 ${variable_prefix}_VERSION_AGE) math(EXPR ${variable_prefix}_SOVERSION "${${variable_prefix}_VERSION_CURRENT} - ${${variable_prefix}_VERSION_AGE}") math(EXPR ${variable_prefix}_MACHO_COMPATIBILITY_VERSION "${${variable_prefix}_VERSION_CURRENT} + 1") math(EXPR ${variable_prefix}_MACHO_CURRENT_VERSION "${${variable_prefix}_VERSION_CURRENT} + 1") set( ${variable_prefix}_MACHO_CURRENT_VERSION "${${variable_prefix}_MACHO_CURRENT_VERSION}.${${variable_prefix}_VERSION_REVISION}}" ) set( ${variable_prefix}_VERSION "${${variable_prefix}_SOVERSION}.${${variable_prefix}_VERSION_AGE}.${${variable_prefix}_VERSION_REVISION}" ) endmacro() parse_lib_version(LIBPCRE2_POSIX) parse_lib_version(LIBPCRE2_8) parse_lib_version(LIBPCRE2_16) parse_lib_version(LIBPCRE2_32) # Output files configure_file(src/config-cmake.h.in ${PROJECT_BINARY_DIR}/src/config.h @ONLY) configure_file(src/pcre2.h.in ${PROJECT_BINARY_DIR}/interface/pcre2.h @ONLY) configure_file(src/pcre2posix.h ${PROJECT_BINARY_DIR}/interface/pcre2posix.h COPYONLY) # Configure the version script files. if(HAVE_VSCRIPT AND PCRE2_SYMVERS) if(HAVE_VSCRIPT_NO_STAR) set(PCRE2_EXTRA_LOCAL_SYMS "") else() set(PCRE2_EXTRA_LOCAL_SYMS " local: *;") endif() configure_file(src/libpcre2-8.sym.in ${PROJECT_BINARY_DIR}/src/libpcre2-8.sym @ONLY) configure_file(src/libpcre2-16.sym.in ${PROJECT_BINARY_DIR}/src/libpcre2-16.sym @ONLY) configure_file(src/libpcre2-32.sym.in ${PROJECT_BINARY_DIR}/src/libpcre2-32.sym @ONLY) configure_file(src/libpcre2-posix.sym.in ${PROJECT_BINARY_DIR}/src/libpcre2-posix.sym @ONLY) endif() # Character table generation if(REBUILD_CHARTABLES) add_executable(pcre2_dftables src/pcre2_dftables.c) add_custom_command( OUTPUT ${PROJECT_BINARY_DIR}/src/pcre2_chartables.c COMMAND pcre2_dftables ARGS ${PROJECT_BINARY_DIR}/src/pcre2_chartables.c DEPENDS pcre2_dftables COMMENT "Generating character tables (pcre2_chartables.c) for current locale" VERBATIM ) elseif(NOT PCRE2_EBCDIC) configure_file( ${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.dist ${PROJECT_BINARY_DIR}/src/pcre2_chartables.c COPYONLY ) elseif(PCRE2_EBCDIC_NL25) configure_file( ${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.ebcdic-1047-nl25 ${PROJECT_BINARY_DIR}/src/pcre2_chartables.c COPYONLY ) else() configure_file( ${PROJECT_SOURCE_DIR}/src/pcre2_chartables.c.ebcdic-1047-nl15 ${PROJECT_BINARY_DIR}/src/pcre2_chartables.c COPYONLY ) endif() # Source code set(PCRE2_HEADERS ${PROJECT_BINARY_DIR}/interface/pcre2.h) set( PCRE2_SOURCES src/pcre2_auto_possess.c ${PROJECT_BINARY_DIR}/src/pcre2_chartables.c src/pcre2_chkdint.c src/pcre2_compile.c src/pcre2_compile_cgroup.c src/pcre2_compile_class.c src/pcre2_config.c src/pcre2_context.c src/pcre2_convert.c src/pcre2_dfa_match.c src/pcre2_error.c src/pcre2_extuni.c src/pcre2_find_bracket.c src/pcre2_jit_compile.c src/pcre2_maketables.c src/pcre2_match.c src/pcre2_match_data.c src/pcre2_match_next.c src/pcre2_newline.c src/pcre2_ord2utf.c src/pcre2_pattern_info.c src/pcre2_script_run.c src/pcre2_serialize.c src/pcre2_string_utils.c src/pcre2_study.c src/pcre2_substitute.c src/pcre2_substring.c src/pcre2_tables.c src/pcre2_ucd.c src/pcre2_valid_utf.c src/pcre2_xclass.c ) set(PCRE2POSIX_HEADERS ${PROJECT_BINARY_DIR}/interface/pcre2posix.h) set(PCRE2POSIX_SOURCES src/pcre2posix.c) if(MINGW AND BUILD_SHARED_LIBS) if(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) add_custom_command( OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o PRE-LINK COMMAND windres ARGS pcre2.rc pcre2.o WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMENT "Using pcre2 coff info in mingw build" ) list(APPEND PCRE2_SOURCES ${PROJECT_SOURCE_DIR}/pcre2.o) endif() if(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc) add_custom_command( OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o PRE-LINK COMMAND windres ARGS pcre2posix.rc pcre2posix.o WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMENT "Using pcre2posix coff info in mingw build" ) list(APPEND PCRE2POSIX_SOURCES ${PROJECT_SOURCE_DIR}/pcre2posix.o) endif() endif() if(MSVC AND BUILD_SHARED_LIBS) if(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) list(APPEND PCRE2_SOURCES pcre2.rc) endif() if(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc) list(APPEND PCRE2POSIX_SOURCES pcre2posix.rc) endif() endif() # Build setup add_compile_definitions(HAVE_CONFIG_H) if(PCRE2_DEBUG STREQUAL "IfDebugBuild") add_compile_definitions("$<$:PCRE2_DEBUG>") elseif(PCRE2_DEBUG) add_compile_definitions("PCRE2_DEBUG") endif() if(MSVC) add_compile_definitions(_CRT_SECURE_NO_DEPRECATE _CRT_SECURE_NO_WARNINGS) endif() # Make sure to not link debug libs against release libs and vice versa. if(WIN32) set(CMAKE_DEBUG_POSTFIX "d") endif() set(TARGETS) set(DLL_PDB_FILES) set(DLL_PDB_DEBUG_FILES) # 8-bit library if(PCRE2_BUILD_PCRE2_8) if(BUILD_STATIC_LIBS) add_library(pcre2-8-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES}) set_target_properties( pcre2-8-static PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_8_VERSION} SOVERSION ${LIBPCRE2_8_SOVERSION} ) target_compile_definitions(pcre2-8-static PUBLIC PCRE2_STATIC) target_include_directories( pcre2-8-static PUBLIC "$" "$" ) if(REQUIRE_PTHREAD) target_link_libraries(pcre2-8-static Threads::Threads) endif() list(APPEND TARGETS pcre2-8-static) add_library(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES}) set_target_properties( pcre2-posix-static PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_POSIX_VERSION} SOVERSION ${LIBPCRE2_POSIX_SOVERSION} ) target_link_libraries(pcre2-posix-static pcre2-8-static) target_include_directories( pcre2-posix-static PUBLIC "$" "$" ) list(APPEND TARGETS pcre2-posix-static) if(MSVC) set_target_properties(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8-static) set_target_properties(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix-static) else() set_target_properties(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8) set_target_properties(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix) endif() if(PCRE2_STATIC_PIC) set_target_properties(pcre2-8-static pcre2-posix-static PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() endif() if(BUILD_SHARED_LIBS) add_library(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES}) target_include_directories( pcre2-8-shared PUBLIC "$" "$" ) set_target_properties( pcre2-8-shared PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_8_VERSION} SOVERSION ${LIBPCRE2_8_SOVERSION} OUTPUT_NAME pcre2-8 ) if(REQUIRE_PTHREAD) target_link_libraries(pcre2-8-shared Threads::Threads) endif() if(HAVE_VSCRIPT AND PCRE2_SYMVERS) target_link_options(pcre2-8-shared PRIVATE -Wl,${VSCRIPT_FLAG},${PROJECT_BINARY_DIR}/src/libpcre2-8.sym) set_target_properties(pcre2-8-shared PROPERTIES LINK_DEPENDS ${PROJECT_BINARY_DIR}/src/libpcre2-8.sym) endif() list(APPEND TARGETS pcre2-8-shared) list(APPEND DLL_PDB_FILES $/pcre2-8.pdb) list(APPEND DLL_PDB_DEBUG_FILES $/pcre2-8d.pdb) add_library(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES}) target_include_directories( pcre2-posix-shared PUBLIC "$" "$" ) set_target_properties( pcre2-posix-shared PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_POSIX_VERSION} SOVERSION ${LIBPCRE2_POSIX_SOVERSION} OUTPUT_NAME pcre2-posix ) if(HAVE_VSCRIPT AND PCRE2_SYMVERS) target_link_options(pcre2-posix-shared PRIVATE -Wl,${VSCRIPT_FLAG},${PROJECT_BINARY_DIR}/src/libpcre2-posix.sym) set_target_properties(pcre2-posix-shared PROPERTIES LINK_DEPENDS ${PROJECT_BINARY_DIR}/src/libpcre2-posix.sym) endif() target_compile_definitions(pcre2-posix-shared PUBLIC PCRE2POSIX_SHARED) target_link_libraries(pcre2-posix-shared pcre2-8-shared) list(APPEND TARGETS pcre2-posix-shared) list(APPEND DLL_PDB_FILES $/pcre2-posix.pdb) list(APPEND DLL_PDB_DEBUG_FILES $/pcre2-posixd.pdb) if(MINGW) if(NON_STANDARD_LIB_PREFIX) set_target_properties(pcre2-8-shared pcre2-posix-shared PROPERTIES PREFIX "") endif() if(NON_STANDARD_LIB_SUFFIX) set_target_properties(pcre2-8-shared pcre2-posix-shared PROPERTIES SUFFIX "-0.dll") endif() endif() endif() if(BUILD_STATIC_LIBS) add_library(pcre2-8 ALIAS pcre2-8-static) add_library(pcre2-posix ALIAS pcre2-posix-static) else() add_library(pcre2-8 ALIAS pcre2-8-shared) add_library(pcre2-posix ALIAS pcre2-posix-shared) endif() endif() # 16-bit library if(PCRE2_BUILD_PCRE2_16) if(BUILD_STATIC_LIBS) add_library(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES}) target_include_directories( pcre2-16-static PUBLIC "$" "$" ) set_target_properties( pcre2-16-static PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_16_VERSION} SOVERSION ${LIBPCRE2_16_SOVERSION} ) target_compile_definitions(pcre2-16-static PUBLIC PCRE2_STATIC) if(REQUIRE_PTHREAD) target_link_libraries(pcre2-16-static Threads::Threads) endif() list(APPEND TARGETS pcre2-16-static) if(MSVC) set_target_properties(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16-static) else() set_target_properties(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16) endif() if(PCRE2_STATIC_PIC) set_target_properties(pcre2-16-static PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() endif() if(BUILD_SHARED_LIBS) add_library(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES}) target_include_directories( pcre2-16-shared PUBLIC "$" "$" ) set_target_properties( pcre2-16-shared PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_16_VERSION} SOVERSION ${LIBPCRE2_16_SOVERSION} OUTPUT_NAME pcre2-16 ) if(REQUIRE_PTHREAD) target_link_libraries(pcre2-16-shared Threads::Threads) endif() if(HAVE_VSCRIPT AND PCRE2_SYMVERS) target_link_options(pcre2-16-shared PRIVATE -Wl,${VSCRIPT_FLAG},${PROJECT_BINARY_DIR}/src/libpcre2-16.sym) set_target_properties(pcre2-16-shared PROPERTIES LINK_DEPENDS ${PROJECT_BINARY_DIR}/src/libpcre2-16.sym) endif() list(APPEND TARGETS pcre2-16-shared) list(APPEND DLL_PDB_FILES $/pcre2-16.pdb) list(APPEND DLL_PDB_DEBUG_FILES $/pcre2-16d.pdb) if(MINGW) if(NON_STANDARD_LIB_PREFIX) set_target_properties(pcre2-16-shared PROPERTIES PREFIX "") endif() if(NON_STANDARD_LIB_SUFFIX) set_target_properties(pcre2-16-shared PROPERTIES SUFFIX "-0.dll") endif() endif() endif() if(BUILD_STATIC_LIBS) add_library(pcre2-16 ALIAS pcre2-16-static) else() add_library(pcre2-16 ALIAS pcre2-16-shared) endif() endif() # 32-bit library if(PCRE2_BUILD_PCRE2_32) if(BUILD_STATIC_LIBS) add_library(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES}) target_include_directories( pcre2-32-static PUBLIC "$" "$" ) set_target_properties( pcre2-32-static PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_32_VERSION} SOVERSION ${LIBPCRE2_32_SOVERSION} ) target_compile_definitions(pcre2-32-static PUBLIC PCRE2_STATIC) if(REQUIRE_PTHREAD) target_link_libraries(pcre2-32-static Threads::Threads) endif() list(APPEND TARGETS pcre2-32-static) if(MSVC) set_target_properties(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32-static) else() set_target_properties(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32) endif() if(PCRE2_STATIC_PIC) set_target_properties(pcre2-32-static PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() endif() if(BUILD_SHARED_LIBS) add_library(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES}) target_include_directories( pcre2-32-shared PUBLIC "$" "$" ) set_target_properties( pcre2-32-shared PROPERTIES COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32 MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}" MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}" VERSION ${LIBPCRE2_32_VERSION} SOVERSION ${LIBPCRE2_32_SOVERSION} OUTPUT_NAME pcre2-32 ) if(REQUIRE_PTHREAD) target_link_libraries(pcre2-32-shared Threads::Threads) endif() if(HAVE_VSCRIPT AND PCRE2_SYMVERS) target_link_options(pcre2-32-shared PRIVATE -Wl,${VSCRIPT_FLAG},${PROJECT_BINARY_DIR}/src/libpcre2-32.sym) set_target_properties(pcre2-32-shared PROPERTIES LINK_DEPENDS ${PROJECT_BINARY_DIR}/src/libpcre2-32.sym) endif() list(APPEND TARGETS pcre2-32-shared) list(APPEND DLL_PDB_FILES $/pcre2-32.pdb) list(APPEND DLL_PDB_DEBUG_FILES $/pcre2-32d.pdb) if(MINGW) if(NON_STANDARD_LIB_PREFIX) set_target_properties(pcre2-32-shared PROPERTIES PREFIX "") endif() if(NON_STANDARD_LIB_SUFFIX) set_target_properties(pcre2-32-shared PROPERTIES SUFFIX "-0.dll") endif() endif() endif() if(BUILD_STATIC_LIBS) add_library(pcre2-32 ALIAS pcre2-32-static) else() add_library(pcre2-32 ALIAS pcre2-32-shared) endif() endif() # Generate pkg-config files set(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}") set(prefix ${CMAKE_INSTALL_PREFIX}) set(exec_prefix "\${prefix}") set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") set(includedir "\${prefix}/include") set(LIB_POSTFIX "") if(WIN32 AND (CMAKE_BUILD_TYPE MATCHES Debug)) set(LIB_POSTFIX ${CMAKE_DEBUG_POSTFIX}) endif() set(PCRE2_STATIC_CFLAG "") if(NOT BUILD_SHARED_LIBS) set(PCRE2_STATIC_CFLAG "-DPCRE2_STATIC") endif() set(PCRE2POSIX_CFLAG "") if(BUILD_SHARED_LIBS) set(PCRE2POSIX_CFLAG "-DPCRE2POSIX_SHARED") endif() set(PTHREAD_CFLAGS "") set(PTHREAD_LIBS "") if(REQUIRE_PTHREAD) set(PTHREAD_CFLAGS "$") set(PTHREAD_LIBS "$") endif() if(PCRE2_BUILD_PCRE2_8) configure_file(libpcre2-posix.pc.in libpcre2-posix.pc.generator @ONLY) file(GENERATE OUTPUT libpcre2-posix.pc INPUT "${PROJECT_BINARY_DIR}/libpcre2-posix.pc.generator") list(APPEND pkg_config_files "${PROJECT_BINARY_DIR}/libpcre2-posix.pc") configure_file(libpcre2-8.pc.in libpcre2-8.pc.generator @ONLY) file(GENERATE OUTPUT libpcre2-8.pc INPUT "${PROJECT_BINARY_DIR}/libpcre2-8.pc.generator") list(APPEND pkg_config_files "${PROJECT_BINARY_DIR}/libpcre2-8.pc") set(enable_pcre2_8 "yes") else() set(enable_pcre2_8 "no") endif() if(PCRE2_BUILD_PCRE2_16) configure_file(libpcre2-16.pc.in libpcre2-16.pc.generator @ONLY) file(GENERATE OUTPUT libpcre2-16.pc INPUT "${PROJECT_BINARY_DIR}/libpcre2-16.pc.generator") list(APPEND pkg_config_files "${PROJECT_BINARY_DIR}/libpcre2-16.pc") set(enable_pcre2_16 "yes") else() set(enable_pcre2_16 "no") endif() if(PCRE2_BUILD_PCRE2_32) configure_file(libpcre2-32.pc.in libpcre2-32.pc.generator @ONLY) file(GENERATE OUTPUT libpcre2-32.pc INPUT "${PROJECT_BINARY_DIR}/libpcre2-32.pc.generator") list(APPEND pkg_config_files "${PROJECT_BINARY_DIR}/libpcre2-32.pc") set(enable_pcre2_32 "yes") else() set(enable_pcre2_32 "no") endif() configure_file(pcre2-config.in pcre2-config @ONLY NEWLINE_STYLE LF) # Executables if(PCRE2_BUILD_PCRE2GREP) add_executable(pcre2grep src/pcre2grep.c) set_property(TARGET pcre2grep PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8) list(APPEND TARGETS pcre2grep) target_link_libraries(pcre2grep pcre2-posix ${PCRE2GREP_LIBS}) endif() # Testing if(PCRE2_BUILD_TESTS) enable_testing() set(PCRE2TEST_SOURCES src/pcre2test.c) set(PCRE2TEST_LINKER_FLAGS "") if(MSVC) # This is needed to avoid a stack overflow error in the standard tests. The # flag should be indicated with a forward-slash instead of a hyphen, but # then CMake treats it as a file path. set(PCRE2TEST_LINKER_FLAGS -STACK:2500000) endif() add_executable(pcre2test ${PCRE2TEST_SOURCES}) list(APPEND TARGETS pcre2test) if(PCRE2_BUILD_PCRE2_8) list(APPEND PCRE2TEST_LIBS pcre2-posix pcre2-8) endif() if(PCRE2_BUILD_PCRE2_16) list(APPEND PCRE2TEST_LIBS pcre2-16) endif() if(PCRE2_BUILD_PCRE2_32) list(APPEND PCRE2TEST_LIBS pcre2-32) endif() target_link_libraries(pcre2test ${PCRE2TEST_LIBS} ${PCRE2TEST_LINKER_FLAGS}) if(PCRE2_BUILD_PCRE2_8) add_executable(pcre2posix_test src/pcre2posix_test.c) target_link_libraries(pcre2posix_test pcre2-posix pcre2-8) endif() if(PCRE2_SUPPORT_JIT) add_executable(pcre2_jit_test src/pcre2_jit_test.c) set(PCRE2_JIT_TEST_LIBS) if(PCRE2_BUILD_PCRE2_8) list(APPEND PCRE2_JIT_TEST_LIBS pcre2-8) endif() if(PCRE2_BUILD_PCRE2_16) list(APPEND PCRE2_JIT_TEST_LIBS pcre2-16) endif() if(PCRE2_BUILD_PCRE2_32) list(APPEND PCRE2_JIT_TEST_LIBS pcre2-32) endif() target_link_libraries(pcre2_jit_test ${PCRE2_JIT_TEST_LIBS}) endif() # ================================================= # Write out a CTest configuration file # file( WRITE ${PROJECT_BINARY_DIR}/CTestCustom.ctest "# This is a generated file. MESSAGE(\"When testing is complete, review test output in the \\\"${PROJECT_BINARY_DIR}/Testing/Temporary\\\" folder.\") MESSAGE(\" \") " ) file( WRITE ${PROJECT_BINARY_DIR}/pcre2_test.sh "#! /bin/sh # This is a generated file. srcdir=${PROJECT_SOURCE_DIR} pcre2test=${PROJECT_BINARY_DIR}/pcre2test test -z \"$CMAKE_CONFIG_TYPE\" || pcre2test=${PROJECT_BINARY_DIR}/$CMAKE_CONFIG_TYPE/pcre2test . ${PROJECT_SOURCE_DIR}/RunTest if test \"$?\" != \"0\"; then exit 1; fi # End " ) if(UNIX) add_test(pcre2_test sh ${PROJECT_BINARY_DIR}/pcre2_test.sh) endif() if(PCRE2_BUILD_PCRE2GREP) file( WRITE ${PROJECT_BINARY_DIR}/pcre2_grep_test.sh "#! /bin/sh # This is a generated file. srcdir=${PROJECT_SOURCE_DIR} pcre2grep=${PROJECT_BINARY_DIR}/pcre2grep test -z \"$CMAKE_CONFIG_TYPE\" || pcre2grep=${PROJECT_BINARY_DIR}/$CMAKE_CONFIG_TYPE/pcre2grep pcre2test=${PROJECT_BINARY_DIR}/pcre2test test -z \"$CMAKE_CONFIG_TYPE\" || pcre2test=${PROJECT_BINARY_DIR}/$CMAKE_CONFIG_TYPE/pcre2test . ${PROJECT_SOURCE_DIR}/RunGrepTest if test \"$?\" != \"0\"; then exit 1; fi # End " ) if(UNIX) add_test(pcre2_grep_test sh ${PROJECT_BINARY_DIR}/pcre2_grep_test.sh) if(PCRE2_EBCDIC) # The grep tests currently fail in EBCDIC mode because the test data # files are in ASCII. This could be fixed properly, but for now, we # have very few EBCDIC users and the pcre2grep utility is hardly even # part of the official project artifacts. set_property(TEST pcre2_grep_test PROPERTY WILL_FAIL TRUE) endif() endif() endif() if(WIN32) # Provide environment for executing the bat file version of RunTest file(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} winsrc) file(TO_NATIVE_PATH ${PROJECT_BINARY_DIR} winbin) file( WRITE ${PROJECT_BINARY_DIR}/pcre2_test.bat "\@REM This is a generated file. \@echo off setlocal SET srcdir=\"${winsrc}\" SET pcre2test=\"${winbin}\\pcre2test.exe\" if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2test=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2test.exe\" call %srcdir%\\RunTest.bat if errorlevel 1 exit /b 1 echo RunTest.bat tests successfully completed " ) add_test(NAME pcre2_test_bat COMMAND pcre2_test.bat) set_tests_properties(pcre2_test_bat PROPERTIES PASS_REGULAR_EXPRESSION "RunTest\\.bat tests successfully completed") if(PCRE2_BUILD_PCRE2GREP) file( WRITE ${PROJECT_BINARY_DIR}/pcre2_grep_test.bat "\@REM This is a generated file. \@echo off setlocal SET srcdir=\"${winsrc}\" SET pcre2test=\"${winbin}\\pcre2test.exe\" if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2test=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2test.exe\" SET pcre2grep=\"${winbin}\\pcre2grep.exe\" if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2grep=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2grep.exe\" call %srcdir%\\RunGrepTest.bat if errorlevel 1 exit /b 1 echo RunGrepTest.bat tests successfully completed " ) add_test(NAME pcre2_grep_test_bat COMMAND pcre2_grep_test.bat) set_tests_properties( pcre2_grep_test_bat PROPERTIES PASS_REGULAR_EXPRESSION "RunGrepTest\\.bat tests successfully completed" ) endif() if(DEFINED ENV{OSTYPE}) if("$ENV{OSTYPE}" STREQUAL "msys") set(MSYS2 TRUE) endif() endif() if(MSYS2) # Both the sh and bat file versions of RunTest are run if make test is used # in msys add_test(pcre2_test_sh sh.exe ${PROJECT_BINARY_DIR}/pcre2_test.sh) if(PCRE2_BUILD_PCRE2GREP) add_test(pcre2_grep_test sh.exe ${PROJECT_BINARY_DIR}/pcre2_grep_test.sh) endif() endif() endif() # Changed to accommodate testing whichever location was just built if(PCRE2_SUPPORT_JIT) add_test(pcre2_jit_test pcre2_jit_test) endif() if(PCRE2_BUILD_PCRE2_8) add_test(pcre2posix_test pcre2posix_test) endif() endif() # Installation set(CMAKE_INSTALL_ALWAYS 1) install( TARGETS ${TARGETS} EXPORT pcre2-targets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) install(EXPORT pcre2-targets DESTINATION ${PCRE2_INSTALL_CMAKEDIR} NAMESPACE pcre2::) install(FILES ${pkg_config_files} DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) install( FILES "${PROJECT_BINARY_DIR}/pcre2-config" DESTINATION ${CMAKE_INSTALL_BINDIR} # Set 0755 permissions PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) install(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # CMake config files. set(PCRE2_CONFIG_IN ${PROJECT_SOURCE_DIR}/cmake/pcre2-config.cmake.in) set(PCRE2_CONFIG_OUT ${PROJECT_BINARY_DIR}/cmake/pcre2-config.cmake) configure_package_config_file(${PCRE2_CONFIG_IN} ${PCRE2_CONFIG_OUT} INSTALL_DESTINATION ${PCRE2_INSTALL_CMAKEDIR}) set(PCRE2_CONFIG_VERSION_OUT ${PROJECT_BINARY_DIR}/cmake/pcre2-config-version.cmake) write_basic_package_version_file( ${PCRE2_CONFIG_VERSION_OUT} VERSION ${PCRE2_MAJOR}.${PCRE2_MINOR}.0 COMPATIBILITY SameMajorVersion ) install(FILES ${PCRE2_CONFIG_OUT} ${PCRE2_CONFIG_VERSION_OUT} DESTINATION ${PCRE2_INSTALL_CMAKEDIR}) file(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html ${PROJECT_SOURCE_DIR}/doc/html/*.txt) file( GLOB txts ${PROJECT_SOURCE_DIR}/doc/*.txt AUTHORS.md COPYING ChangeLog LICENCE.md NEWS README SECURITY.md ) file(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1) file(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3) install(FILES ${man1} DESTINATION ${CMAKE_INSTALL_MANDIR}/man1) install(FILES ${man3} DESTINATION ${CMAKE_INSTALL_MANDIR}/man3) install(FILES ${txts} DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/pcre2) install(FILES ${html} DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/pcre2/html) if(MSVC AND INSTALL_MSVC_PDB) install(FILES ${DLL_PDB_FILES} DESTINATION ${CMAKE_INSTALL_BINDIR} CONFIGURATIONS RelWithDebInfo) install(FILES ${DLL_PDB_DEBUG_FILES} DESTINATION ${CMAKE_INSTALL_BINDIR} CONFIGURATIONS Debug) endif() # Help, only for nice output if(BUILD_STATIC_LIBS) set(BUILD_STATIC_LIBS ON) else() set(BUILD_STATIC_LIBS OFF) endif() if(PCRE2_HEAP_MATCH_RECURSE) message(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.") endif() if(PCRE2_SHOW_REPORT) message(STATUS "") message(STATUS "") message(STATUS "PCRE2-${PCRE2_MAJOR}.${PCRE2_MINOR} configuration summary:") message(STATUS "") message(STATUS " Install prefix .................... : ${CMAKE_INSTALL_PREFIX}") message(STATUS " C compiler ........................ : ${CMAKE_C_COMPILER}") set(CFSP "") if(CMAKE_C_FLAGS) set(CFSP " ") endif() if(CMAKE_CONFIGURATION_TYPES) foreach(config IN LISTS CMAKE_CONFIGURATION_TYPES) string(TOUPPER "${config}" buildtype) string(LENGTH " (${config})" buildtypelen) math(EXPR dotslen "18 - ${buildtypelen}") string(REPEAT "." ${dotslen} dots) message(STATUS " C compiler flags (${config}) ${dots} : ${CMAKE_C_FLAGS}${CFSP}${CMAKE_C_FLAGS_${buildtype}}") endforeach() else() string(TOUPPER "${CMAKE_BUILD_TYPE}" buildtype) if(buildtype STREQUAL "") set(CFBLD "") else() set(CFBLD "${CMAKE_C_FLAGS_${buildtype}}") endif() message(STATUS " C compiler flags .................. : ${CMAKE_C_FLAGS}${CFSP}${CFBLD}") endif() message(STATUS "") if(CMAKE_CONFIGURATION_TYPES) message(STATUS " Build configurations .............. : ${CMAKE_CONFIGURATION_TYPES}") else() message(STATUS " Build type ........................ : ${CMAKE_BUILD_TYPE}") endif() message(STATUS " Build 8 bit pcre2 library ......... : ${PCRE2_BUILD_PCRE2_8}") message(STATUS " Build 16 bit pcre2 library ........ : ${PCRE2_BUILD_PCRE2_16}") message(STATUS " Build 32 bit pcre2 library ........ : ${PCRE2_BUILD_PCRE2_32}") message(STATUS " Include debugging code ............ : ${PCRE2_DEBUG}") message(STATUS " Enable JIT compiling support ...... : ${PCRE2_SUPPORT_JIT}") message(STATUS " Use SELinux allocator in JIT ...... : ${PCRE2_SUPPORT_JIT_SEALLOC}") message(STATUS " Enable Unicode support ............ : ${PCRE2_SUPPORT_UNICODE}") message(STATUS " Newline char/sequence ............. : ${PCRE2_NEWLINE}") message(STATUS " \\R matches only ANYCRLF ........... : ${PCRE2_SUPPORT_BSR_ANYCRLF}") message(STATUS " \\C is disabled .................... : ${PCRE2_NEVER_BACKSLASH_C}") if(NOT EBCDIC) set(EBCDIC_NL_CODE "n/a") elseif(EBCDIC_NL25) set(EBCDIC_NL_CODE "0x25") else() set(EBCDIC_NL_CODE "0x15") endif() message(STATUS " EBCDIC coding ..................... : ${EBCDIC}") message(STATUS " EBCDIC code for NL ................ : ${EBCDIC_NL_CODE}") message(STATUS " EBCDIC coding ignoring compiler ... : ${PCRE2_EBCDIC_IGNORING_COMPILER}") message(STATUS " Rebuild char tables ............... : ${REBUILD_CHARTABLES}") message(STATUS " Internal link size ................ : ${PCRE2_LINK_SIZE}") message(STATUS " Maximum variable lookbehind ....... : ${PCRE2_MAX_VARLOOKBEHIND}") message(STATUS " Nested parentheses limit .......... : ${PCRE2_PARENS_NEST_LIMIT}") message(STATUS " Heap limit ........................ : ${PCRE2_HEAP_LIMIT}") message(STATUS " Match limit ....................... : ${PCRE2_MATCH_LIMIT}") message(STATUS " Match depth limit ................. : ${PCRE2_MATCH_LIMIT_DEPTH}") message(STATUS " Build shared libs ................. : ${BUILD_SHARED_LIBS}") if(NOT HAVE_VSCRIPT) message(STATUS " with symbol versioning ........ : n/a") else() message(STATUS " with symbol versioning ........ : ${PCRE2_SYMVERS}") endif() message(STATUS " Build static libs ................. : ${BUILD_STATIC_LIBS}") message(STATUS " with PIC enabled .............. : ${PCRE2_STATIC_PIC}") message(STATUS " Build pcre2grep ................... : ${PCRE2_BUILD_PCRE2GREP}") message(STATUS " Enable JIT in pcre2grep ........... : ${PCRE2GREP_SUPPORT_JIT}") message(STATUS " Enable callouts in pcre2grep ...... : ${PCRE2GREP_SUPPORT_CALLOUT}") message(STATUS " Enable callout fork in pcre2grep .. : ${PCRE2GREP_SUPPORT_CALLOUT_FORK}") message(STATUS " Initial buffer size for pcre2grep . : ${PCRE2GREP_BUFSIZE}") message(STATUS " Maximum buffer size for pcre2grep . : ${PCRE2GREP_MAX_BUFSIZE}") message(STATUS " Build tests (implies pcre2test .... : ${PCRE2_BUILD_TESTS}") message(STATUS " and pcre2grep)") if(ZLIB_FOUND) message(STATUS " Link pcre2grep with libz .......... : ${PCRE2_SUPPORT_LIBZ}") else() message(STATUS " Link pcre2grep with libz .......... : Library not found") endif() if(BZip2_FOUND) message(STATUS " Link pcre2grep with libbz2 ........ : ${PCRE2_SUPPORT_LIBBZ2}") else() message(STATUS " Link pcre2grep with libbz2 ........ : Library not found") endif() if(Editline_FOUND) message(STATUS " Link pcre2test with libeditline ... : ${PCRE2_SUPPORT_LIBEDIT}") else() message(STATUS " Link pcre2test with libeditline ... : Library not found") endif() if(Readline_FOUND) message(STATUS " Link pcre2test with libreadline ... : ${PCRE2_SUPPORT_LIBREADLINE}") else() message(STATUS " Link pcre2test with libreadline ... : Library not found") endif() message(STATUS " Enable Valgrind support ........... : ${PCRE2_SUPPORT_VALGRIND}") if(PCRE2_DISABLE_PERCENT_ZT) message(STATUS " Use %zu and %td ................... : OFF") else() message(STATUS " Use %zu and %td ................... : AUTO") endif() if(MINGW AND BUILD_SHARED_LIBS) message(STATUS " Non-standard dll names (prefix) ... : ${NON_STANDARD_LIB_PREFIX}") message(STATUS " Non-standard dll names (suffix) ... : ${NON_STANDARD_LIB_SUFFIX}") endif() if(MSVC) message(STATUS " Install MSVC .pdb files ........... : ${INSTALL_MSVC_PDB}") endif() message(STATUS "") endif() # end CMakeLists.txt ================================================ FILE: COPYING ================================================ PCRE2 LICENCE Please see the file LICENCE in the PCRE2 distribution for licensing details. End ================================================ FILE: ChangeLog ================================================ Change Log for PCRE2 -------------------- Before the move to GitHub, this was the only record of changes to PCRE2. Now there is also the log of commit messages. Internal changes which are not visible to clients of the library are mostly not listed here. Version 10.48 xx-xxx-2026 ------------------------- ... Version 10.47 21-October-2025 ----------------------------- 1. (#660, #655, #663) Expanded platforms tested by CI: FreeBSD, OpenBSD, Solaris, MSYS (Cygwin), S390x, PPC64le, ARMv7, AARCH64, RiscV. 2. (#655) Made build clean of `/W3` warnings on MSVC. Further suppression of minor build warnings on other platforms (various commits). 3. (#651) Added `--enable-Werror` flag to `./configure` to treat warnings as errors (or `--enable-errwarn` for Solaris cc). Previously, you had to hackily override the CPPFLAGS when calling make since you can't pass `-Werror` as a CFLAG into `./configure` (it breaks compiler feature detection). 4. (#682) Added AM_MAINTAINER_MODE for Git tag releases. Users building with the `./configure` script can check out the release tag using Git, which assigns the current time as the modification time to each checked-out file. This caused Autoconf to attempt to regenerate the configure script. 5. (#692) Add LICENSE file for sljit to the tarball release, to clarify that the sljit code is governed separately by the 2-clause BSD license. 6. (#656, #695) Add full support for z/OS and native EBCDIC support. The z/OS support is tested nightly using the XLC and IBM-Clang compilers, with Autoconf and CMake. In addition, for test purposes, the EBCDIC support can now be enabled on any platform using the new Autoconf `--enable-ebcdic-ignoring-compiler` and CMake `-DPCRE2_EBCDIC_IGNORING_COMPILER` options. 7. (#700) Faster lookup of named capture groups during pattern compilation using a hash table. 8. (#697, #756, #778) Improvements to pcre2test to increase code coverage: -malloc argument; more detailed test assertions. 9. (#705, #710, #737, #738) Powerful new feature: pattern recursion of the form "(?1(GROUP_NAME_OR_NUM,...))" acts as a subroutine call which additionally returns the listed capturing groups to the calling context. 10. (#721) Add linker scripts to enable symbol versioning for the PCRE2 dynamic libraries. Downstream Linux distributions may make use of this, or disable it with the new Autoconf `--disable-symvers` and CMake `-DPCRE2_SYMVERS` options. Currently, Linux, Solaris, and FreeBSD (GNU ld, LLVM lld, and Solaris ld) are tested and supported. 11. (#733) New API function: pcre2_next_match(). This function makes it both simpler and safer for clients to iterate over all matches in a subject. The documentation in `pcre2api` also provides improved guidance in the section "Iterating over all matches". 12. (#739, #744, #753) Modernize the CMake build files, to use standard commands to export the PCRE2 targets. This makes use of the "$" and "$" expressions alongside the built-in "install(EXPORT...)" command. This brings the CMake files in line with the patches used by vcpkg to distribute PCRE2. The minimum CMake version remains 3.15. 13. (#756) Improved error offsets and diagnostics for syntax errors during pattern compilation. 14. (#708, #729, #724, #731, #777) Various updates to Bazel and Zig build support. 15. (#775) Added PCRE2_CONFIG_EFFECTIVE_LINKSIZE option to pcre2_config(), to report the actual number of code units used in compiled patterns for recording string lengths and offsets. 16. (#801) Significant bugfix: Fix a crash in pcre2_callout_enumerate() which is easily reachable on any pattern that contains a Unicode character class. If your application uses this function, please read the details for this change and evaluate its severity for your application. 17. (#806, #807) Improved input validation for pcre2_substitute() used with PCRE2_SUBSTITUTE_MATCHED. 18. (#817) Add support for $+ replacement to pcre2_substitute(). 19. (#818) New SIMD code generation in the JIT for AArch64. Version 10.46 27-August-2025 ---------------------------- 1. (#771) (CVE-2025-58050) Security fix to prevent a read-past-the-end memory error, of arbitrary length. An attacker-controlled regex pattern is required, and it cannot be triggered by providing crafted subject (match) text. The (*ACCEPT) and (*scs:) pattern features must be used together. Release 10.44 and earlier are not affected. This could have implications of denial-of-service or information disclosure, and could potentially be used to escalate other vulnerabilities in a system (such as information disclosure being used to escalate the severity of an unrelated bug in another system). Version 10.45 05-February-2025 ------------------------------ 1. (#418) Change 6 of 10.44 broke 32-bit tests because pcre2test's reporting of memory size was changed to the entire compiled data block, instead of just the pattern and tables data, so as to align with the new length restriction. Because the block's header contains pointers, this meant the pcre2test output was different in 32-bit mode. A patch by Carlo reverts to the previous state and makes sure that any limit set by pcre2_set_max_pattern_compiled_length() also avoids the internal struct overhead. 2. (#416, #622) Updates to build.zig. 3. (#427, et al.) Various fixes to pacify static analyzers. 4. (#428) Add --posix-pattern-file to pcre2grep to allow processing of empty patterns through the -f option, as well as patterns that end in space characters, for compatibility with other grep tools. 5. (4fa5b8bd) Fix a bug in the fuzz support quantifier-limiting code. It ignores strings of more than 5 digits because they are necessarily numbers greater than 65535, the largest legal quantifier. However, it wasn't ignoring non-significant leading zeros. 6. (6d82f0cd) The case-independent processing of the letter-matching Unicode properties Ll, Lt, and Lu have been changed to match Perl (which changed a while ago). When caseless matching is in force, all three of these properties are now treated as Lc (cased letter). 7. (#433) The pcre2_jit_compile() function was updated by the addition of a new option PCRE2_JIT_TEST_ALLOC which, if called with a NULL first argument, tests not only the availability of JIT, but also its ability to allocate executable memory. Update pcre2test to use this support to extend the -C option. 8. (75b1025a) The code for parsing Unicode property descriptions for \p and \P been changed as follows: . White space etc. before ^ in a negated value such as \p{ ^L } was not being ignored. . The code wouldn't have worked if PCRE2 was compiled for UTF-8 support within an EBCDIC environment. Possibly nobody does this any more, but it should now work. . The documentation of the syntax of what can follow \p and \P has been updated. 9. (1c24ba01) There was an error in the table of lengths for parsed items for the OPTIONS item, but fortuitously it could never have actually bitten. While fixing this, some other code that could never be obeyed was discovered and removed. 10. (674b6640) Removed some incorect optimization code from DFA matching that has been there since PCRE1, but has just been found to cause a no match return instead of a partial match in some cases. It involves partial matching when (*F) is present so is unlikely to have actually affected anyone. 11. (b0f4ac17) Tidy the wording and formatting of some pcre2test error messages concerned with bad modifiers. Also restrict single-letter modifier sequences to the first item in a modifier list, as documented and always intended. 12. (1415565c) An iterator at the end of many assertions can always be auto-possessified, but not at the end of variable-length lookbehinds. There was a bug in the code that checks for such a lookbehind; it was looking only at the first branch, which is wrong because some branches can be fixed length when others are not, for example (?<=AB|CD?). Now all branches are checked for variability. 13. (ead08288) Matching with pcre2_match() could give an incorrect result if a variable-length lookbehind was used as the condition in a conditional group. The condition could erroneously be treated as true if a branch matched but overran the current position. This bug was in the interpreter only; matching with JIT was correct. 14. (#443) Split out the sljit sub-project into a "Git submodule". Git users must now run `git submodule init; git submodule update` after a Git checkout, or the build will fail due to missing files in deps/sljit. 15. (#441) Add a new error code (PCRE2_ERROR_JIT_UNSUPPORTED) which is yielded for unsupported jit features. 16. (#444) Fix bug in 'first code unit' and 'last code unit' optimization combined with lookahead assertions. 17. (#445, #447, #449, #451, #452, #459, #563) Add a new feature called scan substring. This feature is a new type of assertion which matches the content of a capturing block to a sub-pattern. 18. (#450) Improvements to 'first code unit' / 'starting code units' optimisation. 19. (#455) Many, many improvements to the JIT compiler. 20. Item 43 of 10.43 was incomplete because it addressed only \z and not \Z, which was still misbehaving when matching fragments inside invalid UTF strings. 21. (d29e7290) Octal escapes of the form \045 or \111 were not being recognized in substitution strings, and if encountered gave an error, though the \o{...} form was recognized. This bug is now fixed. 22. (#463, #487) Fix 1 byte out-of-bounds read when parsing malformed limits (e.g. LIMIT_HEAP) 23. Many improvements to test infrastructure. Many more platforms and configurations are now run in Continuous Integration, and all the platforms now run the full test suite, rather than a partial subset. 24. (#475) Implement title casing in substitution strings using Perl syntax. 25. (#478, #504) Disallow \x if not followed by { or a hex digit. 26. (#473) Implements Python-style backrefs in substitutions. 27. (#472) Fix error reporting for certain over-large octal escapes. 28. (#482) Fix parsing of named captures in replacement strings, allowing non-ASCII capture names to be used. 29. (#477, #474, #488, #494, #496, #506, #508, #511, #518, #524, #540) Many improvements to parsing and optimising of character classes. 30. (#483, #498) Add support for \g and $ to replacement strings. 31. (#470) Add option flags PCRE2_EXTRA_NO_BS0 and PCRE2_EXTRA_PYTHON_OCTAL. 32. (#471) Add new API function pcre2_set_optimize() for controlling which optimizations are enabled. 33. (#491) Adds $& $` $' and $_ to substitution replacements, as well as interpreting \b and \v as characters. 34. (#499) Add option PCRE2_EXTRA_NEVER_CALLOUT to disable callouts. 35. (#503, #513) Update Unicode support to UCD 16. 36. (#512, #618, #638) Add new function pcre2_set_substitute_case_callout() to allow clients to provide a custom callback with locale-aware case transformation. 37. (#516) Fix case-insensitive matching of backreferences when using the PCRE2_EXTRA_CASELESS_RESTRICT option. 38. (#519) In pcre2grep, add $& as an alias for $0 39. (c9bf8339, #534) Updated perltest.sh to enable locale setting. 40. (#521) Add support for Turkish I casefolding, using new options PCRE2_EXTRA_TURKISH_CASING, and added pre-pattern flags (*TURKISH_CASING) and (*CASELESS_RESTRICT). 41. (#523, #546, #547) Add support for UTS#18 compatible character classes, using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a metacharacter within character classes and the operators '&&', '--' and '~~', allowing subtractions and intersections of character classes to be easily expressed. 42. (#553, #586, #596, #597) Add support for Perl-style extended character classes, using the syntax (?[...]). This also allows expressing subtractions and intersections of character classes, but using a different syntax to UTS#18. 43. (#554) Fixed a bug in JIT affecting greedy bounded repeats. The upper limit of repeats inside a repeated bracket might be incorrectly checked. 44. (#556) Fixed a bug in JIT affecting caseful matching of backreferences. When utf is disabled, and dupnames is enabled, caseless matching was used even if caseful matching was needed. 45. (f34fc0a3) Fixed a bug in pcre2grep reported by Alejandro Colomar (GitHub issue #577). In certain cases, when lines of above and below context were contiguous, a separator line was incorrectly being inserted. 46. (#594) Fix a small (one/two byte) out-of-bounds read on invalid UTF-8 input in pcre2grep. 47. (#370) Fix the INSTALL_MSVC_PDB CMake flag. 48. (#366) Install cmake files in prefix/lib/cmake/pcre2 rather than prefix/cmake. The new CMake flag PCRE2_INSTALL_CMAKEDIR allows customising this location. 49. (#624, #626, #628, #632, #639, #641) Reduce code size of generated JIT code for repeated character classes. 50. (#623) Update the Bazel build files. Version 10.44 07-June-2024 -------------------------- 1. If a pattern contained a variable-length lookbehind in which the first branch was not the one with the shortest minimum length, and the lookbehind contained a capturing group, and elsewhere in the pattern there was another lookbehind that referenced that group, the pattern was incorrectly compiled, leading to unpredictable results, including crashes in JIT compiling. An example pattern is: /(((?<=123?456456|ABC)))(?<=\2)/ 2. Further updates to the oss-fuzz support: (a) Limit quantifiers for groups and classes to be no more than 10. This avoids very long JIT compile times that happen in some cases when groups are replicated for quantification, and very long match times when classes contain a lot of non-ascii characters. (b) Added PCRE2_EXTENDED_MORE to the list of allowed options. (c) Arranged for text error messages to be shown in 16-bit and 32-bit modes. (d) Made the output in standalone mode more readable. (e) General code tidies. (f) Limit the size of compiled patterns to 10MB (see 6 below). (g) Do not run JIT on patterns whose compiled length is greater than 200K bytes because this takes a long time, causing oss-fuzz to time out. (h) Avoid compiling or matching twice with the same options (this could happen if the input didn't set any options). 3. Increase the maximum length of a name for a group from 32 to 128 because there is a user for whom 32 is too small. 4. Cause pcre2test to output a message when pcre2_jit_compile() gives an error return if either jitverify or info is specified. 5. Some auxiliary files for building under OpenVMS that were contributed by Alexey Chupahin have been installed. 6. Added pcre2_set_max_pattern_compiled_length() to limit the size of compiled patterns. 7. There was a bug in the implementation of \X caused by my (PH) misreading or misunderstanding one of the grapheme sequence breaking rules in Unicode Annex #29. A break should occur between two characters with the Extended Pictographic break property unless a zero-width joiner intervenes. PCRE2 was not insisting on the ZWJ, causing \X to match more than it should. See GitHub issue #410. 8. Avoid compilation issues with proprietary compilers in UNIX since 10.43. Version 10.43 16-February-2024 ------------------------------ 1. The test program added by change 2 of 10.42 didn't work when the default newline setting didn't include \n as a newline. One test needed (*LF) to ensure that it worked. 2. Added the new freestanding POSIX test program to the ManyConfigTests script in the maint directory (overlooked in 2 below). Also improved the selection facilities in that script, and added a test with JIT in a non-source directory, fixing an oversight that would have made such a test fail before. 3. Added pcre2_get_match_data_heapframes_size() and related pcre2test flags to allow for finer control of the heap used when pcre2_match() without JIT is used and the match_data might be reused. This began as PR #191, but has had further refinement and documentation edits. 4. Applied PR #181, which tidies some casts in pcre2_valid_utf.c. 5. Applied PR #184, which avoids overflow issues with the heap limit (introduced in 10.41/9). 6. Applied PR #192, which changes the timing units for pcre2test from milliseconds to microseconds. This is more useful for modern CPUs. 7. Applied PR #193, which makes the requirement for C99 explicit in configure.ac and CMakeLists.txt. 8. Fixed a bug in pcre2test when a ridiculously large string repeat required a stupid amount of memory. It now gives a clean realloc() failure error. 9. Updates to restrict the interaction between ASCII and non-ASCII characters for caseless matching and items like \d: (a) Added PCRE2_EXTRA_CASELESS_RESTRICT to lock out mixing of ASCII and non-ASCII when matching caselessly. This is also /r in pcre2test and (?r) within patterns. (b) Added PCRE2_EXTRA_ASCII_{BSD,BSS,BSW,POSIX} and corresponding (?aD) etc in patterns and /a in pcre2test. (c) Corresponding updates to pcre2test. 10. Unicode has been updated to 15.0.0. 11. The Python scripts and ucptest.c in maint have been updated (a) a minor change needed for 9(a) above; (b) fix bugs in ucptest, 12. Integer overflow testing is now centralized in a new function. 13. Made PCRE2_UCP the default in UTF mode in pcre2grep, and added new options --case-restrict and --no-ucp. 14. In the debugging printint module (which is normally only linked into pcre2test), avoid the use of a variable called "not" because that's deprecated in C and forbidden in C++. Also rewrite some code to avoid a goto into a block that bypassed its initialization (though it didn't actually matter). 15. More minor code adjustments to avoid using reserved C++ words as variable names ("new" and "typename") and another jump that bypassed an (irrelevant) initialization. 16. Merged a pull request that removed pcre2_ucptables.c from the list of files to compile in NON-AUTOTOOLS-BUILD because it is #included in pcre2_tables.c. Also adjusted the BUILD.bazel and build.zig files, which had the same issue. At the same time, fixed a typo in the Bazel file. 17. Add PCRE2_EXTRA_ASCII_DIGIT to allow [:digit:] to be kept on sync with \d even in UCP mode. 18. Fix an invalid match of ascii word classes when invalid utf is enabled. 19. Add a --posix-digit to pcre2grep for compatibility with GNU grep, and other tools that prefer the POSIX compatible unicode definition for \d. 20. Report the bit width of the library in use by pcre2test for usability. 21. A pathological pattern conversion test could result in a string longer than the available input buffer. Cause such a test to fail. 22. Add a check that forces a compiler error if PCRE2_CODE_UNIT_WIDTH is not 8, 16, or 32 when compiling any of the library modules. 23. Update pcre2_compile() to treat a NULL pattern with zero length as an empty string. 24. Add support for limited-length variable-length lookbehind assertions, with default maximum length 255 characters (same as Perl) but with a function to adjust the limit. 25. Applied pull request #262, which updates the zig configuration, and #278 which fixes a bug with out-of-source-tree CMake build testing. 26. Add support for LoongArch to JIT. 27. Fixed a bug in pcre2_match() in the code for handling the vector of backtracking frames on the heap, which caused a heap overflow if *LIMIT_HEAP restricted an attempt to extend to less than the frame size. Generally tidy up the code for extending the heap frames vector. This fixes GitHub issue #275. 28. Update pcre2_fuzzsupport.c to avoid clang sanitize complaint about shifting left by 16 when there are non-zeros in the top 16 bits. 29. Perl 5.34.0 changed the meaning of (for example) {,3} which did not used to be treated as a quantifier. Now it is interpreted as {0,3} and PCRE2 has changed to match. Note that {,} is still not a quantifier. 30. Perl allows spaces and/or horizontal tabs after { or before } in all items that use braces, and also before or after the comma in quantifiers. PCRE2 now does the same, except for \u{...}, which is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This an ECMAScript, non-Perl compatible, extension, so PCRE2 follows ECMAScript rather than Perl. 31. Applied pull request #300 by Carlo, which fixes #261. The bug was that pcre2_match() was not fully resetting all captures that had been set within a (possibly recursive) subroutine call such as (?3). 32. Changed the meaning of \w (and its synonyms) in UCP mode to match Perl. It now matches characters whose general categories are L or N or whose particular categories are Mn (non-spacing mark) or Pc (combining punctuation). The latter includes underscore. 33. Changed the meaning of [:xdigit:] in UCP mode to match Perl. It now also matches the "fullwidth" versions of the hex digits. Just like it is done for [:digit:], PCRE2_EXTRA_ASCII_DIGIT can be used to keep this class ASCII only without affecting other POSIX classes. 34. GitHub PR305 fixes a potential integer overflow in pcre2_dfa_match(). 35. Updated handling of \b and \B in UCP mode to match the changes to \w in 32 above because \b and \B are defined in terms of \w. 36. Within a pattern (?aT) and (?-aT) set and reset the PCRE2_EXTRA_ASCII_DIGIT option, and (?aP) also sets (?aT) so that (?-aP) disables all ASCII restrictions on POSIX classes. 37. If PCRE2_FIRSTLINE was set on an anchored pattern, pcre2_match() and pcre2_dfa_match() misbehaved. PCRE2_FIRSTLINE is now ignored for anchored patterns. 38. Add a test for ridiculous ovector offset values to the substring extraction functions. 39. Make OP_REVERSE use IMM2_SIZE for its data instead of LINK_SIZE, for consistency with OP_VREVERSE. 40. In some legacy environments with a pre C99 snprintf, pcre2_regerror could return an incorrect value when the provided buffer was too small. 41. Applied pull request #342 which adds sanity checks for ctype functions and locks out any accidental sign-extension. 42. In the 32-bit library, in non-UTF mode, a quantifier that followed a literal character with a value greater than or equal to 0x80000000u caused undefined behaviour. 43. \z was misbehaving when matching fragments inside invalid UTF strings. 44. Implement --group-separator and --no-group-separator for pcre2grep. 45. Fix \X matching in 32 bit mode without UTF in JIT. 46. Fix backref iterators when PCRE2_MATCH_UNSET_BACKREF is set in JIT. 47. Refactor the handling of whole-pattern recursion (?0) in pcre2_match() so that its end is handled similarly to other recursions. This has altered the behaviour of /|(?0)./endanchored which was previously not right. 48. Improved the test for looping recursion by checking the last referenced character as well as the current character. This allows some patterns that previously triggered the check to run to completion instead of giving the loop error. 49. In 32-bit mode, the compiler looped for the pattern /[\x{ffffffff}]/ when PCRE2_CASELESS and PCRE2_UCP (but not PCRE2_UTF) were set. Fixed by not trying to look for other cases for characters above the Unicode range. 50. In caseless 32-bit mode with UCP (but not UTF) set, the character 0xffffffff incorrectly matched any character that has more than one other case, in particular k and s. 51. Fix accept and endanchored interaction in JIT. 52. Fix backreferences with unset backref and non-greedy iterators in JIT. 53. Improve the logic that checks for a list of starting code units -- positive lookahead assertions are now ignored if the immediately following item is one that sets a mandatory starting character. For example, /a?(?=bc|)d/ used to set all of a, b, and d as possible starting code units; now it sets only a and d. 54. Fix incorrect class character matches in JIT. 55. In pcre2test, ensure pcre2_jit_match() is used when jitfast is used with substitution testing. 56. Insert omitted setting of subject length in match data at the end of pcre2_jit_match(). 57. Implemented PCRE2_DISABLE_RECURSELOOP_CHECK for pcre2_match() to enable some apparently looping recursions to run to completion and therefore match the JIT behaviour. With this set, real loops will eventually get caught by match or heap limits or run out of resource. 58. AC did a lot of work on pcre2_fuzzsupport.c to extend it to 16-bit and 32-bit libraries and to compare JIT and non-JIT matching. Version 10.42 11-December-2022 ------------------------------ 1. Change 19 of 10.41 wasn't quite right; it put the definition of a default, empty value for PCRE2_CALL_CONVENTION in src/pcre2posix.c instead of src/pcre2posix.h, which meant that programs that included pcre2posix.h but not pcre2.h failed to compile. 2. To catch similar issues to the above in future, a new small test program that includes pcre2posix.h but not pcre2.h has been added to the test suite. 3. When the -S option of pcre2test was used to set a stack size greater than the allowed maximum, the error message displayed the hard limit incorrectly. This was pointed out on GitHub pull request #171, but the suggested patch didn't cope with all cases. Some further modification was required. 4. Supplying an ovector count of more than 65535 to pcre2_match_data_create() caused a crash because the field in the match data block is only 16 bits. A maximum of 65535 is now silently applied. 5. Merged @carenas patch #175 which fixes #86 - segfault on aarch64 (ARM), 6. The prototype for pcre2_substring_list_free() specified its argument as PCRE2_SPTR * which is a const data type, whereas the yield from pcre2_substring_list() is not const. This caused compiler warnings. I have changed the argument of pcre2_substring_list_free() to be PCRE2_UCHAR ** to remove this anomaly. This might cause new warnings in existing code where a cast has been used to avoid previous ones. Version 10.41 06-December-2022 ------------------------------ 1. Add fflush() before and after a fork callout in pcre2grep to get its output to be the same on all systems. (There were previously ordering differences in Alpine Linux). 2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake. 3. SSF scorecards grumbled about possible overflow in an expression in pcre2test. It never would have overflowed in practice, but some casts have been added and at the some time there's been some tidying of fprints that output size_t values. 4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed. 5. Minor code re-arrangement to remove gcc warning about realloc() in pcre2test. 6. Change a number of int variables that hold buffer and line lengths in pcre2grep to PCRE2_SIZE (aka size_t). 7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not supported (even though that function would do nothing in that case) at the request of a user who doesn't even want to link with pcre_jit_compile.o. Also tidied up an untidy #ifdef arrangement in pcre2test. 8. Fixed an issue in the backtracking optimization of character repeats in JIT. Furthermore optimize star repetitions, not just plus repetitions. 9. Removed the use of an initial backtracking frames vector on the system stack in pcre2_match() so that it now always uses the heap. (In a multi-thread environment with very small stacks there had been an issue.) This also is tidier for JIT matching, which didn't need that vector. The heap vector is now remembered in the match data block and re-used if that block itself is re-used. It is freed with the match data block. 10. Adjusted the find_limits code in pcre2test to work with change 9 above. 11. Added find_limits_noheap to pcre2test, because the heap limits are now different in different environments and so cannot be included in the standard tests. 12. Created a test for pcre2_match() heap processing that is not part of the tests run by 'make check', but can be run manually. The current output is from a 64-bit system. 13. Implemented -Z aka --null in pcre2grep. 14. A minor change to pcre2test and the addition of several new pcre2grep tests have improved LCOV coverage statistics. At the same time, code in pcre2grep and elsewhere that can never be obeyed in normal testing has been excluded from coverage. 15. Fixed a bug in pcre2grep that could cause an extra newline to be written after output generated by --output. 16. If a file has a .bz2 extension but is not in fact compressed, pcre2grep should process it as a plain text file. A bug stopped this happening; now fixed and added to the tests. 17. When pcre2grep was running not in UTF mode, if a string specified by --output or obtained from a callout in a pattern contained a character (byte) greater than 127, it was incorrectly output in UTF-8 format. 18. Added some casts after warnings from Clang sanitize. 19. Merged patch from cbouc (GitHub #139): 4 function prototypes were missing PCRE2_CALL_CONVENTION in src/pcre2posix.h. All function prototypes returning pointers had out of place PCRE2_CALL_CONVENTION in src/pcre2.h.*. These produced errors when building for Windows with #define PCRE2_CALL_CONVENTION __stdcall. 20. A negative repeat value in a pcre2test subject line was not being diagnosed, leading to infinite looping. 21. Updated RunGrepTest to discard the warning that Bash now gives when setting LC_CTYPE to a bad value (because older versions didn't). 22. Updated pcre2grep so that it behaves like GNU grep when matching more than one pattern and a later pattern matches at an earlier point in the subject when the matched substrings are being identified by colour or by offsets. 23. Updated the PrepareRelease script so that the man page that it makes for the pcre2demo demonstration program is more standard and does not cause errors when processed by lexgrog or mandb -c (GitHub issue #160). 24. The JIT compiler was updated. Version 10.40 15-April-2022 --------------------------- 1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect handling of multiple passes. 2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue in pcre2grep with buffered fseek(stdin). 3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is not supported. 4. Revert an unintended change in JIT repeat detection. 5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd. 6. Merged documentation and comments patches from @carenas (GitHub #47). 7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code from pcre2grep. 8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46. 9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and substituting. 10. Add null_subject and null_replacement modifiers to pcre2test. 11. Add check for NULL subject to POSIX regexec() function. 12. Add check for NULL replacement to pcre2_substitute(). 13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and pcre2_substitute(), and the replacement argument of the latter, if the pointer is NULL and the length is zero, treat as an empty string. Apparently a number of applications treat NULL/0 in this way. 14. Added support for Bidi_Class and a number of binary Unicode properties, including Bidi_Control. 15. Fix some minor issues raised by clang sanitize. 16. Very minor code speed up for maximizing character property matches. 17. A number of changes to script matching for \p and \P: (a) Script extensions for a character are now coded as a bitmap instead of a list of script numbers, which should be faster and does not need a loop. (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms sc and scx). (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being the same as \p{scx:scriptname} because this change happened in Perl at release 5.26. (d) The standard Unicode 4-letter abbreviations for script names are now recognized. (e) In accordance with Unicode and Perl's "loose matching" rules, spaces, hyphens, and underscores are ignored in property names, which are then matched independent of case. 18. The Python scripts in the maint directory have been refactored. There are now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c (which is #included by pcre2_tables.c). The data lists that used to be duplicated are now held in a single common Python module. 19. On CHERI, and thus Arm's Morello prototype, pointers are represented as hardware capabilities, which consist of both an integer address and additional metadata, meaning they are twice the size of the platform's size_t type, i.e. 16 bytes on a 64-bit system. The ovector member of heapframe happens to only be 8 byte aligned, and so computing frame_size ended up with a multiple of 8 but not 16. Whilst the first frame was always suitably aligned, this then misaligned the frame that follows, resulting in an alignment fault when storing a pointer to Fecode at the start of match. Patch to fix this issue by Jessica Clarke PR#72. 20. Added -LP and -LS listing options to pcre2test. 21. A user discovered that the library names in CMakeLists.txt for MSVC debugger (PDB) files were incorrect - perhaps never tried for PCRE2? 22. An item such as [Aa] is optimized into a caseless single character match. When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a pattern, the optimizing "must be present for a match" character check was not being flagged as caseless, causing some matches that should have succeeded to fail. 23. Fixed a unicode property matching issue in JIT. The character was not fully read in caseless matching. 24. Fixed an issue affecting recursions in JIT caused by duplicated data transfers. 25. Merged patch from @carenas (GitHub #96) which fixes some problems with pcre2test and readline/readedit: * Use the right header for libedit in FreeBSD with autoconf * Really allow libedit with cmake * Avoid using readline headers with libedit Version 10.39 29-October-2021 ----------------------------- 1. Fix incorrect detection of alternatives in first character search in JIT. 2. Merged patch from @carenas (GitHub #28): Visual Studio 2013 includes support for %zu and %td, so let newer versions of it avoid the fallback, and while at it, make sure that the first check is for DISABLE_PERCENT_ZT so it will be always honoured if chosen. prtdiff_t is signed, so use a signed type instead, and make sure that an appropriate width is chosen if pointers are 64bit wide and long is not (ex: Windows 64bit). IMHO removing the cast (and therefore the possibility of truncation) make the code cleaner and the fallback is likely portable enough with all 64-bit POSIX systems doing LP64 except for Windows. 3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0. 4. Merged patch from @carenas (GitHub #30): * Cleanup: remove references to no longer used stdint.h Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h (simplification) and remove the now unnecessary inclusion in pcre2_internal.h., 2018-11-14), stdint.h is no longer used. Remove checks for it in autotools and CMake and document better the expected build failures for systems that might have stdint.h (C99) and not inttypes.h (from POSIX), like old Windows. * Cleanup: remove detection for inttypes.h which is a hard dependency CMake checks for standard headers are not meant to be used for hard dependencies, so will prevent a possible fallback to work. Alternatively, the header could be checked to make the configuration fail instead of breaking the build, but that was punted, as it was missing anyway from autotools. 5. Merged patch from @carenas (GitHub #32): * jit: allow building with ancient MSVC versions Visual Studio older than 2013 fails to build with JIT enabled, because it is unable to parse non C89 compatible syntax, with mixed declarations and code. While most recent compilers wouldn't even report this as a warning since it is valid C99, it could be also made visible by adding to gcc/clang the -Wdeclaration-after-statement flag at build time. Move the code below the affected definitions. * pcre2grep: avoid mixing declarations with code Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep, 2021-08-28), code will fail to build in a strict C89 compiler. Reformat slightly to make it C89 compatible again. Version 10.38 01-October-2021 ----------------------------- 1. Fix invalid single character repetition issues in JIT when the repetition is inside a capturing bracket and the bracket is preceded by character literals. 2. Installed revised CMake configuration files provided by Jan-Willem Blokland. This extends the CMake build system to build both static and shared libraries in one go, builds the static library with PIC, and exposes PCRE2 libraries using the CMake config files. JWB provided these notes: - Introduced CMake variable BUILD_STATIC_LIBS to build the static library. - Make a small modification to config-cmake.h.in by removing the PCRE2_STATIC variable. Added PCRE2_STATIC variable to the static build using the target_compile_definitions() function. - Extended the CMake config files. - Introduced CMake variable PCRE2_USE_STATIC_LIBS to easily switch between the static and shared libraries. - Added the PCRE_STATIC variable to the target compile definitions for the import of the static library. Building static and shared libraries using MSVC results in a name clash of the libraries. Both static and shared library builds create, for example, the file pcre2-8.lib. Therefore, I decided to change the static library names by adding "-static". For example, pcre2-8.lib has become pcre2-8-static.lib. [Comment by PH: this is MSVC-specific. It doesn't happen on Linux.] 3. Increased the minimum release number for CMake to 3.0.0 because older than 2.8.12 is deprecated (it was set to 2.8.5) and causes warnings. Even 3.0.0 is quite old; it was released in 2014. 4. Implemented a modified version of Thomas Tempelmann's pcre2grep patch for detecting symlink loops. This is dependent on the availability of realpath(), which is now tested for in ./configure and CMakeLists.txt. 5. Implemented a modified version of Thomas Tempelmann's patch for faster case-independent "first code unit" searches for unanchored patterns in 8-bit mode in the interpreters. Instead of just remembering whether one case matched or not, it remembers the position of a previous match so as to avoid unnecessary repeated searching. 6. Perl now locks out \K in lookarounds, so PCRE2 now does the same by default. However, just in case anybody was relying on the old behaviour, there is an option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour. An option has also been added to pcre2grep to enable this. 7. Re-enable a JIT optimization which was unintentionally disabled in 10.35. 8. There is a loop counter to catch excessively crazy patterns when checking the lengths of lookbehinds at compile time. This was incorrectly getting reset whenever a lookahead was processed, leading to some fuzzer-generated patterns taking a very long time to compile when (?|) was present in the pattern, because (?|) disables caching of group lengths. Version 10.37 26-May-2021 ------------------------- 1. Change RunGrepTest to use tr instead of sed when testing with binary zero bytes, because sed varies a lot from system to system and has problems with binary zeros. This is from Bugzilla #2681. Patch from Jeremie Courreges-Anglas via Nam Nguyen. This fixes RunGrepTest for OpenBSD. Later: it broke it for at least one version of Solaris, where tr can't handle binary zeros. However, that system had /usr/xpg4/bin/tr installed, which works OK, so RunGrepTest now checks for that command and uses it if found. 2. Compiling with gcc 10.2's -fanalyzer option showed up a hypothetical problem with a NULL dereference. I don't think this case could ever occur in practice, but I have put in a check in order to get rid of the compiler error. 3. An alternative patch for CMakeLists.txt because 10.36 #4 breaks CMake on Windows. Patch from email@cs-ware.de fixes bugzilla #2688. 4. Two bugs related to over-large numbers have been fixed so the behaviour is now the same as Perl. (a) A pattern such as /\214748364/ gave an overflow error instead of being treated as the octal number \214 followed by literal digits. (b) A sequence such as {65536 that has no terminating } so is not a quantifier was nevertheless complaining that a quantifier number was too big. 5. A run of autoconf suggested that configure.ac was out-of-date with respect to the latest autoconf. Running autoupdate made some valid changes, some valid suggestions, and also some invalid changes, which were fixed by hand. Autoconf now runs clean and the resulting "configure" seems to work, so I hope nothing is broken. Later: the requirement for autoconf 2.70 broke some automatic test robots. It doesn't seem to be necessary: trying a reduction to 2.60. 6. The pattern /a\K.(?0)*/ when matched against "abac" by the interpreter gave the answer "bac", whereas Perl and JIT both yield "c". This was because the effect of \K was not propagating back from the full pattern recursion. Other recursions such as /(a\K.(?1)*)/ did not have this problem. 7. Restore single character repetition optimization in JIT. Currently fewer character repetitions are optimized than in 10.34. 8. When the names of the functions in the POSIX wrapper were changed to pcre2_regcomp() etc. (see change 10.33 #4 below), functions with the original names were left in the library so that pre-compiled programs would still work. However, this has proved troublesome when programs link with several libraries, some of which use PCRE2 via the POSIX interface while others use a native POSIX library. For this reason, the POSIX function names are removed in this release. The macros in pcre2posix.h should ensure that re-compiling fixes any programs that haven't been compiled since before 10.33. Version 10.36 04-December-2020 ------------------------------ 1. Add CET_CFLAGS so that when Intel CET is enabled, pass -mshstk to compiler. This fixes https://bugs.exim.org/show_bug.cgi?id=2578. Patch for Makefile.am and configure.ac by H.J. Lu. Equivalent patch for CMakeLists.txt invented by PH. 2. Fix infinite loop when a single byte newline is searched in JIT when invalid utf8 mode is enabled. 3. Updated CMakeLists.txt with patch from Wolfgang Stöggl (Bugzilla #2584): - Include GNUInstallDirs and use ${CMAKE_INSTALL_LIBDIR} instead of hardcoded lib. This allows differentiation between lib and lib64. CMAKE_INSTALL_LIBDIR is used for installation of libraries and also for pkgconfig file generation. - Add the version of PCRE2 to the configuration summary like ./configure does. - Fix typo: MACTHED_STRING->MATCHED_STRING 4. Updated CMakeLists.txt with another patch from Wolfgang Stöggl (Bugzilla #2588): - Add escaped double quotes around include directory in CMakeLists.txt to allow spaces in directory names. - This fixes a cmake error, if the path of the pcre2 source contains a space. 5. Updated CMakeLists.txt with a patch from B. Scott Michel: CMake's documentation suggests using CHECK_SYMBOL_EXISTS over CHECK_FUNCTION_EXIST. Moreover, these functions come from specific header files, which need to be specified (and, thankfully, are the same on both the Linux and WinXX platforms.) 6. Added a (uint32_t) cast to prevent a compiler warning in pcre2_compile.c. 7. Applied a patch from Wolfgang Stöggl (Bugzilla #2600) to fix postfix for debug Windows builds using CMake. This also updated configure so that it generates *.pc files and pcre2-config with the same content, as in the past. 8. If a pattern ended with (?(VERSION=n.d where n is any number but d is just a single digit, the code unit beyond d was being read (i.e. there was a read buffer overflow). Fixes ClusterFuzz 23779. 9. After the rework in r1235, certain character ranges were incorrectly handled by an optimization in JIT. Furthermore a wrong offset was used to read a value from a buffer which could lead to memory overread. 10. Unnoticed for many years was the fact that delimiters other than / in the testinput1 and testinput4 files could cause incorrect behaviour when these files were processed by perltest.sh. There were several tests that used quotes as delimiters, and it was just luck that they didn't go wrong with perltest.sh. All the patterns in testinput1 and testinput4 now use / as their delimiter. This fixes Bugzilla #2641. 11. Perl has started to give an error for \K within lookarounds (though there are cases where it doesn't). PCRE2 still allows this, so the tests that include this case have been moved from test 1 to test 2. 12. Further to 10 above, pcre2test has been updated to detect and grumble if a delimiter other than / is used after #perltest. 13. Fixed a bug with PCRE2_MATCH_INVALID_UTF in 8-bit mode when PCRE2_CASELESS was set and PCRE2_NO_START_OPTIMIZE was not set. The optimization for finding the start of a match was not resetting correctly after a failed match on the first valid fragment of the subject, possibly causing incorrect "no match" returns on subsequent fragments. For example, the pattern /A/ failed to match the subject \xe5A. Fixes Bugzilla #2642. 14. Fixed a bug in character set matching when JIT is enabled and both unicode scripts and unicode classes are present at the same time. 15. Added GNU grep's -m (aka --max-count) option to pcre2grep. 16. Refactored substitution processing in pcre2grep strings, both for the -O option and when dealing with callouts. There is now a single function that handles $ expansion in all cases (instead of multiple copies of almost identical code). This means that the same escape sequences are available everywhere, which was not previously the case. At the same time, the escape sequences $x{...} and $o{...} have been introduced, to allow for characters whose code points are greater than 255 in Unicode mode. 17. Applied the patch from Bugzilla #2628 to RunGrepTest. This does an explicit test for a version of sed that can handle binary zero, instead of assuming that any Linux version will work. Later: replaced $(...) by `...` because not all shells recognize the former. 18. Fixed a word boundary check bug in JIT when partial matching is enabled. 19. Fix ARM64 compilation warning in JIT. Patch by Carlo. 20. A bug in the RunTest script meant that if the first part of test 2 failed, the failure was not reported. 21. Test 2 was failing when run from a directory other than the source directory. This failure was previously missed in RunTest because of 20 above. Fixes added to both RunTest and RunTest.bat. 22. Patch to CMakeLists.txt from Daniel to fix problem with testing under Windows. Version 10.35 09-May-2020 --------------------------- 1. Use PCRE2_MATCH_EMPTY flag to detect empty matches in JIT. 2. Fix ARMv5 JIT improper handling of labels right after a constant pool. 3. A JIT bug is fixed which allowed to read the fields of the compiled pattern before its existence is checked. 4. Back in the PCRE1 day, capturing groups that contained recursive back references to themselves were made atomic (version 8.01, change 18) because after the end a repeated group, the captured substrings had their values from the final repetition, not from an earlier repetition that might be the destination of a backtrack. This feature was documented, and was carried over into PCRE2. However, it has now been realized that the major refactoring that was done for 10.30 has made this atomizing unnecessary, and it is confusing when users are unaware of it, making some patterns appear not to be working as expected. Capture values of recursive back references in repeated groups are now correctly backtracked, so this unnecessary restriction has been removed. 5. Added PCRE2_SUBSTITUTE_LITERAL. 6. Avoid some VS compiler warnings. 7. Added PCRE2_SUBSTITUTE_MATCHED. 8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another regex engine. The Perl regex folks are aware of this usage and have made a note about it. 9. When an assertion is repeated, PCRE2 used to limit the maximum repetition to 1, believing that repeating an assertion is pointless. However, if a positive assertion contains capturing groups, repetition can be useful. In any case, an assertion could always be wrapped in a repeated group. The only restriction that is now imposed is that an unlimited maximum is changed to one more than the minimum. 10. Fix *THEN verbs in lookahead assertions in JIT. 11. Added PCRE2_SUBSTITUTE_REPLACEMENT_ONLY. 12. The JIT stack should be freed when the low-level stack allocation fails. 13. In pcre2grep, if the final line in a scanned file is output but does not end with a newline sequence, add a newline according to the --newline setting. 14. (?(DEFINE)...) groups were not being handled correctly when checking for the fixed length of a lookbehind assertion. Such a group within a lookbehind should be skipped, as it does not contribute to the length of the group. Instead, the (DEFINE) group was being processed, and if at the end of the lookbehind, that end was not correctly recognized. Errors such as "lookbehind assertion is not fixed length" and also "internal error: bad code value in parsed_skip()" could result. 15. Put a limit of 1000 on recursive calls in pcre2_study() when searching nested groups for starting code units, in order to avoid stack overflow issues. If the limit is reached, it just gives up trying for this optimization. 16. The control verb chain list must always be restored when exiting from a recurse function in JIT. 17. Fix a crash which occurs when the character type of an invalid UTF character is decoded in JIT. 18. Changes in many areas of the code so that when Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for upper/lower case computations on characters whose code points are greater than 127. 19. The function for checking UTF-16 validity was returning an incorrect offset for the start of the error when a high surrogate was not followed by a valid low surrogate. This caused incorrect behaviour, for example when PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the invalid high surrogate, such as /aa/ matching "\x{d800}aa". 20. If a DEFINE group immediately preceded a lookbehind assertion, the pattern could be mis-compiled and therefore not match correctly. This is the example that found this: /(?(DEFINE)(?bar))(? has been raised to 50, (b) the new --om-capture option changes the limit, (c) an error is raised if -o asks for a group that is above the limit. 12. The quantifier {1} was always being ignored, but this is incorrect when it is made possessive and applied to an item in parentheses, because a parenthesized item may contain multiple branches or other backtracking points, for example /(a|ab){1}+c/ or /(a+){1}+a/. 13. For partial matches, pcre2test was always showing the maximum lookbehind characters, flagged with "<", which is misleading when the lookbehind didn't actually look behind the start (because it was later in the pattern). Showing all consulted preceding characters for partial matches is now controlled by the existing "allusedtext" modifier and, as for complete matches, this facility is available only for non-JIT matching, because JIT does not maintain the first and last consulted characters. 14. DFA matching (using pcre2_dfa_match()) was not recognising a partial match if the end of the subject was encountered in a lookahead (conditional or otherwise), an atomic group, or a recursion. 15. Give error if pcre2test -t, -T, -tm or -TM is given an argument of zero. 16. Check for integer overflow when computing lookbehind lengths. Fixes Clusterfuzz issue 15636. 17. Implemented non-atomic positive lookaround assertions. 18. If a lookbehind contained a lookahead that contained another lookbehind within it, the nested lookbehind was not correctly processed. For example, if /(?<=(?=(?<=a)))b/ was matched to "ab" it gave no match instead of matching "b". 19. Implemented pcre2_get_match_data_size(). 20. Two alterations to partial matching: (a) The definition of a partial match is slightly changed: if a pattern contains any lookbehinds, an empty partial match may be given, because this is another situation where adding characters to the current subject can lead to a full match. Example: /c*+(?<=[bc])/ with subject "ab". (b) Similarly, if a pattern could match an empty string, an empty partial match may be given. Example: /(?![ab]).*/ with subject "ab". This case applies only to PCRE2_PARTIAL_HARD. (c) An empty string partial hard match can be returned for \z and \Z as it is documented that they shouldn't match. 21. A branch that started with (*ACCEPT) was not being recognized as one that could match an empty string. 22. Corrected pcre2_set_character_tables() tables data type: was const unsigned char * instead of const uint8_t *, as generated by pcre2_maketables(). 23. Upgraded to Unicode 12.1.0. 24. Add -jitfast command line option to pcre2test (to make all the jit options available directly). 25. Make pcre2test -C show if libreadline or libedit is supported. 26. If the length of one branch of a group exceeded 65535 (the maximum value that is remembered as a minimum length), the whole group's length was incorrectly recorded as 65535, leading to incorrect "no match" when start-up optimizations were in force. 27. The "rightmost consulted character" value was not always correct; in particular, if a pattern ended with a negative lookahead, characters that were inspected in that lookahead were not included. 28. Add the pcre2_maketables_free() function. 29. The start-up optimization that looks for a unique initial matching code unit in the interpretive engines uses memchr() in 8-bit mode. When the search is caseless, it was doing so inefficiently, which ended up slowing down the match drastically when the subject was very long. The revised code (a) remembers if one case is not found, so it never repeats the search for that case after a bumpalong and (b) when one case has been found, it searches only up to that position for an earlier occurrence of the other case. This fix applies to both interpretive pcre2_match() and to pcre2_dfa_match(). 30. While scanning to find the minimum length of a group, if any branch has minimum length zero, there is no need to scan any subsequent branches (a small compile-time performance improvement). 31. Installed a .gitignore file on a user's suggestion. When using the svn repository with git (through git svn) this helps keep it tidy. 32. Add underflow check in JIT which may occur when the value of subject string pointer is close to 0. 33. Arrange for classes such as [Aa] which contain just the two cases of the same character, to be treated as a single caseless character. This causes the first and required code unit optimizations to kick in where relevant. 34. Improve the bitmap of starting bytes for positive classes that include wide characters, but no property types, in UTF-8 mode. Previously, on encountering such a class, the bits for all bytes greater than \xc4 were set, thus specifying any character with codepoint >= 0x100. Now the only bits that are set are for the relevant bytes that start the wide characters. This can give a noticeable performance improvement. 35. If the bitmap of starting code units contains only 1 or 2 bits, replace it with a single starting code unit (1 bit) or a caseless single starting code unit if the two relevant characters are case-partners. This is particularly relevant to the 8-bit library, though it applies to all. It can give a performance boost for patterns such as [Ww]ord and (word|WORD). However, this optimization doesn't happen if there is a "required" code unit of the same value (because the search for a "required" code unit starts at the match start for non-unique first code unit patterns, but after a unique first code unit, and patterns such as a*a need the former action). 36. Small patch to pcre2posix.c to set the erroroffset field to -1 immediately after a successful compile, instead of at the start of matching to avoid a sanitizer complaint (regexec is supposed to be thread safe). 37. Add NEON vectorization to JIT to speed up matching of first character and pairs of characters on ARM64 CPUs. 38. If a non-ASCII character was the first in a starting assertion in a caseless match, the "first code unit" optimization did not get the casing right, and the assertion failed to match a character in the other case if it did not start with the same code unit. 39. Fixed the incorrect computation of jump sizes on x86 CPUs in JIT. A masking operation was incorrectly removed in r1136. Reported by Ralf Junker. Version 10.33 16-April-2019 --------------------------- 1. Added "allvector" to pcre2test to make it easy to check the part of the ovector that shouldn't be changed, in particular after substitute and failed or partial matches. 2. Fix subject buffer overread in JIT when UTF is disabled and \X or \R has a greater than 1 fixed quantifier. This issue was found by Yunho Kim. 3. Added support for callouts from pcre2_substitute(). After 10.33-RC1, but prior to release, fixed a bug that caused a crash if pcre2_substitute() was called with a NULL match context. 4. The POSIX functions are now all called pcre2_regcomp() etc., with wrapper functions that use the standard POSIX names. However, in pcre2posix.h the POSIX names are defined as macros. This should help avoid linking with the wrong library in some environments while still exporting the POSIX names for pre-existing programs that use them. (The Debian alternative names are also defined as macros, but not documented.) 5. Fix an xclass matching issue in JIT. 6. Implement PCRE2_EXTRA_ESCAPED_CR_IS_LF (see Bugzilla 2315). 7. Implement the Perl 5.28 experimental alphabetic names for atomic groups and lookaround assertions, for example, (*pla:...) and (*atomic:...). These are characterized by a lower case letter following (* and to simplify coding for this, the character tables created by pcre2_maketables() were updated to add a new "is lower case letter" bit. At the same time, the now unused "is hexadecimal digit" bit was removed. The default tables in src/pcre2_chartables.c.dist are updated. 8. Implement the new Perl "script run" features (*script_run:...) and (*atomic_script_run:...) aka (*sr:...) and (*asr:...). 9. Fixed two typos in change 22 for 10.21, which added special handling for ranges such as a-z in EBCDIC environments. The original code probably never worked, though there were no bug reports. 10. Implement PCRE2_COPY_MATCHED_SUBJECT for pcre2_match() (including JIT via pcre2_match()) and pcre2_dfa_match(), but *not* the pcre2_jit_match() fast path. Also, when a match fails, set the subject field in the match data to NULL for tidiness - none of the substring extractors should reference this after match failure. 11. If a pattern started with a subroutine call that had a quantifier with a minimum of zero, an incorrect "match must start with this character" could be recorded. Example: /(?&xxx)*ABC(?XYZ)/ would (incorrectly) expect 'A' to be the first character of a match. 12. The heap limit checking code in pcre2_dfa_match() could suffer from overflow if the heap limit was set very large. This could cause incorrect "heap limit exceeded" errors. 13. Add "kibibytes" to the heap limit output from pcre2test -C to make the units clear. 14. Add a call to pcre2_jit_free_unused_memory() in pcre2grep, for tidiness. 15. Updated the VMS-specific code in pcre2test on the advice of a VMS user. 16. Removed the unnecessary inclusion of stdint.h (or inttypes.h) from pcre2_internal.h as it is now included by pcre2.h. Also, change 17 for 10.32 below was unnecessarily complicated, as inttypes.h is a Standard C header, which is defined to be a superset of stdint.h. Instead of conditionally including stdint.h or inttypes.h, pcre2.h now unconditionally includes inttypes.h. This supports environments that do not have stdint.h but do have inttypes.h, which are known to exist. A note in the autotools documentation says (November 2018) that there are none known that are the other way round. 17. Added --disable-percent-zt to "configure" (and equivalent to CMake) to forcibly disable the use of %zu and %td in formatting strings because there is at least one version of VMS that claims to be C99 but does not support these modifiers. 18. Added --disable-pcre2grep-callout-fork, which restricts the callout support in pcre2grep to the inbuilt echo facility. This may be useful in environments that do not support fork(). 19. Fix two instances of <= 0 being applied to unsigned integers (the VMS compiler complains). 20. Added "fork" support for VMS to pcre2grep, for running an external program via a string callout. 21. Improve MAP_JIT flag usage on MacOS. Patch by Rich Siegel. 22. If a pattern started with (*MARK), (*COMMIT), (*PRUNE), (*SKIP), or (*THEN) followed by ^ it was not recognized as anchored. 23. The RunGrepTest script used to cut out the test of NUL characters for Solaris and MacOS as printf and sed can't handle them. It seems that the *BSD systems can't either. I've inverted the test so that only those OS that are known to work (currently only Linux) try to run this test. 24. Some tests in RunGrepTest appended to testtrygrep from two different file descriptors instead of redirecting stderr to stdout. This worked on Linux, but it was reported not to on other systems, causing the tests to fail. 25. In the RunTest script, make the test for stack setting use the same value for the stack as it needs for -bigstack. 26. Insert a cast in pcre2_dfa_match.c to suppress a compiler warning. 26. With PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL set, escape sequences such as \s which are valid in character classes, but not as the end of ranges, were being treated as literals. An example is [_-\s] (but not [\s-_] because that gave an error at the *start* of a range). Now an "invalid range" error is given independently of PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. 27. Related to 26 above, PCRE2_BAD_ESCAPE_IS_LITERAL was affecting known escape sequences such as \eX when they appeared invalidly in a character class. Now the option applies only to unrecognized or malformed escape sequences. 28. Fix word boundary in JIT compiler. Patch by Mike Munday. 29. The pcre2_dfa_match() function was incorrectly handling conditional version tests such as (?(VERSION>=0)...) when the version test was true. Incorrect processing or a crash could result. 30. When PCRE2_UTF is set, allow non-ASCII letters and decimal digits in group names, as Perl does. There was a small bug in this new code, found by ClusterFuzz 12950, fixed before release. 31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh} construct. 32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits from auto-anchoring if \p{Any}* starts a pattern. 33. Compile invalid UTF check in JIT test when only pcre32 is enabled. 34. For some time now, CMake has been warning about the setting of policy CMP0026 to "OLD" in CmakeLists.txt, and hinting that the feature might be removed in a future version. A request for CMake expertise on the list produced no result, so I have now hacked CMakeLists.txt along the lines of some changes I found on the Internet. The new code no longer needs the policy setting, and it appears to work fine on Linux. 35. Setting --enable-jit=auto for an out-of-tree build failed because the source directory wasn't in the search path for AC_TRY_COMPILE always. Patch from Ross Burton. 36. Disable SSE2 JIT optimizations in x86 CPUs when SSE2 is not available. Patch by Guillem Jover. 37. Changed expressions such as 1<<10 to 1u<<10 in many places because compiler warnings were reported. 38. Using the clang compiler with sanitizing options causes runtime complaints about truncation for statements such as x = ~x when x is an 8-bit value; it seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x gets rid of the warnings. There were also two missing casts in pcre2test. Version 10.32 10-September-2018 ------------------------------- 1. When matching using the REG_STARTEND feature of the POSIX API with a non-zero starting offset, unset capturing groups with lower numbers than a group that did capture something were not being correctly returned as "unset" (that is, with offset values of -1). 2. When matching using the POSIX API, pcre2test used to omit listing unset groups altogether. Now it shows those that come before any actual captures as "", as happens for non-POSIX matching. 3. Running "pcre2test -C" always stated "\R matches CR, LF, or CRLF only", whatever the build configuration was. It now correctly says "\R matches all Unicode newlines" in the default case when --enable-bsr-anycrlf has not been specified. Similarly, running "pcre2test -C bsr" never produced the result ANY. 4. Matching the pattern /(*UTF)\C[^\v]+\x80/ against an 8-bit string containing multi-code-unit characters caused bad behaviour and possibly a crash. This issue was fixed for other kinds of repeat in release 10.20 by change 19, but repeating character classes were overlooked. 5. pcre2grep now supports the inclusion of binary zeros in patterns that are read from files via the -f option. 6. A small fix to pcre2grep to avoid compiler warnings for -Wformat-overflow=2. 7. Added --enable-jit=auto support to configure.ac. 8. Added some dummy variables to the heapframe structure in 16-bit and 32-bit modes for the benefit of m68k, where pointers can be 16-bit aligned. The dummies force 32-bit alignment and this ensures that the structure is a multiple of PCRE2_SIZE, a requirement that is tested at compile time. In other architectures, alignment requirements take care of this automatically. 9. When returning an error from pcre2_pattern_convert(), ensure the error offset is set zero for early errors. 10. A number of patches for Windows support from Daniel Richard G: (a) List of error numbers in Runtest.bat corrected (it was not the same as in Runtest). (b) pcre2grep snprintf() workaround as used elsewhere in the tree. (c) Support for non-C99 snprintf() that returns -1 in the overflow case. 11. Minor tidy of pcre2_dfa_match() code. 12. Refactored pcre2_dfa_match() so that the internal recursive calls no longer use the stack for local workspace and local ovectors. Instead, an initial block of stack is reserved, but if this is insufficient, heap memory is used. The heap limit parameter now applies to pcre2_dfa_match(). 13. If a "find limits" test of DFA matching in pcre2test resulted in too many matches for the ovector, no matches were displayed. 14. Removed an occurrence of ctrl/Z from test 6 because Windows treats it as EOF. The test looks to have come from a fuzzer. 15. If PCRE2 was built with a default match limit a lot greater than the default default of 10 000 000, some JIT tests of the match limit no longer failed. All such tests now set 10 000 000 as the upper limit. 16. Another Windows related patch for pcregrep to ensure that WIN32 is undefined under Cygwin. 17. Test for the presence of stdint.h and inttypes.h in configure and CMake and include whichever exists (stdint preferred) instead of unconditionally including stdint. This makes life easier for old and non-standard systems. 18. Further changes to improve portability, especially to old and or non- standard systems: (a) Put all printf arguments in RunGrepTest into single, not double, quotes, and use \0 not \x00 for binary zero. (b) Avoid the use of C++ (i.e. BCPL) // comments. (c) Parameterize the use of %zu in pcre2test to make it like %td. For both of these now, if using MSVC or a standard C before C99, %lu is used with a cast if necessary. 19. Applied a contributed patch to CMakeLists.txt to increase the stack size when linking pcre2test with MSVC. This gets rid of a stack overflow error in the standard set of tests. 20. Output a warning in pcre2test when ignoring the "altglobal" modifier when it is given with the "replace" modifier. 21. In both pcre2test and pcre2_substitute(), with global matching, a pattern that matched an empty string, but never at the starting match offset, was not handled in a Perl-compatible way. The pattern /(a(*:1))(?>b)(*SKIP:1)x|.*/ matched against "abc", where the *SKIP shouldn't find a MARK (because is in an atomic group), but it did. 26. Upgraded the perltest.sh script: (1) #pattern lines can now be used to set a list of modifiers for all subsequent patterns - only those that the script recognizes are meaningful; (2) #subject lines can be used to set or unset a default "mark" modifier; (3) Unsupported #command lines give a warning when they are ignored; (4) Mark data is output only if the "mark" modifier is present. 27. (*ACCEPT:ARG), (*FAIL:ARG), and (*COMMIT:ARG) are now supported. 28. A (*MARK) name was not being passed back for positive assertions that were terminated by (*ACCEPT). 29. Add support for \N{U+dddd}, but only in Unicode mode. 30. Add support for (?^) for unsetting all imnsx options. 31. The PCRE2_EXTENDED (/x) option only ever discarded space characters whose code point was less than 256 and that were recognized by the lookup table generated by pcre2_maketables(), which uses isspace() to identify white space. Now, when Unicode support is compiled, PCRE2_EXTENDED also discards U+0085, U+200E, U+200F, U+2028, and U+2029, which are additional characters defined by Unicode as "Pattern White Space". This makes PCRE2 compatible with Perl. 32. In certain circumstances, option settings within patterns were not being correctly processed. For example, the pattern /((?i)A)(?m)B/ incorrectly matched "ab". (The (?m) setting lost the fact that (?i) should be reset at the end of its group during the parse process, but without another setting such as (?m) the compile phase got it right.) This bug was introduced by the refactoring in release 10.23. 33. PCRE2 uses bcopy() if available when memmove() is not, and it used just to define memmove() as function call to bcopy(). This hasn't been tested for a long time because in pcre2test the result of memmove() was being used, whereas bcopy() doesn't return a result. This feature is now refactored always to call an emulation function when there is no memmove(). The emulation makes use of bcopy() when available. 34. When serializing a pattern, set the memctl, executable_jit, and tables fields (that is, all the fields that contain pointers) to zeros so that the result of serializing is always the same. These fields are re-set when the pattern is deserialized. 35. In a pattern such as /[^\x{100}-\x{ffff}]*[\x80-\xff]/ which has a repeated negative class with no characters less than 0x100 followed by a positive class with only characters less than 0x100, the first class was incorrectly being auto-possessified, causing incorrect match failures. 36. Removed the character type bit ctype_meta, which dates from PCRE1 and is not used in PCRE2. 37. Tidied up unnecessarily complicated macros used in the escapes table. 38. Since 10.21, the new testoutput8-16-4 file has accidentally been omitted from distribution tarballs, owing to a typo in Makefile.am which had testoutput8-16-3 twice. Now fixed. 39. If the only branch in a conditional subpattern was anchored, the whole subpattern was treated as anchored, when it should not have been, since the assumed empty second branch cannot be anchored. Demonstrated by test patterns such as /(?(1)^())b/ or /(?(?=^))b/. 40. A repeated conditional subpattern that could match an empty string was always assumed to be unanchored. Now it is checked just like any other repeated conditional subpattern, and can be found to be anchored if the minimum quantifier is one or more. I can't see much use for a repeated anchored pattern, but the behaviour is now consistent. 41. Minor addition to pcre2_jit_compile.c to avoid static analyzer complaint (for an event that could never occur but you had to have external information to know that). 42. If before the first match in a file that was being searched by pcre2grep there was a line that was sufficiently long to cause the input buffer to be expanded, the variable holding the location of the end of the previous match was being adjusted incorrectly, and could cause an overflow warning from a code sanitizer. However, as the value is used only to print pending "after" lines when the next match is reached (and there are no such lines in this case) this bug could do no damage. Version 10.31 12-February-2018 ------------------------------ 1. Fix typo (missing ]) in VMS code in pcre2test.c. 2. Replace the replicated code for matching extended Unicode grapheme sequences (which got a lot more complicated by change 10.30/49) by a single subroutine that is called by both pcre2_match() and pcre2_dfa_match(). 3. Add idempotent guard to pcre2_internal.h. 4. Add new pcre2_config() options: PCRE2_CONFIG_NEVER_BACKSLASH_C and PCRE2_CONFIG_COMPILED_WIDTHS. 5. Cut out \C tests in the JIT regression tests when NEVER_BACKSLASH_C is defined (e.g. by --enable-never-backslash-C). 6. Defined public names for all the pcre2_compile() error numbers, and used the public names in pcre2_convert.c. 7. Fixed a small memory leak in pcre2test (convert contexts). 8. Added two casts to compile.c and one to match.c to avoid compiler warnings. 9. Added code to pcre2grep when compiled under VMS to set the symbol PCRE2GREP_RC to the exit status, because VMS does not distinguish between exit(0) and exit(1). 10. Added the -LM (list modifiers) option to pcre2test. Also made -C complain about a bad option only if the following argument item does not start with a hyphen. 11. pcre2grep was truncating components of file names to 128 characters when processing files with the -r option, and also (some very odd code) truncating path names to 512 characters. There is now a check on the absolute length of full path file names, which may be up to 2047 characters long. 12. When an assertion contained (*ACCEPT) it caused all open capturing groups to be closed (as for a non-assertion ACCEPT), which was wrong and could lead to misbehaviour for subsequent references to groups that started outside the assertion. ACCEPT in an assertion now closes only those groups that were started within that assertion. Fixes oss-fuzz issues 3852 and 3891. 13. Multiline matching in pcre2grep was misbehaving if the pattern matched within a line, and then matched again at the end of the line and over into subsequent lines. Behaviour was different with and without colouring, and sometimes context lines were incorrectly printed and/or line endings were lost. All these issues should now be fixed. 14. If --line-buffered was specified for pcre2grep when input was from a compressed file (.gz or .bz2) a segfault occurred. (Line buffering should be ignored for compressed files.) 15. Although pcre2_jit_match checks whether the pattern is compiled in a given mode, it was also expected that at least one mode is available. This is fixed and pcre2_jit_match returns with PCRE2_ERROR_JIT_BADOPTION when the pattern is not optimized by JIT at all. 16. The line number and related variables such as match counts in pcre2grep were all int variables, causing overflow when files with more than 2147483647 lines were processed (assuming 32-bit ints). They have all been changed to unsigned long ints. 17. If a backreference with a minimum repeat count of zero was first in a pattern, apart from assertions, an incorrect first matching character could be recorded. For example, for the pattern /(?=(a))\1?b/, "b" was incorrectly set as the first character of a match. 18. Characters in a leading positive assertion are considered for recording a first character of a match when the rest of the pattern does not provide one. However, a character in a non-assertive group within a leading assertion such as in the pattern /(?=(a))\1?b/ caused this process to fail. This was an infelicity rather than an outright bug, because it did not affect the result of a match, just its speed. (In fact, in this case, the starting 'a' was subsequently picked up in the study.) 19. A minor tidy in pcre2_match(): making all PCRE2_ERROR_ returns use "return" instead of "RRETURN" saves unwinding the backtracks in these cases (only one didn't). 20. Allocate a single callout block on the stack at the start of pcre2_match() and set its never-changing fields once only. Do the same for pcre2_dfa_match(). 21. Save the extra compile options (set in the compile context) with the compiled pattern (they were not previously saved), add PCRE2_INFO_EXTRAOPTIONS to retrieve them, and update pcre2test to show them. 22. Added PCRE2_CALLOUT_STARTMATCH and PCRE2_CALLOUT_BACKTRACK bits to a new field callout_flags in callout blocks. The bits are set by pcre2_match(), but not by JIT or pcre2_dfa_match(). Their settings are shown in pcre2test callouts if the callout_extra subject modifier is set. These bits are provided to help with tracking how a backtracking match is proceeding. 23. Updated the pcre2demo.c demonstration program, which was missing the extra code for -g that handles the case when \K in an assertion causes the match to end at the original start point. Also arranged for it to detect when \K causes the end of a match to be before its start. 24. Similar to 23 above, strange things (including loops) could happen in pcre2grep when \K was used in an assertion when --colour was used or in multiline mode. The "end at original start point" bug is fixed, and if the end point is found to be before the start point, they are swapped. 25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT matching (both pcre2_match() and pcre2_dfa_match()) and the matched string started with the first code unit of a newline sequence, matching failed because it was not tried at the newline. 26. Code for giving up a non-partial match after failing to find a starting code unit anywhere in the subject was missing when searching for one of a number of code units (the bitmap case) in both pcre2_match() and pcre2_dfa_match(). This was a missing optimization rather than a bug. 27. Tidied up the ACROSSCHAR macro to be like FORWARDCHAR and BACKCHAR, using a pointer argument rather than a code unit value. This should not have affected the generated code. 28. The JIT compiler has been updated. 29. Avoid pointer overflow for unset captures in pcre2_substring_list_get(). This could not actually cause a crash because it was always used in a memcpy() call with zero length. 30. Some internal structures have a variable-length ovector[] as their last element. Their actual memory is obtained dynamically, giving an ovector of appropriate length. However, they are defined in the structure as ovector[NUMBER], where NUMBER is large so that array bound checkers don't grumble. The value of NUMBER was 10000, but a fuzzer exceeded 5000 capturing groups, making the ovector larger than this. The number has been increased to 131072, which allows for the maximum number of captures (65535) plus the overall match. This fixes oss-fuzz issue 5415. 31. Auto-possessification at the end of a capturing group was dependent on what follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused incorrect behaviour when the group was called recursively from elsewhere in the pattern where something different might follow. This bug is an unforseen consequence of change #1 for 10.30 - the implementation of backtracking into recursions. Iterators at the ends of capturing groups are no longer considered for auto-possessification if the pattern contains any recursions. Fixes Bugzilla #2232. Version 10.30 14-August-2017 ---------------------------- 1. The main interpreter, pcre2_match(), has been refactored into a new version that does not use recursive function calls (and therefore the stack) for remembering backtracking positions. This makes --disable-stack-for-recursion a NOOP. The new implementation allows backtracking into recursive group calls in patterns, making it more compatible with Perl, and also fixes some other hard-to-do issues such as #1887 in Bugzilla. The code is also cleaner because the old code had a number of fudges to try to reduce stack usage. It seems to run no slower than the old code. A number of bugs in the refactored code were subsequently fixed during testing before release, but after the code was made available in the repository. These bugs were never in fully released code, but are noted here for the record. (a) If a pattern had fewer capturing parentheses than the ovector supplied in the match data block, a memory error (detectable by ASAN) occurred after a match, because the external block was being set from non-existent internal ovector fields. Fixes oss-fuzz issue 781. (b) A pattern with very many capturing parentheses (when the internal frame size was greater than the initial frame vector on the stack) caused a crash. A vector on the heap is now set up at the start of matching if the vector on the stack is not big enough to handle at least 10 frames. Fixes oss-fuzz issue 783. (c) Handling of (*VERB)s in recursions was wrong in some cases. (d) Captures in negative assertions that were used as conditions were not happening if the assertion matched via (*ACCEPT). (e) Mark values were not being passed out of recursions. (f) Refactor some code in do_callout() to avoid picky compiler warnings about negative indices. Fixes oss-fuzz issue 1454. (g) Similarly refactor the way the variable length ovector is addressed for similar reasons. Fixes oss-fuzz issue 1465. 2. Now that pcre2_match() no longer uses recursive function calls (see above), the "match limit recursion" value seems misnamed. It still exists, and limits the depth of tree that is searched. To avoid future confusion, it has been renamed as "depth limit" in all relevant places (--with-depth-limit, (*LIMIT_DEPTH), pcre2_set_depth_limit(), etc) but the old names are still available for backwards compatibility. 3. Hardened pcre2test so as to reduce the number of bugs reported by fuzzers: (a) Check for malloc failures when getting memory for the ovector (POSIX) or the match data block (non-POSIX). 4. In the 32-bit library in non-UTF mode, an attempt to find a Unicode property for a character with a code point greater than 0x10ffff (the Unicode maximum) caused a crash. 5. If a lookbehind assertion that contained a back reference to a group appearing later in the pattern was compiled with the PCRE2_ANCHORED option, undefined actions (often a segmentation fault) could occur, depending on what other options were set. An example assertion is (?" should be ">=" in opcode check in pcre2_auto_possess.c. (b) Added some casts to avoid "suspicious implicit sign extension". (c) Resource leaks in pcre2test in rare error cases. (d) Avoid warning for never-use case OP_TABLE_LENGTH which is just a fudge for checking at compile time that tables are the right size. (e) Add missing "fall through" comment. 29. Implemented PCRE2_EXTENDED_MORE and related /xx and (?xx) features. 30. Implement (?n: for PCRE2_NO_AUTO_CAPTURE, because Perl now has this. 31. If more than one of "push", "pushcopy", or "pushtablescopy" were set in pcre2test, a crash could occur. 32. Make -bigstack in RunTest allocate a 64MiB stack (instead of 16MiB) so that all the tests can run with clang's sanitizing options. 33. Implement extra compile options in the compile context and add the first one: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. 34. Implement newline type PCRE2_NEWLINE_NUL. 35. A lookbehind assertion that had a zero-length branch caused undefined behaviour when processed by pcre2_dfa_match(). This is oss-fuzz issue 1859. 36. The match limit value now also applies to pcre2_dfa_match() as there are patterns that can use up a lot of resources without necessarily recursing very deeply. (Compare item 10.23/36.) This should fix oss-fuzz #1761. 37. Implement PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. 38. Fix returned offsets from regexec() when REG_STARTEND is used with a starting offset greater than zero. 39. Implement REG_PEND (GNU extension) for the POSIX wrapper. 40. Implement the subject_literal modifier in pcre2test, and allow jitstack on pattern lines. 41. Implement PCRE2_LITERAL and use it to support REG_NOSPEC. 42. Implement PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD for the benefit of pcre2grep. 43. Re-implement pcre2grep's -F, -w, and -x options using PCRE2_LITERAL, PCRE2_EXTRA_MATCH_WORD, and PCRE2_EXTRA_MATCH_LINE. This fixes two bugs: (a) The -F option did not work for fixed strings containing \E. (b) The -w option did not work for patterns with multiple branches. 44. Added configuration options for the SELinux compatible execmem allocator in JIT. 45. Increased the limit for searching for a "must be present" code unit in subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are much faster. 46. Arrange for anchored patterns to record and use "first code unit" data, because this can give a fast "no match" without searching for a "required code unit". Previously only non-anchored patterns did this. 47. Upgraded the Unicode tables from Unicode 8.0.0 to Unicode 10.0.0. 48. Add the callout_no_where modifier to pcre2test. 49. Update extended grapheme breaking rules to the latest set that are in Unicode Standard Annex #29. 50. Added experimental foreign pattern conversion facilities (pcre2_pattern_convert() and friends). 51. Change the macro FWRITE, used in pcre2grep, to FWRITE_IGNORE because FWRITE is defined in a system header in cygwin. Also modified some of the #ifdefs in pcre2grep related to Windows and Cygwin support. 52. Change 3(g) for 10.23 was a bit too zealous. If a hyphen that follows a character class is the last character in the class, Perl does not give a warning. PCRE2 now also treats this as a literal. 53. Related to 52, though PCRE2 was throwing an error for [[:digit:]-X] it was not doing so for [\d-X] (and similar escapes), as is documented. 54. Fixed a MIPS issue in the JIT compiler reported by Joshua Kinard. 55. Fixed a "maybe uninitialized" warning for class_uchardata in \p handling in pcre2_compile() which could never actually trigger (code should have been cut out when Unicode support is disabled). Version 10.23 14-February-2017 ------------------------------ 1. Extended pcre2test with the utf8_input modifier so that it is able to generate all possible 16-bit and 32-bit code unit values in non-UTF modes. 2. In any wide-character mode (8-bit UTF or any 16-bit or 32-bit mode), without PCRE2_UCP set, a negative character type such as \D in a positive class should cause all characters greater than 255 to match, whatever else is in the class. There was a bug that caused this not to happen if a Unicode property item was added to such a class, for example [\D\P{Nd}] or [\W\pL]. 3. There has been a major re-factoring of the pcre2_compile.c file. Most syntax checking is now done in the pre-pass that identifies capturing groups. This has reduced the amount of duplication and made the code tidier. While doing this, some minor bugs and Perl incompatibilities were fixed, including: (a) \Q\E in the middle of a quantifier such as A+\Q\E+ is now ignored instead of giving an invalid quantifier error. (b) {0} can now be used after a group in a lookbehind assertion; previously this caused an "assertion is not fixed length" error. (c) Perl always treats (?(DEFINE) as a "define" group, even if a group with the name "DEFINE" exists. PCRE2 now does likewise. (d) A recursion condition test such as (?(R2)...) must now refer to an existing subpattern. (e) A conditional recursion test such as (?(R)...) misbehaved if there was a group whose name began with "R". (f) When testing zero-terminated patterns under valgrind, the terminating zero is now marked "no access". This catches bugs that would otherwise show up only with non-zero-terminated patterns. (g) A hyphen appearing immediately after a POSIX character class (for example /[[:ascii:]-z]/) now generates an error. Perl does accept this as a literal, but gives a warning, so it seems best to fail it in PCRE. (h) An empty \Q\E sequence may appear after a callout that precedes an assertion condition (it is, of course, ignored). One effect of the refactoring is that some error numbers and messages have changed, and the pattern offset given for compiling errors is not always the right-most character that has been read. In particular, for a variable-length lookbehind assertion it now points to the start of the assertion. Another change is that when a callout appears before a group, the "length of next pattern item" that is passed now just gives the length of the opening parenthesis item, not the length of the whole group. A length of zero is now given only for a callout at the end of the pattern. Automatic callouts are no longer inserted before and after explicit callouts in the pattern. A number of bugs in the refactored code were subsequently fixed during testing before release, but after the code was made available in the repository. Many of the bugs were discovered by fuzzing testing. Several of them were related to the change from assuming a zero-terminated pattern (which previously had required non-zero terminated strings to be copied). These bugs were never in fully released code, but are noted here for the record. (a) An overall recursion such as (?0) inside a lookbehind assertion was not being diagnosed as an error. (b) In utf mode, the length of a *MARK (or other verb) name was being checked in characters instead of code units, which could lead to bad code being compiled, leading to unpredictable behaviour. (c) In extended /x mode, characters whose code was greater than 255 caused a lookup outside one of the global tables. A similar bug existed for wide characters in *VERB names. (d) The amount of memory needed for a compiled pattern was miscalculated if a lookbehind contained more than one toplevel branch and the first branch was of length zero. (e) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero- terminated pattern, if a # comment ran on to the end of the pattern, one or more code units past the end were being read. (f) An unterminated repeat at the end of a non-zero-terminated pattern (e.g. "{2,2") could cause reading beyond the pattern. (g) When reading a callout string, if the end delimiter was at the end of the pattern one further code unit was read. (h) An unterminated number after \g' could cause reading beyond the pattern. (i) An insufficient memory size was being computed for compiling with PCRE2_AUTO_CALLOUT. (j) A conditional group with an assertion condition used more memory than was allowed for it during parsing, so too many of them could therefore overrun a buffer. (k) If parsing a pattern exactly filled the buffer, the internal test for overrun did not check when the final META_END item was added. (l) If a lookbehind contained a subroutine call, and the called group contained an option setting such as (?s), and the PCRE2_ANCHORED option was set, unpredictable behaviour could occur. The underlying bug was incorrect code and insufficient checking while searching for the end of the called subroutine in the parsed pattern. (m) Quantifiers following (*VERB)s were not being diagnosed as errors. (n) The use of \Q...\E in a (*VERB) name when PCRE2_ALT_VERBNAMES and PCRE2_AUTO_CALLOUT were both specified caused undetermined behaviour. (o) If \Q was preceded by a quantified item, and the following \E was followed by '?' or '+', and there was at least one literal character between them, an internal error "unexpected repeat" occurred (example: /.+\QX\E+/). (p) A buffer overflow could occur while sorting the names in the group name list (depending on the order in which the names were seen). (q) A conditional group that started with a callout was not doing the right check for a following assertion, leading to compiling bad code. Example: /(?(C'XX))?!XX/ (r) If a character whose code point was greater than 0xffff appeared within a lookbehind that was within another lookbehind, the calculation of the lookbehind length went wrong and could provoke an internal error. (t) The sequence \E- or \Q\E- after a POSIX class in a character class caused an internal error. Now the hyphen is treated as a literal. 4. Back references are now permitted in lookbehind assertions when there are no duplicated group numbers (that is, (?| has not been used), and, if the reference is by name, there is only one group of that name. The referenced group must, of course be of fixed length. 5. pcre2test has been upgraded so that, when run under valgrind with valgrind support enabled, reading past the end of the pattern is detected, both when compiling and during callout processing. 6. \g{+} (e.g. \g{+2} ) is now supported. It is a "forward back reference" and can be useful in repetitions (compare \g{-} ). Perl does not recognize this syntax. 7. Automatic callouts are no longer generated before and after callouts in the pattern. 8. When pcre2test was outputting information from a callout, the caret indicator for the current position in the subject line was incorrect if it was after an escape sequence for a character whose code point was greater than \x{ff}. 9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be PCRE2_STATIC_RUNTIME). Fix from David Gaussmann. 10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer expansion when long lines are encountered. Original patch by Dmitry Cherniachenko. 11. If pcre2grep was compiled with JIT support, but the library was compiled without it (something that neither ./configure nor CMake allow, but it can be done by editing config.h), pcre2grep was giving a JIT error. Now it detects this situation and does not try to use JIT. 12. Added some "const" qualifiers to variables in pcre2grep. 13. Added Dmitry Cherniachenko's patch for colouring output in Windows (untested by me). Also, look for GREP_COLOUR or GREP_COLOR if the environment variables PCRE2GREP_COLOUR and PCRE2GREP_COLOR are not found. 14. Add the -t (grand total) option to pcre2grep. 15. A number of bugs have been mended relating to match start-up optimizations when the first thing in a pattern is a positive lookahead. These all applied only when PCRE2_NO_START_OPTIMIZE was *not* set: (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed both an initial 'X' and a following 'X'. (b) Some patterns starting with an assertion that started with .* were incorrectly optimized as having to match at the start of the subject or after a newline. There are cases where this is not true, for example, (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that start with spaces. Starting .* in an assertion is no longer taken as an indication of matching at the start (or after a newline). 16. The "offset" modifier in pcre2test was not being ignored (as documented) when the POSIX API was in use. 17. Added --enable-fuzz-support to "configure", causing an non-installed library containing a test function that can be called by fuzzers to be compiled. A non-installed binary to run the test function locally, called pcre2fuzzcheck is also compiled. 18. A pattern with PCRE2_DOTALL (/s) set but not PCRE2_NO_DOTSTAR_ANCHOR, and which started with .* inside a positive lookahead was incorrectly being compiled as implicitly anchored. 19. Removed all instances of "register" declarations, as they are considered obsolete these days and in any case had become very haphazard. 20. Add strerror() to pcre2test for failed file opening. 21. Make pcre2test -C list valgrind support when it is enabled. 22. Add the use_length modifier to pcre2test. 23. Fix an off-by-one bug in pcre2test for the list of names for 'get' and 'copy' modifiers. 24. Add PCRE2_CALL_CONVENTION into the prototype declarations in pcre2.h as it is apparently needed there as well as in the function definitions. (Why did nobody ask for this in PCRE1?) 25. Change the _PCRE2_H and _PCRE2_UCP_H guard macros in the header files to PCRE2_H_IDEMPOTENT_GUARD and PCRE2_UCP_H_IDEMPOTENT_GUARD to be more standard compliant and unique. 26. pcre2-config --libs-posix was listing -lpcre2posix instead of -lpcre2-posix. Also, the CMake build process was building the library with the wrong name. 27. In pcre2test, give some offset information for errors in hex patterns. This uses the C99 formatting sequence %td, except for MSVC which doesn't support it - %lu is used instead. 28. Implemented pcre2_code_copy_with_tables(), and added pushtablescopy to pcre2test for testing it. 29. Fix small memory leak in pcre2test. 30. Fix out-of-bounds read for partial matching of /./ against an empty string when the newline type is CRLF. 31. Fix a bug in pcre2test that caused a crash when a locale was set either in the current pattern or a previous one and a wide character was matched. 32. The appearance of \p, \P, or \X in a substitution string when PCRE2_SUBSTITUTE_EXTENDED was set caused a segmentation fault (NULL dereference). 33. If the starting offset was specified as greater than the subject length in a call to pcre2_substitute() an out-of-bounds memory reference could occur. 34. When PCRE2 was compiled to use the heap instead of the stack for recursive calls to match(), a repeated minimizing caseless back reference, or a maximizing one where the two cases had different numbers of code units, followed by a caseful back reference, could lose the caselessness of the first repeated back reference (example: /(Z)(a)\2{1,2}?(?-i)\1X/i should match ZaAAZX but didn't). 35. When a pattern is too complicated, PCRE2 gives up trying to find a minimum matching length and just records zero. Typically this happens when there are too many nested or recursive back references. If the limit was reached in certain recursive cases it failed to be triggered and an internal error could be the result. 36. The pcre2_dfa_match() function now takes note of the recursion limit for the internal recursive calls that are used for lookrounds and recursions within the pattern. 37. More refactoring has got rid of the internal could_be_empty_branch() function (around 400 lines of code, including comments) by keeping track of could-be-emptiness as the pattern is compiled instead of scanning compiled groups. (This would have been much harder before the refactoring of #3 above.) This lifts a restriction on the number of branches in a group (more than about 1100 would give "pattern is too complicated"). 38. Add the "-ac" command line option to pcre2test as a synonym for "-pattern auto_callout". 39. In a library with Unicode support, incorrect data was compiled for a pattern with PCRE2_UCP set without PCRE2_UTF if a class required all wide characters to match (for example, /[\s[:^ascii:]]/). 40. The callout_error modifier has been added to pcre2test to make it possible to return PCRE2_ERROR_CALLOUT from a callout. 41. A minor change to pcre2grep: colour reset is now "[0m" instead of "[00m". 42. The limit in the auto-possessification code that was intended to catch overly-complicated patterns and not spend too much time auto-possessifying was being reset too often, resulting in very long compile times for some patterns. Now such patterns are no longer completely auto-possessified. 43. Applied Jason Hood's revised patch for RunTest.bat. 44. Added a new Windows script RunGrepTest.bat, courtesy of Jason Hood. 45. Minor cosmetic fix to pcre2test: move a variable that is not used under Windows into the "not Windows" code. 46. Applied Jason Hood's patches to upgrade pcre2grep under Windows and tidy some of the code: * normalised the Windows condition by ensuring WIN32 is defined; * enables the callout feature under Windows; * adds globbing (Microsoft's implementation expands quoted args), using a tweaked opendirectory; * implements the is_*_tty functions for Windows; * --color=always will write the ANSI sequences to file; * add sequences 4 (underline works on Win10) and 5 (blink as bright background, relatively standard on DOS/Win); * remove the (char *) casts for the now-const strings; * remove GREP_COLOUR (grep's command line allowed the 'u', but not the environment), parsing GREP_COLORS instead; * uses the current colour if not set, rather than black; * add print_match for the undefined case; * fixes a typo. In addition, colour settings containing anything other than digits and semicolon are ignored, and the colour controls are no longer output for empty strings. 47. Detecting patterns that are too large inside the length-measuring loop saves processing ridiculously long patterns to their end. 48. Ignore PCRE2_CASELESS when processing \h, \H, \v, and \V in classes as it just wastes time. In the UTF case it can also produce redundant entries in XCLASS lists caused by characters with multiple other cases and pairs of characters in the same "not-x" sublists. 49. A pattern such as /(?=(a\K))/ can report the end of the match being before its start; pcre2test was not handling this correctly when using the POSIX interface (it was OK with the native interface). 50. In pcre2grep, ignore all JIT compile errors. This means that pcre2grep will continue to work, falling back to interpretation if anything goes wrong with JIT. 51. Applied patches from Christian Persch to configure.ac to make use of the AC_USE_SYSTEM_EXTENSIONS macro and to test for functions used by the JIT modules. 52. Minor fixes to pcre2grep from Jason Hood: * fixed some spacing; * Windows doesn't usually use single quotes, so I've added a define to use appropriate quotes [in an example]; * LC_ALL was displayed as "LCC_ALL"; * numbers 11, 12 & 13 should end in "th"; * use double quotes in usage message. 53. When autopossessifying, skip empty branches without recursion, to reduce stack usage for the benefit of clang with -fsanitize-address, which uses huge stack frames. Example pattern: /X?(R||){3335}/. Fixes oss-fuzz issue 553. 54. A pattern with very many explicit back references to a group that is a long way from the start of the pattern could take a long time to compile because searching for the referenced group in order to find the minimum length was being done repeatedly. Now up to 128 group minimum lengths are cached and the attempt to find a minimum length is abandoned if there is a back reference to a group whose number is greater than 128. (In that case, the pattern is so complicated that this optimization probably isn't worth it.) This fixes oss-fuzz issue 557. 55. Issue 32 for 10.22 below was not correctly fixed. If pcre2grep in multiline mode with --only-matching matched several lines, it restarted scanning at the next line instead of moving on to the end of the matched string, which can be several lines after the start. 56. Applied Jason Hood's new patch for RunGrepTest.bat that updates it in line with updates to the non-Windows version. Version 10.22 29-July-2016 -------------------------- 1. Applied Jason Hood's patches to RunTest.bat and testdata/wintestoutput3 to fix problems with running the tests under Windows. 2. Implemented a facility for quoting literal characters within hexadecimal patterns in pcre2test, to make it easier to create patterns with just a few non-printing characters. 3. Binary zeros are not supported in pcre2test input files. It now detects them and gives an error. 4. Updated the valgrind parameters in RunTest: (a) changed smc-check=all to smc-check=all-non-file; (b) changed obj:* in the suppression file to obj:??? so that it matches only unknown objects. 5. Updated the maintenance script maint/ManyConfigTests to make it easier to select individual groups of tests. 6. When the POSIX wrapper function regcomp() is called, the REG_NOSUB option used to set PCRE2_NO_AUTO_CAPTURE when calling pcre2_compile(). However, this disables the use of back references (and subroutine calls), which are supported by other implementations of regcomp() with RE_NOSUB. Therefore, REG_NOSUB no longer causes PCRE2_NO_AUTO_CAPTURE to be set, though it still ignores nmatch and pmatch when regexec() is called. 7. Because of 6 above, pcre2test has been modified with a new modifier called posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture modifier had this effect. That option is now ignored when the POSIX API is in use. 8. Minor tidies to the pcre2demo.c sample program, including more comments about its 8-bit-ness. 9. Detect unmatched closing parentheses and give the error in the pre-scan instead of later. Previously the pre-scan carried on and could give a misleading incorrect error message. For example, /(?J)(?'a'))(?'a')/ gave a message about invalid duplicate group names. 10. It has happened that pcre2test was accidentally linked with another POSIX regex library instead of libpcre2-posix. In this situation, a call to regcomp() (in the other library) may succeed, returning zero, but of course putting its own data into the regex_t block. In one example the re_pcre2_code field was left as NULL, which made pcre2test think it had not got a compiled POSIX regex, so it treated the next line as another pattern line, resulting in a confusing error message. A check has been added to pcre2test to see if the data returned from a successful call of regcomp() are valid for PCRE2's regcomp(). If they are not, an error message is output and the pcre2test run is abandoned. The message points out the possibility of a mis-linking. Hopefully this will avoid some head-scratching the next time this happens. 11. A pattern such as /(?<=((?C)0))/, which has a callout inside a lookbehind assertion, caused pcre2test to output a very large number of spaces when the callout was taken, making the program appearing to loop. 12. A pattern that included (*ACCEPT) in the middle of a sufficiently deeply nested set of parentheses of sufficient size caused an overflow of the compiling workspace (which was diagnosed, but of course is not desirable). 13. Detect missing closing parentheses during the pre-pass for group identification. 14. Changed some integer variable types and put in a number of casts, following a report of compiler warnings from Visual Studio 2013 and a few tests with gcc's -Wconversion (which still throws up a lot). 15. Implemented pcre2_code_copy(), and added pushcopy and #popcopy to pcre2test for testing it. 16. Change 66 for 10.21 introduced the use of snprintf() in PCRE2's version of regerror(). When the error buffer is too small, my version of snprintf() puts a binary zero in the final byte. Bug #1801 seems to show that other versions do not do this, leading to bad output from pcre2test when it was checking for buffer overflow. It no longer assumes a binary zero at the end of a too-small regerror() buffer. 17. Fixed typo ("&&" for "&") in pcre2_study(). Fortunately, this could not actually affect anything, by sheer luck. 18. Two minor fixes for MSVC compilation: (a) removal of apparently incorrect "const" qualifiers in pcre2test and (b) defining snprintf as _snprintf for older MSVC compilers. This has been done both in src/pcre2_internal.h for most of the library, and also in src/pcre2posix.c, which no longer includes pcre2_internal.h (see 24 below). 19. Applied Chris Wilson's patch (Bugzilla #1681) to CMakeLists.txt for MSVC static compilation. Subsequently applied Chris Wilson's second patch, putting the first patch under a new option instead of being unconditional when PCRE_STATIC is set. 20. Updated pcre2grep to set stdout as binary when run under Windows, so as not to convert \r\n at the ends of reflected lines into \r\r\n. This required ensuring that other output that is written to stdout (e.g. file names) uses the appropriate line terminator: \r\n for Windows, \n otherwise. 21. When a line is too long for pcre2grep's internal buffer, show the maximum length in the error message. 22. Added support for string callouts to pcre2grep (Zoltan's patch with PH additions). 23. RunTest.bat was missing a "set type" line for test 22. 24. The pcre2posix.c file was including pcre2_internal.h, and using some "private" knowledge of the data structures. This is unnecessary; the code has been re-factored and no longer includes pcre2_internal.h. 25. A racing condition is fixed in JIT reported by Mozilla. 26. Minor code refactor to avoid "array subscript is below array bounds" compiler warning. 27. Minor code refactor to avoid "left shift of negative number" warning. 28. Add a bit more sanity checking to pcre2_serialize_decode() and document that it expects trusted data. 29. Fix typo in pcre2_jit_test.c 30. Due to an oversight, pcre2grep was not making use of JIT when available. This is now fixed. 31. The RunGrepTest script is updated to use the valgrind suppressions file when testing with JIT under valgrind (compare 10.21/51 below). The suppressions file is updated so that is now the same as for PCRE1: it suppresses the Memcheck warnings Addr16 and Cond in unknown objects (that is, JIT-compiled code). Also changed smc-check=all to smc-check=all-non-file as was done for RunTest (see 4 above). 32. Implemented the PCRE2_NO_JIT option for pcre2_match(). 33. Fix typo that gave a compiler error when JIT not supported. 34. Fix comment describing the returns from find_fixedlength(). 35. Fix potential negative index in pcre2test. 36. Calls to pcre2_get_error_message() with error numbers that are never returned by PCRE2 functions were returning empty strings. Now the error code PCRE2_ERROR_BADDATA is returned. A facility has been added to pcre2test to show the texts for given error numbers (i.e. to call pcre2_get_error_message() and display what it returns) and a few representative error codes are now checked in RunTest. 37. Added "&& !defined(__INTEL_COMPILER)" to the test for __GNUC__ in pcre2_match.c, in anticipation that this is needed for the same reason it was recently added to pcrecpp.cc in PCRE1. 38. Using -o with -M in pcre2grep could cause unnecessary repeated output when the match extended over a line boundary, as it tried to find more matches "on the same line" - but it was already over the end. 39. Allow \C in lookbehinds and DFA matching in UTF-32 mode (by converting it to the same code as '.' when PCRE2_DOTALL is set). 40. Fix two clang compiler warnings in pcre2test when only one code unit width is supported. 41. Upgrade RunTest to automatically re-run test 2 with a large (64MiB) stack if it fails when running the interpreter with a 16MiB stack (and if changing the stack size via pcre2test is possible). This avoids having to manually set a large stack size when testing with clang. 42. Fix register overwrite in JIT when SSE2 acceleration is enabled. 43. Detect integer overflow in pcre2test pattern and data repetition counts. 44. In pcre2test, ignore "allcaptures" after DFA matching. 45. Fix unaligned accesses on x86. Patch by Marc Mutz. 46. Fix some more clang compiler warnings. Version 10.21 12-January-2016 ----------------------------- 1. Improve matching speed of patterns starting with + or * in JIT. 2. Use memchr() to find the first character in an unanchored match in 8-bit mode in the interpreter. This gives a significant speed improvement. 3. Removed a redundant copy of the opcode_possessify table in the pcre2_auto_possessify.c source. 4. Fix typos in dftables.c for z/OS. 5. Change 36 for 10.20 broke the handling of [[:>:]] and [[:<:]] in that processing them could involve a buffer overflow if the following character was an opening parenthesis. 6. Change 36 for 10.20 also introduced a bug in processing this pattern: /((?x)(*:0))#(?'/. Specifically: if a setting of (?x) was followed by a (*MARK) setting (which (*:0) is), then (?x) did not get unset at the end of its group during the scan for named groups, and hence the external # was incorrectly treated as a comment and the invalid (?' at the end of the pattern was not diagnosed. This caused a buffer overflow during the real compile. This bug was discovered by Karl Skomski with the LLVM fuzzer. 7. Moved the pcre2_find_bracket() function from src/pcre2_compile.c into its own source module to avoid a circular dependency between src/pcre2_compile.c and src/pcre2_study.c 8. A callout with a string argument containing an opening square bracket, for example /(?C$[$)(?<]/, was incorrectly processed and could provoke a buffer overflow. This bug was discovered by Karl Skomski with the LLVM fuzzer. 9. The handling of callouts during the pre-pass for named group identification has been tightened up. 10. The quantifier {1} can be ignored, whether greedy, non-greedy, or possessive. This is a very minor optimization. 11. A possessively repeated conditional group that could match an empty string, for example, /(?(R))*+/, was incorrectly compiled. 12. The Unicode tables have been updated to Unicode 8.0.0 (thanks to Christian Persch). 13. An empty comment (?#) in a pattern was incorrectly processed and could provoke a buffer overflow. This bug was discovered by Karl Skomski with the LLVM fuzzer. 14. Fix infinite recursion in the JIT compiler when certain patterns such as /(?:|a|){100}x/ are analysed. 15. Some patterns with character classes involving [: and \\ were incorrectly compiled and could cause reading from uninitialized memory or an incorrect error diagnosis. Examples are: /[[:\\](?<[::]/ and /[[:\\](?'abc')[a:]. The first of these bugs was discovered by Karl Skomski with the LLVM fuzzer. 16. Pathological patterns containing many nested occurrences of [: caused pcre2_compile() to run for a very long time. This bug was found by the LLVM fuzzer. 17. A missing closing parenthesis for a callout with a string argument was not being diagnosed, possibly leading to a buffer overflow. This bug was found by the LLVM fuzzer. 18. A conditional group with only one branch has an implicit empty alternative branch and must therefore be treated as potentially matching an empty string. 19. If (?R was followed by - or + incorrect behaviour happened instead of a diagnostic. This bug was discovered by Karl Skomski with the LLVM fuzzer. 20. Another bug that was introduced by change 36 for 10.20: conditional groups whose condition was an assertion preceded by an explicit callout with a string argument might be incorrectly processed, especially if the string contained \Q. This bug was discovered by Karl Skomski with the LLVM fuzzer. 21. Compiling PCRE2 with the sanitize options of clang showed up a number of very pedantic coding infelicities and a buffer overflow while checking a UTF-8 string if the final multi-byte UTF-8 character was truncated. 22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a class, where both values are literal letters in the same case, omit the non-letter EBCDIC code points within the range. 23. Finding the minimum matching length of complex patterns with back references and/or recursions can take a long time. There is now a cut-off that gives up trying to find a minimum length when things get too complex. 24. An optimization has been added that speeds up finding the minimum matching length for patterns containing repeated capturing groups or recursions. 25. If a pattern contained a back reference to a group whose number was duplicated as a result of appearing in a (?|...) group, the computation of the minimum matching length gave a wrong result, which could cause incorrect "no match" errors. For such patterns, a minimum matching length cannot at present be computed. 26. Added a check for integer overflow in conditions (?() and (?(R). This omission was discovered by Karl Skomski with the LLVM fuzzer. 27. Fixed an issue when \p{Any} inside an xclass did not read the current character. 28. If pcre2grep was given the -q option with -c or -l, or when handling a binary file, it incorrectly wrote output to stdout. 29. The JIT compiler did not restore the control verb head in case of *THEN control verbs. This issue was found by Karl Skomski with a custom LLVM fuzzer. 30. The way recursive references such as (?3) are compiled has been re-written because the old way was the cause of many issues. Now, conversion of the group number into a pattern offset does not happen until the pattern has been completely compiled. This does mean that detection of all infinitely looping recursions is postponed till match time. In the past, some easy ones were detected at compile time. This re-writing was done in response to yet another bug found by the LLVM fuzzer. 31. A test for a back reference to a non-existent group was missing for items such as \987. This caused incorrect code to be compiled. This issue was found by Karl Skomski with a custom LLVM fuzzer. 32. Error messages for syntax errors following \g and \k were giving inaccurate offsets in the pattern. 33. Improve the performance of starting single character repetitions in JIT. 34. (*LIMIT_MATCH=) now gives an error instead of setting the value to 0. 35. Error messages for syntax errors in *LIMIT_MATCH and *LIMIT_RECURSION now give the right offset instead of zero. 36. The JIT compiler should not check repeats after a {0,1} repeat byte code. This issue was found by Karl Skomski with a custom LLVM fuzzer. 37. The JIT compiler should restore the control chain for empty possessive repeats. This issue was found by Karl Skomski with a custom LLVM fuzzer. 38. A bug which was introduced by the single character repetition optimization was fixed. 39. Match limit check added to recursion. This issue was found by Karl Skomski with a custom LLVM fuzzer. 40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look only at the part of the subject that is relevant when the starting offset is non-zero. 41. Improve first character match in JIT with SSE2 on x86. 42. Fix two assertion fails in JIT. These issues were found by Karl Skomski with a custom LLVM fuzzer. 43. Correct the setting of CMAKE_C_FLAGS in CMakeLists.txt (patch from Roy Ivy III). 44. Fix bug in RunTest.bat for new test 14, and adjust the script for the added test (there are now 20 in total). 45. Fixed a corner case of range optimization in JIT. 46. Add the ${*MARK} facility to pcre2_substitute(). 47. Modifier lists in pcre2test were splitting at spaces without the required commas. 48. Implemented PCRE2_ALT_VERBNAMES. 49. Fixed two issues in JIT. These were found by Karl Skomski with a custom LLVM fuzzer. 50. The pcre2test program has been extended by adding the #newline_default command. This has made it possible to run the standard tests when PCRE2 is compiled with either CR or CRLF as the default newline convention. As part of this work, the new command was added to several test files and the testing scripts were modified. The pcre2grep tests can now also be run when there is no LF in the default newline convention. 51. The RunTest script has been modified so that, when JIT is used and valgrind is specified, a valgrind suppressions file is set up to ignore "Invalid read of size 16" errors because these are false positives when the hardware supports the SSE2 instruction set. 52. It is now possible to have comment lines amid the subject strings in pcre2test (and perltest.sh) input. 53. Implemented PCRE2_USE_OFFSET_LIMIT and pcre2_set_offset_limit(). 54. Add the null_context modifier to pcre2test so that calling pcre2_compile() and the matching functions with NULL contexts can be tested. 55. Implemented PCRE2_SUBSTITUTE_EXTENDED. 56. In a character class such as [\W\p{Any}] where both a negative-type escape ("not a word character") and a property escape were present, the property escape was being ignored. 57. Fixed integer overflow for patterns whose minimum matching length is very, very large. 58. Implemented --never-backslash-C. 59. Change 55 above introduced a bug by which certain patterns provoked the erroneous error "\ at end of pattern". 60. The special sequences [[:<:]] and [[:>:]] gave rise to incorrect compiling errors or other strange effects if compiled in UCP mode. Found with libFuzzer and AddressSanitizer. 61. Whitespace at the end of a pcre2test pattern line caused a spurious error message if there were only single-character modifiers. It should be ignored. 62. The use of PCRE2_NO_AUTO_CAPTURE could cause incorrect compilation results or segmentation errors for some patterns. Found with libFuzzer and AddressSanitizer. 63. Very long names in (*MARK) or (*THEN) etc. items could provoke a buffer overflow. 64. Improve error message for overly-complicated patterns. 65. Implemented an optional replication feature for patterns in pcre2test, to make it easier to test long repetitive patterns. The tests for 63 above are converted to use the new feature. 66. In the POSIX wrapper, if regerror() was given too small a buffer, it could misbehave. 67. In pcre2_substitute() in UTF mode, the UTF validity check on the replacement string was happening before the length setting when the replacement string was zero-terminated. 68. In pcre2_substitute() in UTF mode, PCRE2_NO_UTF_CHECK can be set for the second and subsequent calls to pcre2_match(). 69. There was no check for integer overflow for a replacement group number in pcre2_substitute(). An added check for a number greater than the largest group number in the pattern means this is not now needed. 70. The PCRE2-specific VERSION condition didn't work correctly if only one digit was given after the decimal point, or if more than two digits were given. It now works with one or two digits, and gives a compile time error if more are given. 71. In pcre2_substitute() there was the possibility of reading one code unit beyond the end of the replacement string. 72. The code for checking a subject's UTF-32 validity for a pattern with a lookbehind involved an out-of-bounds pointer, which could potentially cause trouble in some environments. 73. The maximum lookbehind length was incorrectly calculated for patterns such as /(?<=(a)(?-1))x/ which have a recursion within a backreference. 74. Give an error if a lookbehind assertion is longer than 65535 code units. 75. Give an error in pcre2_substitute() if a match ends before it starts (as a result of the use of \K). 76. Check the length of subpattern names and the names in (*MARK:xx) etc. dynamically to avoid the possibility of integer overflow. 77. Implement pcre2_set_max_pattern_length() so that programs can restrict the size of patterns that they are prepared to handle. 78. (*NO_AUTO_POSSESS) was not working. 79. Adding group information caching improves the speed of compiling when checking whether a group has a fixed length and/or could match an empty string, especially when recursion or subroutine calls are involved. However, this cannot be used when (?| is present in the pattern because the same number may be used for groups of different sizes. To catch runaway patterns in this situation, counts have been introduced to the functions that scan for empty branches or compute fixed lengths. 80. Allow for the possibility of the size of the nest_save structure not being a factor of the size of the compiling workspace (it currently is). 81. Check for integer overflow in minimum length calculation and cap it at 65535. 82. Small optimizations in code for finding the minimum matching length. 83. Lock out configuring for EBCDIC with non-8-bit libraries. 84. Test for error code <= 0 in regerror(). 85. Check for too many replacements (more than INT_MAX) in pcre2_substitute(). 86. Avoid the possibility of computing with an out-of-bounds pointer (though not dereferencing it) while handling lookbehind assertions. 87. Failure to get memory for the match data in regcomp() is now given as a regcomp() error instead of waiting for regexec() to pick it up. 88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid newline sequence. 89. Paranoid check in regcomp() for bad error code from pcre2_compile(). 90. Run test 8 (internal offsets and code sizes) for link sizes 3 and 4 as well as for link size 2. 91. Document that JIT has a limit on pattern size, and give more information about JIT compile failures in pcre2test. 92. Implement PCRE2_INFO_HASBACKSLASHC. 93. Re-arrange valgrind support code in pcre2test to avoid spurious reports with JIT (possibly caused by SSE2?). 94. Support offset_limit in JIT. 95. A sequence such as [[:punct:]b] that is, a POSIX character class followed by a single ASCII character in a class item, was incorrectly compiled in UCP mode. The POSIX class got lost, but only if the single character followed it. 96. [:punct:] in UCP mode was matching some characters in the range 128-255 that should not have been matched. 97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all characters with code points greater than 255 are in the class. When a Unicode property was also in the class (if PCRE2_UCP is set, escapes such as \w are turned into Unicode properties), wide characters were not correctly handled, and could fail to match. 98. In pcre2test, make the "startoffset" modifier a synonym of "offset", because it sets the "startoffset" parameter for pcre2_match(). 99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between an item and its qualifier (for example, A(?#comment)?B) pcre2_compile() misbehaved. This bug was found by the LLVM fuzzer. 100. The error for an invalid UTF pattern string always gave the code unit offset as zero instead of where the invalidity was found. 101. Further to 97 above, negated classes such as [^[:^ascii:]\d] were also not working correctly in UCP mode. 102. Similar to 99 above, if an isolated \E was present between an item and its qualifier when PCRE2_AUTO_CALLOUT was set, pcre2_compile() misbehaved. This bug was found by the LLVM fuzzer. 103. The POSIX wrapper function regexec() crashed if the option REG_STARTEND was set when the pmatch argument was NULL. It now returns REG_INVARG. 104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep. 105. An empty \Q\E sequence between an item and its qualifier caused pcre2_compile() to misbehave when auto callouts were enabled. This bug was found by the LLVM fuzzer. 106. If both PCRE2_ALT_VERBNAMES and PCRE2_EXTENDED were set, and a (*MARK) or other verb "name" ended with whitespace immediately before the closing parenthesis, pcre2_compile() misbehaved. Example: /(*:abc )/, but only when both those options were set. 107. In a number of places pcre2_compile() was not handling NULL characters correctly, and pcre2test with the "bincode" modifier was not always correctly displaying fields containing NULLS: (a) Within /x extended #-comments (b) Within the "name" part of (*MARK) and other *verbs (c) Within the text argument of a callout 108. If a pattern that was compiled with PCRE2_EXTENDED started with white space or a #-type comment that was followed by (?-x), which turns off PCRE2_EXTENDED, and there was no subsequent (?x) to turn it on again, pcre2_compile() assumed that (?-x) applied to the whole pattern and consequently mis-compiled it. This bug was found by the LLVM fuzzer. The fix for this bug means that a setting of any of the (?imsxJU) options at the start of a pattern is no longer transferred to the options that are returned by PCRE2_INFO_ALLOPTIONS. In fact, this was an anachronism that should have changed when the effects of those options were all moved to compile time. 109. An escaped closing parenthesis in the "name" part of a (*verb) when PCRE2_ALT_VERBNAMES was set caused pcre2_compile() to malfunction. This bug was found by the LLVM fuzzer. 110. Implemented PCRE2_SUBSTITUTE_UNSET_EMPTY, and updated pcre2test to make it possible to test it. 111. "Harden" pcre2test against ridiculously large values in modifiers and command line arguments. 112. Implemented PCRE2_SUBSTITUTE_UNKNOWN_UNSET and PCRE2_SUBSTITUTE_OVERFLOW_ LENGTH. 113. Fix printing of *MARK names that contain binary zeroes in pcre2test. Version 10.20 30-June-2015 -------------------------- 1. Callouts with string arguments have been added. 2. Assertion code generator in JIT has been optimized. 3. The invalid pattern (?(?C) has a missing assertion condition at the end. The pcre2_compile() function read past the end of the input before diagnosing an error. This bug was discovered by the LLVM fuzzer. 4. Implemented pcre2_callout_enumerate(). 5. Fix JIT compilation of conditional blocks whose assertion is converted to (*FAIL). E.g: /(?(?!))/. 6. The pattern /(?(?!)^)/ caused references to random memory. This bug was discovered by the LLVM fuzzer. 7. The assertion (?!) is optimized to (*FAIL). This was not handled correctly when this assertion was used as a condition, for example (?(?!)a|b). In pcre2_match() it worked by luck; in pcre2_dfa_match() it gave an incorrect error about an unsupported item. 8. For some types of pattern, for example /Z*(|d*){216}/, the auto- possessification code could take exponential time to complete. A recursion depth limit of 1000 has been imposed to limit the resources used by this optimization. This infelicity was discovered by the LLVM fuzzer. 9. A pattern such as /(*UTF)[\S\V\H]/, which contains a negated special class such as \S in non-UCP mode, explicit wide characters (> 255) can be ignored because \S ensures they are all in the class. The code for doing this was interacting badly with the code for computing the amount of space needed to compile the pattern, leading to a buffer overflow. This bug was discovered by the LLVM fuzzer. 10. A pattern such as /((?2)+)((?1))/ which has mutual recursion nested inside other kinds of group caused stack overflow at compile time. This bug was discovered by the LLVM fuzzer. 11. A pattern such as /(?1)(?#?'){8}(a)/ which had a parenthesized comment between a subroutine call and its quantifier was incorrectly compiled, leading to buffer overflow or other errors. This bug was discovered by the LLVM fuzzer. 12. The illegal pattern /(?(?.*!.*)?)/ was not being diagnosed as missing an assertion after (?(. The code was failing to check the character after (?(?< for the ! or = that would indicate a lookbehind assertion. This bug was discovered by the LLVM fuzzer. 13. A pattern such as /X((?2)()*+){2}+/ which has a possessive quantifier with a fixed maximum following a group that contains a subroutine reference was incorrectly compiled and could trigger buffer overflow. This bug was discovered by the LLVM fuzzer. 14. Negative relative recursive references such as (?-7) to non-existent subpatterns were not being diagnosed and could lead to unpredictable behaviour. This bug was discovered by the LLVM fuzzer. 15. The bug fixed in 14 was due to an integer variable that was unsigned when it should have been signed. Some other "int" variables, having been checked, have either been changed to uint32_t or commented as "must be signed". 16. A mutual recursion within a lookbehind assertion such as (?<=((?2))((?1))) caused a stack overflow instead of the diagnosis of a non-fixed length lookbehind assertion. This bug was discovered by the LLVM fuzzer. 17. The use of \K in a positive lookbehind assertion in a non-anchored pattern (e.g. /(?<=\Ka)/) could make pcre2grep loop. 18. There was a similar problem to 17 in pcre2test for global matches, though the code there did catch the loop. 19. If a greedy quantified \X was preceded by \C in UTF mode (e.g. \C\X*), and a subsequent item in the pattern caused a non-match, backtracking over the repeated \X did not stop, but carried on past the start of the subject, causing reference to random memory and/or a segfault. There were also some other cases where backtracking after \C could crash. This set of bugs was discovered by the LLVM fuzzer. 20. The function for finding the minimum length of a matching string could take a very long time if mutual recursion was present many times in a pattern, for example, /((?2){73}(?2))((?1))/. A better mutual recursion detection method has been implemented. This infelicity was discovered by the LLVM fuzzer. 21. Implemented PCRE2_NEVER_BACKSLASH_C. 22. The feature for string replication in pcre2test could read from freed memory if the replication required a buffer to be extended, and it was not working properly in 16-bit and 32-bit modes. This issue was discovered by a fuzzer: see http://lcamtuf.coredump.cx/afl/. 23. Added the PCRE2_ALT_CIRCUMFLEX option. 24. Adjust the treatment of \8 and \9 to be the same as the current Perl behaviour. 25. Static linking against the PCRE2 library using the pkg-config module was failing on missing pthread symbols. 26. If a group that contained a recursive back reference also contained a forward reference subroutine call followed by a non-forward-reference subroutine call, for example /.((?2)(?R)\1)()/, pcre2_compile() failed to compile correct code, leading to undefined behaviour or an internally detected error. This bug was discovered by the LLVM fuzzer. 27. Quantification of certain items (e.g. atomic back references) could cause incorrect code to be compiled when recursive forward references were involved. For example, in this pattern: /(?1)()((((((\1++))\x85)+)|))/. This bug was discovered by the LLVM fuzzer. 28. A repeated conditional group whose condition was a reference by name caused a buffer overflow if there was more than one group with the given name. This bug was discovered by the LLVM fuzzer. 29. A recursive back reference by name within a group that had the same name as another group caused a buffer overflow. For example: /(?J)(?'d'(?'d'\g{d}))/. This bug was discovered by the LLVM fuzzer. 30. A forward reference by name to a group whose number is the same as the current group, for example in this pattern: /(?|(\k'Pm')|(?'Pm'))/, caused a buffer overflow at compile time. This bug was discovered by the LLVM fuzzer. 31. Fix -fsanitize=undefined warnings for left shifts of 1 by 31 (it treats 1 as an int; fixed by writing it as 1u). 32. Fix pcre2grep compile when -std=c99 is used with gcc, though it still gives a warning for "fileno" unless -std=gnu99 us used. 33. A lookbehind assertion within a set of mutually recursive subpatterns could provoke a buffer overflow. This bug was discovered by the LLVM fuzzer. 34. Give an error for an empty subpattern name such as (?''). 35. Make pcre2test give an error if a pattern that follows #forbud_utf contains \P, \p, or \X. 36. The way named subpatterns are handled has been refactored. There is now a pre-pass over the regex which does nothing other than identify named subpatterns and count the total captures. This means that information about named patterns is known before the rest of the compile. In particular, it means that forward references can be checked as they are encountered. Previously, the code for handling forward references was contorted and led to several errors in computing the memory requirements for some patterns, leading to buffer overflows. 37. There was no check for integer overflow in subroutine calls such as (?123). 38. The table entry for \l in EBCDIC environments was incorrect, leading to its being treated as a literal 'l' instead of causing an error. 39. If a non-capturing group containing a conditional group that could match an empty string was repeated, it was not identified as matching an empty string itself. For example: /^(?:(?(1)x|)+)+$()/. 40. In an EBCDIC environment, pcretest was mishandling the escape sequences \a and \e in test subject lines. 41. In an EBCDIC environment, \a in a pattern was converted to the ASCII instead of the EBCDIC value. 42. The handling of \c in an EBCDIC environment has been revised so that it is now compatible with the specification in Perl's perlebcdic page. 43. Single character repetition in JIT has been improved. 20-30% speedup was achieved on certain patterns. 44. The EBCDIC character 0x41 is a non-breaking space, equivalent to 0xa0 in ASCII/Unicode. This has now been added to the list of characters that are recognized as white space in EBCDIC. 45. When PCRE2 was compiled without Unicode support, the use of \p and \P gave an error (correctly) when used outside a class, but did not give an error within a class. 46. \h within a class was incorrectly compiled in EBCDIC environments. 47. JIT should return with error when the compiled pattern requires more stack space than the maximum. 48. Fixed a memory leak in pcre2grep when a locale is set. Version 10.10 06-March-2015 --------------------------- 1. When a pattern is compiled, it remembers the highest back reference so that when matching, if the ovector is too small, extra memory can be obtained to use instead. A conditional subpattern whose condition is a check on a capture having happened, such as, for example in the pattern /^(?:(a)|b)(?(1)A|B)/, is another kind of back reference, but it was not setting the highest backreference number. This mattered only if pcre2_match() was called with an ovector that was too small to hold the capture, and there was no other kind of back reference (a situation which is probably quite rare). The effect of the bug was that the condition was always treated as FALSE when the capture could not be consulted, leading to a incorrect behaviour by pcre2_match(). This bug has been fixed. 2. Functions for serialization and deserialization of sets of compiled patterns have been added. 3. The value that is returned by PCRE2_INFO_SIZE has been corrected to remove excess code units at the end of the data block that may occasionally occur if the code for calculating the size over-estimates. This change stops the serialization code copying uninitialized data, to which valgrind objects. The documentation of PCRE2_INFO_SIZE was incorrect in stating that the size did not include the general overhead. This has been corrected. 4. All code units in every slot in the table of group names are now set, again in order to avoid accessing uninitialized data when serializing. 5. The (*NO_JIT) feature is implemented. 6. If a bug that caused pcre2_compile() to use more memory than allocated was triggered when using valgrind, the code in (3) above passed a stupidly large value to valgrind. This caused a crash instead of an "internal error" return. 7. A reference to a duplicated named group (either a back reference or a test for being set in a conditional) that occurred in a part of the pattern where PCRE2_DUPNAMES was not set caused the amount of memory needed for the pattern to be incorrectly calculated, leading to overwriting. 8. A mutually recursive set of back references such as (\2)(\1) caused a segfault at compile time (while trying to find the minimum matching length). The infinite loop is now broken (with the minimum length unset, that is, zero). 9. If an assertion that was used as a condition was quantified with a minimum of zero, matching went wrong. In particular, if the whole group had unlimited repetition and could match an empty string, a segfault was likely. The pattern (?(?=0)?)+ is an example that caused this. Perl allows assertions to be quantified, but not if they are being used as conditions, so the above pattern is faulted by Perl. PCRE2 has now been changed so that it also rejects such patterns. 10. The error message for an invalid quantifier has been changed from "nothing to repeat" to "quantifier does not follow a repeatable item". 11. If a bad UTF string is compiled with NO_UTF_CHECK, it may succeed, but scanning the compiled pattern in subsequent auto-possessification can get out of step and lead to an unknown opcode. Previously this could have caused an infinite loop. Now it generates an "internal error" error. This is a tidyup, not a bug fix; passing bad UTF with NO_UTF_CHECK is documented as having an undefined outcome. 12. A UTF pattern containing a "not" match of a non-ASCII character and a subroutine reference could loop at compile time. Example: /[^\xff]((?1))/. 13. The locale test (RunTest 3) has been upgraded. It now checks that a locale that is found in the output of "locale -a" can actually be set by pcre2test before it is accepted. Previously, in an environment where a locale was listed but would not set (an example does exist), the test would "pass" without actually doing anything. Also the fr_CA locale has been added to the list of locales that can be used. 14. Fixed a bug in pcre2_substitute(). If a replacement string ended in a capturing group number without parentheses, the last character was incorrectly literally included at the end of the replacement string. 15. A possessive capturing group such as (a)*+ with a minimum repeat of zero failed to allow the zero-repeat case if pcre2_match() was called with an ovector too small to capture the group. 16. Improved error message in pcre2test when setting the stack size (-S) fails. 17. Fixed two bugs in CMakeLists.txt: (1) Some lines had got lost in the transfer from PCRE1, meaning that CMake configuration failed if "build tests" was selected. (2) The file src/pcre2_serialize.c had not been added to the list of PCRE2 sources, which caused a failure to build pcre2test. 18. Fixed typo in pcre2_serialize.c (DECL instead of DEFN) that causes problems only on Windows. 19. Use binary input when reading back saved serialized patterns in pcre2test. 20. Added RunTest.bat for running the tests under Windows. 21. "make distclean" was not removing config.h, a file that may be created for use with CMake. 22. A pattern such as "((?2){0,1999}())?", which has a group containing a forward reference repeated a large (but limited) number of times within a repeated outer group that has a zero minimum quantifier, caused incorrect code to be compiled, leading to the error "internal error: previously-checked referenced subpattern not found" when an incorrect memory address was read. This bug was reported as "heap overflow", discovered by Kai Lu of Fortinet's FortiGuard Labs. (Added 24-March-2015: CVE-2015-2325 was given to this.) 23. A pattern such as "((?+1)(\1))/" containing a forward reference subroutine call within a group that also contained a recursive back reference caused incorrect code to be compiled. This bug was reported as "heap overflow", discovered by Kai Lu of Fortinet's FortiGuard Labs. (Added 24-March-2015: CVE-2015-2326 was given to this.) 24. Computing the size of the JIT read-only data in advance has been a source of various issues, and new ones are still appear unfortunately. To fix existing and future issues, size computation is eliminated from the code, and replaced by on-demand memory allocation. 25. A pattern such as /(?i)[A-`]/, where characters in the other case are adjacent to the end of the range, and the range contained characters with more than one other case, caused incorrect behaviour when compiled in UTF mode. In that example, the range a-j was left out of the class. Version 10.00 05-January-2015 ----------------------------- Version 10.00 is the first release of PCRE2, a revised API for the PCRE library. Changes prior to 10.00 are logged in the ChangeLog file for the old API, up to item 20 for release 8.36. The code of the library was heavily revised as part of the new API implementation. Details of each and every modification were not individually logged. In addition to the API changes, the following changes were made. They are either new functionality, or bug fixes and other noticeable changes of behaviour that were implemented after the code had been forked. 1. Including Unicode support at build time is now enabled by default, but it can optionally be disabled. It is not enabled by default at run time (no change). 2. The test program, now called pcre2test, was re-specified and almost completely re-written. Its input is not compatible with input for pcretest. 3. Patterns may start with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) to set the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART options for every subject line that is matched by that pattern. 4. For the benefit of those who use PCRE2 via some other application, that is, not writing the function calls themselves, it is possible to check the PCRE2 version by matching a pattern such as /(?(VERSION>=10)yes|no)/ against a string such as "yesno". 5. There are case-equivalent Unicode characters whose encodings use different numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is theoretically possible for this to happen in UTF-16 too.) If a backreference to a group containing one of these characters was greedily repeated, and during the match a backtrack occurred, the subject might be backtracked by the wrong number of code units. For example, if /^(\x{23a})\1*(.)/ is matched caselessly (and in UTF-8 mode) against "\x{23a}\x{2c65}\x{2c65}\x{2c65}", group 2 should capture the final character, which is the three bytes E2, B1, and A5 in UTF-8. Incorrect backtracking meant that group 2 captured only the last two bytes. This bug has been fixed; the new code is slower, but it is used only when the strings matched by the repetition are not all the same length. 6. A pattern such as /()a/ was not setting the "first character must be 'a'" information. This applied to any pattern with a group that matched no characters, for example: /(?:(?=.)|(? start of atomic group META_CIRCUMFLEX ^ metacharacter META_CLASS [ start of non-empty class META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS META_CLASS_EMPTY_NOT [^] negative empty class - ditto META_CLASS_END ] end of non-empty class META_CLASS_NOT [^ start non-empty negative class META_COMMIT (*COMMIT) - no argument (see below for with argument) META_COND_ASSERT (?(?assertion) META_DOLLAR $ metacharacter META_DOT . metacharacter META_END End of pattern (this value is 0x80000000) META_FAIL (*FAIL) META_KET ) closing parenthesis META_LOOKAHEAD (?= start of lookahead META_LOOKAHEAD_NA (*napla: start of non-atomic lookahead META_LOOKAHEADNOT (?! start of negative lookahead META_NOCAPTURE (?: no capture parens META_PLUS + META_PLUS_PLUS ++ META_PLUS_QUERY +? META_PRUNE (*PRUNE) - no argument (see below for with argument) META_QUERY ? META_QUERY_PLUS ?+ META_QUERY_QUERY ?? META_RANGE_ESCAPED hyphen in class range with at least one escape META_RANGE_LITERAL hyphen in class range defined literally META_SKIP (*SKIP) - no argument (see below for with argument) META_THEN (*THEN) - no argument (see below for with argument) META_ECLASS_AND && (or &) in an extended character class META_ECLASS_OR || (or |, +) in an extended character class META_ECLASS_SUB -- (or -) in an extended character class META_ECLASS_XOR ~~ (or ^) in an extended character class META_ECLASS_NOT ! in an extended character class The two RANGE values occur only in character classes. They are positioned between two literals that define the start and end of the range. In an EBCDIC environment it is necessary to know whether either of the range values was specified as an escape. In an ASCII/Unicode environment the distinction is not relevant. The following have data in the lower 16 bits, and may be followed by other data elements: META_ALT | alternation META_BACKREF back reference META_CAPTURE start of capturing group META_ESCAPE non-literal escape sequence META_RECURSE recursion call If the data for META_ALT is non-zero, it is inside a lookbehind, and the data is the maximum length of its branch (see META_LOOKBEHIND below for more detail). META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as their data in the lower 16 bits of the element. META_RECURSE is followed by an offset, for use in error messages. META_BACKREF is followed by an offset if the back reference group number is 10 or more. The offsets of the first occurrences of references to groups whose numbers are less than 10 are put in cb->small_ref_offset[] (only the first occurrence is useful). On 64-bit systems this avoids using more than two parsed pattern elements for items such as \3. The offset is used when an error occurs because the reference is to a non-existent group. META_ESCAPE is used for escapes such as \d that match a character. It has an ESC_xxx value as its data. For ESC_P and ESC_p, the next element contains the 16-bit type and data property values, packed together. Escape sequences such as \g and \k are turned into other items like META_RECURSE or META_BACKREF and their ESC_xxx values never occur with META_ESCAPE. The following have one data item that follows in the next vector element: META_BIGVALUE Next is a literal >= META_END META_POSIX POSIX class item (data identifies the class) META_POSIX_NEG negative POSIX class item (ditto) The following are followed by a length element, then a number of character code values (which should match with the length): META_MARK (*MARK:xxxx) META_COMMIT_ARG (*COMMIT:xxxx) META_PRUNE_ARG (*PRUNE:xxx) META_SKIP_ARG (*SKIP:xxxx) META_THEN_ARG (*THEN:xxxx) The following are followed by a length element, then an offset in the pattern that identifies the name: META_COND_NAME (?() or (?('name') or (?(name) META_COND_RNAME (?(R&name) META_COND_RNUMBER (?(Rdigits) META_RECURSE_BYNAME (?&name) META_BACKREF_BYNAME \k'name' or \k or \k{name} or \g{name} META_SCS_NAME (*scs:()...) META_COND_RNUMBER is used for names that start with R and continue with digits, because this is an ambiguous case. It could be a back reference to a group with that name, or it could be a recursion test on a numbered group. These are followed by an offset, for use in error messages, then a number: META_COND_NUMBER (?([+-]digits) META_SCS_NUMBER (*scs:(digits)...) The following is followed just by an offset, for use in error messages: META_COND_DEFINE (?(DEFINE) The following are at first also followed just by an offset for use in error messages. After the lengths of the branches of a lookbehind group have been checked the error offset is no longer needed. The lower 16 bits of the main word are now set to the maximum length of the first branch of the lookbehind group, and the second word is set to the minimum matching length for a variable-length lookbehind group, or to LOOKBEHIND_MAX for a group whose branches are all of fixed length. These values are used when generating OP_REVERSE or OP_VREVERSE for the first branch. The miminum value is also used for any subsequent branches because there is only room for one value (the branch maximum length) in a META_ALT item. META_LOOKBEHIND (?<= start of lookbehind META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind META_LOOKBEHINDNOT (?' and 1 for '>='; the next two are the major and minor numbers: META_COND_VERSION (?(VERSIONx.y) Callouts are converted into one of two items: META_CALLOUT_NUMBER (?C with numerical argument META_CALLOUT_STRING (?C with string argument In both cases, the next two elements contain the offset and length of the next item in the pattern. Then there is either one callout number, or a length and an offset for the string argument. The length includes both delimiters. Traditional matching function ----------------------------- The "traditional", and original, matching function is called pcre2_match(), and it implements an NFA algorithm, similar to the original Henry Spencer algorithm and the way that Perl works. This is not surprising, since it is intended to be as compatible with Perl as possible. This is the function most users of PCRE2 will use most of the time. If PCRE2 is compiled with just-in-time (JIT) support, and studying a compiled pattern with JIT is successful, the JIT code is run instead of the normal pcre2_match() code, but the result is the same. The interpreter used to implement backtracking by means of recursive function calls, but this gave rise to regular complaints when patterns with large search trees ran out of stack. There was for a while a fudge that used the heap instead, but this was inefficient and slow. In 2017 I re-wrote pcre2_match() as a single, non-recursive function that implements backtracking via a vector of "frames" on the heap, each frame representing a backtracking point. As well as standard information such as the position in the pattern and position in the subject, each frame has a number of unassigned variables that can be used locally to preserve values at a backtracking point. C macros are used extensively to implement all of this. Supplementary matching function ------------------------------- There is a supplementary matching function called pcre2_dfa_match() that implements a DFA matching algorithm that searches simultaneously for all possible matches that start at one point in the subject string. (Going back to my roots: see Historical Note 1 above.) This function intreprets the same compiled pattern data as pcre2_match(); however, not all the facilities are available, and those that are do not always work in quite the same way. In particular, capturing parentheses and backreferences are not supported. See the user documentation for details. The algorithm that is used for pcre2_dfa_match() is not a traditional FSM, because it may have a number of states active at one time. More work would be needed at compile time to produce a traditional FSM where only one state is ever active at once. I believe some other regex matchers work this way. JIT support is not available for this kind of matching. Changeable options ------------------ The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and some others may be changed in the middle of patterns by items such as (?i). Their processing is handled entirely at compile time by generating different opcodes for the different settings. Some options are copied into the opcode's data, for opcodes such as OP_REFI which depends on the (?r) (PCRE2_EXTRA_CASELESS_RESTRICT) option. The runtime functions do not need to keep track of an option's state. PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE are tracked and processed during the parsing pre-pass. The others are handled from META_OPTIONS items during the main compile phase. Format of compiled patterns --------------------------- The compiled form of a pattern is a vector of unsigned code units (bytes in 8-bit mode, shorts in 16-bit mode, 32-bit words in 32-bit mode), containing items of variable length. The first code unit in an item contains an opcode, and the length of the item is either implicit in the opcode or contained in the data that follows it. In many cases listed below, LINK_SIZE data values are specified for offsets within the compiled pattern. LINK_SIZE always specifies a number of bytes. The default value for LINK_SIZE is 2, except for the 32-bit library, where it can only be 4. The 8-bit library can be compiled to use 3-byte or 4-byte values, and the 16-bit library can be compiled to use 4-byte values, though this impairs performance. Specifying a LINK_SIZE larger than 2 for these libraries is necessary only when patterns whose compiled length is greater than 65535 code units are going to be processed. When a LINK_SIZE value uses more than one code unit, the most significant unit is first. In this description, we assume the "normal" compilation options. Data values that are counts (e.g. quantifiers) are always two bytes long in 8-bit mode (most significant byte first), and one code unit in 16-bit and 32-bit modes. Opcodes with no following data ------------------------------ These items are all just one code unit long: OP_END end of pattern OP_ANY match any one character other than newline OP_ALLANY match any one character, including newline OP_ANYBYTE match any single code unit, even in UTF-8/16 mode OP_SOD match start of data: \A OP_SOM, start of match (subject + offset): \G OP_SET_SOM, set start of match (\K) OP_CIRC ^ (start of data) OP_CIRCM ^ multiline mode (start of data or after newline) OP_NOT_WORD_BOUNDARY \W OP_WORD_BOUNDARY \w OP_NOT_DIGIT \D OP_DIGIT \d OP_NOT_HSPACE \H OP_HSPACE \h OP_NOT_WHITESPACE \S OP_WHITESPACE \s OP_NOT_VSPACE \V OP_VSPACE \v OP_NOT_WORDCHAR \W OP_WORDCHAR \w OP_EODN match end of data or newline at end: \Z OP_EOD match end of data: \z OP_DOLL $ (end of data, or before final newline) OP_DOLLM $ multiline mode (end of data or before newline) OP_EXTUNI match an extended Unicode grapheme cluster OP_ANYNL match any Unicode newline sequence OP_ASSERT_ACCEPT ) OP_ACCEPT ) These are Perl 5.10's "backtracking control OP_COMMIT ) verbs". If OP_ACCEPT is inside capturing OP_FAIL ) parentheses, it may be preceded by one or more OP_PRUNE ) OP_CLOSE, each followed by a number that OP_SKIP ) indicates which parentheses must be closed. OP_THEN ) OP_ASSERT_ACCEPT is used when (*ACCEPT) is encountered within an assertion. This ends the assertion, not the entire pattern match. The assertion (?!) is always optimized to OP_FAIL. OP_ALLANY is used for '.' when PCRE2_DOTALL is set. It is also used for \C in non-UTF modes and in UTF-32 mode (since one code unit still equals one character). Another use is for [^] when empty classes are permitted (PCRE2_ALLOW_EMPTY_CLASS is set). Backtracking control verbs -------------------------- Verbs with no arguments generate opcodes with no following data (as listed in the section above). (*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a length in one code unit, and followed by a binary zero. The name length is limited by the size of the code unit. (*ACCEPT:NAME) and (*FAIL:NAME) are compiled as (*MARK:NAME)(*ACCEPT) and (*MARK:NAME)(*FAIL) respectively. For (*COMMIT:NAME), (*PRUNE:NAME), (*SKIP:NAME), and (*THEN:NAME), the opcodes OP_COMMIT_ARG, OP_PRUNE_ARG, OP_SKIP_ARG, and OP_THEN_ARG are used, with the name following in the same format as for OP_MARK. Matching literal characters --------------------------- The OP_CHAR opcode is followed by a single character that is to be matched casefully. For caseless matching of characters that have at most two case-equivalent code points, OP_CHARI is used. In UTF-8 or UTF-16 modes, the character may be more than one code unit long. In UTF-32 mode, characters are always exactly one code unit long. If there is only one character in a character class, OP_CHAR or OP_CHARI is used for a positive class, and OP_NOT or OP_NOTI for a negative one (that is, for something like [^a]). Caseless matching (positive or negative) of characters that have more than two case-equivalent code points (which is possible only in UTF mode) is handled by compiling a Unicode property item (see below), with the pseudo-property PT_CLIST. The value of this property is an offset in a vector called "ucd_caseless_sets" which identifies the start of a short list of case equivalent characters, terminated by the value NOTACHAR (0xffffffff). Repeating single characters --------------------------- The common repeats (*, +, ?), when applied to a single character, use the following opcodes, which come in caseful and caseless versions: Caseful Caseless OP_STAR OP_STARI OP_MINSTAR OP_MINSTARI OP_POSSTAR OP_POSSTARI OP_PLUS OP_PLUSI OP_MINPLUS OP_MINPLUSI OP_POSPLUS OP_POSPLUSI OP_QUERY OP_QUERYI OP_MINQUERY OP_MINQUERYI OP_POSQUERY OP_POSQUERYI Each opcode is followed by the character that is to be repeated. In ASCII or UTF-32 modes, these are two-code-unit items; in UTF-8 or UTF-16 modes, the length is variable. Those with "MIN" in their names are the minimizing versions. Those with "POS" in their names are possessive versions. Other kinds of repeat make use of these opcodes: Caseful Caseless OP_UPTO OP_UPTOI OP_MINUPTO OP_MINUPTOI OP_POSUPTO OP_POSUPTOI OP_EXACT OP_EXACTI Each of these is followed by a count and then the repeated character. The count is two bytes long in 8-bit mode (most significant byte first), or one code unit in 16-bit and 32-bit modes. OP_UPTO matches from 0 to the given number. A repeat with a non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an OP_UPTO (or OP_MINUPTO or OPT_POSUPTO). Another set of matching repeating opcodes (called OP_NOTSTAR, OP_NOTSTARI, etc.) are used for repeated, negated, single-character classes such as [^a]*. The normal single-character opcodes (OP_STAR, etc.) are used for repeated positive single-character classes. Repeating character types ------------------------- Repeats of things like \d are done exactly as for single characters, except that instead of a character, the opcode for the type (e.g. OP_DIGIT) is stored in the next code unit. The opcodes are: OP_TYPESTAR OP_TYPEMINSTAR OP_TYPEPOSSTAR OP_TYPEPLUS OP_TYPEMINPLUS OP_TYPEPOSPLUS OP_TYPEQUERY OP_TYPEMINQUERY OP_TYPEPOSQUERY OP_TYPEUPTO OP_TYPEMINUPTO OP_TYPEPOSUPTO OP_TYPEEXACT Match by Unicode property ------------------------- OP_PROP and OP_NOTPROP are used for positive and negative matches of a character by testing its Unicode property (the \p and \P escape sequences). Each is followed by two code units that encode the desired property as a type and a value. The types are a set of #defines of the form PT_xxx, and the values are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file. The value is relevant only for PT_GC (General Category), PT_PC (Particular Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property), and the pseudo-property PT_CLIST, which is used to identify a list of case-equivalent characters when there are three or more (see above). Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by three code units: OP_PROP or OP_NOTPROP, and then the desired property type and value. Character classes ----------------- If there is only one character in a class, OP_CHAR or OP_CHARI is used for a positive class, and OP_NOT or OP_NOTI for a negative one (that is, for something like [^a]), except when caselessly matching a character that has more than two case-equivalent code points (which can happen only in UTF mode). In this case a Unicode property item is used, as described above in "Matching literal characters". A set of repeating opcodes (called OP_NOTSTAR etc.) are used for repeated, negated, single-character classes. The normal single-character opcodes (OP_STAR, etc.) are used for repeated positive single-character classes. When there is more than one character in a class, and all the code points are less than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative one. In either case, the opcode is followed by a 32-byte (16-short, 8-word) bit map containing a 1 bit for every character that is acceptable. The bits are counted from the least significant end of each unit. In caseless mode, bits for both cases are set. The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 and 16-bit and 32-bit modes, subject characters with values greater than 255 can be handled correctly. For OP_CLASS they do not match, whereas for OP_NCLASS they do. For classes containing characters with values greater than 255 or that contain \p or \P, OP_XCLASS is used. It optionally uses a bit map if any acceptable code points are less than 256. After the bit map, the properties of the character class are listed, if they are present. The items in the list follows the declaration order of the pattern string. The property list is followed by single characters and/or character ranges, if they are present. The characters/ranges are sorted in ascending order, and at least one non-matching character must be present between any two of them. In caseless mode, all equivalent characters are explicitly listed. OP_XCLASS is followed by a LINK_SIZE value containing the total length of the opcode and its data. This is followed by a code unit containing flag bits: XCL_NOT indicates that this is a negative class, and XCL_MAP indicates that a bit map is present. There follows the bit map, if XCL_MAP is set, and then a sequence of items coded as follows: XCL_END marks the end of the list XCL_SINGLE one character follows XCL_RANGE two characters follow XCL_PROP a Unicode property (type, value) follows XCL_NOTPROP a Unicode property (type, value) follows If a range starts with a code point less than 256 and ends with one greater than 255, it is split into two ranges, with characters less than 256 being indicated in the bit map, and the rest with XCL_RANGE. When XCL_NOT is set, the bit map, if present, contains bits for characters that are allowed (exactly as for OP_NCLASS), but the list of items that follow it specifies characters and properties that are not allowed. The meaning of the bitmap indicated by XCL_MAP is that, if one is present, then it fully describes which code points < 256 match the class (without needing to invert the check according to XCL_NOT); the other items in the OP_XCLASS need not be consulted. However, if a bitmap is not present, then code points < 256 may still match, so the other items in the OP_XCLASS must be consulted. For classes containing logical expressions, such as "[\p{Greek} && \p{Lu}]" for "uppercase Greek letters", OP_ECLASS is used. The expression is encoded as a a stack-based series of operands and operators, in Reverse Polish Notation. Like an OP_XCLASS, the OP_ECLASS is first followed by a LINK_SIZE value containing the total length of the opcode and its data. That is followed by a code unit containing flags: currently just ECL_MAP indicating that a bit map is present. There follows the bit map, if ECL_MAP is set. Finally, there is a sequence of items that are either an operand or operator. Each item starts with a single code unit containing its type: ECL_AND AND; no additional data ECL_OR OR; no additional data ECL_XOR XOR; no additional data ECL_NOT NOT; no additional data ECL_XCLASS The additional data which follows ECL_XCLASS is the same as for an OP_XCLASS, except that this data is preceded by ECL_XCLASS rather than OP_XCLASS. Because the OP_ECLASS has its own bitmap (if required), an ECL_XCLASS should not contain a bitmap. Additionally, there are two intermediate values used during compilation, but these are folded away during generation of the opcode, and so never appear inside an OP_ECLASS at match time. They are: ECL_ANY match all characters; no additional data ECL_NONE match no characters; no additional data The meaning of the bitmap indicated by ECL_MAP is the same as XCL_MAP. If the bitmap is present, all codepoints < 256 are checked against the bitmap. Back references --------------- OP_REF (caseful) or OP_REFI (caseless) is followed by a count containing the reference number when the reference is to a unique capturing group (either by number or by name). When named groups are used, there may be more than one group with the same name. In this case, a reference to such a group by name generates OP_DNREF or OP_DNREFI. These are followed by two counts: the index (not the byte offset) in the group name table of the first entry for the required name, followed by the number of groups with the same name. The matching code can then search for the first one that is set. OP_REFI and OP_DNREFI are further followed by an item containing any case-insensitivity flags. Repeating character classes and back references ----------------------------------------------- Single-character classes are handled specially (see above). This section applies to other classes and also to back references. In both cases, the repeat information follows the base item. The matching code looks at the following opcode to see if it is one of these: OP_CRSTAR OP_CRMINSTAR OP_CRPOSSTAR OP_CRPLUS OP_CRMINPLUS OP_CRPOSPLUS OP_CRQUERY OP_CRMINQUERY OP_CRPOSQUERY OP_CRRANGE OP_CRMINRANGE OP_CRPOSRANGE All but the last three are single-code-unit items, with no data. The range opcodes are followed by the minimum and maximum repeat counts. Brackets and alternation ------------------------ A pair of non-capturing round brackets is wrapped round each expression at compile time, so alternation always happens in the context of brackets. [Note for North Americans: "bracket" to some English speakers, including myself, can be round, square, curly, or pointy. Hence this usage rather than "parentheses".] Non-capturing brackets use the opcode OP_BRA, capturing brackets use OP_CBRA. A bracket opcode is followed by a LINK_SIZE value which gives the offset to the next alternative OP_ALT or, if there aren't any branches, to the terminating opcode. Each OP_ALT is followed by a LINK_SIZE value giving the offset to the next one, or to the final opcode. For capturing brackets, the bracket number is a count that immediately follows the offset. There are several opcodes that mark the end of a subpattern group. OP_KET is used for subpatterns that do not repeat indefinitely, OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or maximally respectively, and OP_KETRPOS for possessive repetitions (see below for more details). All four are followed by a LINK_SIZE value giving (as a positive number) the offset back to the matching opening bracket opcode. If a subpattern is quantified such that it is permitted to match zero times, it is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are single-unit opcodes that tell the matcher that skipping the following subpattern entirely is a valid match. In the case of the first two, not skipping the pattern is also valid (greedy and non-greedy). The third is used when a pattern has the quantifier {0,0}. It cannot be entirely discarded, because it may be called as a subroutine from elsewhere in the pattern. A subpattern with an indefinite maximum repetition is replicated in the compiled data its minimum number of times (or once with OP_BRAZERO if the minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX as appropriate. A subpattern with a bounded maximum repetition is replicated in a nested fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO before each replication after the minimum, so that, for example, (abc){2,5} is compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group has the same number. When a repeated subpattern has an unbounded upper limit, it is checked to see whether it could match an empty string. If this is the case, the opcode in the final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher that it needs to check for matching an empty string when it hits OP_KETRMIN or OP_KETRMAX, and if so, to break the loop. Possessive brackets ------------------- When a repeated group (capturing or non-capturing) is marked as possessive by the "+" notation, e.g. (abc)++, different opcodes are used. Their names all have POS on the end, e.g. OP_BRAPOS instead of OP_BRA and OP_SCBRAPOS instead of OP_SCBRA. The end of such a group is marked by OP_KETRPOS. If the minimum repetition is zero, the group is preceded by OP_BRAPOSZERO. Once-only (atomic) groups ------------------------- These are just like other subpatterns, but they start with the opcode OP_ONCE. The check for matching an empty string in an unbounded repeat is handled entirely at runtime, so there is just this one opcode for atomic groups. Assertions ---------- Forward assertions are also just like other subpatterns, but starting with one of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK, OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT. If all the branches of a backward assertion are of fixed length (not necessarily the same), the first opcode inside each branch is OP_REVERSE, followed by an IMM2_SIZE count of the number of characters to move back the pointer in the subject string, thus allowing each branch to have a different (but fixed) length. Variable-length backward assertions whose maximum matching length is limited are also supported. For such assertions, the first opcode inside each branch is OP_VREVERSE, followed by the minimum and maximum lengths for that branch, unless these happen to be equal, in which case OP_REVERSE is used. These IMM2_SIZE values occupy two code units each in 8-bit mode, and 1 code unit in 16/32 bit modes. In ASCII or UTF-32 mode, the character counts in OP_REVERSE and OP_VREVERSE are also the number of code units, but in UTF-8/16 mode each character may occupy more than one code unit. The "scan substring" assertion compiles as OP_ASSERT_SCS. This opcode is followed by a list of arguments. Each argument is either an OP_CREF or OP_DNCREF byte code sequence. The details of these sequences are described in the next section. For example (*scs:(1,'NAME')...PATTERN...) is translated to: [OP_ASSERT_SCS] [OP_CREF] [OP_CREF] ...PATTERN... [OP_KET] If 'NAME' is a duplicated name, the second [OP_CREF] is [OP_DNCREF] instead. Conditional subpatterns ----------------------- These are like other subpatterns, but they start with the opcode OP_COND, or OP_SCOND for one that might match an empty string in an unbounded repeat. If the condition is a back reference, this is stored at the start of the subpattern using the opcode OP_CREF followed by a count containing the reference number, provided that the reference is to a unique capturing group. If the reference was by name and there is more than one group with that name, OP_DNCREF is used instead. It is followed by two counts: the index in the group names table, and the number of groups with the same name. The allows the matcher to check if any group with the given name is set. If the condition is "in recursion" (coded as "(?(R)"), or "in recursion of group x" (coded as "(?(Rx)"), the group number is stored at the start of the subpattern using the opcode OP_RREF (with a value of RREF_ANY (0xffff) for "the whole pattern") or OP_DNRREF (with data as for OP_DNCREF). For a DEFINE condition, OP_FALSE is used (with no associated data). During compilation, however, a DEFINE condition is coded as OP_DEFINE so that, when the conditional group is complete, there can be a check to ensure that it contains only one top-level branch. Once this has happened, the opcode is changed to OP_FALSE, so the matcher never sees OP_DEFINE. There is a special PCRE2-specific condition of the form (VERSION[>]=x.y), which tests the PCRE2 version number. This compiles into one of the opcodes OP_TRUE or OP_FALSE. If a condition is not a back reference, recursion test, DEFINE, or VERSION, it must start with a parenthesized atomic assertion, whose opcode normally immediately follows OP_COND or OP_SCOND. However, if automatic callouts are enabled, a callout is inserted immediately before the assertion. It is also possible to insert a manual callout at this point. Only assertion conditions may have callouts preceding the condition. A condition that is the negative assertion (?!) is optimized to OP_FAIL in all parts of the pattern, so this is another opcode that may appear as a condition. It is treated the same as OP_FALSE. Recursion --------- Recursion either matches the current pattern, or some subexpression. The opcode OP_RECURSE is followed by a LINK_SIZE value that is the offset to the starting bracket from the start of the whole pattern. OP_RECURSE is also used for "subroutine" calls, even though they are not strictly a recursion. Up till release 10.30 recursions were treated as atomic groups, making them incompatible with Perl (but PCRE had them well before Perl did). From 10.30, backtracking into recursions is supported. Repeated recursions used to be wrapped inside OP_ONCE brackets, which not only forced no backtracking, but also allowed repetition to be handled as for other bracketed groups. From 10.30 onwards, repeated recursions are duplicated for their minimum repetitions, and then wrapped in non-capturing brackets for the remainder. For example, (?1){3} is treated as (?1)(?1)(?1), and (?1){2,4} is treated as (?1)(?1)(?:(?1)){0,2}. Callouts -------- A callout may have either a numerical argument or a string argument. These use OP_CALLOUT or OP_CALLOUT_STR, respectively. In each case these are followed by two LINK_SIZE values giving the offset in the pattern string to the start of the following item, and another count giving the length of this item. These values make it possible for pcre2test to output useful tracing information using callouts. In the case of a numeric callout, after these two values there is a single code unit containing the callout number, in the range 0-255, with 255 being used for callouts that are automatically inserted as a result of the PCRE2_AUTO_CALLOUT option. Thus, this opcode item is of fixed length: [OP_CALLOUT] [PATTERN_OFFSET] [PATTERN_LENGTH] [NUMBER] For callouts with string arguments, OP_CALLOUT_STR has three more data items: a LINK_SIZE value giving the complete length of the entire opcode item, a LINK_SIZE item containing the offset within the pattern string to the start of the string argument, and the string itself, preceded by its starting delimiter and followed by a binary zero. When a callout function is called, a pointer to the actual string is passed, but the delimiter can be accessed as string[-1] if the application needs it. In the 8-bit library, the callout in /X(?C'abc')Y/ is compiled as the following bytes (decimal numbers represent binary values): [OP_CALLOUT_STR] [0] [10] [0] [1] [0] [14] [0] [5] ['] [a] [b] [c] [0] -------- ------- -------- ------- | | | | ------- LINK_SIZE items ------ Opcode table checking --------------------- The last opcode that is defined in pcre2_internal.h is OP_TABLE_LENGTH. This is not a real opcode, but is used to check at compile time that tables indexed by opcode are the correct length, in order to catch updating errors. See also -------- The file maint/README contains additional information. Philip Hazel August 2024 ================================================ FILE: INSTALL ================================================ Installation Instructions ************************* Copyright (C) 1994-1996, 1999-2002, 2004-2017, 2020-2021 Free Software Foundation, Inc. Copying and distribution of this file, with or without modification, are permitted in any medium without royalty provided the copyright notice and this notice are preserved. This file is offered as-is, without warranty of any kind. Basic Installation ================== Briefly, the shell command './configure && make && make install' should configure, build, and install this package. The following more-detailed instructions are generic; see the 'README' file for instructions specific to this package. Some packages provide this 'INSTALL' file but do not implement all of the features documented below. The lack of an optional feature in a given package is not necessarily a bug. More recommendations for GNU packages can be found in *note Makefile Conventions: (standards)Makefile Conventions. The 'configure' shell script attempts to guess correct values for various system-dependent variables used during compilation. It uses those values to create a 'Makefile' in each directory of the package. It may also create one or more '.h' files containing system-dependent definitions. Finally, it creates a shell script 'config.status' that you can run in the future to recreate the current configuration, and a file 'config.log' containing compiler output (useful mainly for debugging 'configure'). It can also use an optional file (typically called 'config.cache' and enabled with '--cache-file=config.cache' or simply '-C') that saves the results of its tests to speed up reconfiguring. Caching is disabled by default to prevent problems with accidental use of stale cache files. If you need to do unusual things to compile the package, please try to figure out how 'configure' could check whether to do them, and mail diffs or instructions to the address given in the 'README' so they can be considered for the next release. If you are using the cache, and at some point 'config.cache' contains results you don't want to keep, you may remove or edit it. The file 'configure.ac' (or 'configure.in') is used to create 'configure' by a program called 'autoconf'. You need 'configure.ac' if you want to change it or regenerate 'configure' using a newer version of 'autoconf'. The simplest way to compile this package is: 1. 'cd' to the directory containing the package's source code and type './configure' to configure the package for your system. Running 'configure' might take a while. While running, it prints some messages telling which features it is checking for. 2. Type 'make' to compile the package. 3. Optionally, type 'make check' to run any self-tests that come with the package, generally using the just-built uninstalled binaries. 4. Type 'make install' to install the programs and any data files and documentation. When installing into a prefix owned by root, it is recommended that the package be configured and built as a regular user, and only the 'make install' phase executed with root privileges. 5. Optionally, type 'make installcheck' to repeat any self-tests, but this time using the binaries in their final installed location. This target does not install anything. Running this target as a regular user, particularly if the prior 'make install' required root privileges, verifies that the installation completed correctly. 6. You can remove the program binaries and object files from the source code directory by typing 'make clean'. To also remove the files that 'configure' created (so you can compile the package for a different kind of computer), type 'make distclean'. There is also a 'make maintainer-clean' target, but that is intended mainly for the package's developers. If you use it, you may have to get all sorts of other programs in order to regenerate files that came with the distribution. 7. Often, you can also type 'make uninstall' to remove the installed files again. In practice, not all packages have tested that uninstallation works correctly, even though it is required by the GNU Coding Standards. 8. Some packages, particularly those that use Automake, provide 'make distcheck', which can by used by developers to test that all other targets like 'make install' and 'make uninstall' work correctly. This target is generally not run by end users. Compilers and Options ===================== Some systems require unusual options for compilation or linking that the 'configure' script does not know about. Run './configure --help' for details on some of the pertinent environment variables. You can give 'configure' initial values for configuration parameters by setting variables in the command line or in the environment. Here is an example: ./configure CC=c99 CFLAGS=-g LIBS=-lposix *Note Defining Variables::, for more details. Compiling For Multiple Architectures ==================================== You can compile the package for more than one kind of computer at the same time, by placing the object files for each architecture in their own directory. To do this, you can use GNU 'make'. 'cd' to the directory where you want the object files and executables to go and run the 'configure' script. 'configure' automatically checks for the source code in the directory that 'configure' is in and in '..'. This is known as a "VPATH" build. With a non-GNU 'make', it is safer to compile the package for one architecture at a time in the source code directory. After you have installed the package for one architecture, use 'make distclean' before reconfiguring for another architecture. On MacOS X 10.5 and later systems, you can create libraries and executables that work on multiple system types--known as "fat" or "universal" binaries--by specifying multiple '-arch' options to the compiler but only a single '-arch' option to the preprocessor. Like this: ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ CPP="gcc -E" CXXCPP="g++ -E" This is not guaranteed to produce working output in all cases, you may have to build one architecture at a time and combine the results using the 'lipo' tool if you have problems. Installation Names ================== By default, 'make install' installs the package's commands under '/usr/local/bin', include files under '/usr/local/include', etc. You can specify an installation prefix other than '/usr/local' by giving 'configure' the option '--prefix=PREFIX', where PREFIX must be an absolute file name. You can specify separate installation prefixes for architecture-specific files and architecture-independent files. If you pass the option '--exec-prefix=PREFIX' to 'configure', the package uses PREFIX as the prefix for installing programs and libraries. Documentation and other data files still use the regular prefix. In addition, if you use an unusual directory layout you can give options like '--bindir=DIR' to specify different values for particular kinds of files. Run 'configure --help' for a list of the directories you can set and what kinds of files go in them. In general, the default for these options is expressed in terms of '${prefix}', so that specifying just '--prefix' will affect all of the other directory specifications that were not explicitly provided. The most portable way to affect installation locations is to pass the correct locations to 'configure'; however, many packages provide one or both of the following shortcuts of passing variable assignments to the 'make install' command line to change installation locations without having to reconfigure or recompile. The first method involves providing an override variable for each affected directory. For example, 'make install prefix=/alternate/directory' will choose an alternate location for all directory configuration variables that were expressed in terms of '${prefix}'. Any directories that were specified during 'configure', but not in terms of '${prefix}', must each be overridden at install time for the entire installation to be relocated. The approach of makefile variable overrides for each directory variable is required by the GNU Coding Standards, and ideally causes no recompilation. However, some platforms have known limitations with the semantics of shared libraries that end up requiring recompilation when using this method, particularly noticeable in packages that use GNU Libtool. The second method involves providing the 'DESTDIR' variable. For example, 'make install DESTDIR=/alternate/directory' will prepend '/alternate/directory' before all installation names. The approach of 'DESTDIR' overrides is not required by the GNU Coding Standards, and does not work on platforms that have drive letters. On the other hand, it does better at avoiding recompilation issues, and works well even when some directory options were not specified in terms of '${prefix}' at 'configure' time. Optional Features ================= If the package supports it, you can cause programs to be installed with an extra prefix or suffix on their names by giving 'configure' the option '--program-prefix=PREFIX' or '--program-suffix=SUFFIX'. Some packages pay attention to '--enable-FEATURE' options to 'configure', where FEATURE indicates an optional part of the package. They may also pay attention to '--with-PACKAGE' options, where PACKAGE is something like 'gnu-as' or 'x' (for the X Window System). The 'README' should mention any '--enable-' and '--with-' options that the package recognizes. For packages that use the X Window System, 'configure' can usually find the X include and library files automatically, but if it doesn't, you can use the 'configure' options '--x-includes=DIR' and '--x-libraries=DIR' to specify their locations. Some packages offer the ability to configure how verbose the execution of 'make' will be. For these packages, running './configure --enable-silent-rules' sets the default to minimal output, which can be overridden with 'make V=1'; while running './configure --disable-silent-rules' sets the default to verbose, which can be overridden with 'make V=0'. Particular systems ================== On HP-UX, the default C compiler is not ANSI C compatible. If GNU CC is not installed, it is recommended to use the following options in order to use an ANSI C compiler: ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" and if that doesn't work, install pre-built binaries of GCC for HP-UX. HP-UX 'make' updates targets which have the same timestamps as their prerequisites, which makes it generally unusable when shipped generated files such as 'configure' are involved. Use GNU 'make' instead. On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot parse its '' header file. The option '-nodtk' can be used as a workaround. If GNU CC is not installed, it is therefore recommended to try ./configure CC="cc" and if that doesn't work, try ./configure CC="cc -nodtk" On Solaris, don't put '/usr/ucb' early in your 'PATH'. This directory contains several dysfunctional programs; working variants of these programs are available in '/usr/bin'. So, if you need '/usr/ucb' in your 'PATH', put it _after_ '/usr/bin'. On Haiku, software installed for all users goes in '/boot/common', not '/usr/local'. It is recommended to use the following options: ./configure --prefix=/boot/common Specifying the System Type ========================== There may be some features 'configure' cannot figure out automatically, but needs to determine by the type of machine the package will run on. Usually, assuming the package is built to be run on the _same_ architectures, 'configure' can figure that out, but if it prints a message saying it cannot guess the machine type, give it the '--build=TYPE' option. TYPE can either be a short name for the system type, such as 'sun4', or a canonical name which has the form: CPU-COMPANY-SYSTEM where SYSTEM can have one of these forms: OS KERNEL-OS See the file 'config.sub' for the possible values of each field. If 'config.sub' isn't included in this package, then this package doesn't need to know the machine type. If you are _building_ compiler tools for cross-compiling, you should use the option '--target=TYPE' to select the type of system they will produce code for. If you want to _use_ a cross compiler, that generates code for a platform different from the build platform, you should specify the "host" platform (i.e., that on which the generated programs will eventually be run) with '--host=TYPE'. Sharing Defaults ================ If you want to set default values for 'configure' scripts to share, you can create a site shell script called 'config.site' that gives default values for variables like 'CC', 'cache_file', and 'prefix'. 'configure' looks for 'PREFIX/share/config.site' if it exists, then 'PREFIX/etc/config.site' if it exists. Or, you can set the 'CONFIG_SITE' environment variable to the location of the site script. A warning: not all 'configure' scripts look for a site script. Defining Variables ================== Variables not defined in a site shell script can be set in the environment passed to 'configure'. However, some packages may run configure again during the build, and the customized values of these variables may be lost. In order to avoid this problem, you should set them in the 'configure' command line, using 'VAR=value'. For example: ./configure CC=/usr/local2/bin/gcc causes the specified 'gcc' to be used as the C compiler (unless it is overridden in the site shell script). Unfortunately, this technique does not work for 'CONFIG_SHELL' due to an Autoconf limitation. Until the limitation is lifted, you can use this workaround: CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash 'configure' Invocation ====================== 'configure' recognizes the following options to control how it operates. '--help' '-h' Print a summary of all of the options to 'configure', and exit. '--help=short' '--help=recursive' Print a summary of the options unique to this package's 'configure', and exit. The 'short' variant lists options used only in the top level, while the 'recursive' variant lists options also present in any nested packages. '--version' '-V' Print the version of Autoconf used to generate the 'configure' script, and exit. '--cache-file=FILE' Enable the cache: use and save the results of the tests in FILE, traditionally 'config.cache'. FILE defaults to '/dev/null' to disable caching. '--config-cache' '-C' Alias for '--cache-file=config.cache'. '--quiet' '--silent' '-q' Do not print messages saying which checks are being made. To suppress all normal output, redirect it to '/dev/null' (any error messages will still be shown). '--srcdir=DIR' Look for the package's source code in directory DIR. Usually 'configure' can determine that directory automatically. '--prefix=DIR' Use DIR as the installation prefix. *note Installation Names:: for more details, including other options available for fine-tuning the installation locations. '--no-create' '-n' Run the configure checks, but stop before creating any output files. 'configure' also accepts some other, not widely useful, options. Run 'configure --help' for more details. ================================================ FILE: LICENCE.md ================================================ PCRE2 Licence ============= | SPDX-License-Identifier: | BSD-3-Clause WITH PCRE2-exception | |---------|-------| PCRE2 is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Releases 10.00 and above of PCRE2 are distributed under the terms of the "BSD" licence, as specified below, with one exemption for certain binary redistributions. The documentation for PCRE2, supplied in the "doc" directory, is distributed under the same terms as the software itself. The data in the testdata directory is not copyrighted and is in the public domain. The basic library functions are written in C and are freestanding. Also included in the distribution is a just-in-time compiler that can be used to optimize pattern matching. This is an optional feature that can be omitted when the library is built. The just-in-time compiler is separately licensed under the "2-clause BSD" licence. COPYRIGHT --------- ### The basic library functions Written by: Philip Hazel Email local part: Philip.Hazel Email domain: gmail.com Retired from University of Cambridge Computing Service, Cambridge, England. Copyright (c) 1997-2007 University of Cambridge Copyright (c) 2007-2024 Philip Hazel All rights reserved. ### PCRE2 Just-In-Time compilation support Written by: Zoltan Herczeg Email local part: hzmester Email domain: freemail.hu Copyright (c) 2010-2024 Zoltan Herczeg All rights reserved. ### Stack-less Just-In-Time compiler Written by: Zoltan Herczeg Email local part: hzmester Email domain: freemail.hu Copyright (c) 2009-2024 Zoltan Herczeg All rights reserved. The code in the `deps/sljit` directory has its own LICENSE file. ### All other contributions Many other contributors have participated in the authorship of PCRE2. As PCRE2 has never required a Contributor Licensing Agreement, or other copyright assignment agreement, all contributions have copyright retained by each original contributor or their employer. THE "BSD" LICENCE ----------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notices, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notices, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. EXEMPTION FOR BINARY LIBRARY-LIKE PACKAGES ------------------------------------------ The second condition in the BSD licence (covering binary redistributions) does not apply all the way down a chain of software. If binary package A includes PCRE2, it must respect the condition, but if package B is software that includes package A, the condition is not imposed on package B unless it uses PCRE2 independently. ================================================ FILE: MODULE.bazel ================================================ module( name = "pcre2", version = "10.48-DEV", compatibility_level = 1, ) bazel_dep(name = "rules_cc", version = "0.2.8") bazel_dep(name = "bazel_skylib", version = "1.8.1") bazel_dep(name = "platforms", version = "1.0.0") ================================================ FILE: Makefile.am ================================================ ## Process this file with automake to produce Makefile.in. AUTOMAKE_OPTIONS = subdir-objects ACLOCAL_AMFLAGS = -I m4 ## This seems to have become necessary for building in non-source directory. AM_CPPFLAGS="-I$(srcdir)/src" ## Set some variables to accumulate conditionally-set Automake targets. CLEAN_LOCAL_TARGETS= DISTCLEAN_LOCAL_TARGETS= PHONY_TARGETS= ## Specify the documentation files that are distributed. dist_doc_DATA = \ AUTHORS.md \ COPYING \ ChangeLog \ LICENCE.md \ NEWS \ README \ SECURITY.md \ doc/pcre2.txt \ doc/pcre2-config.txt \ doc/pcre2grep.txt \ doc/pcre2test.txt dist_html_DATA = \ doc/html/NON-AUTOTOOLS-BUILD.txt \ doc/html/README.txt \ doc/html/index.html \ doc/html/pcre2-config.html \ doc/html/pcre2.html \ doc/html/pcre2_callout_enumerate.html \ doc/html/pcre2_code_copy.html \ doc/html/pcre2_code_copy_with_tables.html \ doc/html/pcre2_code_free.html \ doc/html/pcre2_compile.html \ doc/html/pcre2_compile_context_copy.html \ doc/html/pcre2_compile_context_create.html \ doc/html/pcre2_compile_context_free.html \ doc/html/pcre2_config.html \ doc/html/pcre2_convert_context_copy.html \ doc/html/pcre2_convert_context_create.html \ doc/html/pcre2_convert_context_free.html \ doc/html/pcre2_converted_pattern_free.html \ doc/html/pcre2_dfa_match.html \ doc/html/pcre2_general_context_copy.html \ doc/html/pcre2_general_context_create.html \ doc/html/pcre2_general_context_free.html \ doc/html/pcre2_get_error_message.html \ doc/html/pcre2_get_mark.html \ doc/html/pcre2_get_match_data_heapframes_size.html \ doc/html/pcre2_get_match_data_size.html \ doc/html/pcre2_get_ovector_count.html \ doc/html/pcre2_get_ovector_pointer.html \ doc/html/pcre2_get_startchar.html \ doc/html/pcre2_jit_compile.html \ doc/html/pcre2_jit_free_unused_memory.html \ doc/html/pcre2_jit_match.html \ doc/html/pcre2_jit_stack_assign.html \ doc/html/pcre2_jit_stack_create.html \ doc/html/pcre2_jit_stack_free.html \ doc/html/pcre2_maketables.html \ doc/html/pcre2_maketables_free.html \ doc/html/pcre2_match.html \ doc/html/pcre2_match_context_copy.html \ doc/html/pcre2_match_context_create.html \ doc/html/pcre2_match_context_free.html \ doc/html/pcre2_match_data_create.html \ doc/html/pcre2_match_data_create_from_pattern.html \ doc/html/pcre2_match_data_free.html \ doc/html/pcre2_next_match.html \ doc/html/pcre2_pattern_convert.html \ doc/html/pcre2_pattern_info.html \ doc/html/pcre2_serialize_decode.html \ doc/html/pcre2_serialize_encode.html \ doc/html/pcre2_serialize_free.html \ doc/html/pcre2_serialize_get_number_of_codes.html \ doc/html/pcre2_set_bsr.html \ doc/html/pcre2_set_callout.html \ doc/html/pcre2_set_character_tables.html \ doc/html/pcre2_set_compile_extra_options.html \ doc/html/pcre2_set_compile_recursion_guard.html \ doc/html/pcre2_set_depth_limit.html \ doc/html/pcre2_set_glob_escape.html \ doc/html/pcre2_set_glob_separator.html \ doc/html/pcre2_set_heap_limit.html \ doc/html/pcre2_set_match_limit.html \ doc/html/pcre2_set_max_pattern_compiled_length.html \ doc/html/pcre2_set_max_pattern_length.html \ doc/html/pcre2_set_max_varlookbehind.html \ doc/html/pcre2_set_offset_limit.html \ doc/html/pcre2_set_optimize.html \ doc/html/pcre2_set_newline.html \ doc/html/pcre2_set_parens_nest_limit.html \ doc/html/pcre2_set_recursion_limit.html \ doc/html/pcre2_set_recursion_memory_management.html \ doc/html/pcre2_set_substitute_callout.html \ doc/html/pcre2_set_substitute_case_callout.html \ doc/html/pcre2_substitute.html \ doc/html/pcre2_substring_copy_byname.html \ doc/html/pcre2_substring_copy_bynumber.html \ doc/html/pcre2_substring_free.html \ doc/html/pcre2_substring_get_byname.html \ doc/html/pcre2_substring_get_bynumber.html \ doc/html/pcre2_substring_length_byname.html \ doc/html/pcre2_substring_length_bynumber.html \ doc/html/pcre2_substring_list_free.html \ doc/html/pcre2_substring_list_get.html \ doc/html/pcre2_substring_nametable_scan.html \ doc/html/pcre2_substring_number_from_name.html \ doc/html/pcre2api.html \ doc/html/pcre2build.html \ doc/html/pcre2callout.html \ doc/html/pcre2compat.html \ doc/html/pcre2convert.html \ doc/html/pcre2demo.html \ doc/html/pcre2grep.html \ doc/html/pcre2jit.html \ doc/html/pcre2limits.html \ doc/html/pcre2matching.html \ doc/html/pcre2partial.html \ doc/html/pcre2pattern.html \ doc/html/pcre2perform.html \ doc/html/pcre2posix.html \ doc/html/pcre2sample.html \ doc/html/pcre2serialize.html \ doc/html/pcre2syntax.html \ doc/html/pcre2test.html \ doc/html/pcre2unicode.html dist_man_MANS = \ doc/pcre2-config.1 \ doc/pcre2.3 \ doc/pcre2_callout_enumerate.3 \ doc/pcre2_code_copy.3 \ doc/pcre2_code_copy_with_tables.3 \ doc/pcre2_code_free.3 \ doc/pcre2_compile.3 \ doc/pcre2_compile_context_copy.3 \ doc/pcre2_compile_context_create.3 \ doc/pcre2_compile_context_free.3 \ doc/pcre2_config.3 \ doc/pcre2_convert_context_copy.3 \ doc/pcre2_convert_context_create.3 \ doc/pcre2_convert_context_free.3 \ doc/pcre2_converted_pattern_free.3 \ doc/pcre2_dfa_match.3 \ doc/pcre2_general_context_copy.3 \ doc/pcre2_general_context_create.3 \ doc/pcre2_general_context_free.3 \ doc/pcre2_get_error_message.3 \ doc/pcre2_get_mark.3 \ doc/pcre2_get_match_data_heapframes_size.3 \ doc/pcre2_get_match_data_size.3 \ doc/pcre2_get_ovector_count.3 \ doc/pcre2_get_ovector_pointer.3 \ doc/pcre2_get_startchar.3 \ doc/pcre2_jit_compile.3 \ doc/pcre2_jit_free_unused_memory.3 \ doc/pcre2_jit_match.3 \ doc/pcre2_jit_stack_assign.3 \ doc/pcre2_jit_stack_create.3 \ doc/pcre2_jit_stack_free.3 \ doc/pcre2_maketables.3 \ doc/pcre2_maketables_free.3 \ doc/pcre2_match.3 \ doc/pcre2_match_context_copy.3 \ doc/pcre2_match_context_create.3 \ doc/pcre2_match_context_free.3 \ doc/pcre2_match_data_create.3 \ doc/pcre2_match_data_create_from_pattern.3 \ doc/pcre2_match_data_free.3 \ doc/pcre2_next_match.3 \ doc/pcre2_pattern_convert.3 \ doc/pcre2_pattern_info.3 \ doc/pcre2_serialize_decode.3 \ doc/pcre2_serialize_encode.3 \ doc/pcre2_serialize_free.3 \ doc/pcre2_serialize_get_number_of_codes.3 \ doc/pcre2_set_bsr.3 \ doc/pcre2_set_callout.3 \ doc/pcre2_set_character_tables.3 \ doc/pcre2_set_compile_extra_options.3 \ doc/pcre2_set_compile_recursion_guard.3 \ doc/pcre2_set_depth_limit.3 \ doc/pcre2_set_glob_escape.3 \ doc/pcre2_set_glob_separator.3 \ doc/pcre2_set_heap_limit.3 \ doc/pcre2_set_match_limit.3 \ doc/pcre2_set_max_pattern_compiled_length.3 \ doc/pcre2_set_max_pattern_length.3 \ doc/pcre2_set_max_varlookbehind.3 \ doc/pcre2_set_offset_limit.3 \ doc/pcre2_set_optimize.3 \ doc/pcre2_set_newline.3 \ doc/pcre2_set_parens_nest_limit.3 \ doc/pcre2_set_recursion_limit.3 \ doc/pcre2_set_recursion_memory_management.3 \ doc/pcre2_set_substitute_callout.3 \ doc/pcre2_set_substitute_case_callout.3 \ doc/pcre2_substitute.3 \ doc/pcre2_substring_copy_byname.3 \ doc/pcre2_substring_copy_bynumber.3 \ doc/pcre2_substring_free.3 \ doc/pcre2_substring_get_byname.3 \ doc/pcre2_substring_get_bynumber.3 \ doc/pcre2_substring_length_byname.3 \ doc/pcre2_substring_length_bynumber.3 \ doc/pcre2_substring_list_free.3 \ doc/pcre2_substring_list_get.3 \ doc/pcre2_substring_nametable_scan.3 \ doc/pcre2_substring_number_from_name.3 \ doc/pcre2api.3 \ doc/pcre2build.3 \ doc/pcre2callout.3 \ doc/pcre2compat.3 \ doc/pcre2convert.3 \ doc/pcre2demo.3 \ doc/pcre2grep.1 \ doc/pcre2jit.3 \ doc/pcre2limits.3 \ doc/pcre2matching.3 \ doc/pcre2partial.3 \ doc/pcre2pattern.3 \ doc/pcre2perform.3 \ doc/pcre2posix.3 \ doc/pcre2sample.3 \ doc/pcre2serialize.3 \ doc/pcre2syntax.3 \ doc/pcre2test.1 \ doc/pcre2unicode.3 # The Libtool libraries to install. We'll add to this later. lib_LTLIBRARIES = # Unit tests you want to run when people type 'make check'. # TESTS is for binary unit tests, check_SCRIPTS for script-based tests TESTS = XFAIL_TESTS = check_SCRIPTS = dist_noinst_SCRIPTS = # Some of the binaries we make are to be installed, and others are # (non-user-visible) helper programs needed to build the libraries. bin_PROGRAMS = noinst_PROGRAMS = # Additional files to delete on 'make clean', 'make distclean', # and 'make maintainer-clean'. It turns out that the default is to delete only # those binaries that *this* configuration has created. If the configuration # has been changed, some binaries may not get automatically deleted. Therefore # we list them here. CLEANFILES = \ pcre2_dftables \ pcre2_jit_test \ pcre2fuzzcheck-8 \ pcre2fuzzcheck-16 \ pcre2fuzzcheck-32 \ pcre2demo DISTCLEANFILES = src/config.h.in~ MAINTAINERCLEANFILES = # Additional files to bundle with the distribution, over and above what # the Autotools include by default. EXTRA_DIST = # These files contain additional m4 macros that are used by autoconf. EXTRA_DIST += \ m4/ax_pthread.m4 \ m4/pcre2_check_vscript.m4 \ m4/pcre2_visibility.m4 \ m4/pcre2_zos.m4 # These files contain maintenance information EXTRA_DIST += \ NON-AUTOTOOLS-BUILD \ HACKING # These are support files for building with Bazel or Zig EXTRA_DIST += \ BUILD.bazel \ MODULE.bazel \ build.zig # These are support files for building under VMS EXTRA_DIST += \ vms/configure.com \ vms/openvms_readme.txt \ vms/pcre2.h_patch \ vms/stdint.h # These files are usable versions of pcre2.h and config.h that are distributed # for the benefit of people who are building PCRE2 manually, without the # Autotools support. EXTRA_DIST += \ src/pcre2.h.generic \ src/config.h.generic # The only difference between pcre2.h.in and pcre2.h is the setting of the PCRE # version number. Therefore, we can create the generic version just by copying. src/pcre2.h.generic: src/pcre2.h.in configure.ac rm -f $@ cp -p src/pcre2.h $@ # It is more complicated for config.h.generic. We need the version that results # from a default configuration so as to get all the default values for PCRE # configuration macros such as MATCH_LIMIT and NEWLINE. We can get this by # doing a configure in a temporary directory. However, some trickery is needed, # because the source directory may already be configured. If you just try # running configure in a new directory, it complains. For this reason, we move # config.status out of the way while doing the default configuration. The # resulting config.h is munged by perl to put #ifdefs round any #defines for # macros with values, and to #undef all boolean macros such as HAVE_xxx, # SUPPORT_xxx, and PCRE2_STATIC/PCRE2POSIX_SHARED. We also get rid of any # GCC-specific visibility settings. src/config.h.generic: configure.ac rm -rf $@ _generic mkdir _generic cs=$(srcdir)/config.status; test ! -f $$cs || mv -f $$cs $$cs.aside cd _generic && $(abs_top_srcdir)/configure || : cs=$(srcdir)/config.status; test ! -f $$cs.aside || mv -f $$cs.aside $$cs test -f _generic/src/config.h perl -n \ -e 'BEGIN{$$blank=0;}' \ -e 'if(/(.+?)\s*__attribute__ \(\(visibility/){print"$$1\n";$$blank=0;next;}' \ -e 'if(/LT_OBJDIR/){print"/* This is ignored unless you are using libtool. */\n";}' \ -e 'if(/^#define\s((?:HAVE|SUPPORT|STDC)_\w+|PCRE2_STATIC|PCRE2POSIX_SHARED)/){print"/* #undef $$1 */\n";$$blank=0;next;}' \ -e 'if(/^#define\s(?!PACKAGE|VERSION)(\w+)/){print"#ifndef $$1\n$$_#endif\n";$$blank=0;next;}' \ -e 'if(/^\s*$$/){print unless $$blank; $$blank=1;} else{print;$$blank=0;}' \ _generic/src/config.h >$@ rm -rf _generic MAINTAINERCLEANFILES += src/pcre2.h.generic src/config.h.generic # These are the header files we'll install. We do not distribute pcre2.h # because it is generated from pcre2.h.in. nodist_include_HEADERS = src/pcre2.h include_HEADERS = src/pcre2posix.h # This is the "config" script. bin_SCRIPTS = pcre2-config ## --------------------------------------------------------------- ## The pcre2_dftables program is used to rebuild character tables before ## compiling PCRE2, if --enable-rebuild-chartables is specified. It is not an ## installed program. The default (when --enable-rebuild-chartables is not ## specified) is to copy a distributed set of tables that are defined for ASCII ## code. In this case, pcre2_dftables is not needed. if WITH_REBUILD_CHARTABLES noinst_PROGRAMS += pcre2_dftables pcre2_dftables_SOURCES = src/pcre2_dftables.c src/pcre2_chartables.c: pcre2_dftables$(EXEEXT) rm -f $@ ./pcre2_dftables$(EXEEXT) $@ else if WITH_EBCDIC if WITH_EBCDIC_NL25 src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl25 rm -f $@ $(LN_S) $(abs_srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl25 $(abs_builddir)/src/pcre2_chartables.c else # WITH_EBCDIC_NL25 src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl15 rm -f $@ $(LN_S) $(abs_srcdir)/src/pcre2_chartables.c.ebcdic-1047-nl15 $(abs_builddir)/src/pcre2_chartables.c endif # WITH_EBCDIC_NL25 else # WITH_EBCDIC src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.dist rm -f $@ $(LN_S) $(abs_srcdir)/src/pcre2_chartables.c.dist $(abs_builddir)/src/pcre2_chartables.c endif # WITH_EBCDIC endif # WITH_REBUILD_CHARTABLES BUILT_SOURCES = src/pcre2_chartables.c NODIST_SOURCES = src/pcre2_chartables.c ## Define the list of common sources, then arrange to build whichever of the ## 8-, 16-, or 32-bit libraries are configured. COMMON_SOURCES = \ src/pcre2_auto_possess.c \ src/pcre2_chkdint.c \ src/pcre2_compile.c \ src/pcre2_compile.h \ src/pcre2_compile_cgroup.c \ src/pcre2_compile_class.c \ src/pcre2_config.c \ src/pcre2_context.c \ src/pcre2_convert.c \ src/pcre2_dfa_match.c \ src/pcre2_error.c \ src/pcre2_extuni.c \ src/pcre2_find_bracket.c \ src/pcre2_internal.h \ src/pcre2_intmodedep.h \ src/pcre2_jit_char_inc.h \ src/pcre2_jit_compile.c \ src/pcre2_jit_match_inc.h \ src/pcre2_jit_misc_inc.h \ src/pcre2_jit_simd_inc.h \ src/pcre2_maketables.c \ src/pcre2_match.c \ src/pcre2_match_data.c \ src/pcre2_match_next.c \ src/pcre2_newline.c \ src/pcre2_ord2utf.c \ src/pcre2_pattern_info.c \ src/pcre2_printint_inc.h \ src/pcre2_script_run.c \ src/pcre2_serialize.c \ src/pcre2_string_utils.c \ src/pcre2_study.c \ src/pcre2_substitute.c \ src/pcre2_substring.c \ src/pcre2_tables.c \ src/pcre2_ucd.c \ src/pcre2_ucp.h \ src/pcre2_ucptables_inc.h \ src/pcre2_util.h \ src/pcre2_valid_utf.c \ src/pcre2_xclass.c if WITH_PCRE2_8 lib_LTLIBRARIES += libpcre2-8.la libpcre2_8_la_SOURCES = \ $(COMMON_SOURCES) nodist_libpcre2_8_la_SOURCES = \ $(NODIST_SOURCES) libpcre2_8_la_CFLAGS = \ -DPCRE2_CODE_UNIT_WIDTH=8 \ $(VISIBILITY_CFLAGS) \ $(CET_CFLAGS) \ $(AM_CFLAGS) libpcre2_8_la_LIBADD = endif # WITH_PCRE2_8 if WITH_PCRE2_16 lib_LTLIBRARIES += libpcre2-16.la libpcre2_16_la_SOURCES = \ $(COMMON_SOURCES) nodist_libpcre2_16_la_SOURCES = \ $(NODIST_SOURCES) libpcre2_16_la_CFLAGS = \ -DPCRE2_CODE_UNIT_WIDTH=16 \ $(VISIBILITY_CFLAGS) \ $(CET_CFLAGS) \ $(AM_CFLAGS) libpcre2_16_la_LIBADD = endif # WITH_PCRE2_16 if WITH_PCRE2_32 lib_LTLIBRARIES += libpcre2-32.la libpcre2_32_la_SOURCES = \ $(COMMON_SOURCES) nodist_libpcre2_32_la_SOURCES = \ $(NODIST_SOURCES) libpcre2_32_la_CFLAGS = \ -DPCRE2_CODE_UNIT_WIDTH=32 \ $(VISIBILITY_CFLAGS) \ $(CET_CFLAGS) \ $(AM_CFLAGS) libpcre2_32_la_LIBADD = endif # WITH_PCRE2_32 # The pcre2_chartables.c.dist file is the default version of # pcre2_chartables.c, used unless --enable-rebuild-chartables is specified. EXTRA_DIST += \ src/pcre2_chartables.c.dist \ src/pcre2_chartables.c.ebcdic-1047-nl15 \ src/pcre2_chartables.c.ebcdic-1047-nl25 CLEANFILES += src/pcre2_chartables.c # The JIT compiler lives in a separate directory, but its files are #included # when pcre2_jit_compile.c is processed, so they must be distributed. EXTRA_DIST += \ deps/sljit/LICENSE \ deps/sljit/README.md \ deps/sljit/sljit_src/sljitConfig.h \ deps/sljit/sljit_src/sljitConfigCPU.h \ deps/sljit/sljit_src/sljitConfigInternal.h \ deps/sljit/sljit_src/sljitLir.c \ deps/sljit/sljit_src/sljitLir.h \ deps/sljit/sljit_src/sljitNativeARM_32.c \ deps/sljit/sljit_src/sljitNativeARM_64.c \ deps/sljit/sljit_src/sljitNativeARM_T2_32.c \ deps/sljit/sljit_src/sljitNativeLOONGARCH_64.c \ deps/sljit/sljit_src/sljitNativeMIPS_32.c \ deps/sljit/sljit_src/sljitNativeMIPS_64.c \ deps/sljit/sljit_src/sljitNativeMIPS_common.c \ deps/sljit/sljit_src/sljitNativePPC_32.c \ deps/sljit/sljit_src/sljitNativePPC_64.c \ deps/sljit/sljit_src/sljitNativePPC_common.c \ deps/sljit/sljit_src/sljitNativeRISCV_32.c \ deps/sljit/sljit_src/sljitNativeRISCV_64.c \ deps/sljit/sljit_src/sljitNativeRISCV_common.c \ deps/sljit/sljit_src/sljitNativeS390X.c \ deps/sljit/sljit_src/sljitNativeX86_32.c \ deps/sljit/sljit_src/sljitNativeX86_64.c \ deps/sljit/sljit_src/sljitNativeX86_common.c \ deps/sljit/sljit_src/sljitSerialize.c \ deps/sljit/sljit_src/sljitUtils.c \ deps/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c \ deps/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c \ deps/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c \ deps/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c \ deps/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c \ deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c \ deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c \ deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c \ deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c if WITH_PCRE2_8 libpcre2_8_la_LDFLAGS = $(EXTRA_LIBPCRE2_8_LDFLAGS) endif # WITH_PCRE2_8 if WITH_PCRE2_16 libpcre2_16_la_LDFLAGS = $(EXTRA_LIBPCRE2_16_LDFLAGS) endif # WITH_PCRE2_16 if WITH_PCRE2_32 libpcre2_32_la_LDFLAGS = $(EXTRA_LIBPCRE2_32_LDFLAGS) endif # WITH_PCRE2_32 if WITH_VALGRIND if WITH_PCRE2_8 libpcre2_8_la_CFLAGS += $(VALGRIND_CFLAGS) endif # WITH_PCRE2_8 if WITH_PCRE2_16 libpcre2_16_la_CFLAGS += $(VALGRIND_CFLAGS) endif # WITH_PCRE2_16 if WITH_PCRE2_32 libpcre2_32_la_CFLAGS += $(VALGRIND_CFLAGS) endif # WITH_PCRE2_32 endif # WITH_VALGRIND if WITH_GCOV if WITH_PCRE2_8 libpcre2_8_la_CFLAGS += $(GCOV_CFLAGS) endif # WITH_PCRE2_8 if WITH_PCRE2_16 libpcre2_16_la_CFLAGS += $(GCOV_CFLAGS) endif # WITH_PCRE2_16 if WITH_PCRE2_32 libpcre2_32_la_CFLAGS += $(GCOV_CFLAGS) endif # WITH_PCRE2_32 endif # WITH_GCOV ## A version of the 8-bit library that has a POSIX API. if WITH_PCRE2_8 lib_LTLIBRARIES += libpcre2-posix.la libpcre2_posix_la_SOURCES = src/pcre2posix.c libpcre2_posix_la_CFLAGS = \ -DPCRE2_CODE_UNIT_WIDTH=8 @PCRE2POSIX_CFLAG@ \ $(VISIBILITY_CFLAGS) $(AM_CFLAGS) libpcre2_posix_la_LDFLAGS = $(EXTRA_LIBPCRE2_POSIX_LDFLAGS) libpcre2_posix_la_LIBADD = libpcre2-8.la if WITH_GCOV libpcre2_posix_la_CFLAGS += $(GCOV_CFLAGS) endif # WITH_GCOV endif # WITH_PCRE2_8 ## Build pcre2grep and optional fuzzer stuff if the 8-bit library is enabled if WITH_PCRE2_8 bin_PROGRAMS += pcre2grep pcre2grep_SOURCES = src/pcre2grep.c pcre2grep_CFLAGS = $(AM_CFLAGS) pcre2grep_LDADD = $(LIBZ) $(LIBBZ2) pcre2grep_LDADD += libpcre2-8.la if WITH_GCOV pcre2grep_CFLAGS += $(GCOV_CFLAGS) pcre2grep_LDADD += $(GCOV_LIBS) endif # WITH_GCOV endif # WITH_PCRE2_8 ## If fuzzer support is enabled, build a non-distributed library containing the ## fuzzing function. Also build the standalone checking binary from the same ## source but using -DSTANDALONE. if WITH_FUZZ_SUPPORT noinst_LIBRARIES = if WITH_PCRE2_8 noinst_LIBRARIES += .libs/libpcre2-fuzzsupport.a _libs_libpcre2_fuzzsupport_a_SOURCES = src/pcre2_fuzzsupport.c _libs_libpcre2_fuzzsupport_a_CFLAGS = $(AM_CFLAGS) _libs_libpcre2_fuzzsupport_a_LIBADD = noinst_PROGRAMS += pcre2fuzzcheck-8 pcre2fuzzcheck_8_SOURCES = src/pcre2_fuzzsupport.c pcre2fuzzcheck_8_CFLAGS = -DSTANDALONE $(AM_CFLAGS) pcre2fuzzcheck_8_LDADD = libpcre2-8.la if WITH_GCOV pcre2fuzzcheck_8_CFLAGS += $(GCOV_CFLAGS) pcre2fuzzcheck_8_LDADD += $(GCOV_LIBS) endif # WITH_GCOV endif # WITH_PCRE2_8 if WITH_PCRE2_16 noinst_LIBRARIES += .libs/libpcre2-fuzzsupport-16.a _libs_libpcre2_fuzzsupport_16_a_SOURCES = src/pcre2_fuzzsupport.c _libs_libpcre2_fuzzsupport_16_a_CFLAGS = $(AM_CFLAGS) -DPCRE2_CODE_UNIT_WIDTH=16 _libs_libpcre2_fuzzsupport_16_a_LIBADD = noinst_PROGRAMS += pcre2fuzzcheck-16 pcre2fuzzcheck_16_SOURCES = src/pcre2_fuzzsupport.c pcre2fuzzcheck_16_CFLAGS = -DSTANDALONE $(AM_CFLAGS) -DPCRE2_CODE_UNIT_WIDTH=16 pcre2fuzzcheck_16_LDADD = libpcre2-16.la if WITH_GCOV pcre2fuzzcheck_16_CFLAGS += $(GCOV_CFLAGS) pcre2fuzzcheck_16_LDADD += $(GCOV_LIBS) endif # WITH_GCOV endif # WITH_PCRE2_16 if WITH_PCRE2_32 noinst_LIBRARIES += .libs/libpcre2-fuzzsupport-32.a _libs_libpcre2_fuzzsupport_32_a_SOURCES = src/pcre2_fuzzsupport.c _libs_libpcre2_fuzzsupport_32_a_CFLAGS = $(AM_CFLAGS) -DPCRE2_CODE_UNIT_WIDTH=32 _libs_libpcre2_fuzzsupport_32_a_LIBADD = noinst_PROGRAMS += pcre2fuzzcheck-32 pcre2fuzzcheck_32_SOURCES = src/pcre2_fuzzsupport.c pcre2fuzzcheck_32_CFLAGS = -DSTANDALONE $(AM_CFLAGS) -DPCRE2_CODE_UNIT_WIDTH=32 pcre2fuzzcheck_32_LDADD = libpcre2-32.la if WITH_GCOV pcre2fuzzcheck_32_CFLAGS += $(GCOV_CFLAGS) pcre2fuzzcheck_32_LDADD += $(GCOV_LIBS) endif # WITH_GCOV endif # WITH_PCRE2_32 endif # WITH_FUZZ_SUPPORT ## -------- Testing ---------- ## If the 8-bit library is enabled, build the POSIX wrapper test program and ## arrange for it to run. if WITH_PCRE2_8 TESTS += pcre2posix_test noinst_PROGRAMS += pcre2posix_test pcre2posix_test_SOURCES = src/pcre2posix_test.c pcre2posix_test_CFLAGS = $(AM_CFLAGS) @PCRE2POSIX_CFLAG@ pcre2posix_test_LDADD = libpcre2-posix.la libpcre2-8.la endif # WITH_PCRE2_8 ## If JIT support is enabled, arrange for the JIT test program to run. if WITH_JIT TESTS += pcre2_jit_test noinst_PROGRAMS += pcre2_jit_test pcre2_jit_test_SOURCES = src/pcre2_jit_test.c pcre2_jit_test_CFLAGS = $(AM_CFLAGS) pcre2_jit_test_LDADD = if WITH_PCRE2_8 pcre2_jit_test_LDADD += libpcre2-8.la endif # WITH_PCRE2_8 if WITH_PCRE2_16 pcre2_jit_test_LDADD += libpcre2-16.la endif # WITH_PCRE2_16 if WITH_PCRE2_32 pcre2_jit_test_LDADD += libpcre2-32.la endif # WITH_PCRE2_32 if WITH_GCOV pcre2_jit_test_CFLAGS += $(GCOV_CFLAGS) pcre2_jit_test_LDADD += $(GCOV_LIBS) endif # WITH_GCOV endif # WITH_JIT # Build the general pcre2test program. bin_PROGRAMS += pcre2test pcre2test_SOURCES = src/pcre2test.c src/pcre2test_inc.h pcre2test_CFLAGS = $(AM_CFLAGS) pcre2test_LDADD = $(LIBREADLINE) if WITH_PCRE2_8 pcre2test_LDADD += libpcre2-8.la libpcre2-posix.la endif # WITH_PCRE2_8 if WITH_PCRE2_16 pcre2test_LDADD += libpcre2-16.la endif # WITH_PCRE2_16 if WITH_PCRE2_32 pcre2test_LDADD += libpcre2-32.la endif # WITH_PCRE2_32 if WITH_VALGRIND pcre2test_CFLAGS += $(VALGRIND_CFLAGS) endif # WITH_VALGRIND if WITH_GCOV pcre2test_CFLAGS += $(GCOV_CFLAGS) pcre2test_LDADD += $(GCOV_LIBS) endif # WITH_GCOV ## The main library tests. Each test is a binary plus a script that runs that ## binary in various ways. We install these test binaries in case folks find it ## helpful. The two .bat files are for running the tests under Windows. TESTS += RunTest EXTRA_DIST += RunTest.bat dist_noinst_SCRIPTS += RunTest ## When the 8-bit library is configured, pcre2grep will have been built. if WITH_PCRE2_8 TESTS += RunGrepTest EXTRA_DIST += RunGrepTest.bat dist_noinst_SCRIPTS += RunGrepTest if WITH_EBCDIC XFAIL_TESTS += RunGrepTest endif # WITH_EBCDIC endif # WITH_PCRE2_8 ## Distribute all the test data files EXTRA_DIST += \ testdata/grepbinary \ testdata/grepfilelist \ testdata/grepinput \ testdata/grepinput3 \ testdata/grepinput8 \ testdata/grepinputBad8 \ testdata/grepinputBad8_Trail \ testdata/grepinputC.bz2 \ testdata/grepinputC.gz \ testdata/grepinputM \ testdata/grepinputUN \ testdata/grepinputv \ testdata/grepinputx \ testdata/greplist \ testdata/grepnot.bz2 \ testdata/grepoutput \ testdata/grepoutput8 \ testdata/grepoutputC \ testdata/grepoutputCN \ testdata/grepoutputCNU \ testdata/grepoutputCU \ testdata/grepoutputCbz2 \ testdata/grepoutputCgz \ testdata/grepoutputN \ testdata/grepoutputUN \ testdata/greppatN4 \ testdata/testbtables \ testdata/testinput1 \ testdata/testinput2 \ testdata/testinput3 \ testdata/testinput4 \ testdata/testinput5 \ testdata/testinput6 \ testdata/testinput7 \ testdata/testinput8 \ testdata/testinput9 \ testdata/testinput10 \ testdata/testinput11 \ testdata/testinput12 \ testdata/testinput13 \ testdata/testinput14 \ testdata/testinput15 \ testdata/testinput16 \ testdata/testinput17 \ testdata/testinput18 \ testdata/testinput19 \ testdata/testinput20 \ testdata/testinput21 \ testdata/testinput22 \ testdata/testinput23 \ testdata/testinput24 \ testdata/testinput25 \ testdata/testinput26 \ testdata/testinput27 \ testdata/testinput28 \ testdata/testinput29 \ testdata/testinputheap \ testdata/testoutput1 \ testdata/testoutput2 \ testdata/testoutput3 \ testdata/testoutput3A \ testdata/testoutput3B \ testdata/testoutput4 \ testdata/testoutput5 \ testdata/testoutput6 \ testdata/testoutput7 \ testdata/testoutput8-16-2 \ testdata/testoutput8-16-4 \ testdata/testoutput8-32-4 \ testdata/testoutput8-8-2 \ testdata/testoutput8-8-3 \ testdata/testoutput8-8-4 \ testdata/testoutput9 \ testdata/testoutput10 \ testdata/testoutput11-16 \ testdata/testoutput11-32 \ testdata/testoutput12-16 \ testdata/testoutput12-32 \ testdata/testoutput13 \ testdata/testoutput14-16 \ testdata/testoutput14-32 \ testdata/testoutput14-8 \ testdata/testoutput15 \ testdata/testoutput16 \ testdata/testoutput17 \ testdata/testoutput18 \ testdata/testoutput19 \ testdata/testoutput20 \ testdata/testoutput21 \ testdata/testoutput22-16 \ testdata/testoutput22-32 \ testdata/testoutput22-8 \ testdata/testoutput23 \ testdata/testoutput24 \ testdata/testoutput25 \ testdata/testoutput26 \ testdata/testoutput27 \ testdata/testoutput28 \ testdata/testoutput29 \ testdata/testoutputheap-16 \ testdata/testoutputheap-32 \ testdata/testoutputheap-8 \ testdata/valgrind-jit.supp \ testdata/wintestinput3 \ testdata/wintestoutput3 \ perltest.sh # RunTest and RunGrepTest should clean up after themselves, but just in case # they don't, add their working files to CLEANFILES. CLEANFILES += \ testSinput \ testSoutput \ test3input \ test3output \ test3outputA \ test3outputB \ testtry \ teststdout \ teststderr \ teststderrgrep \ testtemp1grep \ testtemp2grep \ testtrygrep \ testNinputgrep testoutput-clean: -rm -rf testoutput8 testoutput8-jit testoutput8-dfa -rm -rf testoutput16 testoutput16-jit testoutput16-dfa -rm -rf testoutput32 testoutput32-jit testoutput32-dfa CLEAN_LOCAL_TARGETS += testoutput-clean PHONY_TARGETS += testoutput-clean ## ------------ End of testing ------------- # PCRE2 demonstration program. Not built automatically. The point is that the # users should build it themselves. So just distribute the source. EXTRA_DIST += src/pcre2demo.c # We have .pc files for pkg-config users. pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = if WITH_PCRE2_8 pkgconfig_DATA += libpcre2-8.pc libpcre2-posix.pc endif if WITH_PCRE2_16 pkgconfig_DATA += libpcre2-16.pc endif if WITH_PCRE2_32 pkgconfig_DATA += libpcre2-32.pc endif # Symbol version template files for use with GNU and Sun ld. EXTRA_DIST += \ src/libpcre2-8.sym.in src/libpcre2-16.sym.in src/libpcre2-32.sym.in \ src/libpcre2-posix.sym.in # gcov/lcov code coverage reporting # # Coverage reporting targets: # # coverage: Create a coverage report from 'make check' # coverage-baseline: Capture baseline coverage information # coverage-reset: This zeros the coverage counters only # coverage-report: This creates the coverage report only # coverage-clean-report: This removes the generated coverage report # without cleaning the coverage data itself # coverage-clean-data: This removes the captured coverage data without # removing the coverage files created at compile time (*.gcno) # coverage-clean: This cleans all coverage data including the generated # coverage report. if WITH_GCOV COVERAGE_TEST_NAME = $(PACKAGE) COVERAGE_NAME = $(PACKAGE)-$(VERSION) COVERAGE_OUTPUT_FILE = $(COVERAGE_NAME)-coverage.info COVERAGE_OUTPUT_DIR = $(COVERAGE_NAME)-coverage COVERAGE_LCOV_EXTRA_FLAGS = COVERAGE_GENHTML_EXTRA_FLAGS = coverage_quiet = $(coverage_quiet_$(V)) coverage_quiet_ = $(coverage_quiet_$(AM_DEFAULT_VERBOSITY)) coverage_quiet_0 = --quiet coverage-check: all -$(MAKE) $(AM_MAKEFLAGS) -k check coverage-baseline: $(LCOV) $(coverage_quiet) \ --directory $(top_builddir) \ --output-file "$(COVERAGE_OUTPUT_FILE)" \ --capture \ --initial coverage-report: $(LCOV) $(coverage_quiet) \ --directory $(top_builddir) \ --capture \ --output-file "$(COVERAGE_OUTPUT_FILE).tmp" \ --test-name "$(COVERAGE_TEST_NAME)" \ --no-checksum \ --compat-libtool \ $(COVERAGE_LCOV_EXTRA_FLAGS) $(LCOV) $(coverage_quiet) \ --directory $(top_builddir) \ --output-file "$(COVERAGE_OUTPUT_FILE)" \ --remove "$(COVERAGE_OUTPUT_FILE).tmp" \ "/tmp/*" \ "/usr/include/*" \ "$(includedir)/*" -@rm -f "$(COVERAGE_OUTPUT_FILE).tmp" LANG=C $(GENHTML) $(coverage_quiet) \ --prefix $(top_builddir) \ --output-directory "$(COVERAGE_OUTPUT_DIR)" \ --title "$(PACKAGE) $(VERSION) Code Coverage Report" \ --show-details "$(COVERAGE_OUTPUT_FILE)" \ --legend \ $(COVERAGE_GENHTML_EXTRA_FLAGS) @echo "Code coverage report written to file://$(abs_builddir)/$(COVERAGE_OUTPUT_DIR)/index.html" coverage-reset: -$(LCOV) $(coverage_quiet) --zerocounters --directory $(top_builddir) coverage-clean-report: -rm -f "$(COVERAGE_OUTPUT_FILE)" "$(COVERAGE_OUTPUT_FILE).tmp" -rm -rf "$(COVERAGE_OUTPUT_DIR)" coverage-clean-data: -find $(top_builddir) -name "*.gcda" -delete coverage-clean: coverage-reset coverage-clean-report coverage-clean-data -find $(top_builddir) -name "*.gcno" -delete coverage-distclean: coverage-clean coverage: coverage-reset coverage-baseline coverage-check coverage-report CLEAN_LOCAL_TARGETS += coverage-clean DISTCLEAN_LOCAL_TARGETS += coverage-distclean PHONY_TARGETS += coverage coverage-baseline coverage-check coverage-report coverage-reset coverage-clean-report coverage-clean-data coverage-clean coverage-distclean else coverage: @echo "Configuring with --enable-coverage is required to generate code coverage report." # Without coverage support, still arrange for 'make distclean' to get rid of # any coverage files that may have been left from a different configuration. DISTCLEANFILES += src/*.gcda src/*.gcno coverage-distclean: rm -rf $(PACKAGE)-$(VERSION)-coverage* DISTCLEAN_LOCAL_TARGETS += coverage-distclean PHONY_TARGETS += coverage-distclean endif # WITH_GCOV ## CMake support EXTRA_DIST += \ cmake/COPYING-CMAKE-SCRIPTS \ cmake/FindEditline.cmake \ cmake/FindReadline.cmake \ cmake/pcre2-config.cmake.in \ cmake/PCRE2CheckVscript.cmake \ cmake/PCRE2UseSystemExtensions.cmake \ cmake/PCRE2WarningAsError.cmake \ src/config-cmake.h.in \ CMakeLists.txt ## Set the special make and Automake targets. We could very easily simply ## redefine these targets to append prerequisites to them, rather than ## collecting the prerequisites in variables. However, there is an annoying ## Automake behaviour where it emits a warning if a target has prerequisites ## appended in a conditional block. clean-local: $(CLEAN_LOCAL_TARGETS) distclean-local: $(DISTCLEAN_LOCAL_TARGETS) .PHONY: $(PHONY_TARGETS) ## end Makefile.am ================================================ FILE: NEWS ================================================ News about PCRE2 releases ------------------------- Version 10.48 xx-xxx-2026 ------------------------- In development. A list of potential items for inclusion is here: https://github.com/PCRE2Project/pcre2/milestone/5 If you are reading this, contributions are welcome! Version 10.47 21-October-2025 ----------------------------- This is a regular semi-annual release, incorporating a few new features and several maintenance and build improvements. Only changes to behaviour, changes to the API, and other significant changes are described here. Please see the ChangeLog and Git log for further details. * (Powerful new feature) Pattern recursion of the form "(?1(GROUP_NAME_OR_NUM,...))" acts as a subroutine call which additionally returns the listed capturing groups to the calling context. * (Significant bugfix) Fixed a crash in pcre2_callout_enumerate() which is easily reachable on any pattern that contains a Unicode character class. If your application uses this function, please read the details for this change and evaluate its severity for your application. * (Build change) There are now linker scripts to enable symbol versioning for the PCRE2 dynamic libraries. Downstream Linux distributions may make use of this, or disable it with the new Autoconf `--disable-symvers` and CMake `-DPCRE2_SYMVERS` options. Linux, Solaris, and FreeBSD (GNU ld, LLVM lld, and Solaris ld) are tested and supported. * (New API function) Added pcre2_next_match(). This function makes it both simpler and safer for clients to iterate over all matches in a subject. The documentation in `pcre2api` also provides improved guidance in the section "Iterating over all matches". * (Minor API addition) Added the PCRE2_CONFIG_EFFECTIVE_LINKSIZE option to pcre2_config(). * (Minor replacement syntax extension) Added support for $+ replacement to pcre2_substitute(). * (Build change) Modernize the CMake build files, to use the "$", "$" and "install(EXPORT...)" expressions to export the PCRE2 targets. Version 10.46 27-August-2025 ---------------------------- This is a security-only release, to address CVE-2025-58050. Compared to 10.45, this release has only a minimal code change to prevent a read-past-the-end memory error, of arbitrary length. An attacker-controlled regex pattern is required, and it cannot be triggered by providing crafted subject (match) text. The (*ACCEPT) and (*scs:) pattern features must be used together. Release 10.44 and earlier are not affected. This could have implications of denial-of-service or information disclosure, and could potentially be used to escalate other vulnerabilities in a system (such as information disclosure being used to escalate the severity of an unrelated bug in another system). Version 10.45 05-February-2025 ------------------------------ This is a comparatively large release, incorporating new features, some bugfixes, and a few changes with slight backwards compatibility implications. Please see the ChangeLog and Git log for further details. Only changes to behaviour, changes to the API, and major changes to the pattern syntax are described here. This release is the first to be available as a (signed) Git tag, or alternatively as a (signed) tarball of the Git tag. This is also the first release to be made by the new maintainers of PCRE2, and we would like to thank Philip Hazel, creator and maintainer of PCRE and PCRE2. * (Git change) The sljit project has been split out into a separate Git repository. Git users must now run `git submodule init; git submodule update` after a Git checkout. * (Behaviour change) Update Unicode support to UCD 16. * (Match behaviour change) Case-insensitive matching of Unicode properties Ll, Lt, and Lu has been changed to match Perl. Previously, /\p{Ll}/i would match only lower-case characters (even though case-insensitive matching was specified). This also affects case-insensitive matching of POSIX classes such as [:lower:]. * (Minor match behaviour change) Case-insensitive matching of backreferences now respects the PCRE2_EXTRA_CASELESS_RESTRICT option. * (Minor pattern syntax change) Parsing of the \x escape is stricter, and is no longer parsed as an escape for the NUL character if not followed by '{' or a hexadecimal digit. Use \x00 instead. * (Major new feature) Add a new feature called scan substring. This is a new type of assertion which matches the content of a capturing block to a sub-pattern. Example: to find a word that contains the rare (in English) sequence of letters "rh" not at the start: \b(\w++)(*scan_substring:(1).+rh) The first group captures a word which is then scanned by the (*scan_substring:(1) ... ) assertion, which tests whether the pattern ".+rh" matches the capture group "(1)". * (Major new feature) Add support for UTS#18 compatible character classes, using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a metacharacter within character classes and the operators '&&', '--' and '~~', allowing subtractions and intersections of character classes to be easily expressed. Example: to match Thai or Greek letters (but not letters or other characters in those scripts), use [\p{L}&&[\p{Thai}||\p{Greek}]]. * (Major new feature) Add support for Perl-style extended character classes, using the syntax (?[...]). This also allows expressing subtractions and intersections of character classes, but using a different syntax to UTS#18. Example: to match Thai or Greek letters (but not letters or other characters in those scripts), use (?[\p{L} & (\p{Thai} + \p{Greek})]). * (Minor feature) Significant improvements to the character class match engine. Compiled character classes are now more compact, and have faster matching for large or complex character sets, using binary search through the set. * JIT compilation now fails with the new error code PCRE2_ERROR_JIT_UNSUPPORTED for patterns which use features not supported by the JIT compiler. * (Minor feature) New options PCRE2_EXTRA_NO_BS0 (disallow \0 as an escape for the NUL character); PCRE2_EXTRA_PYTHON_OCTAL (use Python disambiguation rules for deciding whether \12 is a backreference or an octal escape); PCRE2_EXTRA_NEVER_CALLOUT (disable callout syntax entirely); PCRE2_EXTRA_TURKISH_CASING (use Turkish rules for case-insensitive matching). * (Minor feature) Add new API function pcre2_set_optimize() for controlling which optimizations are enabled. * (Minor new features) A variety of extensions have been made to pcre2_substitute() and its syntax for replacement strings. These now support: \123 octal escapes; titlecasing \u\L; \1 backreferences; \g<1> and $ backreferences; $& $` $' and $_; new function pcre2_set_substitute_case_callout() to allow locale-aware case transformation. Version 10.44 07-June-2024 -------------------------- This is mostly a bug-fix and tidying release. There is one new function, to set a maximum size for a compiled pattern. The maximum name length for groups is increased to 128. Some auxiliary files for building under VMS are added. Version 10.43 16-February-2024 ------------------------------ There are quite a lot of changes in this release (see ChangeLog and Git log for a list). Those that are not bugfixes or code tidies are: * The JIT code no longer supports ARMv5 architecture. * A new function pcre2_get_match_data_heapframes_size() for finer heap control. * New option flags to restrict the interaction between ASCII and non-ASCII characters for caseless matching and \d and friends. There are also new pattern constructs to control these flags from within a pattern. * Upgrade to Unicode 15.0.0. * Treat a NULL pattern with zero length as an empty string. * Added support for limited-length variable-length lookbehind assertions, with a default maximum length of 255 characters (same as Perl) but with a function to adjust the limit. * Support for LoongArch in JIT. * Perl changed the meaning of (for example) {,3} which did not used to be recognized as a quantifier. Now it means {0,3} and PCRE2 has also changed. Note that {,} is still not a quantifier. * Following Perl, allow spaces and tabs after { and before } in all Perl- compatible items that use braces, and also around commas in quantifiers. The one exception in PCRE2 is \u{...}, which is from ECMAScript, not Perl, and PCRE2 follows ECMAScript usage. * Changed the meaning of \w and its synonyms and derivatives (\b and \B) in UCP mode to follow Perl. It now matches characters whose general categories are L or N or whose particular categories are Mn (non-spacing mark) or Pc (combining punctuation). * Changed the default meaning of [:xdigit:] in UCP mode to follow Perl. It now matches the "fullwidth" versions of hex digits. PCRE2_EXTRA_ASCII_DIGIT can be used to keep it ASCII only. * Make PCRE2_UCP the default in UTF mode in pcre2grep and add --no-ucp, --case-restrict and --posix-digit. * Add --group-separator and --no-group-separator to pcre2grep. Version 10.42 11-December-2022 ------------------------------ This is an unexpectedly early release to fix a problem that was introduced in 10.41. ChangeLog number 19 (GitHub #139) added the default definition of PCRE2_CALL_CONVENTION to pcre2posix.c instead of pcre2posix.h, which meant that programs including pcre2posix.h but not pcre2.h couldn't compile. A new test that checks this case has been added. A couple of other minor issues are also fixed, and a patch for an intermittent JIT fault is also included. See ChangeLog and the Git log. Version 10.41 06-December-2022 ------------------------------ This is another mainly bug-fixing and code-tidying release. There is one significant upgrade to pcre2grep: it now behaves like GNU grep when matching more than one pattern and a later pattern matches at an earlier point in the subject when the matched substrings are being identified by colour or by offsets. Version 10.40 15-April-2022 --------------------------- This is mostly a bug-fixing and code-tidying release. However, there are some extensions to Unicode property handling: * Added support for Bidi_Class and a number of binary Unicode properties, including Bidi_Control. * A number of changes to script matching for \p and \P: (a) Script extensions for a character are now coded as a bitmap instead of a list of script numbers, which should be faster and does not need a loop. (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms sc and scx). (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being the same as \p{scx:scriptname} because this change happened in Perl at release 5.26. (d) The standard Unicode 4-letter abbreviations for script names are now recognized. (e) In accordance with Unicode and Perl's "loose matching" rules, spaces, hyphens, and underscores are ignored in property names, which are then matched independent of case. As always, see ChangeLog for a list of all changes (also the Git log). Version 10.39 29-October-2021 ----------------------------- This release is happening soon after 10.38 because the bug fix is important. 1. Fix incorrect detection of alternatives in first character search in JIT. 2. Update to Unicode 14.0.0. 3. Some code cleanups (see ChangeLog). Version 10.38 01-October-2021 ----------------------------- As well as some bug fixes and tidies (as always, see ChangeLog for details), the documentation is updated to list the new URLs, following the move of the source repository to GitHub and the mailing list to Google Groups. * The CMake build system can now build both static and shared libraries in one go. * Following Perl's lead, \K is now locked out in lookaround assertions by default, but an option is provided to re-enable the previous behaviour. Version 10.37 26-May-2021 ------------------------- A few more bug fixes and tidies. The only change of real note is the removal of the actual POSIX names regcomp etc. from the POSIX wrapper library because these have caused issues for some applications (see 10.33 #2 below). Version 10.36 04-December-2020 ------------------------------ Again, mainly bug fixes and tidies. The only enhancements are the addition of GNU grep's -m (aka --max-count) option to pcre2grep, and also unifying the handling of substitution strings for both -O and callouts in pcre2grep, with the addition of $x{...} and $o{...} to allow for characters whose code points are greater than 255 in Unicode mode. NOTE: there is an outstanding issue with JIT support for MacOS on arm64 hardware. For details, please see Bugzilla issue #2618. Version 10.35 15-April-2020 --------------------------- Bugfixes, tidies, and a few new enhancements. 1. Capturing groups that contain recursive backreferences to themselves are no longer automatically atomic, because the restriction is no longer necessary as a result of the 10.30 restructuring. 2. Several new options for pcre2_substitute(). 3. When Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for upper/lower case computations on characters whose code points are greater than 127. 4. The character tables (for low-valued characters) can now more easily be saved and restored in binary. 5. Updated to Unicode 13.0.0. Version 10.34 21-November-2019 ------------------------------ Another release with a few enhancements as well as bugfixes and tidies. The main new features are: 1. There is now some support for matching in invalid UTF strings. 2. Non-atomic positive lookarounds are implemented in the pcre2_match() interpreter, but not in JIT. 3. Added two new functions: pcre2_get_match_data_size() and pcre2_maketables_free(). 4. Upgraded to Unicode 12.1.0. Version 10.33 16-April-2019 --------------------------- Yet more bugfixes, tidies, and a few enhancements, summarized here (see ChangeLog for the full list): 1. Callouts from pcre2_substitute() are now available. 2. The POSIX functions are now all called pcre2_regcomp() etc., with wrapper functions that use the standard POSIX names. However, in pcre2posix.h the POSIX names are defined as macros. This should help avoid linking with the wrong library in some environments, while still exporting the POSIX names for pre-existing programs that use them. 3. Some new options: (a) PCRE2_EXTRA_ESCAPED_CR_IS_LF makes \r behave as \n. (b) PCRE2_EXTRA_ALT_BSUX enables support for ECMAScript 6's \u{hh...} construct. (c) PCRE2_COPY_MATCHED_SUBJECT causes a copy of a matched subject to be made, instead of just remembering a pointer. 4. Some new Perl features: (a) Perl 5.28's experimental alphabetic names for atomic groups and lookaround assertions, for example, (*pla:...) and (*atomic:...). (b) The new Perl "script run" features (*script_run:...) and (*atomic_script_run:...) aka (*sr:...) and (*asr:...). (c) When PCRE2_UTF is set, allow non-ASCII letters and decimal digits in capture group names. 5. --disable-percent-zt disables the use of %zu and %td in formatting strings in pcre2test. They were already automatically disabled for VC and older C compilers. 6. Some changes related to callouts in pcre2grep: (a) Support for running an external program under VMS has been added, in addition to Windows and fork() support. (b) --disable-pcre2grep-callout-fork restricts the callout support in to the inbuilt echo facility. Version 10.32 10-September-2018 ------------------------------- This is another mainly bugfix and tidying release with a few minor enhancements. These are the main ones: 1. pcre2grep now supports the inclusion of binary zeros in patterns that are read from files via the -f option. 2. ./configure now supports --enable-jit=auto, which automatically enables JIT if the hardware supports it. 3. In pcre2_dfa_match(), internal recursive calls no longer use the stack for local workspace and local ovectors. Instead, an initial block of stack is reserved, but if this is insufficient, heap memory is used. The heap limit parameter now applies to pcre2_dfa_match(). 4. Updated to Unicode version 11.0.0. 5. (*ACCEPT:ARG), (*FAIL:ARG), and (*COMMIT:ARG) are now supported. 6. Added support for \N{U+dddd}, but only in Unicode mode. 7. Added support for (?^) to unset all imnsx options. Version 10.31 12-February-2018 ------------------------------ This is mainly a bugfix and tidying release (see ChangeLog for full details). However, there are some minor enhancements. 1. New pcre2_config() options: PCRE2_CONFIG_NEVER_BACKSLASH_C and PCRE2_CONFIG_COMPILED_WIDTHS. 2. New pcre2_pattern_info() option PCRE2_INFO_EXTRAOPTIONS to retrieve the extra compile time options. 3. There are now public names for all the pcre2_compile() error numbers. 4. Added PCRE2_CALLOUT_STARTMATCH and PCRE2_CALLOUT_BACKTRACK bits to a new field callout_flags in callout blocks. Version 10.30 14-August-2017 ---------------------------- The full list of changes that includes bugfixes and tidies is, as always, in ChangeLog. These are the most important new features: 1. The main interpreter, pcre2_match(), has been refactored into a new version that does not use recursive function calls (and therefore the system stack) for remembering backtracking positions. This makes --disable-stack-for-recursion a NOOP. The new implementation allows backtracking into recursive group calls in patterns, making it more compatible with Perl, and also fixes some other previously hard-to-do issues. For patterns that have a lot of backtracking, the heap is now used, and there is an explicit limit on the amount, settable by pcre2_set_heap_limit() or (*LIMIT_HEAP=xxx). The "recursion limit" is retained, but is renamed as "depth limit" (though the old names remain for compatibility). There is also a change in the way callouts from pcre2_match() are handled. The offset_vector field in the callout block is no longer a pointer to the actual ovector that was passed to the matching function in the match data block. Instead it points to an internal ovector of a size large enough to hold all possible captured substrings in the pattern. 2. The new option PCRE2_ENDANCHORED insists that a pattern match must end at the end of the subject. 3. The new option PCRE2_EXTENDED_MORE implements Perl's /xx feature, and pcre2test is upgraded to support it. Setting within the pattern by (?xx) is also supported. 4. (?n) can be used to set PCRE2_NO_AUTO_CAPTURE, because Perl now has this. 5. Additional compile options in the compile context are now available, and the first two are: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES and PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. 6. The newline type PCRE2_NEWLINE_NUL is now available. 7. The match limit value now also applies to pcre2_dfa_match() as there are patterns that can use up a lot of resources without necessarily recursing very deeply. 8. The option REG_PEND (a GNU extension) is now available for the POSIX wrapper. Also there is a new option PCRE2_LITERAL which is used to support REG_NOSPEC. 9. PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are implemented for the benefit of pcre2grep, and pcre2grep's -F, -w, and -x options are re-implemented using PCRE2_LITERAL, PCRE2_EXTRA_MATCH_WORD, and PCRE2_EXTRA_MATCH_LINE. This is tidier and also fixes some bugs. 10. The Unicode tables are upgraded from Unicode 8.0.0 to Unicode 10.0.0. 11. There are some experimental functions for converting foreign patterns (globs and POSIX patterns) into PCRE2 patterns. Version 10.23 14-February-2017 ------------------------------ 1. ChangeLog has the details of a lot of bug fixes and tidies. 2. There has been a major re-factoring of the pcre2_compile.c file. Most syntax checking is now done in the pre-pass that identifies capturing groups. This has reduced the amount of duplication and made the code tidier. While doing this, some minor bugs and Perl incompatibilities were fixed (see ChangeLog for details.) 3. Back references are now permitted in lookbehind assertions when there are no duplicated group numbers (that is, (?| has not been used), and, if the reference is by name, there is only one group of that name. The referenced group must, of course be of fixed length. 4. \g{+} (e.g. \g{+2} ) is now supported. It is a "forward back reference" and can be useful in repetitions (compare \g{-} ). Perl does not recognize this syntax. 5. pcre2grep now automatically expands its buffer up to a maximum set by --max-buffer-size. 6. The -t option (grand total) has been added to pcre2grep. 7. A new function called pcre2_code_copy_with_tables() exists to copy a compiled pattern along with a private copy of the character tables that is uses. 8. A user supplied a number of patches to upgrade pcre2grep under Windows and tidy the code. 9. Several updates have been made to pcre2test and test scripts (see ChangeLog). Version 10.22 29-July-2016 -------------------------- 1. ChangeLog has the details of a number of bug fixes. 2. The POSIX wrapper function regcomp() did not used to support back references and subroutine calls if called with the REG_NOSUB option. It now does. 3. A new function, pcre2_code_copy(), is added, to make a copy of a compiled pattern. 4. Support for string callouts is added to pcre2grep. 5. Added the PCRE2_NO_JIT option to pcre2_match(). 6. The pcre2_get_error_message() function now returns with a negative error code if the error number it is given is unknown. 7. Several updates have been made to pcre2test and test scripts (see ChangeLog). Version 10.21 12-January-2016 ----------------------------- 1. Many bugs have been fixed. A large number of them were provoked only by very strange pattern input, and were discovered by fuzzers. Some others were discovered by code auditing. See ChangeLog for details. 2. The Unicode tables have been updated to Unicode version 8.0.0. 3. For Perl compatibility in EBCDIC environments, ranges such as a-z in a class, where both values are literal letters in the same case, omit the non-letter EBCDIC code points within the range. 4. There have been a number of enhancements to the pcre2_substitute() function, giving more flexibility to replacement facilities. It is now also possible to cause the function to return the needed buffer size if the one given is too small. 5. The PCRE2_ALT_VERBNAMES option causes the "name" parts of special verbs such as (*THEN:name) to be processed for backslashes and to take note of PCRE2_EXTENDED. 6. PCRE2_INFO_HASBACKSLASHC makes it possible for a client to find out if a pattern uses \C, and --never-backslash-C makes it possible to compile a version PCRE2 in which the use of \C is always forbidden. 7. A limit to the length of pattern that can be handled can now be set by calling pcre2_set_max_pattern_length(). 8. When matching an unanchored pattern, a match can be required to begin within a given number of code units after the start of the subject by calling pcre2_set_offset_limit(). 9. The pcre2test program has been extended to test new facilities, and it can now run the tests when LF on its own is not a valid newline sequence. 10. The RunTest script has also been updated to enable more tests to be run. 11. There have been some minor performance enhancements. Version 10.20 30-June-2015 -------------------------- 1. Callouts with string arguments and the pcre2_callout_enumerate() function have been implemented. 2. The PCRE2_NEVER_BACKSLASH_C option, which locks out the use of \C, is added. 3. The PCRE2_ALT_CIRCUMFLEX option lets ^ match after a newline at the end of a subject in multiline mode. 4. The way named subpatterns are handled has been refactored. The previous approach had several bugs. 5. The handling of \c in EBCDIC environments has been changed to conform to the perlebcdic document. This is an incompatible change. 6. Bugs have been mended, many of them discovered by fuzzers. Version 10.10 06-March-2015 --------------------------- 1. Serialization and de-serialization functions have been added to the API, making it possible to save and restore sets of compiled patterns, though restoration must be done in the same environment that was used for compilation. 2. The (*NO_JIT) feature has been added; this makes it possible for a pattern creator to specify that JIT is not to be used. 3. A number of bugs have been fixed. In particular, bugs that caused building on Windows using CMake to fail have been mended. Version 10.00 05-January-2015 ----------------------------- Version 10.00 is the first release of PCRE2, a revised API for the PCRE library. Changes prior to 10.00 are logged in the ChangeLog file for the old API, up to item 20 for release 8.36. New programs are recommended to use the new library. Programs that use the original (PCRE1) API will need changing before linking with the new library. **** ================================================ FILE: NON-AUTOTOOLS-BUILD ================================================ Building PCRE2 without using autotools ====================================== This document contains the following sections: General Generic instructions for the PCRE2 C libraries Stack size in Windows environments Linking programs in Windows environments Calling conventions in Windows environments Comments about Win32 builds Building PCRE2 on Windows with CMake Building PCRE2 on Windows with Visual Studio Testing with RunTest.bat Building PCRE2 on z/OS and z/VM Building PCRE2 under VMS General ------- The source of the PCRE2 libraries consists entirely of code written in Standard C, and so should compile successfully on any system that has a Standard C compiler and library. The PCRE2 distribution includes a "configure" file for use by the configure/make (autotools) build system, as found in many Unix-like environments. The README file contains information about the options for "configure". There is also support for CMake, which some users prefer, especially in Windows environments, though it can also be run in Unix-like environments. See the section entitled "Building PCRE2 on Windows with CMake" below. Versions of src/config.h and src/pcre2.h are distributed in the PCRE2 tarballs under the names src/config.h.generic and src/pcre2.h.generic. These are provided for those who build PCRE2 without using "configure" or CMake. If you use "configure" or CMake, the .generic versions are not used. Generic instructions for the PCRE2 C libraries ---------------------------------------------- There are three possible PCRE2 libraries, each handling data with a specific code unit width: 8, 16, or 32 bits. You can build any combination of them. The following are generic instructions for building a PCRE2 C library "by hand". If you are going to use CMake, this section does not apply to you; you can skip ahead to the CMake section. Note that the settings concerned with 8-bit, 16-bit, and 32-bit code units relate to the type of data string that PCRE2 processes. They are NOT referring to the underlying operating system bit width. You do not have to do anything special to compile in a 64-bit environment, for example. (1) Copy or rename the file src/config.h.generic as src/config.h, and edit the macro settings that it contains to whatever is appropriate for your environment. In particular, you can alter the definition of the NEWLINE macro to specify what character(s) you want to be interpreted as line terminators by default. You need to #define at least one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16, or SUPPORT_PCRE2_32, depending on which libraries you are going to build. You must set all that apply. When you subsequently compile any of the PCRE2 modules, you must specify -DHAVE_CONFIG_H to your compiler so that src/config.h is included in the sources. An alternative approach is not to edit src/config.h, but to use -D on the compiler command line to make any changes that you need to the configuration options. In this case -DHAVE_CONFIG_H must not be set. NOTE: There have been occasions when the way in which certain parameters in src/config.h are used has changed between releases. (In the configure/make world, this is handled automatically.) When upgrading to a new release, you are strongly advised to review src/config.h.generic before re-using what you had previously. Note also that the src/config.h.generic file is created from a config.h that was generated by Autotools, which automatically includes settings of a number of macros that are not actually used by PCRE2 (for example, HAVE_DLFCN_H). (2) Copy or rename the file src/pcre2.h.generic as src/pcre2.h. (3) EITHER: Copy or rename file src/pcre2_chartables.c.dist as src/pcre2_chartables.c. OR: Compile src/pcre2_dftables.c as a stand-alone program (using -DHAVE_CONFIG_H if you have set up src/config.h), and then run it with the single argument "src/pcre2_chartables.c". This generates a set of standard character tables and writes them to that file. The tables are generated using the default C locale for your system. If you want to use a locale that is specified by LC_xxx environment variables, add the -L option to the pcre2_dftables command. You must use this method if you are building on a system that uses EBCDIC code. The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can specify alternative tables at run time. (4) For a library that supports 8-bit code units in the character strings that it processes, compile the following source files from the src directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set -DHAVE_CONFIG_H if you have set up src/config.h with your configuration, or else use other -D settings to change the configuration as required. pcre2_auto_possess.c pcre2_chkdint.c pcre2_chartables.c pcre2_compile.c pcre2_compile_cgroup.c pcre2_compile_class.c pcre2_config.c pcre2_context.c pcre2_convert.c pcre2_dfa_match.c pcre2_error.c pcre2_extuni.c pcre2_find_bracket.c pcre2_jit_compile.c pcre2_maketables.c pcre2_match.c pcre2_match_data.c pcre2_match_next.c pcre2_newline.c pcre2_ord2utf.c pcre2_pattern_info.c pcre2_script_run.c pcre2_serialize.c pcre2_string_utils.c pcre2_study.c pcre2_substitute.c pcre2_substring.c pcre2_tables.c pcre2_ucd.c pcre2_valid_utf.c pcre2_xclass.c Make sure that you include -I. in the compiler command (or equivalent for an unusual compiler) so that all included PCRE2 header files are first sought in the src directory under the current directory. Otherwise you run the risk of picking up a previously-installed file from somewhere else. Note that you must compile pcre2_jit_compile.c, even if you have not defined SUPPORT_JIT in src/config.h, because when JIT support is not configured, dummy functions are compiled. When JIT support IS configured, pcre2_jit_compile.c #includes other files from the sljit dependency, all of whose names begin with "sljit". Note also that the pcre2_fuzzsupport.c file contains special code that is useful to those who want to run fuzzing tests on the PCRE2 library. Unless you are doing that, you can ignore it. (5) Now link all the compiled code into an object library in whichever form your system keeps such libraries. This is the PCRE2 C 8-bit library, typically called something like libpcre2-8. If your system has static and shared libraries, you may have to do this once for each type. (6) If you want to build a library that supports 16-bit or 32-bit code units, set 16 or 32 as the value of -DPCRE2_CODE_UNIT_WIDTH when obeying step 4 above. If you want to build more than one PCRE2 library, repeat steps 4 and 5 as necessary. (7) If you want to build the POSIX wrapper functions (which apply only to the 8-bit library), ensure that you have the src/pcre2posix.h file and then compile src/pcre2posix.c. Link the result (on its own) as the pcre2posix library. If targeting a DLL in Windows, make sure to include -DPCRE2POSIX_SHARED with your compiler flags. (8) The pcre2test program can be linked with any combination of the 8-bit, 16-bit and 32-bit libraries (depending on what you specfied in src/config.h) . Compile src/pcre2test.c; don't forget -DHAVE_CONFIG_H if necessary, but do NOT define PCRE2_CODE_UNIT_WIDTH. Then link with the appropriate library/ies. If you compiled an 8-bit library, pcre2test also needs the pcre2posix wrapper library when linking. (9) Run pcre2test on the testinput files in the testdata directory, and check that the output matches the corresponding testoutput files. There are comments about what each test does in the section entitled "Testing PCRE2" in the README file. If you compiled more than one of the 8-bit, 16-bit and 32-bit libraries, you need to run pcre2test with the -16 option to do 16-bit tests and with the -32 option to do 32-bit tests. Some tests are relevant only when certain build-time options are selected. For example, test 4 is for Unicode support, and will not run if you have built PCRE2 without it. See the comments at the start of each testinput file. If you have a suitable Unix-like shell, the RunTest script will run the appropriate tests for you. The command "RunTest list" will output a list of all the tests. Note that the supplied files are in Unix format, with just LF characters as line terminators. You may need to edit them to change this if your system uses a different convention. (10) If you have built PCRE2 with SUPPORT_JIT, the JIT features can be tested by running pcre2test with the -jit option. This is done automatically by the RunTest script. You might also like to build and run the freestanding JIT test program, src/pcre2_jit_test.c. (11) The pcre2test program tests the POSIX wrapper library, but there is also a freestanding test program in src/pcre2posix_test.c. It must be linked with both the pcre2posix library and the 8-bit PCRE2 library. (12) If you want to use the pcre2grep command, compile and link src/pcre2grep.c; it uses only the 8-bit PCRE2 library (it does not need the pcre2posix library). If you have built the PCRE2 library with JIT support by defining SUPPORT_JIT in src/config.h, you can also define SUPPORT_PCRE2GREP_JIT, which causes pcre2grep to make use of JIT (unless it is run with --no-jit). If you define SUPPORT_PCRE2GREP_JIT without defining SUPPORT_JIT, pcre2grep does not try to make use of JIT. Stack size in Windows environments ---------------------------------- Prior to release 10.30 the default system stack size of 1MiB in some Windows environments caused issues with some tests. This should no longer be the case for 10.30 and later releases. Linking programs in Windows environments ---------------------------------------- If you want to statically link a program against a PCRE2 library in the form of a non-dll .a file, you must define PCRE2_STATIC before including src/pcre2.h. Calling conventions in Windows environments ------------------------------------------- It is possible to compile programs to use different calling conventions using MSVC. Search the web for "calling conventions" for more information. To make it easier to change the calling convention for the exported functions in a PCRE2 library, the macro PCRE2_CALL_CONVENTION is present in all the external definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is not set, it defaults to empty; the default calling convention is then used (which is what is wanted most of the time). Comments about Win32 builds (see also "Building PCRE2 on Windows with CMake") --------------------------- There are two ways of building PCRE2 using the "configure, make, make install" paradigm on Windows systems: using MinGW or using Cygwin. These are not at all the same thing; they are completely different from each other. There is also support for building using CMake, which some users find a more straightforward way of building PCRE2 under Windows. The MinGW home page (http://www.mingw.org/) says this: MinGW: A collection of freely available and freely distributable Windows specific header files and import libraries combined with GNU toolsets that allow one to produce native Windows programs that do not rely on any 3rd-party C runtime DLLs. The Cygwin home page (http://www.cygwin.com/) says this: Cygwin is a Linux-like environment for Windows. It consists of two parts: . A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing substantial Linux API functionality . A collection of tools which provide Linux look and feel. On both MinGW and Cygwin, PCRE2 should build correctly using: ./configure && make && make install This should create two libraries called libpcre2-8 and libpcre2-posix. These are independent libraries: when you link with libpcre2-posix you must also link with libpcre2-8, which contains the basic functions. Using Cygwin's compiler generates libraries and executables that depend on cygwin1.dll. If a library that is generated this way is distributed, cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL licence, this forces not only PCRE2 to be under the GPL, but also the entire application. A distributor who wants to keep their own code proprietary must purchase an appropriate Cygwin licence. MinGW has no such restrictions. The MinGW compiler generates a library or executable that can run standalone on Windows without any third party dll or licensing issues. But there is more complication: If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's gcc and MinGW's gcc). So, a user can: . Build native binaries by using MinGW or by getting Cygwin and using -mno-cygwin. . Build binaries that depend on cygwin1.dll by using Cygwin with the normal compiler flags. The test files that are supplied with PCRE2 are in UNIX format, with LF characters as line terminators. Unless your PCRE2 library uses a default newline option that includes LF as a valid newline, it may be necessary to change the line terminators in the test files to get some of the tests to work. Building PCRE2 on Windows with CMake ------------------------------------ CMake is an alternative configuration facility that can be used instead of "configure". CMake creates project files (make files, solution files, etc.) tailored to numerous development environments, including Visual Studio, Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no spaces in the names for your CMake installation and your PCRE2 source and build directories. If you are using CMake and encounter errors, deleting the CMake cache and restarting from a fresh build may fix the error. In the CMake GUI, the cache can be deleted by selecting "File > Delete Cache"; or the folder "CMakeCache" can be deleted. 1. Install the latest CMake version available from http://www.cmake.org/, and ensure that cmake\bin is on your path. 2. Unzip (retaining folder structure) the PCRE2 source tree into a source directory such as C:\pcre2. You should ensure your local date and time is not earlier than the file dates in your source dir if the release is very new. 3. Create a new, empty build directory, preferably a subdirectory of the source dir. For example, C:\pcre2\pcre2-xx\build. 4. Run CMake. - Using the CLI, simply run `cmake ..` inside the `build/` directory. You can use the `ccmake` ncurses GUI to select and configure PCRE2 features. - Using the CMake GUI: a) Run cmake-gui from the Shell environment of your build tool, for example, Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. b) Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and build directories, respectively. c) Press the "Configure" button. d) Select the particular IDE / build tool that you are using (Visual Studio, MSYS makefiles, MinGW makefiles, etc.) e) The GUI will then list several configuration options. This is where you can disable Unicode support or select other PCRE2 optional features. f) Press "Configure" again. The adjacent "Generate" button should now be active. g) Press "Generate". 5. The build directory should now contain a usable build system, be it a solution file for Visual Studio, makefiles for MinGW, etc. Exit from cmake-gui and use the generated build system with your compiler or IDE. E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 solution, select the desired configuration (Debug, or Release, etc.) and build the ALL_BUILD project. Regardless of build system used, `cmake --build .` will build it. 6. If during configuration with cmake-gui you've elected to build the test programs, you can execute them by building the test project. E.g., for MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The most recent build configuration is targeted by the tests. A summary of test results is presented. Complete test output is subsequently available for review in Testing\Temporary under your build dir. Regardless of build system used, `ctest` will run the tests. Building PCRE2 on Windows with Visual Studio -------------------------------------------- The code currently cannot be compiled without an inttypes.h header, which is available only with Visual Studio 2013 or newer. However, this portable and permissively-licensed implementation of the stdint.h header could be used as an alternative: http://www.azillionmonkeys.com/qed/pstdint.h Just rename it and drop it into the top level of the build tree. Testing with RunTest.bat ------------------------ If configured with CMake, building the test project ("make test" or building ALL_TESTS in Visual Studio) creates (and runs) pcre2_test.bat (and depending on your configuration options, possibly other test programs) in the build directory. The pcre2_test.bat script runs RunTest.bat with correct source and exe paths. For manual testing with RunTest.bat, provided the build dir is a subdirectory of the source directory: Open command shell window. Chdir to the location of your pcre2test.exe and pcre2grep.exe programs. Call RunTest.bat with "..\RunTest.Bat" or "..\..\RunTest.bat" as appropriate. To run only a particular test with RunTest.Bat provide a test number argument. Otherwise: 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe have been created. 2. Edit RunTest.bat to identify the full or relative location of the pcre2 source (wherein which the testdata folder resides), e.g.: set srcdir=C:\pcre2\pcre2-10.00 3. In a Windows command environment, chdir to the location of your bat and exe programs. 4. Run RunTest.bat. Test outputs will automatically be compared to expected results, and discrepancies will be identified in the console output. To independently test the just-in-time compiler, run pcre2_jit_test.exe. Building PCRE2 on z/OS and z/VM ------------------------------- z/OS and z/VM are operating systems for mainframe computers, produced by IBM. The character code used is EBCDIC, not ASCII or Unicode. In z/OS, UNIX APIs and applications can be supported through UNIX System Services. The PCRE2 codebase compiles and runs with native EBCDIC support on modern z/OS systems, using the pre-installed tools (/bin/sh, ./configure, and the XLC or IBM-Clang compilers). PCRE2 supports z/OS using both Autoconf (./configure) and CMake (which IBM distributes via "zopen install cmake"). Note that as of the time of writing, IBM's port of CMake to z/OS has only partial support for EBCDIC. It is recommended to build PCRE2 using the ./configure script, if you require an EBCDIC build. Any EBCDIC codepage should work (PCRE2 does not assume or require IBM-1047), or PCRE2 can compiled for ASCII/Latin-1/Unicode. After unpacking the PCRE2 tarball, you must subsequently tag the files as ASCII in order for the z/OS shell and compiler to interpret them correctly: chtag -R -tc ISO8859-1 Some unusual features on the IBM platform are: - The _ALL_SOURCE macro must be provided. Unlike on Linux or macOS, even quite standard POSIX APIs are not made visible by default. PCRE2's Autoconf and CMake system both provide this for you. - The `cc`, `c89`, and even `c99` compilers provided by IBM do not default to the same argument ordering as other Unix platforms. - The XLC compiler requires `-qhaltonmsg=CCN3296`, otherwise it will treat any preprocessor #include errors as a warning rather than an error. Needless to say this default wrecks Autoconf and CMake's feature-detection tests. PCRE2's build system is aware of this. - The test suite (in the testdata/ directory) is entirely in ASCII/UTF-8. When running the tests, you must ensure that the EBCDIC-native build of pcre2test receives an EBCDIC version of these files. The easiest way to achieve this is via filesystem tagging (chtag). Alternatively, you could manually re-encode the testdata files as EBCDIC, and tag them as EBCDIC. (Latin-1 and EBCDIC are one-to-one convertible encodings, a simple byte-by-byte permutation of the 256 values.) In native z/OS (without UNIX System Services) and in z/VM, a user has provided a special port of PCRE2. For details, please see file 939 on this web site: http://www.cbttape.org The user-provided port also provides an API for LE languages such as COBOL and for the z/OS and z/VM versions of the Rexx languages. Building PCRE2 under VMS ------------------------ Alexey Chuphin has contributed some auxiliary files for building PCRE2 under OpenVMS. They are in the "vms" directory in the distribution tarball. Please read the file called vms/openvms_readme.txt. The pcre2test and pcre2grep programs contain some VMS-specific code. This has not been tested for some time. The PCRE2 maintainers would be grateful to learn whether it still works (or if anyone still uses it). ============================= Last updated: 17 October 2025 ============================= ================================================ FILE: README ================================================ README file for PCRE2 (Perl-compatible regular expression library) ================================================================== PCRE2 is a re-working of the original PCRE1 library to provide an entirely new API. Since its initial release in 2015, there has been further development of the code and it now differs from PCRE1 in more than just the API. There are new features, and the internals have been improved. The original PCRE1 library is now obsolete and no longer maintained. The latest release of PCRE2 is available in .tar.gz, tar.bz2, or .zip form from this GitHub repository: https://github.com/PCRE2Project/pcre2/releases There is a mailing list for discussion about the development of PCRE2 at pcre2-dev@googlegroups.com. You can subscribe by sending an email to pcre2-dev+subscribe@googlegroups.com. You can access the archives and also subscribe or manage your subscription here: https://groups.google.com/g/pcre2-dev Please read the NEWS file if you are upgrading from a previous release. The contents of this README file are: The PCRE2 APIs Documentation for PCRE2 Building PCRE2 on non-Unix-like systems Building PCRE2 without using autotools Building PCRE2 using autotools Retrieving configuration information Shared libraries Cross-compiling using autotools Making new tarballs Testing PCRE2 Character tables File manifest The PCRE2 APIs -------------- PCRE2 is written in C, and it has its own API. There are three sets of functions, one for the 8-bit library, which processes strings of bytes, one for the 16-bit library, which processes strings of 16-bit values, and one for the 32-bit library, which processes strings of 32-bit values. Unlike PCRE1, there are no C++ wrappers. The distribution does contain a set of C wrapper functions for the 8-bit library that are based on the POSIX regular expression API (see the pcre2posix man page). These are built into a library called libpcre2-posix. Note that this just provides a POSIX calling interface to PCRE2; the regular expressions themselves still follow Perl syntax and semantics. The POSIX API is restricted, and does not give full access to all of PCRE2's facilities. The header file for the POSIX-style functions is called pcre2posix.h. The official POSIX name is regex.h, but I did not want to risk possible problems with existing files of that name by distributing it that way. To use PCRE2 with an existing program that uses the POSIX API, pcre2posix.h will have to be renamed or pointed at by a link (or the program modified, of course). See the pcre2posix documentation for more details. Documentation for PCRE2 ----------------------- If you install PCRE2 in the normal way on a Unix-like system, you will end up with a set of man pages whose names all start with "pcre2". The one that is just called "pcre2" lists all the others. In addition to these man pages, the PCRE2 documentation is supplied in two other forms: 1. There are files called doc/pcre2.txt, doc/pcre2grep.txt, and doc/pcre2test.txt in the source distribution. The first of these is a concatenation of the text forms of all the section 3 man pages except the listing of pcre2demo.c and those that summarize individual functions. The other two are the text forms of the section 1 man pages for the pcre2grep and pcre2test commands. These text forms are provided for ease of scanning with text editors or similar tools. They are installed in /share/doc/pcre2, where is the installation prefix (defaulting to /usr/local). 2. A set of files containing all the documentation in HTML form, hyperlinked in various ways, and rooted in a file called index.html, is distributed in doc/html and installed in /share/doc/pcre2/html. Building PCRE2 on non-Unix-like systems --------------------------------------- For a non-Unix-like system, please read the file NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and "make" you may be able to build PCRE2 using autotools in the same way as for many Unix-like systems. This file also contains useful information on building for some unusual Unix environments (such as EBCDIC mainframes). PCRE2 can also be configured using CMake, which can be run in various ways (command line, GUI, etc). This creates Makefiles, solution files, etc. The file NON-AUTOTOOLS-BUILD has information about CMake. PCRE2 has been compiled on many different operating systems. It should be straightforward to build PCRE2 on any system that has a C99 or later compiler and library. Building PCRE2 without using autotools -------------------------------------- The use of autotools (in particular, libtool) is problematic in some environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD file for ways of building PCRE2 without using autotools. Building PCRE2 using autotools ------------------------------ The following instructions assume the use of the widely used "configure; make; make install" (autotools) process. If you have downloaded and unpacked a PCRE2 release tarball, run the "configure" command from the PCRE2 directory, with your current directory set to the directory where you want the files to be created. This command is a standard GNU "autoconf" configuration script, for which generic instructions are supplied in the file INSTALL. The files in the GitHub repository do not contain "configure". If you have downloaded the PCRE2 source files from GitHub, before you can run "configure" you must run the shell script called autogen.sh. This runs a number of autotools to create a "configure" script (you must of course have the autotools commands installed in order to do this). Most commonly, people build PCRE2 within its own distribution directory, and in this case, on many systems, just running "./configure" is sufficient. However, the usual methods of changing standard defaults are available. For example: CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local This command specifies that the C compiler should be run with the flags '-O2 -Wall' instead of the default, and that "make install" should install PCRE2 under /opt/local instead of the default /usr/local. If you want to build in a different directory, just run "configure" with that directory as current. For example, suppose you have unpacked the PCRE2 source into /source/pcre2/pcre2-xxx, but you want to build it in /build/pcre2/pcre2-xxx: cd /build/pcre2/pcre2-xxx /source/pcre2/pcre2-xxx/configure PCRE2 is written in C and is normally compiled as a C library. However, it is possible to build it as a C++ library, though the provided building apparatus does not have any features to support this. There are some optional features that can be included or omitted from the PCRE2 library. They are also documented in the pcre2build man page. . By default, both shared and static libraries are built. You can change this by adding one of these options to the "configure" command: --disable-shared --disable-static Setting --disable-shared ensures that PCRE2 libraries are built as static libraries. The binaries that are then created as part of the build process (for example, pcre2test and pcre2grep) are linked statically with one or more PCRE2 libraries, but may also be dynamically linked with other libraries such as libc. If you want these binaries to be fully statically linked, you can set LDFLAGS like this: LDFLAGS=--static ./configure --disable-shared Note the two hyphens in --static. Of course, this works only if static versions of all the relevant libraries are available for linking. See also "Shared libraries" below. Shared libraries are compiled with symbol versioning enabled on platforms that support this, but this can be disabled by adding --disable-symvers. . By default, only the 8-bit library is built. If you add --enable-pcre2-16 to the "configure" command, the 16-bit library is also built. If you add --enable-pcre2-32 to the "configure" command, the 32-bit library is also built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 to disable building the 8-bit library. . If you want to include support for just-in-time (JIT) compiling, which can give large performance improvements on certain platforms, add --enable-jit to the "configure" command. This support is available only for certain hardware architectures. If you try to enable it on an unsupported architecture, there will be a compile time error. If in doubt, use --enable-jit=auto, which enables JIT only if the current hardware is supported. . If you are enabling JIT under SELinux environment you may also want to add --enable-jit-sealloc, which enables the use of an executable memory allocator that is compatible with SELinux. Warning: this allocator is experimental! It does not support fork() operation and may crash when no disk space is available. This option has no effect if JIT is disabled. . If you do not want to make use of the default support for UTF-8 Unicode character strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit library, or UTF-32 Unicode character strings in the 32-bit library, you can add --disable-unicode to the "configure" command. This reduces the size of the libraries. It is not possible to configure one library with Unicode support, and another without, in the same configuration. It is also not possible to use --enable-ebcdic (see below) with Unicode support, so if this option is set, you must also use --disable-unicode. When Unicode support is available, the use of a UTF encoding still has to be enabled by setting the PCRE2_UTF option at run time or starting a pattern with (*UTF). When PCRE2 is compiled with Unicode support, its input can only either be ASCII or UTF-8/16/32, even when running on EBCDIC platforms. As well as supporting UTF strings, Unicode support includes support for the \P, \p, and \X sequences that recognize Unicode character properties. However, only a subset of Unicode properties are supported; see the pcre2pattern man page for details. Escape sequences such as \d and \w in patterns do not by default make use of Unicode properties, but can be made to do so by setting the PCRE2_UCP option or starting a pattern with (*UCP). . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any of the preceding, or any of the Unicode newline sequences, or the NUL (zero) character as indicating the end of a line. Whatever you specify at build time is the default; the caller of PCRE2 can change the selection at run time. The default newline indicator is a single LF character (the Unix standard). You can specify the default newline indicator by adding --enable-newline-is-cr, --enable-newline-is-lf, --enable-newline-is-crlf, --enable-newline-is-anycrlf, --enable-newline-is-any, or --enable-newline-is-nul to the "configure" command, respectively. . By default, the sequence \R in a pattern matches any Unicode line ending sequence. This is independent of the option specifying what PCRE2 considers to be the end of a line (see above). However, the caller of PCRE2 can restrict \R to match only CR, LF, or CRLF. You can make this the default by adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R"). . In a pattern, the escape sequence \C matches a single code unit, even in a UTF mode. This can be dangerous because it breaks up multi-code-unit characters. You can build PCRE2 with the use of \C permanently locked out by adding --enable-never-backslash-C (note the upper case C) to the "configure" command. When \C is allowed by the library, individual applications can lock it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option. . PCRE2 has a counter that limits the depth of nesting of parentheses in a pattern. This limits the amount of system stack that a pattern uses when it is compiled. The default is 250, but you can change it by setting, for example, --with-parens-nest-limit=500 . PCRE2 has a counter that can be set to limit the amount of computing resource it uses when matching a pattern. If the limit is exceeded during a match, the match fails. The default is ten million. You can change the default by setting, for example, --with-match-limit=500000 on the "configure" command. This is just the default; individual calls to pcre2_match() or pcre2_dfa_match() can supply their own value. There is more discussion in the pcre2api man page (search for pcre2_set_match_limit). . There is a separate counter that limits the depth of nested backtracking (pcre2_match()) or nested function calls (pcre2_dfa_match()) during a matching process, which indirectly limits the amount of heap memory that is used, and in the case of pcre2_dfa_match() the amount of stack as well. This counter also has a default of ten million, which is essentially "unlimited". You can change the default by setting, for example, --with-match-limit-depth=5000 There is more discussion in the pcre2api man page (search for pcre2_set_depth_limit). . You can also set an explicit limit on the amount of heap memory used by the pcre2_match() and pcre2_dfa_match() interpreters: --with-heap-limit=500 The units are kibibytes (units of 1024 bytes). This limit does not apply when the JIT optimization (which has its own memory control features) is used. There is more discussion on the pcre2api man page (search for pcre2_set_heap_limit). . In the 8-bit library, the default maximum compiled pattern size is around 64 kibibytes. You can increase this by adding --with-link-size=3 to the "configure" command. PCRE2 then uses three bytes instead of two for offsets to different parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is the same as --with-link-size=4, which (in both libraries) uses four-byte offsets. Increasing the internal link size reduces performance in the 8-bit and 16-bit libraries. In the 32-bit library, the link size setting is ignored, as 4-byte offsets are always used. . Lookbehind assertions in which one or more branches can match a variable number of characters are supported only if there is a maximum matching length for each top-level branch. There is a limit to this maximum that defaults to 255 characters. You can alter this default by a setting such as --with-max-varlookbehind=100 The limit can be changed at runtime by calling pcre2_set_max_varlookbehind(). Lookbehind assertions in which every branch matches a fixed number of characters (not necessarily all the same) are not constrained by this limit. . For speed, PCRE2 uses four tables for manipulating and identifying characters whose code point values are less than 256. By default, it uses a set of tables for ASCII encoding that is part of the distribution. If you specify --enable-rebuild-chartables a program called pcre2_dftables is compiled and run in the default C locale when you obey "make". It builds a source file called pcre2_chartables.c. If you do not specify this option, pcre2_chartables.c is created as a copy of pcre2_chartables.c.dist. See "Character tables" below for further information. . It is possible to compile PCRE2 for use on systems that use EBCDIC as their character code (as opposed to ASCII/Unicode) by specifying --enable-ebcdic --disable-unicode This automatically implies --enable-rebuild-chartables (see above), in order to ensure that you have the correct default character tables for your system's codepage. There is an exception when you set --enable-ebcdic-ignoring-compiler (see below), which allows using a default set of EBCDIC 1047 character tables rather than forcing use of --enable-rebuild-chartables. When PCRE2 is built with EBCDIC support, it always operates in EBCDIC. It cannot support both EBCDIC and ASCII or UTF-8/16/32. There is a second option, --enable-ebcdic-nl25, which specifies that the code value for the EBCDIC NL character is 0x25 instead of the default 0x15. There is a third option, --enable-ebcdic-ignoring-compiler, which disregards the compiler's codepage for determining the numeric value of C character constants such as 'z', and instead forces PCRE2 to use numeric constants for the EBCDIC 1047 codepage instead. . If you specify --enable-debug, additional debugging code is included in the build. This option is intended for use by the PCRE2 maintainers. . In environments where valgrind is installed, if you specify --enable-valgrind PCRE2 will use valgrind annotations to mark certain memory regions as unaddressable. This allows it to detect invalid memory accesses, and is mostly useful for debugging PCRE2 itself. . In environments where the gcc compiler is used and lcov is installed, if you specify --enable-coverage the build process implements a code coverage report for the test suite. The report is generated by running "make coverage". If ccache is installed on your system, it must be disabled when building PCRE2 for coverage reporting. You can do this by setting the environment variable CCACHE_DISABLE=1 before running "make" to build PCRE2. There is more information about coverage reporting in the "pcre2build" documentation. . When JIT support is enabled, pcre2grep automatically makes use of it, unless you add --disable-pcre2grep-jit to the "configure" command. . There is support for calling external programs during matching in the pcre2grep command, using PCRE2's callout facility with string arguments. This support can be disabled by adding --disable-pcre2grep-callout to the "configure" command. There are two kinds of callout: one that generates output from inbuilt code, and another that calls an external program. The latter has special support for Windows and VMS; otherwise it assumes the existence of the fork() function. This facility can be disabled by adding --disable-pcre2grep-callout-fork to the "configure" command. . The pcre2grep program currently supports only 8-bit data files, and so requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by specifying one or both of --enable-pcre2grep-libz --enable-pcre2grep-libbz2 Of course, the relevant libraries must be installed on your system. . The default starting size (in bytes) of the internal buffer used by pcre2grep can be set by, for example: --with-pcre2grep-bufsize=51200 The value must be a plain integer. The default is 20480. The amount of memory used by pcre2grep is actually three times this number, to allow for "before" and "after" lines. If very long lines are encountered, the buffer is automatically enlarged, up to a fixed maximum size. . The default maximum size of pcre2grep's internal buffer can be set by, for example: --with-pcre2grep-max-bufsize=2097152 The default is either 1048576 or the value of --with-pcre2grep-bufsize, whichever is the larger. . It is possible to compile pcre2test so that it links with the libreadline or libedit libraries, by specifying, respectively, --enable-pcre2test-libreadline or --enable-pcre2test-libedit If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. Enabling libreadline causes the -lreadline option to be added to the pcre2test build. In many operating environments with a system-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is in use), it may be necessary to specify something like LIBS="-lncurses" as well. This is because, to quote the readline INSTALL, "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link with readline the option to choose an appropriate library." If you get error messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto, this is the problem, and linking with the ncurses library should fix it. . The C99 standard defines formatting modifiers z and t for size_t and ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in environments other than Microsoft Visual Studio versions earlier than 2013 when __STDC_VERSION__ is defined and has a value greater than or equal to 199901L (indicating C99). However, there is at least one environment that claims to be C99 but does not support these modifiers. If --disable-percent-zt is specified, no use is made of the z or t modifiers. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who want to run fuzzing tests on PCRE2. If set, it causes an extra library called libpcre2-fuzzsupport.a to be built, but not installed. This contains a single function called LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the length of the string. When called, this function tries to compile the string as a pattern, and if that succeeds, to match it. This is done both with no options and with some random options bits that are generated from the string. Setting --enable-fuzz-support also causes an executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing function and outputs information about what it is doing. The input strings are specified by arguments: if an argument starts with "=" the rest of it is a literal input string. Otherwise, it is assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for backtracking instead of recursive function calls (which use the stack). This is now obsolete because pcre2_match() was refactored always to use the heap (in a much more efficient way than before). This option is retained for backwards compatibility, but has no effect other than to output a warning. The "configure" script builds the following files for the basic C library: . Makefile the makefile that builds the library . src/config.h build-time configuration options for the library . src/pcre2.h the public PCRE2 header file . pcre2-config script that shows the building settings such as CFLAGS that were set for "configure" . libpcre2-8.pc ) . libpcre2-16.pc ) data for the pkg-config command . libpcre2-32.pc ) . libpcre2-posix.pc ) . libtool script that builds shared and/or static libraries Versions of config.h and pcre2.h are distributed in the src directory of PCRE2 tarballs under the names config.h.generic and pcre2.h.generic. These are provided for those who have to build PCRE2 without using "configure" or CMake. If you use "configure" or CMake, the .generic versions are not used. The "configure" script also creates config.status, which is an executable script that can be run to recreate the configuration, and config.log, which contains compiler output from tests that "configure" runs. Once "configure" has run, you can run "make". This builds whichever of the libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test program called pcre2test. If you enabled JIT support with --enable-jit, another test program called pcre2_jit_test is built as well. If the 8-bit library is built, libpcre2-posix, pcre2posix_test, and the pcre2grep command are also built. Running "make" with the -j option may speed up compilation on multiprocessor systems. The command "make check" runs all the appropriate tests. Details of the PCRE2 tests are given below in a separate section of this document. The -j option of "make" can also be used when running the tests. You can use "make install" to install PCRE2 into live directories on your system. The following are installed (file names are all relative to the that is set when "configure" is run): Commands (bin): pcre2test pcre2grep (if 8-bit support is enabled) pcre2-config Libraries (lib): libpcre2-8 (if 8-bit support is enabled) libpcre2-16 (if 16-bit support is enabled) libpcre2-32 (if 32-bit support is enabled) libpcre2-posix (if 8-bit support is enabled) Configuration information (lib/pkgconfig): libpcre2-8.pc libpcre2-16.pc libpcre2-32.pc libpcre2-posix.pc Header files (include): pcre2.h pcre2posix.h Man pages (share/man/man{1,3}): pcre2grep.1 pcre2test.1 pcre2-config.1 pcre2.3 pcre2*.3 (lots more pages, all starting "pcre2") HTML documentation (share/doc/pcre2/html): index.html *.html (lots more pages, hyperlinked from index.html) Text file documentation (share/doc/pcre2): AUTHORS COPYING ChangeLog LICENCE NEWS README SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page pcre2-config.txt the pcre2-config man page If you want to remove PCRE2 from your system, you can run "make uninstall". This removes all the files that "make install" installed. However, it does not remove any directories, because these are often shared with other programs. Retrieving configuration information ------------------------------------ Running "make install" installs the command pcre2-config, which can be used to recall information about the PCRE2 configuration and installation. For example: pcre2-config --version prints the version number, and pcre2-config --libs8 outputs information about where the 8-bit library is installed. This command can be included in makefiles for programs that use PCRE2, saving the programmer from having to remember too many details. Run pcre2-config with no arguments to obtain a list of possible arguments. The pkg-config command is another system for saving and retrieving information about installed libraries. Instead of separate commands for each library, a single command is used. For example: pkg-config --libs libpcre2-16 The data is held in *.pc files that are installed in a directory called /lib/pkgconfig. Shared libraries ---------------- The default distribution builds PCRE2 as shared libraries and static libraries, as long as the operating system supports shared libraries. Shared library support relies on the "libtool" script which is built as part of the "configure" process. The libtool script is used to compile and link both shared and static libraries. They are placed in a subdirectory called .libs when they are newly built. The programs pcre2test and pcre2grep are built to use these uninstalled libraries (by means of wrapper scripts in the case of shared libraries). When you use "make install" to install shared libraries, pcre2grep and pcre2test are automatically re-built to use the newly installed shared libraries before being installed themselves. However, the versions left in the build directory still use the uninstalled libraries. To build PCRE2 using static libraries only you must use --disable-shared when configuring it. For example: ./configure --prefix=/usr/gnu --disable-shared Then run "make" in the usual way. Similarly, you can use --disable-static to build only shared libraries. Note, however, that when you build only static libraries, binary programs such as pcre2test and pcre2grep may still be dynamically linked with other libraries (for example, libc) unless you set LDFLAGS to --static when running "configure". Cross-compiling using autotools ------------------------------- You can specify CC and CFLAGS in the normal way to the "configure" command, in order to cross-compile PCRE2 for some other host. However, you should NOT specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c source file is compiled and run on the local host, in order to generate the inbuilt character tables (the pcre2_chartables.c file). This will probably not work, because pcre2_dftables.c needs to be compiled with the local compiler, not the cross compiler. When --enable-rebuild-chartables is not specified, pcre2_chartables.c is created by making a copy of pcre2_chartables.c.dist, which is a default set of tables that assumes ASCII code. Cross-compiling with the default tables should not be a problem. If you need to modify the character tables when cross-compiling, you should move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by hand and run it on the local host to make a new version of pcre2_chartables.c.dist. See the pcre2build section "Creating character tables at build time" for more details. Making new tarballs ------------------- The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you should first run the maint/UpdateAlways script before making a distribution. This script creates the .txt and HTML forms of the documentation from the man pages. Testing PCRE2 ------------- To test the basic PCRE2 library on a Unix-like system, run the RunTest script. There is another script called RunGrepTest that tests the pcre2grep command. When the 8-bit library is built, a test program for the POSIX wrapper, called pcre2posix_test, is compiled, and when JIT support is enabled, a test program called pcre2_jit_test is built. The scripts and the program tests are all run when you obey "make check". For other environments, see the instructions in NON-AUTOTOOLS-BUILD. The RunTest script runs the pcre2test test program (which is documented in its own man page) on each of the relevant testinput files in the testdata directory, and compares the output with the contents of the corresponding testoutput files. RunTest places its output in directories testoutput{8,16,32}{,-jit,-dfa}. Other files whose names begin with "test" are used as working files in some tests. Some tests are relevant only when certain build-time options were selected. For example, the tests for UTF-8/16/32 features are run only when Unicode support is available. RunTest outputs a comment when it skips a test. Many (but not all) of the tests that are not skipped are run twice if JIT support is available. On the second run, JIT compilation is forced. This testing can be suppressed by putting "-nojit" on the RunTest command line. The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit libraries that are enabled. If you want to run just one set of tests, call RunTest with either the -8, -16 or -32 option. If valgrind is installed, you can run the tests under it by putting "-valgrind" on the RunTest command line. To run pcre2test on just one or more specific test files, give their numbers as arguments to RunTest, for example: RunTest 2 7 11 You can also specify ranges of tests such as 3-6 or 3- (meaning 3 to the end), or a number preceded by ~ to exclude a test. For example: Runtest 3-15 ~10 This runs tests 3 to 15, excluding test 10, and just ~13 runs all the tests except test 13. Whatever order the arguments are in, the tests are always run in numerical order. You can also call RunTest with the single argument "list" to cause it to output a list of tests. The test sequence starts with "test 0", which is a special test that has no input file, and whose output is not checked. This is because it will be different on different hardware and with different configurations. The test exists in order to exercise some of pcre2test's code that would not otherwise be run. Tests 1 and 2 can always be run, as they expect only plain text strings (not UTF) and make no use of Unicode properties. The first test file can be fed directly into the perltest.sh script to check that Perl gives the same results. The only difference you should see is in the first few lines, where the Perl version is given instead of the PCRE2 version. The second set of tests check auxiliary functions, error detection, and run-time flags that are specific to PCRE2. It also uses the debugging flags to check some of the internals of pcre2_compile(). If you build PCRE2 with a locale setting that is not the standard C locale, the character tables may be different (see next paragraph). In some cases, this may cause failures in the second set of tests. For example, in a locale where the isprint() function yields TRUE for characters in the range 128-255, the use of [:isascii:] inside a character class defines a different set of characters, and this shows up in this test as a difference in the compiled code, which is being listed for checking. For example, where the comparison test output contains [\x00-\x7f] the test might contain [\x00-\xff], and similarly in some other cases. This is not a bug in PCRE2. Test 3 checks pcre2_maketables(), the facility for building a set of character tables for a specific locale and using them instead of the default tables. The script uses the "locale" command to check for the availability of the "fr_FR", "french", or "fr" locale, and uses the first one that it finds. If the "locale" command fails, or if its output doesn't include "fr_FR", "french", or "fr" in the list of available locales, the third test cannot be run, and a comment is output to say why. If running this test produces an error like this: ** Failed to set locale "fr_FR" it means that the given locale is not available on your system, despite being listed by "locale". This does not mean that PCRE2 is broken. There are three alternative output files for the third test, because three different versions of the French locale have been encountered. The test passes if its output matches any one of them. Tests 4 and 5 check UTF and Unicode property support, test 4 being compatible with the perltest.sh script, and test 5 checking PCRE2-specific things. Tests 6 and 7 check the pcre2_dfa_match() alternative matching function, in non-UTF mode and UTF-mode with Unicode property support, respectively. Test 8 checks some internal offsets and code size features, but it is run only when Unicode support is enabled. The output is different in 8-bit, 16-bit, and 32-bit modes and for different link sizes, so there are different output files for each mode and link size. Tests 9 and 10 are run only in 8-bit mode, and tests 11 and 12 are run only in 16-bit and 32-bit modes. These are tests that generate different output in 8-bit mode. Each pair are for general cases and Unicode support, respectively. Test 13 checks the handling of non-UTF characters greater than 255 by pcre2_dfa_match() in 16-bit and 32-bit modes. Test 14 contains some special UTF and UCP tests that give different output for different code unit widths. Test 15 contains a number of tests that must not be run with JIT. They check, among other non-JIT things, the match-limiting features of the interpretive matcher. Test 16 is run only when JIT support is not available. It checks that an attempt to use JIT has the expected behaviour. Test 17 is run only when JIT support is available. It checks JIT complete and partial modes, match-limiting under JIT, and other JIT-specific features. Tests 18 and 19 are run only in 8-bit mode. They check the POSIX interface to the 8-bit library, without and with Unicode support, respectively. Test 20 checks the serialization functions by writing a set of compiled patterns to a file, and then reloading and checking them. Tests 21 and 22 test \C support when the use of \C is not locked out, without and with UTF support, respectively. Test 23 tests \C when it is locked out. Tests 24 and 25 test the experimental pattern conversion functions, without and with UTF support, respectively. Test 26 checks Unicode property support using tests that were generated automatically from the Unicode data tables. These are the archived version of the tests from Unicode 15. Test 27 checks Unicode property support using tests that are generated automatically from the currently-used Unicode data tables. Test 28 tests EBCDIC support, and is only run when PCRE2 is specifically compiled for EBCDIC. Test 29 tests EBCDIC when NL has been configured to be 0x25. Character tables ---------------- For speed, PCRE2 uses four tables for manipulating and identifying characters whose code point values are less than 256. By default, a set of tables that is built into the library is used. The pcre2_maketables() function can be called by an application to create a new set of tables in the current locale. This are passed to PCRE2 by calling pcre2_set_character_tables() to put a pointer into a compile context. The source file called pcre2_chartables.c contains the default set of tables. By default, this is created as a copy of pcre2_chartables.c.dist, which contains tables for ASCII coding. However, if --enable-rebuild-chartables is specified for ./configure, a new version of pcre2_chartables.c is built by the program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C character handling functions such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table sources. This means that the default C locale that is set for your system will control the contents of these default tables. You can change the default tables by editing pcre2_chartables.c and then re-building PCRE2. If you do this, you should take care to ensure that the file does not get automatically re-generated. The best way to do this is to move pcre2_chartables.c.dist out of the way and replace it with your customized tables. When the pcre2_dftables program is run as a result of specifying --enable-rebuild-chartables, it uses the default C locale that is set on your system. It does not pay attention to the LC_xxx environment variables. In other words, it uses the system's default locale rather than whatever the compiling user happens to have set. If you really do want to build a source set of character tables in a locale that is specified by the LC_xxx variables, you can run the pcre2_dftables program by hand with the -L option. For example: ./pcre2_dftables -L pcre2_chartables.c.special The second argument names the file where the source code for the tables is written. The first two 256-byte tables provide lower casing and case flipping functions, respectively. The next table consists of a number of 32-byte bit maps which identify certain character classes such as digits, "word" characters, white space, etc. These are used when building 32-byte bit maps that represent character classes for code points less than 256. The final 256-byte table has bits indicating various character types, as follows: 1 white space character 2 letter 4 lower case letter 8 decimal digit 16 alphanumeric or '_' You can also specify -b (with or without -L) when running pcre2_dftables. This causes the tables to be written in binary instead of as source code. A set of binary tables can be loaded into memory by an application and passed to pcre2_compile() in the same way as tables created dynamically by calling pcre2_maketables(). The tables are just a string of bytes, independent of hardware characteristics such as endianness. This means they can be bundled with an application that runs in different environments, to ensure consistent behaviour. See also the pcre2build section "Creating character tables at build time". File manifest ------------- The distribution should contain the files listed below. (A) Source files for the PCRE2 library functions and their headers are found in the src directory: src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c when --enable-rebuild-chartables is specified src/pcre2_chartables.c.dist a default set of character tables that assume ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c src/pcre2_chartables.c.ebcdic-1047-{nl15,nl25} a default set of character tables for EBCDIC 1047; used if --enable-ebcdic-ignoring-compiler is specified without --enable-rebuild-chartables src/pcre2posix.c ) src/pcre2_auto_possess.c ) src/pcre2_chkdint.c ) src/pcre2_compile.c ) src/pcre2_compile_cgroup.c ) src/pcre2_compile_class.c ) src/pcre2_config.c ) src/pcre2_context.c ) src/pcre2_convert.c ) src/pcre2_dfa_match.c ) src/pcre2_error.c ) src/pcre2_extuni.c ) src/pcre2_find_bracket.c ) src/pcre2_jit_compile.c ) src/pcre2_maketables.c ) sources for the functions in the library, src/pcre2_match.c ) and some internal functions that they use src/pcre2_match_data.c ) src/pcre2_match_next.c ) src/pcre2_newline.c ) src/pcre2_ord2utf.c ) src/pcre2_pattern_info.c ) src/pcre2_script_run.c ) src/pcre2_serialize.c ) src/pcre2_string_utils.c ) src/pcre2_study.c ) src/pcre2_substitute.c ) src/pcre2_substring.c ) src/pcre2_tables.c ) src/pcre2_ucd.c ) src/pcre2_valid_utf.c ) src/pcre2_xclass.c ) src/pcre2_fuzzsupport.c function for (optional) fuzzing support src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_match_inc.h header used by JIT src/pcre2_jit_misc_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_printint_inc.h debugging function that is used by pcre2test src/pcre2_ucp.h header for Unicode property handling src/pcre2_ucptables_inc.h header with Unicode data tables src/pcre2_util.h header for internal utils deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: src/pcre2demo.c simple demonstration of coding calls to PCRE2 src/pcre2grep.c source of a grep utility that uses PCRE2 src/pcre2test.c comprehensive test program src/pcre2test_inc.h header used by pcre2test src/pcre2_jit_test.c JIT test program src/pcre2posix_test.c POSIX wrapper API test program (C) Auxiliary files: AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests RunTest.bat a Windows batch file for running tests RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages doc/pcre2-config.txt plain text documentation of pcre2-config script doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config ar-lib ) config.guess ) config.sub ) depcomp ) helper tools generated by libtool and compile ) automake, used internally by ./configure install-sh ) ltmain.sh ) missing ) test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests testdata/testoutput* expected test results testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files src/libpcre2-8.sym.in ) src/libpcre2-16.sym.in ) symbol version script templates for the src/libpcre2-32.sym.in ) GNU, BSD and Sun linkers src/libpcre2-posix.sym.in ) (D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS cmake/FindEditline.cmake cmake/FindReadline.cmake cmake/pcre2-config.cmake.in cmake/PCRE2CheckVscript.cmake cmake/PCRE2UseSystemExtensions.cmake cmake/PCRE2WarningAsError.cmake src/config-cmake.h.in CMakeLists.txt (E) Auxiliary files for building PCRE2 "by hand" src/pcre2.h.generic ) a version of the public PCRE2 header file ) for use in non-"configure" environments src/config.h.generic ) a version of config.h for use in non-"configure" ) environments (F) Auxiliary files for building PCRE2 using other build systems BUILD.bazel ) files used by the Bazel MODULE.bazel ) build system build.zig file used by zig's build system (G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) ============================= Last updated: 15 October 2025 ============================= ================================================ FILE: README.md ================================================ PCRE2: Perl-Compatible Regular Expressions ## Overview The PCRE2 library is a set of C functions that implement **regular expression pattern matching**. It is **self-contained and portable**, and designed to be **easy to embed** into existing projects and build systems, on almost **any platform** or build target. The PCRE2 library is **free and open-source** (BSD licence), and permitted in proprietary software. It supports Unicode matching and a very wide range of regular expression features. It accepts input in various character encodings, and optionally includes a highly **performant JIT matching engine**. PCRE2 is **mature and highly-trusted**: bundled in dozens or hundreds of open-source and commercial products, such as Excel, Safari, Apache, and Git, and used as the basis for regular expressions in several programming languages including PHP and R.
Website https://pcre2project.github.io/pcre2/
Distribution [![GitHub Release](https://img.shields.io/github/v/release/PCRE2Project/pcre2?display_name=release&style=flat-square&label=Latest%20release&color=006094)](https://github.com/PCRE2Project/pcre2/releases)  [![BSD licence](https://img.shields.io/badge/Licence-BSD%203--clause-006094?style=flat-square)](https://github.com/PCRE2Project/pcre2/blob/main/LICENCE.md)
Testing [![Codecov](https://img.shields.io/codecov/c/github/PCRE2Project/pcre2?component=library&style=flat-square&logo=codecov&label=Coverage&color=009400)](https://app.codecov.io/gh/PCRE2Project/pcre2/components)  [![Clang Sanitizers](https://img.shields.io/badge/Clang-Sanitizers-262D3A?style=flat-square&logo=llvm&color=006094)](https://github.com/PCRE2Project/pcre2/actions/workflows/dev.yml)  [![Clang Static Analyzer](https://img.shields.io/badge/Clang-Static%20Analyzer-262D3A?style=flat-square&logo=llvm&color=006094)](https://github.com/PCRE2Project/pcre2/actions/workflows/clang-analyzer.yml)  [![Valgrind](https://img.shields.io/badge/Valgrind-006094?style=flat-square)](https://github.com/PCRE2Project/pcre2/actions/workflows/dev.yml)  [![Coverity Scan](https://img.shields.io/coverity/scan/pcre2?style=flat-square&label=Coverity&color=009400)](https://scan.coverity.com/projects/pcre2?tab=overview)  [![CodeQL](https://img.shields.io/badge/GitHub-CodeQL-006094?style=flat-square)](https://github.com/PCRE2Project/pcre2/actions/workflows/codeql.yml)  [![OSS-Fuzz](https://img.shields.io/badge/Google-OSS--Fuzz-006094?style=flat-square)](https://google.github.io/oss-fuzz/)  [![OSSF-Scorecard Score](https://img.shields.io/ossf-scorecard/github.com/PCRE2Project/pcre2?style=flat-square&label=OSSF-Scorecard&color=009400)](https://scorecard.dev/viewer/?uri=github.com%2FPCRE2Project%2Fpcre2) 
Platforms Tested continuously on Linux, Windows, macOS, FreeBSD, OpenBSD, Solaris, z/OS;
x86, ARM, RISC-V, POWER, S390X; others known to work
## Quickstart Recording of a terminal session showing the PCRE2 quickstart; reproduced in text form below
Show script ```bash session # Fetch PCRE2 with 'git clone', or use curl/wget to download a release. # Here, let's use git to check out a release tag: git clone https://github.com/PCRE2Project/pcre2.git ./pcre2 \ --branch pcre2-$PCRE2_VERSION \ -c advice.detachedHead=false --depth 1 # If using the JIT, remember to fetch the Git submodule: (cd ./pcre2; git submodule update --init) # Now let's build PCRE2: (cd ./pcre2; \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug \ -DPCRE2_SUPPORT_JIT=ON -B build; \ cmake --build build/) # Great, PCRE2 is built. # Here's a quick little demo to show how we can make use of PCRE2. # For a fuller example, see './pcre2/src/pcre2demo.c'. # See below for the demo code. # Compile the demo: gcc -g -I./pcre2/build -L./pcre2/build demo.c -o demo -lpcre2-8 # Finally, run our demo: ./demo 'c.t' 'dogs and cats' # We fetched, built, and called PCRE2 successfully! :) ``` File `demo.c`: ```c /* Set PCRE2_CODE_UNIT_WIDTH to indicate we will use 8-bit input. */ #define PCRE2_CODE_UNIT_WIDTH 8 #include #include /* for strlen */ #include /* for printf */ int main(int argc, char* argv[]) { if (argc != 3) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } const char *pattern = argv[1]; const char *subject = argv[2]; /* Compile the pattern. */ int error_number; PCRE2_SIZE error_offset; pcre2_code *re = pcre2_compile( pattern, /* the pattern */ PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ 0, /* default options */ &error_number, /* for error number */ &error_offset, /* for error offset */ NULL); /* use default compile context */ if (re == NULL) { fprintf(stderr, "Invalid pattern: %s\n", pattern); return 1; } /* Match the pattern against the subject text. */ pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL); int rc = pcre2_match( re, /* the compiled pattern */ subject, /* the subject text */ strlen(subject), /* the length of the subject */ 0, /* start at offset 0 in the subject */ 0, /* default options */ match_data, /* block for storing the result */ NULL); /* use default match context */ /* Print the match result. */ if (rc == PCRE2_ERROR_NOMATCH) { printf("No match\n"); } else if (rc < 0) { fprintf(stderr, "Matching error\n"); } else { PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data); printf("Found match: '%.*s'\n", (int)(ovector[1] - ovector[0]), subject + ovector[0]); } pcre2_match_data_free(match_data); /* Free resources */ pcre2_code_free(re); return 0; } ```
--- The main ways of obtaining PCRE2 are: 1. Via Git clone: ``` git clone https://github.com/PCRE2Project/pcre2.git ``` Please use a release tag in production, not the development branch! Because PCRE2's JIT uses code from a Git submodule, you must check this out after a fresh clone: ``` git submodule update --init ``` 3. Via download of the [release tarball](https://github.com/PCRE2Project/pcre2/releases/latest). 4. Finally, PCRE2 is also bundled by various downstream package managers (such as Linux distributions, or [vcpkg](https://vcpkg.io/)). These are provided by third parties, not the PCRE2 project. The main ways of building PCRE2 are: 1. Via CMake (Linux/Windows/macOS, and others) ``` cd pcre2/ cmake -B build . cmake --build build/ ``` 2. Via Autoconf (Linux/Unix) ``` cd pcre2/ ./configure make ``` See ["Platforms"](#platforms) below for links to more detailed build documentation. ## API Overview The PCRE2 API supports strings in 8-bit, 16-bit, and 32-bit encodings, with or without UTF encoding. There is also EBCDIC support. The default regular expression dialect closely matches the syntax and behaviour of Perl 5, with PCRE2-specific extensions. A wide variety of granular flags can be passed to the PCRE2 API to customise this to more closely follow other dialects such as JavaScript or Python. The default matching engine uses a depth-first tree search with backtracking, which is highly feature-rich but has worst-case exponential time (PCRE2 allows aborting the match if a time limit is exceeded, expressed as a maximum number of steps in the tree search). The second matching engine uses a JIT for greatly improved performance, compiling the regular expression to a block of equivalent native machine code. PCRE2 has a third matching engine, using a DFA engine which is generally slower, but has worst-case polynomial matching time and is able to find the POSIX-style "leftmost-longest" match. There are accompanying utility functions for converting glob patterns and POSIX BRE/ERE patterns to PCRE2 regular expressions; and also for performing high-level regular expression operations such as search-and-replace with a powerful replacement string syntax. As well as the PCRE2 API, the library also offers a POSIX-compatible `` header and `regexec()` function. However, this does not provide the ability to pass PCRE2 flags, so we recommend users consume the PCRE2 API if possible. See the [full library and API documentation](https://pcre2project.github.io/pcre2/doc/) for further details. For third-party documentation, see further: - A curated summary of changes for each PCRE release, and some excellent tutorials on PCRE2 on the [RexEgg website](http://www.rexegg.com/pcre-documentation.html). - Jan Goyvaerts' popular Regular-Expressions.info site includes [information about PCRE2](https://www.regular-expressions.info/pcre.html) as well as tutorials and highly detailed comparisons of PCRE2 to other regular expression dialects. - Jeffrey Friedl's book [_Mastering Regular Expressions_](https://regex.info/book.html) includes chapters on Perl and PCRE, and is available in print and online via O'Reilly Media. ## Platforms PCRE2 is portable C code, and is likely to work on any system with a C99 compiler.
Operating systems
Our continuous integration tests on Linux (GCC and Clang, glibc and musl), Windows (MSVC and MinGW-x64), and macOS (Clang), as well as FreeBSD, OpenBSD, Solaris (Oracle Studio cc), and z/OS (xlc and ibm-clang).
Processors
PCRE2 is tested continuously on x86 (i686 and amd64), ARM 32- and 64-bit (armv7 and aarch64), RISC-V (riscv64), POWER (ppc64le), and the big-endian S390x.
Other systems are likely to work (including mobile, embedded platforms, and commercial UNIX systems), but these are not tested continuously by the PCRE2 maintainers. Users are encouraged to run the full PCRE2 test suite when compiling for any new platform. We are aware of working ports to VMS and z/OS (PCRE2 supports EBCDIC). PCRE2 releases support CMake for building, and for UNIX platforms include a `./configure` script built by Autoconf. Build files for the Bazel build system and `zig build` are also included. Integrating PCRE2 with other systems can be done by including the `.c` files in an existing project. Please see the files [README](./README) and [NON-AUTOTOOLS-BUILD](./NON-AUTOTOOLS-BUILD) for full build documentation, as well as the man pages, including [`man pcre2/doc/pcre2build.3`](https://pcre2project.github.io/pcre2/doc/pcre2build/). ## Licence PCRE2 is released under the **BSD 3-clause licence** with a PCRE2 Exception. It is open-source and also corporate-friendly. - See [LICENCE](./LICENCE.md) for legal text. - See [AUTHORS](./AUTHORS.md) for details of the current maintainers of PCRE2 and acknowledgements of its contributors, including Philip Hazel, the original author. ## Contributing & support Join the community by reporting issues or asking questions via [GitHub issues](https://github.com/PCRE2Project/pcre2/issues). We welcome feedback and proposals. Contributions ranging from bug fixes to feature requests are welcome, and can be made via GitHub pull requests. Please review our [SECURITY](./SECURITY.md) policy for information on reporting security issues. Release announcements will be made via the [pcre2-dev@googlegroups.com](https://groups.google.com/g/pcre2-dev) mailing list, where you can also start discussions about PCRE2 issues and development. You can browse the [list archives](https://groups.google.com/g/pcre2-dev). ================================================ FILE: RunGrepTest ================================================ #! /bin/sh # Run pcre2grep tests. The assumption is that the PCRE2 tests check the library # itself. What we are checking here is the file handling and options that are # supported by pcre2grep. This script must be run in the build directory. # CODING CONVENTIONS: # * Put printf arguments in single, not double quotes to avoid unwanted # escaping. # * Use \0 for binary zero in printf, not \x0, for the benefit of older # versions (and use octal for other special values). # Set the C locale, so that sort(1) behaves predictably. LC_ALL=C export LC_ALL # Remove any non-default colouring and aliases that the caller may have set. unset PCRE2GREP_COLOUR PCRE2GREP_COLOR PCREGREP_COLOUR PCREGREP_COLOR unset GREP_COLOR GREP_COLORS unset cp ls mv rm # Remember the current (build) directory, set the program to be tested, and # valgrind settings when requested. builddir=`pwd` : ${pcre2grep:=$builddir/pcre2grep} : ${pcre2test:=$builddir/pcre2test} if [ ! -x $pcre2grep ] ; then echo "** $pcre2grep does not exist or is not executable." exit 1 fi if [ ! -x $pcre2test ] ; then echo "** $pcre2test does not exist or is not executable." exit 1 fi valgrind= while [ $# -gt 0 ] ; do case $1 in valgrind|-valgrind) valgrind="valgrind -q --leak-check=no --smc-check=all-non-file --error-exitcode=70";; *) echo "RunGrepTest: Unknown argument $1"; exit 1;; esac shift done pcre2grep_version=`$pcre2grep -V` if [ "$valgrind" = "" ] ; then echo "Testing $pcre2grep_version" else echo "Testing $pcre2grep_version using valgrind" fi # Set up a suitable "diff" command for comparison. Some systems have a diff # that lacks a -u option. Try to deal with this; better do the test for the -b # option as well. cf="diff" diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b" diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u" diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub" # Add a -a (always treat as text) if available. This was added in an attempt # to get more detail from an Alpine Linux test failure on GitHub. $cf -a /dev/null /dev/null 2>/dev/null && cf="$cf -a" # Some tests involve NUL characters. It seems impossible to handle them easily # in many operating systems. An earlier version of this script used sed to # translate NUL into the string ZERO, but this didn't work on Solaris (aka # SunOS), where the version of sed explicitly doesn't like them, and also MacOS # (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine, # even when using GNU sed. A user suggested using tr instead, which # necessitates translating to a single character. However, on (some versions # of?) Solaris, the normal "tr" cannot handle binary zeros, but if # /usr/xpg4/bin/tr is available, it can do so, so test for that. if [ -x /usr/xpg4/bin/tr ] ; then tr=/usr/xpg4/bin/tr else tr=tr fi # If this test is being run from "make check", $srcdir will be set. If not, set # it to the current or parent directory, whichever one contains the test data. # Subsequently, we run most of the pcre2grep tests in the source directory so # that the file names in the output are always the same. if [ -z "$srcdir" -o ! -d "$srcdir/testdata" ] ; then if [ -d "./testdata" ] ; then srcdir=. elif [ -d "../testdata" ] ; then srcdir=.. else echo "Cannot find the testdata directory" exit 1 fi fi # Set up the path to the valgrind JIT suppressions vjs= if [ "$valgrind" != "" ] ; then $pcre2test -C jit >/dev/null if [ $? -ne 0 ]; then vjs="--suppressions=`realpath "$srcdir"`/testdata/valgrind-jit.supp" fi fi # Check for the availability of UTF-8 support $pcre2test -C unicode >/dev/null utf8=$? # Check default newline convention. If it does not include LF, force LF. nl=`$pcre2test -C newline` if [ "$nl" != "LF" -a "$nl" != "ANY" -a "$nl" != "ANYCRLF" ]; then pcre2grep="$pcre2grep -N LF" echo "Default newline setting forced to LF" fi # ------ Function to run and check a special pcre2grep arguments test ------- checkspecial() { $valgrind $pcre2grep $1 >>testtrygrep 2>&1 if [ $? -ne $2 ] ; then echo "** pcre2grep $1 failed - check testtrygrep" exit 1 fi } # ------ Normal tests ------ echo "Testing pcre2grep main features" echo "---------------------------- Test 1 ------------------------------" >testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 2 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep '^PATTERN' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 3 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -in PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 4 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -ic PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 5 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -in PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 6 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -inh PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 7 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -il PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 8 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -l PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 9 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -q PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 10 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -q NEVER-PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 11 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -vn pattern ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 12 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -ix pattern ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 13 -----------------------------" >>testtrygrep echo seventeen >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -f./testdata/greplist -f $builddir/testtemp1grep ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 14 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -w pat ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 15 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep 'abc^*' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 16 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep abc ./testdata/grepinput ./testdata/nonexistfile) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 17 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -M 'the\noutput' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 18 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mn '(the\noutput|dog\.\n--)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 19 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mix 'Pattern' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 20 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mixn 'complete pair\nof lines' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 21 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nA3 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 22 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nB3 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 23 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -C3 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 24 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -A9 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 25 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nB9 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 26 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -A9 -B9 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 27 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -A10 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 28 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nB10 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 29 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -C12 -B10 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 30 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -inB3 'pattern' ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 31 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -inA3 'pattern' ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 32 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -L 'fox' ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 33 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep 'fox' ./testdata/grepnonexist) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 34 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -s 'fox' ./testdata/grepnonexist) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 35 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinputx --include grepinput8 --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 36 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include='grepinput[^C]' --exclude 'grepinput$' --exclude='grepinput(Bad)?8' --exclude=grepinputM --exclude=grepinputUN --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 37 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep '^(a+)*\d' ./testdata/grepinput) >>testtrygrep 2>teststderrgrep echo "RC=$?" >>testtrygrep echo "======== STDERR ========" >>testtrygrep cat teststderrgrep >>testtrygrep echo "---------------------------- Test 38 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep '>\x00<' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 39 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -A1 'before the binary zero' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 40 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -B1 'after the binary zero' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 41 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -B1 -o '\w+ the binary zero' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 42 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -B1 -onH '\w+ the binary zero' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 43 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -on 'before|zero|after' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 44 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -on -e before -ezero -e after ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 45 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -on -f ./testdata/greplist -e binary ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 46 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -e 'unopened)' -e abc ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -eabc -e '(unclosed' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -eabc -e xyz -e '[unclosed' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --regex=123 -eabc -e xyz -e '[unclosed' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 47 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Fx "AB.VE elephant" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 48 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -F "AB.VE elephant" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 49 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -F -e DATA -e "AB.VE elephant" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 50 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep "^(abc|def|ghi|jkl)" ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 51 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mv "brown\sfox" ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 52 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --colour=always jumps ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 53 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --file-offsets 'before|zero|after' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 54 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets 'before|zero|after' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 55 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -f./testdata/greplist --color=always ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 56 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -c --exclude=grepinputC lazy ./testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 57 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -c -l --exclude=grepinputC lazy ./testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 58 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --regex=PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 59 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --regexp=PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 60 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --regex PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 61 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --regexp PATTERN ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 62 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $pcre2grep --match-limit=1000 --no-jit -M 'This is a file(.|\R)*file.' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 63 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $pcre2grep --recursion-limit=1K --no-jit -M 'This is a file(.|\R)*file.' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 64 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o1 '(?<=PAT)TERN (ap(pear)s)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 65 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o2 '(?<=PAT)TERN (ap(pear)s)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 66 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o3 '(?<=PAT)TERN (ap(pear)s)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 67 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o12 '(?<=PAT)TERN (ap(pear)s)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 68 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --only-matching=2 '(?<=PAT)TERN (ap(pear)s)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 69 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -vn --colour=always pattern ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 70 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 71 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o "^01|^02|^03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 72 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always "^01|^02|^03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 73 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o --colour=always "^01|^02|^03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 74 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o "^01|02|^03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 75 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always "^01|02|^03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 76 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o --colour=always "^01|02|^03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 77 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o "^01|^02|03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 78 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always "^01|^02|03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 79 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o --colour=always "^01|^02|03" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 80 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o "\b01|\b02" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 81 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always "\\b01|\\b02" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 82 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o --colour=always "\\b01|\\b02" ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 83 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 84 -----------------------------" >>testtrygrep echo testdata/grepinput3 >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep --file-list ./testdata/grepfilelist --file-list $builddir/testtemp1grep "fox|complete|t7") >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 85 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --file-list=./testdata/grepfilelist "dolor" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 86 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 87 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep "cat" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 88 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -v "cat" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 89 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -I "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 90 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --binary-files=without-match "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 91 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -a "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 92 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --binary-files=text "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 93 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --text "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 94 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinputx --include grepinput8 'fox' ./testdata/grepinput* | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 95 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --file-list ./testdata/grepfilelist --exclude grepinputv "fox|complete") >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 96 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' --exclude=grepinput[MCU] 'fox' ./test* | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 97 -----------------------------" >>testtrygrep echo "grepinput$" >testtemp1grep echo "grepinput8" >>testtemp1grep echo "grepinputBad8" >>testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude=grepinput[MCU] --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 98 -----------------------------" >>testtrygrep echo "grepinput$" >testtemp1grep echo "grepinput8" >>testtemp1grep echo "grepinputBad8" >>testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --exclude=grepinput[MCU] --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 99 -----------------------------" >>testtrygrep echo "grepinput$" >testtemp1grep echo "grepinput8" >testtemp2grep echo "grepinputBad8" >>testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude=grepinput[MCU] --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 100 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Ho2 --only-matching=1 -o3 '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 101 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o3 -Ho2 -o12 --only-matching=1 -o3 --colour=always --om-separator='|' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 102 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -n "^$" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 103 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --only-matching "^$" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 104 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -n --only-matching "^$" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 105 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --colour=always "ipsum|" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 106 -----------------------------" >>testtrygrep (cd $srcdir; echo "a" | $valgrind $vjs $pcre2grep -M "|a" ) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 107 -----------------------------" >>testtrygrep echo "a" >testtemp1grep echo "aaaaa" >>testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets --allow-lookaround-bsk '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 108 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -lq PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 109 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -cq --exclude=grepinputC lazy ./testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 110 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --om-separator / -Mo0 -o1 -o2 'match (\d+):\n (.)\n' testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 111 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -M 'match (\d+):\n (.)\n' testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 112 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --file-offsets -M 'match (\d+):\n (.)\n' testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 113 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --total-count --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 114 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -tc --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 115 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -tlc --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 116 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --exclude=grepinput[MCU] -th 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 117 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -tch --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 118 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -tL --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 119 -----------------------------" >>testtrygrep printf '123\n456\n789\n---abc\ndef\nxyz\n---\n' >testNinputgrep $valgrind $vjs $pcre2grep -Mo '(\n|[^-])*---' testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 120 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HO '$0:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HO '$&:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -m 1 -O '$0:$a$b$e$f$r$t$v' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HO '${X}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HO 'XX$' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -O '$x{12345678}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -O '$x{123Z' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --output '$x{1234}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 121 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -F '\E and (regex)' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 122 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -w 'cat|dog' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 123 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -w 'dog|cat' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 124 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always 'start[\s]+end' testdata/grepinputM) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mn 'start[\s]+end' testdata/grepinputM) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -Mn -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 125 -----------------------------" >>testtrygrep printf 'abcd\n' >testNinputgrep $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K.)' testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=.\K)' testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K[ac])' testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep GREP_COLORS='ms=1;20' $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 126 -----------------------------" >>testtrygrep printf 'Next line pattern has binary zero\nABC\0XYZ\n' >testtemp1grep printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep $valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep echo "RC=$?" >>testtrygrep printf 'Next line pattern is erroneous.\n^abc)(xy' >testtemp1grep $valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 127 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 128 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -m1M -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 129 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 130 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 131 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 132 -----------------------------" >>testtrygrep (cd $srcdir; exec 3>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 133 -----------------------------" >>testtrygrep (cd $srcdir; exec 3>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 134 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --max-count=1 -nH -O '=$x{41}$x423$o{103}$o1045=' 'fox' -) <$srcdir/testdata/grepinputv >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 135 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 136 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -m1MK -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --max-count=1MK -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 137 -----------------------------" >>testtrygrep printf 'Last line\nhas no newline' >testtemp1grep $valgrind $vjs $pcre2grep -A1 Last testtemp1grep >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 138 -----------------------------" >>testtrygrep printf 'AbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\n' >testtemp1grep $valgrind $vjs $pcre2grep --no-jit --heap-limit=0 b testtemp1grep >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 139 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --line-buffered 'fox' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 140 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 -A1 'brown' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 141 -----------------------------" >>testtrygrep printf "%s/testdata/grepinputv\n-\n" "$srcdir" >testtemp1grep printf 'This is a line from stdin.' >testtemp2grep $valgrind $vjs $pcre2grep --file-list testtemp1grep "line from stdin" >testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 142 -----------------------------" >>testtrygrep printf "/does/not/exist\n" >testtemp1grep printf 'This is a line from stdin.' >testtemp2grep $valgrind $vjs $pcre2grep --file-list testtemp1grep "line from stdin" >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 143 -----------------------------" >>testtrygrep printf 'fox|cat' >testtemp1grep $valgrind $vjs $pcre2grep -f - $srcdir/testdata/grepinputv >testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 144 -----------------------------" >>testtrygrep $valgrind $vjs $pcre2grep -f /non/exist $srcdir/testdata/grepinputv >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 145 -----------------------------" >>testtrygrep printf '*meta*\rdog.' >testtemp1grep $valgrind $vjs $pcre2grep -Ncr -F -f testtemp1grep $srcdir/testdata/grepinputv >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 146 -----------------------------" >>testtrygrep printf 'A123B' >testtemp1grep $valgrind $vjs $pcre2grep -H -e '123|fox' - >testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -h -e '123|fox' - $srcdir/testdata/grepinputv >testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep - $srcdir/testdata/grepinputv >testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 147 -----------------------------" >>testtrygrep $valgrind $vjs $pcre2grep -e '123|fox' -- -nonfile >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 148 -----------------------------" >>testtrygrep $valgrind $vjs $pcre2grep --nonexist >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -n-n-bad >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --context >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --only-matching --output=xx >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --colour=badvalue >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --newline=badvalue >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -d badvalue >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -D badvalue >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --buffer-size=0 >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --exclude '(badpat' abc /dev/null >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --exclude-from /non/exist abc /dev/null >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --include-from /non/exist abc /dev/null >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep --file-list=/non/exist abc /dev/null >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 149 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --binary-files=binary "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --binary-files=wrong "dog" ./testdata/grepbinary) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep # This test runs the code that tests locale support. However, on some systems # (e.g. Alpine Linux) there is no locale support and running this test just # generates a "no match" result. Therefore, we test for locale support, and if # it is found missing, we pretend that the test has run as expected so that the # output matches. echo "---------------------------- Test 150 -----------------------------" >>testtrygrep which locale >/dev/null 2>&1 if [ $? -ne 0 ]; then echo "pcre2grep: Failed to set locale locale.bad (obtained from LC_CTYPE)" >>testtrygrep echo "RC=2" >>testtrygrep else (cd $srcdir; unset LC_ALL; LC_CTYPE=locale.bad; export LC_CTYPE; $valgrind $vjs $pcre2grep abc /dev/null >>$builddir/testtrygrep 2>&1) >testtemp1grep 2>&1 echo "RC=$?" >>testtrygrep shell_errors=`cat testtemp1grep` if [ x"$shell_errors" != x ] ; then printf "shell errors during locale test: " echo "$shell_errors" fi fi echo "---------------------------- Test 151 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --colour=always -e this -e The -e 'The wo' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 152 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nA3 --group-separator='++' 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 153 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nA3 --no-group-separator 'four' ./testdata/grepinputx) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 154 -----------------------------" >>testtrygrep >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -f $builddir/testtemp1grep ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 155 -----------------------------" >>testtrygrep echo "" >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -f $builddir/testtemp1grep ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 156 -----------------------------" >>testtrygrep echo "" >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep --posix-pattern-file --file $builddir/testtemp1grep ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 157 -----------------------------" >>testtrygrep echo "spaces " >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -o --posix-pattern-file --file=$builddir/testtemp1grep ./testdata/grepinputv >$builddir/testtemp2grep && $valgrind $vjs $pcre2grep -q "s " $builddir/testtemp2grep) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 158 -----------------------------" >>testtrygrep echo "spaces." >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep -f $builddir/testtemp1grep ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 159 -----------------------------" >>testtrygrep printf "spaces.\r\n" >testtemp1grep (cd $srcdir; $valgrind $vjs $pcre2grep --posix-pattern-file -f$builddir/testtemp1grep ./testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 160 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -nC3 '^(ert|jkl)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -n -B4 -A2 '^(ert|dfg)' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep # Now compare the results. $cf $srcdir/testdata/grepoutput testtrygrep if [ $? != 0 ] ; then exit 1; fi # These tests require UTF-8 support if [ $utf8 -ne 0 ] ; then echo "Testing pcre2grep UTF-8 features" echo "---------------------------- Test U1 ------------------------------" >testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -n -u --newline=any "^X" ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U2 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -n -u -C 3 --newline=any "Match" ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U3 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any --allow-lookaround-bsk '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U4 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -u -o '....' ./testdata/grepinputBad8) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test U5 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' ./testdata/grepinputBad8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U6 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test U7 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -ui --colour=always 'k+|\babc\b' ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U8 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -UiEP --colour=always 'k+|\babc\b' ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U9 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -u --colour=always 'A\d' ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test U10 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -u --posix-digit --colour=always 'A\d' ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep $cf $srcdir/testdata/grepoutput8 testtrygrep if [ $? != 0 ] ; then exit 1; fi else echo "Skipping pcre2grep UTF-8 tests: no UTF-8 support in PCRE2 library" fi # We go to some contortions to try to ensure that the tests for the various # newline settings will work in environments where the normal newline sequence # is not \n. Do not use exported files, whose line endings might be changed. # Instead, create an input file using printf so that its contents are exactly # what we want. Note the messy fudge to get printf to write a string that # starts with a hyphen. These tests are run in the build directory. echo "Testing pcre2grep newline settings" printf 'abc\rdef\r\nghi\njkl' >testNinputgrep printf '%c--------------------------- Test N1 ------------------------------\r\n' - >testtrygrep $valgrind $vjs $pcre2grep -n -N CR "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -B1 -n -N CR "^def" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N2 ------------------------------\r\n' - >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=crlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -B1 -n -N CRLF "^ghi" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N3 ------------------------------\r\n' - >>testtrygrep pattern=`printf 'def\rjkl'` $valgrind $vjs $pcre2grep -n --newline=cr -F "$pattern" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N4 ------------------------------\r\n' - >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=crlf -F -f $srcdir/testdata/greppatN4 testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N5 ------------------------------\r\n' - >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -B1 -n --newline=any "^def" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -B1 -n --newline=anycrlf "^jkl" testNinputgrep >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep printf 'xyz\0abc\0def' >testNinputgrep $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep -B1 -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test N8 ------------------------------\r\n' - >>testtrygrep $valgrind $vjs $pcre2grep -na --newline=anycrlf "^a" $srcdir/testdata/grepinputBad8_Trail >>testtrygrep echo "RC=$?" >>testtrygrep echo "" >>testtrygrep $cf $srcdir/testdata/grepoutputN testtrygrep if [ $? != 0 ] ; then exit 1; fi # These newline tests need UTF support. if [ $utf8 -ne 0 ] ; then echo "Testing pcre2grep newline settings with UTF-8 features" printf '%c--------------------------- Test UN1 ------------------------------\r\n' - >testtrygrep $valgrind $vjs $pcre2grep -nau --newline=anycrlf "^(abc|def)" $srcdir/testdata/grepinputUN >>testtrygrep echo "RC=$?" >>testtrygrep printf '%c--------------------------- Test UN2 ------------------------------\r\n' - >testtrygrep $valgrind $vjs $pcre2grep -nauU --newline=anycrlf "^a" $srcdir/testdata/grepinputBad8_Trail >>testtrygrep echo "RC=$?" >>testtrygrep echo "" >>testtrygrep $cf $srcdir/testdata/grepoutputUN testtrygrep if [ $? != 0 ] ; then exit 1; fi else echo "Skipping pcre2grep newline UTF-8 tests: no UTF-8 support in PCRE2 library" fi # If pcre2grep supports script callouts, run some tests on them. It is possible # to restrict these callouts to the non-fork case, either for security, or for # environments that do not support fork(). This is handled by comparing to a # different output. if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scripts in patterns are supported'; then echo "Testing pcre2grep script callouts" echo "--- Test 1 ---" >testtrygrep $valgrind $vjs $pcre2grep '(T)(..(.))(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4) ($14) ($0)")()' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep echo "--- Test 2 ---" >>testtrygrep $valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep echo "--- Test 3 ---" >>testtrygrep $valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep echo "--- Test 4 ---" >>testtrygrep $valgrind $vjs $pcre2grep '(T)(?C"/bin/echo|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep echo "--- Test 5 ---" >>testtrygrep $valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep echo "--- Test 6 ---" >>testtrygrep $valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then nonfork=1 $cf $srcdir/testdata/grepoutputCN testtrygrep else nonfork=0 $cf $srcdir/testdata/grepoutputC testtrygrep fi if [ $? != 0 ] ; then exit 1; fi # These callout tests need UTF support. if [ $utf8 -ne 0 ] ; then echo "Testing pcre2grep script callout with UTF-8 features" echo "--- Test 1 ---" >testtrygrep $valgrind $vjs $pcre2grep -u '(T)(?C"|$0:$x{a6}$n")' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep echo "--- Test 2 ---" >>testtrygrep $valgrind $vjs $pcre2grep -u '(T)(?C"/bin/echo|$0:$x{a6}$n")' $srcdir/testdata/grepinputv >>testtrygrep echo "RC=$?" >>testtrygrep if [ $nonfork = 1 ] ; then $cf $srcdir/testdata/grepoutputCNU testtrygrep else $cf $srcdir/testdata/grepoutputCU testtrygrep fi if [ $? != 0 ] ; then exit 1; fi else echo "Skipping pcre2grep script callout UTF-8 tests: no UTF-8 support in PCRE2 library" fi unset nonfork else echo "Script callouts are not supported" fi # Test reading .gz and .bz2 files when supported. if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q '\.gz are read using zlib'; then echo "Testing reading .gz file" $valgrind $vjs $pcre2grep 'one|two' $srcdir/testdata/grepinputC.gz >testtrygrep echo "RC=$?" >>testtrygrep $cf $srcdir/testdata/grepoutputCgz testtrygrep if [ $? != 0 ] ; then exit 1; fi fi if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q '\.bz2 are read using bzlib2'; then echo "Testing reading .bz2 file" $valgrind $vjs $pcre2grep 'one|two' $srcdir/testdata/grepinputC.bz2 >testtrygrep echo "RC=$?" >>testtrygrep $valgrind $vjs $pcre2grep 'one|two' $srcdir/testdata/grepnot.bz2 >>testtrygrep echo "RC=$?" >>testtrygrep $cf $srcdir/testdata/grepoutputCbz2 testtrygrep if [ $? != 0 ] ; then exit 1; fi fi # Finally, some tests to exercise code that is not tested above, just to be # sure that it runs OK. Doing this improves the coverage statistics. The output # is not checked. echo "Testing miscellaneous pcre2grep arguments (unchecked)" echo '' >testtrygrep checkspecial '-xxxxx' 2 checkspecial '--help' 0 checkspecial '--line-buffered --colour=auto abc /dev/null' 1 checkspecial '--line-buffered --color abc /dev/null' 1 checkspecial '-dskip abc .' 1 checkspecial '-Dread -Dskip abc /dev/null' 1 checkspecial "-f $srcdir/testdata/greplistBad /dev/null" 2 checkspecial "(unpaired /dev/null" 2 checkspecial "-e (unpaired1 -e (unpaired2 /dev/null" 2 # Clean up local working files rm -f testNinputgrep teststderrgrep testtrygrep testtemp1grep testtemp2grep exit 0 # End ================================================ FILE: RunGrepTest.bat ================================================ @echo off :: Run pcre2grep tests. The assumption is that the PCRE2 tests check the library :: itself. What we are checking here is the file handling and options that are :: supported by pcre2grep. This script must be run in the build directory. :: (jmh: I've only tested in the main directory, using my own builds.) setlocal enabledelayedexpansion :: Remove any non-default colouring that the caller may have set. set PCRE2GREP_COLOUR= set PCRE2GREP_COLOR= set PCREGREP_COLOUR= set PCREGREP_COLOR= set GREP_COLORS= set GREP_COLOR= :: Remember the current (build) directory and set the program to be tested. set builddir="%CD%" if [%pcre2grep%]==[] set pcre2grep=%builddir%\pcre2grep.exe if [%pcre2test%]==[] set pcre2test=%builddir%\pcre2test.exe if NOT exist %pcre2grep% ( echo ** %pcre2grep% does not exist. exit /b 1 ) if NOT exist %pcre2test% ( echo ** %pcre2test% does not exist. exit /b 1 ) for /f "delims=" %%a in ('"%pcre2grep%" -V') do set pcre2grep_version=%%a echo Testing %pcre2grep_version% :: Set up a suitable "diff" command for comparison. Some systems have a diff :: that lacks a -u option. Try to deal with this; better do the test for the -b :: option as well. Use FC if there's no diff, taking care to ignore equality. set cf= set cfout= diff -b nul nul 2>nul && set cf=diff -b diff -u nul nul 2>nul && set cf=diff -u diff -ub nul nul 2>nul && set cf=diff -ub if NOT defined cf ( set cf=fc /n set "cfout=>testcf || (type testcf & cmd /c exit /b 1)" ) :: Set srcdir to the current or parent directory, whichever one contains the :: test data. Subsequently, we run most of the pcre2grep tests in the source :: directory so that the file names in the output are always the same. if NOT defined srcdir set srcdir=. if NOT exist %srcdir%\testdata\ ( if exist testdata\ ( set srcdir=. ) else if exist ..\testdata\ ( set srcdir=.. ) else if exist ..\..\testdata\ ( set srcdir=..\.. ) else ( echo Cannot find the testdata directory exit /b 1 ) ) :: Check for the availability of UTF-8 support %pcre2test% -C unicode >nul set utf8=%ERRORLEVEL% :: Check default newline convention. If it does not include LF, force LF. for /f %%a in ('"%pcre2test%" -C newline') do set nl=%%a if NOT "%nl%" == "LF" if NOT "%nl%" == "ANY" if NOT "%nl%" == "ANYCRLF" ( set pcre2grep=%pcre2grep% -N LF echo Default newline setting forced to LF ) :: Create a simple printf via cscript/JScript (an actual printf may translate :: LF to CRLF, which this one does not). We only support the barebones we need: :: \r, \n, \0, and %s (but only once). echo WScript.StdOut.Write(WScript.Arguments(0).replace(/\\r/g, "\r").replace(/\\n/g, "\n").replace(/\\0/g, "\x00").replace(/%%s/g, function() { return WScript.Arguments(1) })) >printf.js set printf=cscript //nologo printf.js :: Create a simple 'tr' via cscript/JScript. echo WScript.StdOut.Write(WScript.StdIn.ReadAll().replace(/\x00/g, "@")) >trnull.js set trnull=cscript //nologo trnull.js :: ------ Normal tests ------ echo Testing pcre2grep main features echo ---------------------------- Test 1 ------------------------------>testtrygrep (pushd %srcdir% & %pcre2grep% PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 2 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% "^PATTERN" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 3 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -in PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 4 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -ic PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 5 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -in PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 6 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -inh PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 7 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -il PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 8 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -l PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 9 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -q PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 10 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -q NEVER-PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 11 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -vn pattern ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 12 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -ix pattern ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 13 ----------------------------->>testtrygrep echo seventeen >testtemp1grep (pushd %srcdir% & %pcre2grep% -f./testdata/greplist -f %builddir%\testtemp1grep ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 14 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -w pat ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 15 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% "abc^*" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 16 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% abc ./testdata/grepinput ./testdata/nonexistfile & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 17 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -M "the\noutput" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 18 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -Mn "(the\noutput|dog\.\n--)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 19 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -Mix "Pattern" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 20 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -Mixn "complete pair\nof lines" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 21 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nA3 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 22 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nB3 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 23 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -C3 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 24 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -A9 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 25 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nB9 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 26 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -A9 -B9 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 27 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -A10 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 28 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nB10 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 29 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -C12 -B10 "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 30 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -inB3 "pattern" ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 31 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -inA3 "pattern" ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 32 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -L "fox" ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 33 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% "fox" ./testdata/grepnonexist & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 34 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -s "fox" ./testdata/grepnonexist & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 35 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -L -r --include=grepinputx --include grepinput8 --exclude-dir="^\." "fox" ./testdata | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 36 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -L -r --include="grepinput[^C]" --exclude "grepinput$" --exclude="grepinput(Bad)?8" --exclude=grepinputM --exclude=grepinputUN --exclude-dir="^\." "fox" ./testdata | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 37 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% "^(a+)*\d" ./testdata/grepinput & popd) >>testtrygrep 2>teststderrgrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ======== STDERR ========>>testtrygrep type teststderrgrep >>testtrygrep echo ---------------------------- Test 38 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% ">\x00<" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 39 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -A1 "before the binary zero" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 40 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -B1 "after the binary zero" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 41 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -B1 -o "\w+ the binary zero" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 42 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -B1 -onH "\w+ the binary zero" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 43 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -on "before|zero|after" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 44 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -on -e before -ezero -e after ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 45 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -on -f ./testdata/greplist -e binary ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 46 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -e "unopened)" -e abc ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -eabc -e "(unclosed" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -eabc -e xyz -e "[unclosed" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% --regex=123 -eabc -e xyz -e "[unclosed" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 47 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -Fx AB.VE^ elephant ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 48 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -F AB.VE^ elephant ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 49 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -F -e DATA -e AB.VE^ elephant ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 50 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% "^(abc|def|ghi|jkl)" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 51 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -Mv "brown\sfox" ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 52 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% --colour=always jumps ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 53 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% --file-offsets "before|zero|after" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 54 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% --line-offsets "before|zero|after" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 55 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -f./testdata/greplist --color=always ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 56 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -c --exclude=grepinputC lazy ./testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 57 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -c -l --exclude=grepinputC lazy ./testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 58 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --regex=PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 59 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --regexp=PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 60 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --regex PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 61 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --regexp PATTERN ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 62 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --match-limit=1000 --no-jit -M "This is a file(.|\R)*file." ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 63 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --recursion-limit=1000 --no-jit -M "This is a file(.|\R)*file." ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 64 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -o1 "(?<=PAT)TERN (ap(pear)s)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 65 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -o2 "(?<=PAT)TERN (ap(pear)s)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 66 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -o3 "(?<=PAT)TERN (ap(pear)s)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 67 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -o12 "(?<=PAT)TERN (ap(pear)s)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 68 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% --only-matching=2 "(?<=PAT)TERN (ap(pear)s)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 69 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -vn --colour=always pattern ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 70 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --color=always -M "triple:\t.*\n\n" ./testdata/grepinput3 & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% --color=always -M -n "triple:\t.*\n\n" ./testdata/grepinput3 & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -M "triple:\t.*\n\n" ./testdata/grepinput3 & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -M -n "triple:\t.*\n\n" ./testdata/grepinput3 & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 71 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o "^01|^02|^03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 72 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --color=always "^01|^02|^03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 73 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o --colour=always "^01|^02|^03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 74 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o "^01|02|^03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 75 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --color=always "^01|02|^03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 76 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o --colour=always "^01|02|^03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 77 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o "^01|^02|03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 78 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --color=always "^01|^02|03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 79 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o --colour=always "^01|^02|03" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 80 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o "\b01|\b02" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 81 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --color=always "\b01|\b02" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 82 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o --colour=always "\b01|\b02" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 83 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3 & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 84 ----------------------------->>testtrygrep echo testdata/grepinput3 >testtemp1grep (pushd %srcdir% & %pcre2grep% --file-list ./testdata/grepfilelist --file-list %builddir%\testtemp1grep "fox|complete|t7" & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 85 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --file-list=./testdata/grepfilelist "dolor" ./testdata/grepinput3 & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 86 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 87 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% "cat" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 88 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -v "cat" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 89 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -I "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 90 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --binary-files=without-match "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 91 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -a "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 92 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --binary-files=text "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 93 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --text "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 94 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -L -r --include=grepinputx --include grepinput8 "fox" ./testdata/grepinput* | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 95 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --file-list ./testdata/grepfilelist --exclude grepinputv "fox|complete" & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 96 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -L -r --include-dir=testdata --exclude "^^(?^!grepinput)" --exclude=grepinput[MCU] "fox" ./test* | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 97 ----------------------------->>testtrygrep echo grepinput$>testtemp1grep echo grepinput8>>testtemp1grep echo grepinputBad8>>testtemp1grep (pushd %srcdir% & %pcre2grep% -L -r --include=grepinput --exclude=grepinput[MCU] --exclude-from %builddir%\testtemp1grep --exclude-dir="^\." "fox" ./testdata | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 98 ----------------------------->>testtrygrep echo grepinput$>testtemp1grep echo grepinput8>>testtemp1grep echo grepinputBad8>>testtemp1grep (pushd %srcdir% & %pcre2grep% -L -r --exclude=grepinput3 --exclude=grepinput[MCU] --include=grepinput --exclude-from %builddir%\testtemp1grep --exclude-dir="^\." "fox" ./testdata | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 99 ----------------------------->>testtrygrep echo grepinput$>testtemp1grep echo grepinput8>testtemp2grep echo grepinputBad8>>testtemp1grep (pushd %srcdir% & %pcre2grep% -L -r --include grepinput --exclude=grepinput[MCU] --exclude-from %builddir%\testtemp1grep --exclude-from=%builddir%\testtemp2grep --exclude-dir="^\." "fox" ./testdata | sort & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 100 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -Ho2 --only-matching=1 -o3 "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 101 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -o3 -Ho2 -o12 --only-matching=1 -o3 --colour=always --om-separator="|" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 102 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -n "^$" ./testdata/grepinput3 & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 103 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --only-matching "^$" ./testdata/grepinput3 & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 104 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -n --only-matching "^$" ./testdata/grepinput3 & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 105 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --colour=always "ipsum|" ./testdata/grepinput3 & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 106 ----------------------------->>testtrygrep (pushd %srcdir% & echo a| %pcre2grep% -M "|a" & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 107 ----------------------------->>testtrygrep echo a>testtemp1grep echo aaaaa>>testtemp1grep (pushd %srcdir% & %pcre2grep% --line-offsets --allow-lookaround-bsk "(?<=\Ka)" %builddir%\testtemp1grep & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 108 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -lq PATTERN ./testdata/grepinput ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 109 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -cq --exclude=grepinputC lazy ./testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 110 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --om-separator / -Mo0 -o1 -o2 "match (\d+):\n (.)\n" testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 111 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --line-offsets -M "match (\d+):\n (.)\n" testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 112 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --file-offsets -M "match (\d+):\n (.)\n" testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 113 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --total-count --exclude=grepinputC "the" testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 114 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -tc --exclude=grepinputC "the" testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 115 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -tlc --exclude=grepinputC "the" testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 116 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --exclude=grepinput[MCU] -th "the" testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 117 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -tch --exclude=grepinputC "the" testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 118 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -tL --exclude=grepinputC "the" testdata/grepinput* & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 119 ----------------------------->>testtrygrep %printf% "123\n456\n789\n---abc\ndef\nxyz\n---\n" >testNinputgrep %pcre2grep% -Mo "(\n|[^-])*---" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 120 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -HO "$0:$2$1$3" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -HO "$&:$2$1$3" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -m 1 -O "$0:$a$b$e$f$r$t$v" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -HO "${X}" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -HO "XX$" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -O "$x{12345678}" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -O "$x{123Z" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% --output "$x{1234}" "(\w+) binary (\w+)(\.)?" ./testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 121 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -F "\E and (regex)" testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 122 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -w "cat|dog" testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 123 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -w "dog|cat" testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 124 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -Mn --colour=always "start[\s]+end" testdata/grepinputM & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -Mn --colour=always -A2 "start[\s]+end" testdata/grepinputM & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -Mn "start[\s]+end" testdata/grepinputM & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -Mn -A2 "start[\s]+end" testdata/grepinputM & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 125 ----------------------------->>testtrygrep %printf% "abcd\n" >testNinputgrep %pcre2grep% --colour=always --allow-lookaround-bsk "(?<=\K.)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --colour=always --allow-lookaround-bsk "(?=.\K)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --colour=always --allow-lookaround-bsk "(?<=\K[ac])" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --colour=always --allow-lookaround-bsk "(?=[ac]\K)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep set GREP_COLORS=ms=1;20 %pcre2grep% --colour=always --allow-lookaround-bsk "(?=[ac]\K)" testNinputgrep >>testtrygrep set GREP_COLORS= echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 126 ----------------------------->>testtrygrep %printf% "Next line pattern has binary zero\nABC\0XYZ\n" >testtemp1grep %printf% "ABC\0XYZ\nABCDEF\nDEFABC\n" >testtemp2grep %pcre2grep% -a -f testtemp1grep testtemp2grep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %printf% "Next line pattern is erroneous.\n^abc)(xy" >testtemp1grep %pcre2grep% -a -f testtemp1grep testtemp2grep >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 127 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o --om-capture=0 "pattern()()()()" testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 128 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -m1M -o1 --om-capture=0 "pattern()()()()" testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 129 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -m 2 "fox" testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 130 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -o -m2 "fox" testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 131 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -oc -m2 "fox" testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 132 ----------------------------->>testtrygrep :: The Unix tests use fd3 here, but Windows only has StdIn/StdOut/StdErr (which, at the kernel :: level, are not even numbered). Use a subshell instead. (pushd %srcdir% & (%pcre2grep% -m1 -A3 "^match" & echo ---& %pcre2grep% -m1 ".*") >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 133 ----------------------------->>testtrygrep :: The Unix tests use fd3 here, but Windows only has StdIn/StdOut/StdErr (which, at the kernel :: level, are not even numbered). Use a subshell instead. (pushd %srcdir% & (%pcre2grep% -m1 -A3 "^match" & echo ---& %pcre2grep% -m1 -A3 "^match") >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 134 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --max-count=1 -nH -O "=$x{41}$x423$o{103}$o1045=" "fox" - & popd) <%srcdir%\testdata\grepinputv >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 135 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -HZ "word" ./testdata/grepinputv & popd) | %trnull% >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -lZ "word" ./testdata/grepinputv ./testdata/grepinputv & popd) | %trnull% >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -A 1 -B 1 -HZ "word" ./testdata/grepinputv & popd) | %trnull% >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -MHZn "start[\s]+end" testdata/grepinputM & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 136 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -m1MK -o1 --om-capture=0 "pattern()()()()" testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% --max-count=1MK -o1 --om-capture=0 "pattern()()()()" testdata/grepinput & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 137 ----------------------------->>testtrygrep %printf% "Last line\nhas no newline" >testtemp1grep %pcre2grep% -A1 Last testtemp1grep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 138 ----------------------------->>testtrygrep %printf% "AbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\n" >testtemp1grep %pcre2grep% --no-jit --heap-limit=0 b testtemp1grep >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 139 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --line-buffered "fox" testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 140 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --buffer-size=10 -A1 "brown" testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 141 ----------------------------->>testtrygrep %printf% "%%s\testdata\grepinputv\n-\n" "%srcdir%" >testtemp1grep %printf% "This is a line from stdin." >testtemp2grep %pcre2grep% --file-list testtemp1grep "line from stdin" >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 142 ----------------------------->>testtrygrep %printf% "/does/not/exist\n" >testtemp1grep %printf% "This is a line from stdin." >testtemp2grep %pcre2grep% --file-list testtemp1grep "line from stdin" >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 143 ----------------------------->>testtrygrep %printf% "fox|cat" >testtemp1grep %pcre2grep% -f - %srcdir%\testdata\grepinputv >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 144 ----------------------------->>testtrygrep %pcre2grep% -f /non/exist %srcdir%\testdata\grepinputv >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 145 ----------------------------->>testtrygrep %printf% "*meta*\rdog." >testtemp1grep %pcre2grep% -Ncr -F -f testtemp1grep %srcdir%\testdata\grepinputv >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 146 ----------------------------->>testtrygrep %printf% "A123B" >testtemp1grep %pcre2grep% -H -e "123|fox" - >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -h -e "123|fox" - %srcdir%\testdata\grepinputv >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% - %srcdir%\testdata\grepinputv >testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 147 ----------------------------->>testtrygrep %pcre2grep% -e "123|fox" -- -nonfile >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 148 ----------------------------->>testtrygrep %pcre2grep% --nonexist >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -n-n-bad >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --context >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --only-matching --output=xx >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --colour=badvalue >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --newline=badvalue >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -d badvalue >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -D badvalue >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --buffer-size=0 >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --exclude "(badpat" abc /dev/null >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --exclude-from /non/exist abc /dev/null >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --include-from /non/exist abc /dev/null >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% --file-list=/non/exist abc /dev/null >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 149 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --binary-files=binary "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% --binary-files=wrong "dog" ./testdata/grepbinary & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 150 ----------------------------->>testtrygrep :: The Unix version of this tests checks for whether locales are supported. On Windows, :: we assume they always are. set LC_ALL= set LC_CTYPE=locale.bad (pushd %srcdir% & %pcre2grep% abc /dev/null & popd) >>testtrygrep 2>&1 echo RC=^%ERRORLEVEL%>>testtrygrep set LC_CTYPE= echo ---------------------------- Test 151 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% --colour=always -e this -e The -e "The wo" testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 152 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nA3 --group-separator="++" "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 153 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nA3 --no-group-separator "four" ./testdata/grepinputx & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 154 ----------------------------->>testtrygrep echo. >nul 2>testtemp1grep (pushd %srcdir% & %pcre2grep% -f %builddir%\testtemp1grep ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 155 ----------------------------->>testtrygrep echo. >testtemp1grep (pushd %srcdir% & %pcre2grep% -f %builddir%\testtemp1grep ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 156 ----------------------------->>testtrygrep %printf% "\n" >testtemp1grep (pushd %srcdir% & %pcre2grep% --posix-pattern-file --file %builddir%\testtemp1grep ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 157 ----------------------------->>testtrygrep %printf% "spaces \n" >testtemp1grep (pushd %srcdir% & %pcre2grep% -o --posix-pattern-file --file=%builddir%\testtemp1grep ./testdata/grepinputv >%builddir%\testtemp2grep && %pcre2grep% -q "s " %builddir%\testtemp2grep & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 158 ----------------------------->>testtrygrep %printf% "spaces.\n" >testtemp1grep (pushd %srcdir% & %pcre2grep% -f %builddir%\testtemp1grep ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 159 ----------------------------->>testtrygrep %printf% "spaces.\r\n" >testtemp1grep (pushd %srcdir% & %pcre2grep% --posix-pattern-file -f%builddir%\testtemp1grep ./testdata/grepinputv & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test 160 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -nC3 "^(ert|jkl)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep (pushd %srcdir% & %pcre2grep% -n -B4 -A2 "^(ert|dfg)" ./testdata/grepinput & popd) >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep :: Now compare the results. %cf% %srcdir%\testdata\grepoutput testtrygrep %cfout% if ERRORLEVEL 1 exit /b 1 :: These tests require UTF-8 support if %utf8% neq 0 ( echo Testing pcre2grep UTF-8 features echo ---------------------------- Test U1 ------------------------------>testtrygrep (pushd %srcdir% & %pcre2grep% -n -u --newline=any "^X" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U2 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -n -u -C 3 --newline=any "Match" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U3 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% --line-offsets -u --newline=any --allow-lookaround-bsk "(?<=\K\x{17f})" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U4 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -u -o "...." ./testdata/grepinputBad8 & popd) >>testtrygrep 2>&1 echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U5 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -U -o "...." ./testdata/grepinputBad8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U6 ----------------------------->>testtrygrep (pushd %srcdir% & %pcre2grep% -u -m1 -O "=$x{1d3}$o{744}=" "fox" & popd) <%srcdir%\testdata\grepinputv >>testtrygrep 2>&1 echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U7 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -ui --colour=always "k+|\babc\b" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U8 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -UiEP --colour=always "k+|\babc\b" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U9 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -u --colour=always "A\d" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test U10 ------------------------------>>testtrygrep (pushd %srcdir% & %pcre2grep% -u --posix-digit --colour=always "A\d" ./testdata/grepinput8 & popd) >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep %cf% %srcdir%\testdata\grepoutput8 testtrygrep %cfout% if ERRORLEVEL 1 exit /b 1 ) else ( echo Skipping pcre2grep UTF-8 tests: no UTF-8 support in PCRE2 library ) :: We go to some contortions to try to ensure that the tests for the various :: newline settings will work in environments where the normal newline sequence :: is not \n. Do not use exported files, whose line endings might be changed. :: Instead, create an input file so that its contents are exactly what we want. :: These tests are run in the build directory. echo Testing pcre2grep newline settings %printf% "abc\rdef\r\nghi\njkl" >testNinputgrep echo ---------------------------- Test N1 ------------------------------>testtrygrep %pcre2grep% -n -N CR "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -B1 -n -N CR "^def" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N2 ------------------------------>>testtrygrep %pcre2grep% -n --newline=crlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -B1 -n -N CRLF "^ghi" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N3 ------------------------------>>testtrygrep for /f %%a in ('%printf% "def\rjkl"') do set pattern=%%a %pcre2grep% -n --newline=cr -F "!pattern!" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N4 ------------------------------>>testtrygrep %pcre2grep% -n --newline=crlf -F -f %srcdir%\testdata\greppatN4 testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N5 ------------------------------>>testtrygrep %pcre2grep% -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -B1 -n --newline=any "^def" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N6 ------------------------------>>testtrygrep %pcre2grep% -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -B1 -n --newline=anycrlf "^jkl" testNinputgrep >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N7 ------------------------------>>testtrygrep %printf% "xyz\0abc\0def" >testNinputgrep %pcre2grep% -na --newline=nul "^(abc|def)" testNinputgrep | %trnull% >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %pcre2grep% -B1 -na --newline=nul "^(abc|def)" testNinputgrep | %trnull% >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep echo ---------------------------- Test N8 ------------------------------>>testtrygrep %pcre2grep% -na --newline=anycrlf "^a" %srcdir%\testdata\grepinputBad8_Trail >>testtrygrep echo RC=^%ERRORLEVEL%>>testtrygrep %printf% "\n" >>testtrygrep %cf% %srcdir%\testdata\grepoutputN testtrygrep %cfout% if ERRORLEVEL 1 exit /b 1 :: These newline tests need UTF support. if %utf8% neq 0 ( echo Testing pcre2grep newline settings with UTF-8 features echo ---------------------------- Test UN1 ------------------------------>testtrygrep %pcre2grep% -nau --newline=anycrlf "^(abc|def)" %srcdir%\testdata\grepinputUN >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo ---------------------------- Test UN2 ------------------------------>testtrygrep %pcre2grep% -nauU --newline=anycrlf "^a" %srcdir%\testdata\grepinputBad8_Trail >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep %printf% "\n" >>testtrygrep %cf% %srcdir%\testdata\grepoutputUN testtrygrep %cfout% if ERRORLEVEL 1 exit /b 1 ) else ( echo Skipping pcre2grep newline UTF-8 tests: no UTF-8 support in PCRE2 library ) :: If pcre2grep supports script callouts, run some tests on them. It is possible :: to restrict these callouts to the non-fork case, either for security, or for :: environments that do not support fork(). This is handled by comparing to a :: different output. %pcre2grep% --help | %pcre2grep% -q "callout scripts in patterns are supported" if %ERRORLEVEL% equ 0 ( echo Testing pcre2grep script callouts echo --- Test 1 --->testtrygrep %pcre2grep% "(T)(..(.))(?C'cmd|/c echo|Arg1: [$1] [$2] [$3]|Arg2: ^$|${1}^$| ($4) ($14) ($0)')()" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo --- Test 2 --->>testtrygrep %pcre2grep% "(T)(..(.))()()()()()()()(..)(?C'cmd|/c echo|Arg1: [$11] [${11}]')" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo --- Test 3 --->>testtrygrep %pcre2grep% "(T)(?C'|$0:$1$n')" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo --- Test 4 --->>testtrygrep %pcre2grep% "(T)(?C'cscript|//nologo|printf.js|%%s\r\n|$0:$1$n')" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo --- Test 5 --->>testtrygrep %pcre2grep% "(T)(?C'|$1$n')(*F)" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo --- Test 6 --->>testtrygrep %pcre2grep% -m1 "(T)(?C'|$0:$1:$x{41}$o{101}$n')" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep %pcre2grep% --help | %pcre2grep% -q "Non-fork callout scripts in patterns are supported" if ^!ERRORLEVEL! equ 0 ( set nonfork=1 %cf% %srcdir%\testdata\grepoutputCN testtrygrep %cfout% ) else ( set nonfork=0 %cf% %srcdir%\testdata\grepoutputC testtrygrep %cfout% ) if ERRORLEVEL 1 exit /b 1 @REM These callout tests need UTF support. if %utf8% neq 0 ( echo Testing pcre2grep script callout with UTF-8 features echo --- Test 1 --->testtrygrep %pcre2grep% -u "(T)(?C'|$0:$x{a6}$n')" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep echo --- Test 2 --->>testtrygrep %pcre2grep% -u "(T)(?C'cscript|//nologo|printf.js|%%s\r\n|$0:$x{a6}$n')" %srcdir%\testdata\grepinputv >>testtrygrep echo RC=^!ERRORLEVEL!>>testtrygrep if ^!nonfork! equ 1 ( %cf% %srcdir%\testdata\grepoutputCNU testtrygrep %cfout% ) else ( %cf% %srcdir%\testdata\grepoutputCU testtrygrep %cfout% ) if ERRORLEVEL 1 exit /b 1 ) else ( echo Skipping pcre2grep script callout UTF-8 tests: no UTF-8 support in PCRE2 library ) ) else ( echo Script callouts are not supported ) :: Finally, some tests to exercise code that is not tested above, just to be :: sure that it runs OK. Doing this improves the coverage statistics. The output :: is not checked. echo Testing miscellaneous pcre2grep arguments (unchecked) echo. >nul 2>testtrygrep call :checkspecial "-xxxxx" 2 || exit /b 1 call :checkspecial "--help" 0 || exit /b 1 call :checkspecial "--line-buffered --colour=auto abc nul" 1 || exit /b 1 call :checkspecial "--line-buffered --color abc nul" 1 || exit /b 1 call :checkspecial "-dskip abc ." 1 || exit /b 1 call :checkspecial "-Dread -Dskip abc nul" 1 || exit /b 1 call :checkspecial "-f %srcdir%\testdata\greplistBad nul" 2 || exit /b 1 call :checkspecial "(unpaired nul" 2 || exit /b 1 call :checkspecial "-e (unpaired1 -e (unpaired2 nul" 2 || exit /b 1 :: Clean up local working files del testcf printf.js trnull.js testNinputgrep teststderrgrep testtrygrep testtemp1grep testtemp2grep exit /b 0 :: ------ Function to run and check a special pcre2grep arguments test ------- :checkspecial %pcre2grep% %~1 >>testtrygrep 2>&1 if %ERRORLEVEL% neq %2 ( echo ** pcre2grep %~1 failed - check testtrygrep exit /b 1 ) exit /b 0 :: End ================================================ FILE: RunTest ================================================ #! /bin/sh ############################################################################### # Run the PCRE2 tests using the pcre2test program. The appropriate tests are # selected, depending on which build-time options were used. # # When JIT support is available, all appropriate tests are run with and without # JIT, unless "-nojit" is given on the command line. There are also two tests # for JIT-specific features, one to be run when JIT support is available # (unless "-nojit" is specified), and one when it is not. # # Whichever of the 8-, 16- and 32-bit libraries exist are tested. It is also # possible to select which to test by giving "-8", "-16" or "-32" on the # command line. # # As well as "-nojit", "-8", "-16", and "-32", arguments for this script are # individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the # end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10" # runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests # except test 10. Whatever order the arguments are in, these tests are always # run in numerical order. # # If no specific tests are selected (which is the case when this script is run # via 'make check') the default is to run all the numbered tests. # # There may also be named (as well as numbered) tests for special purposes. At # present there is just one, called "heap". This test's output contains the # sizes of heap frames and frame vectors, which depend on the environment. It # is therefore not run unless explicitly requested. # # Inappropriate tests are automatically skipped (with a comment to say so). For # example, if JIT support is not compiled, test 16 is skipped, whereas if JIT # support is compiled, test 15 is skipped. # # Other arguments can be one of the words "-valgrind", "-valgrind-log", or # "-sim" followed by an argument to run cross-compiled executables under a # simulator, for example: # # RunTest 3 -sim "qemu-arm -s 8388608" # # For backwards compatibility, -nojit, -valgrind, -valgrind-log, and -sim may # be given without the leading "-" character. # # When PCRE2 is compiled by clang with -fsanitize arguments, some tests need # very much more stack than normal. In environments where the stack can be # set at runtime, -bigstack sets a gigantic stack. # # Special cases where only one argument is allowed: # - If the script is invoked as "RunTest list", a list of available tests is # output, but none of them are run. ############################################################################### # Define test titles in variables so that they can be output as a list. Some # of them are modified (e.g. with -8 or -16) when used in the actual tests. title0="Test 0: Unchecked pcre2test argument tests (to improve coverage)" title1="Test 1: Main non-UTF, non-UCP functionality (compatible with Perl >= 5.10)" title2="Test 2: API, errors, internals and non-Perl stuff" title3="Test 3: Locale-specific features" title4A="Test 4: UTF" title4B=" and Unicode property support (compatible with Perl >= 5.10)" title5A="Test 5: API, internals, and non-Perl stuff for UTF" title5B=" and UCP support" title6="Test 6: DFA matching main non-UTF, non-UCP functionality" title7A="Test 7: DFA matching with UTF" title7B=" and Unicode property support" title8="Test 8: Internal offsets and code size tests" title9="Test 9: Specials for the basic 8-bit library" title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support" title11="Test 11: Specials for the basic 16-bit and 32-bit libraries" title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support" title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries" title14="Test 14: DFA specials for UTF and UCP support" title15="Test 15: Non-JIT limits and other non-JIT tests" title16="Test 16: JIT-specific features when JIT is not available" title17="Test 17: JIT-specific features when JIT is available" title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP" title19="Test 19: Tests of the POSIX interface with UTF/UCP" title20="Test 20: Serialization and code copy tests" title21="Test 21: \C tests without UTF (supported for DFA matching)" title22="Test 22: \C tests with UTF (not supported for DFA matching)" title23="Test 23: \C disabled test" title24="Test 24: Non-UTF pattern conversion tests" title25="Test 25: UTF pattern conversion tests" title26="Test 26: Unicode property tests (compatible with Perl >= 5.38)" title27="Test 27: Auto-generated unicode property tests" title28="Test 28: EBCDIC-specific tests" title29="Test 29: EBCDIC-specific tests (for NL=0x25)" maxtest=29 titleheap="Test 'heap': Environment-specific heap tests" if [ $# -eq 1 -a "$1" = "list" ]; then echo $title0 echo $title1 echo $title2 "(not UTF or UCP)" echo $title3 echo $title4A $title4B echo $title5A $title5B echo $title6 echo $title7A $title7B echo $title8 echo $title9 echo $title10 echo $title11 echo $title12 echo $title13 echo $title14 echo $title15 echo $title16 echo $title17 echo $title18 echo $title19 echo $title20 echo $title21 echo $title22 echo $title23 echo $title24 echo $title25 echo $title26 echo $title27 echo $title28 echo $title29 echo "" echo $titleheap echo "" echo "Numbered tests are automatically run if nothing selected." echo "Named tests must be explicitly selected." exit 0 fi # Set up a suitable "diff" command for comparison. Some systems # have a diff that lacks a -u option. Try to deal with this. # Use gdiff if available. cf="diff" gdiff /dev/null /dev/null 2>/dev/null && cf="gdiff" $cf -u /dev/null /dev/null 2>/dev/null && cf="$cf -u" # Find the test data if [ -n "$srcdir" -a -d "$srcdir" ] ; then testdata="$srcdir/testdata" elif [ -d "./testdata" ] ; then testdata=./testdata elif [ -d "../testdata" ] ; then testdata=../testdata else echo "Cannot find the testdata directory" exit 1 fi yield=0 # ------ Function to check results of a test ------- # This function is called with three parameters: # # $1 the value of $? after a call to pcre2test # $2 the suffix of the output file to compare with # $3 the $opt value (empty, -jit, or -dfa) # # Note: must define using name(), not "function name", for Solaris. checkresult() { if [ $1 -ne 0 ] ; then echo "** pcre2test failed - check testoutput$bits$3/testoutput$2" yield=1 return fi case "$3" in -jit) with=" with JIT";; -dfa) with=" with DFA";; *) with="";; esac cf_out="$testdata/testoutput$2" if [ $ebcdic -eq 1 ] ; then # We currently only use the #if ... #endif support in pcre2test for EBCDIC # testing. Run in "preprocess-only" mode (-E) on the testoutput file to trim # the output lines matching the input lines which are discarded. $sim $pcre2test -q -E "$cf_out" >testoutput$bits$3/testoutput$2-expected cf_out=testoutput$bits$3/testoutput$2-expected fi $cf "$cf_out" testoutput$bits$3/testoutput$2 if [ $? != 0 ] ; then echo "" echo "** Test $2 failed$with" yield=1 return fi echo " OK$with" } # ------ Function to run and check a special pcre2test arguments test ------- checkspecial() { expect=${2:-0} $sim $valgrind $vjs $pcre2test $1 >>testSoutput 2>&1 if [ $? -ne "$expect" ] ; then echo "** pcre2test $1 failed - check testSoutput" yield=1 return 1 fi return 0 } # ------ Test setup ------ # Default values arg8= arg16= arg32= nojit= bigstack= malloc= sim= skip= valgrind= vjs= globalopts="-q" : ${pcre2test:=./pcre2test} # This is in case the caller has set aliases (as I do - PH) unset cp ls mv rm if [ ! -x $pcre2test ] ; then echo "** $pcre2test does not exist or is not executable." exit 1 fi # Process options and select which tests to run; for those that are explicitly # requested, check that the necessary optional facilities are available. do0=no do1=no do2=no do3=no do4=no do5=no do6=no do7=no do8=no do9=no do10=no do11=no do12=no do13=no do14=no do15=no do16=no do17=no do18=no do19=no do20=no do21=no do22=no do23=no do24=no do25=no do26=no do27=no do28=no do29=no doheap=no while [ $# -gt 0 ] ; do case $1 in 0) do0=yes;; 1) do1=yes;; 2) do2=yes;; 3) do3=yes;; 4) do4=yes;; 5) do5=yes;; 6) do6=yes;; 7) do7=yes;; 8) do8=yes;; 9) do9=yes;; 10) do10=yes;; 11) do11=yes;; 12) do12=yes;; 13) do13=yes;; 14) do14=yes;; 15) do15=yes;; 16) do16=yes;; 17) do17=yes;; 18) do18=yes;; 19) do19=yes;; 20) do20=yes;; 21) do21=yes;; 22) do22=yes;; 23) do23=yes;; 24) do24=yes;; 25) do25=yes;; 26) do26=yes;; 27) do27=yes;; 28) do28=yes;; 29) do29=yes;; heap) doheap=yes;; -8) arg8=yes;; -16) arg16=yes;; -32) arg32=yes;; bigstack|-bigstack) bigstack=yes;; malloc|-malloc) malloc=yes;; nojit|-nojit) nojit=yes;; sim|-sim) shift; sim=$1;; valgrind|-valgrind) valgrind="valgrind --tool=memcheck -q --leak-check=yes --errors-for-leak-kinds=all --smc-check=all-non-file --error-exitcode=70";; valgrind-log|-valgrind-log) valgrind="valgrind --tool=memcheck --num-callers=30 --leak-check=yes --errors-for-leak-kinds=all --error-limit=no --smc-check=all-non-file --log-file=report.%p ";; ~*) if expr "$1" : '~[0-9][0-9]*$' >/dev/null; then skip="$skip `expr "$1" : '~\([0-9]*\)*$'`" else echo "Unknown option or test selector '$1'"; exit 1 fi ;; *-*) if expr "$1" : '[0-9][0-9]*-[0-9]*$' >/dev/null; then tf=`expr "$1" : '\([0-9]*\)'` tt=`expr "$1" : '.*-\([0-9]*\)'` if [ "$tt" = "" ] ; then tt=$maxtest; fi if expr \( "$tt" ">" "$maxtest" \) >/dev/null; then echo "Invalid test range '$1'"; exit 1 fi while expr "$tf" "<=" "$tt" >/dev/null; do eval do${tf}=yes tf=`expr $tf + 1` done else echo "Invalid test range '$1'"; exit 1 fi ;; *) echo "Unknown option or test selector '$1'"; exit 1;; esac shift done # If it is possible to set the system stack size and -bigstack was given, # set up a large stack. $sim $pcre2test -S 32 /dev/null /dev/null >/dev/null 2>&1 support_setstack=$? if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then globalopts="$globalopts -S 32" fi # If the malloc option is given, then call pcre2test with -malloc. if [ "$malloc" != "" ] ; then globalopts="$globalopts -malloc" fi # All of 8-bit, 16-bit, and 32-bit character strings may be supported, but only # one need be. $sim $pcre2test -C pcre2-8 >/dev/null support8=$? $sim $pcre2test -C pcre2-16 >/dev/null support16=$? $sim $pcre2test -C pcre2-32 >/dev/null support32=$? # \C may be disabled $sim $pcre2test -C backslash-C >/dev/null supportBSC=$? # Check if compiled in EBCDIC mode, and whether we have EBCDIC I/O and NL=0x25 $sim $pcre2test -C ebcdic >/dev/null ebcdic=$? $sim $pcre2test -C ebcdic-io >/dev/null ebcdic_io=$? $sim $pcre2test -C ebcdic-nl25 >/dev/null ebcdic_nl25=$? if [ $ebcdic -eq 1 ]; then if [ $ebcdic_io -eq 0 ]; then echo "Running tests in EBCDIC mode, and expecting ASCII test data" else echo "Running tests in EBCDIC mode, and expecting EBCDIC test data" echo "If you are on an EBCDIC machine, you will need to convert the PCRE2" echo "testdata/ directory from ISO8859-1, so the data match the EBCDIC" echo "codepage that your C compiler is using for C character literals." echo "For example:" echo " iconv -f ISO8859-1 -t IBM-1047 ..." fi fi # Initialize all bitsizes skipped test8=skip test16=skip test32=skip # If no bitsize arguments, select all that are available if [ "$arg8$arg16$arg32" = "" ] ; then if [ $support8 -ne 0 ] ; then test8=-8 fi if [ $support16 -ne 0 ] ; then test16=-16 fi if [ $support32 -ne 0 ] ; then test32=-32 fi # Otherwise, select requested bit sizes else if [ "$arg8" = yes ] ; then if [ $support8 -eq 0 ] ; then echo "Cannot run 8-bit library tests: 8-bit library not compiled" exit 1 fi test8=-8 fi if [ "$arg16" = yes ] ; then if [ $support16 -eq 0 ] ; then echo "Cannot run 16-bit library tests: 16-bit library not compiled" exit 1 fi test16=-16 fi if [ "$arg32" = yes ] ; then if [ $support32 -eq 0 ] ; then echo "Cannot run 32-bit library tests: 32-bit library not compiled" exit 1 fi test32=-32 fi fi # UTF support is implied by Unicode support, and it always applies to all bit # sizes if both are supported; we can't have UTF-8 support without UTF-16 or # UTF-32 support. $sim $pcre2test -C unicode >/dev/null utf=$? # When JIT is used with valgrind, we need to set up valgrind suppressions as # otherwise there are a lot of false positive valgrind reports when the # the hardware supports SSE2. jitopt= $sim $pcre2test -C jit >/dev/null jit=$? if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then jitopt=-jit if [ "$valgrind" != "" ] ; then vjs="--suppressions=$testdata/valgrind-jit.supp" fi fi # If no specific tests were requested, select all the numbered tests. Those # that are not relevant will be automatically skipped. if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ $do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \ $do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \ $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \ $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \ $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \ $do24 = no -a $do25 = no -a $do26 = no -a $do27 = no -a \ $do28 = no -a $do29 = no -a $doheap = no \ ]; then do0=yes do1=yes do2=yes do3=yes do4=yes do5=yes do6=yes do7=yes do8=yes do9=yes do10=yes do11=yes do12=yes do13=yes do14=yes do15=yes do16=yes do17=yes do18=yes do19=yes do20=yes do21=yes do22=yes do23=yes do24=yes do25=yes do26=yes do27=yes do28=yes do29=yes fi # Handle any explicit skips at this stage, so that an argument list may consist # only of explicit skips. for i in $skip; do eval do$i=no; done # Show which release and which test data echo "" echo PCRE2 C library tests using test data from $testdata $sim $pcre2test /dev/null echo "" # ------ Normal Tests ------ for bmode in "$test8" "$test16" "$test32"; do case "$bmode" in skip) continue;; -16) if [ "$test8$test32" != "skipskip" ] ; then echo ""; fi bits=16; echo "---- Testing 16-bit library ----"; echo "";; -32) if [ "$test8$test16" != "skipskip" ] ; then echo ""; fi bits=32; echo "---- Testing 32-bit library ----"; echo "";; -8) bits=8; echo "---- Testing 8-bit library ----"; echo "";; esac # Set up directories for test output. [ -d testoutput$bits ] || mkdir testoutput$bits [ -d testoutput$bits$jitopt ] || mkdir testoutput$bits$jitopt [ -d testoutput$bits-dfa ] || mkdir testoutput$bits-dfa # Test 0 is a special test. Its output is not checked, because it will # be different on different hardware and with different configurations. # Running this test just exercises the code. if [ $do0 = yes ] ; then echo $title0 echo '/abc/jit,memory,framesize' >testSinput echo ' abc' >>testSinput echo '' >testSoutput saverc=0 checkspecial "$bmode -C" || saverc=$? checkspecial '--help' || saverc=$? checkspecial "$bmode testSinput" || saverc=$? checkspecial "$bmode $testdata/testinputheap" || saverc=$? if [ $support_setstack -eq 0 ] ; then checkspecial "$bmode -S 1 -t 10 testSinput" || saverc=$? fi checkspecial "$bmode reallydoesnotexist" 1 || saverc=$? checkspecial "$bmode testSinput reallydoesnotexist/outfile" 1 || saverc=$? checkspecial "$bmode -pattern debug testSinput" || saverc=$? checkspecial "$bmode -pattern INVALID testSinput" 1 2>/dev/null || saverc=$? checkspecial "$bmode -subject notempty testSinput" || saverc=$? checkspecial "$bmode -subject INVALID testSinput" 1 2>/dev/null || saverc=$? checkspecial -LM || saverc=$? checkspecial -LP || saverc=$? checkspecial -LS || saverc=$? checkspecial "$bmode -unittest" || saverc=$? if [ $saverc -eq 0 ] ; then echo " OK" fi fi # Primary non-UTF test, compatible with JIT and all versions of Perl >= 5.8 if [ $do1 = yes ] ; then echo $title1 for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput1 testoutput$bits$opt/testoutput1 checkresult $? 1 "$opt" done fi # PCRE2 tests that are not Perl-compatible: API, errors, internals. We copy # the testbtables file to the current directory for use by this test. if [ $do2 = yes ] ; then echo $title2 "(excluding UTF-$bits)" cp $testdata/testbtables . for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput2 testoutput$bits$opt/testoutput2 saverc=$? if [ $saverc = 0 ] ; then $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt -error -80,-62,-2,-1,0,100,101,191,300 >>testoutput$bits$opt/testoutput2 checkresult $? 2 "$opt" else checkresult $saverc 2 "$opt" fi done fi # Locale-specific tests, provided that either the "fr_FR", "fr_CA", "french" # or "fr" locale is available. The first two are Unix-like standards; the # last two are for Windows. Unfortunately, different versions of the French # locale give different outputs for some items. This test passes if the # output matches any one of the alternative output files. if [ $do3 = yes ] ; then locale= # In some environments locales that are listed by the "locale -a" # command do not seem to work with setlocale(). Therefore, we do # a preliminary test to see if pcre2test can set one before going # on to use it. for loc in 'fr_FR' 'french' 'fr' 'fr_CA'; do locale -a | grep "^$loc\$" >/dev/null if [ $? -eq 0 ] ; then echo "/a/locale=$loc" | \ $sim $valgrind $pcre2test -q $bmode | \ grep "Failed to set locale" >/dev/null if [ $? -ne 0 ] ; then locale=$loc if [ "$locale" = "fr_FR" ] ; then infile=$testdata/testinput3 outfile=$testdata/testoutput3 outfile2=$testdata/testoutput3A outfile3=$testdata/testoutput3B outfile4=$testdata/testoutput3C else infile=test3input outfile=test3output outfile2=test3outputA outfile3=test3outputB outfile4=test3outputC sed "s/fr_FR/$loc/" $testdata/testinput3 >test3input sed "s/fr_FR/$loc/" $testdata/testoutput3 >test3output sed "s/fr_FR/$loc/" $testdata/testoutput3A >test3outputA sed "s/fr_FR/$loc/" $testdata/testoutput3B >test3outputB sed "s/fr_FR/$loc/" $testdata/testoutput3C >test3outputC fi break fi fi done if [ "$locale" != "" ] ; then echo $title3 "(using '$locale' locale)" for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $infile testoutput$bits/testoutput3 if [ $? = 0 ] ; then case "$opt" in -jit) with=" with JIT";; *) with="";; esac if $cf $outfile testoutput$bits/testoutput3 >teststdout || \ $cf $outfile2 testoutput$bits/testoutput3 >>teststdout || \ $cf $outfile3 testoutput$bits/testoutput3 >>teststdout || \ $cf $outfile4 testoutput$bits/testoutput3 >>teststdout then echo " OK$with" else cat teststdout echo "" echo "** Locale test did not run successfully$with. The output did not match" echo " $outfile, $outfile2, $outfile3 or $outfile4." echo " This may mean that there is a problem with the locale settings rather" echo " than a bug in PCRE2." yield=1 fi else echo "** pcre2test failed - check testoutput$bits/testoutput3" yield=1 fi done else echo "Cannot test locale-specific features - none of the 'fr_FR', 'fr_CA'," echo "'fr' or 'french' locales can be set, or the \"locale\" command is" echo "not available to check for them." echo " " fi fi # Tests for UTF and Unicode property support if [ $do4 = yes ] ; then echo ${title4A}-${bits}${title4B} if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput4 testoutput$bits$opt/testoutput4 checkresult $? 4 "$opt" done fi fi if [ $do5 = yes ] ; then echo ${title5A}-${bits}$title5B if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput5 testoutput$bits$opt/testoutput5 checkresult $? 5 "$opt" done fi fi # Tests for DFA matching support if [ $do6 = yes ] ; then echo $title6 $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput6 testoutput$bits/testoutput6 checkresult $? 6 "" fi if [ $do7 = yes ] ; then echo ${title7A}-${bits}$title7B if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput7 testoutput$bits/testoutput7 checkresult $? 7 "" fi fi # Test of internal offsets and code sizes. This test is run only when there # is UTF/UCP support. The actual tests are mostly the same as in some of the # above, but in this test we inspect some offsets and sizes. This is a # doublecheck for the maintainer, just in case something changes unexpectedly. # The output from this test is different in 8-bit, 16-bit, and 32-bit modes # and for different link sizes, so there are different output files for each # mode and link size. if [ $do8 = yes ] ; then echo $title8 $sim $pcre2test -$bits -C linksize >/dev/null bits_link_size=$? if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput8 testoutput$bits/testoutput8-$bits-$bits_link_size checkresult $? 8-$bits-$bits_link_size "" fi fi # Tests for 8-bit-specific features if [ "$do9" = yes ] ; then echo $title9 if [ "$bits" = "16" -o "$bits" = "32" ] ; then echo " Skipped when running 16/32-bit tests" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput9 testoutput$bits$opt/testoutput9 checkresult $? 9 "$opt" done fi fi # Tests for UTF-8 and UCP 8-bit-specific features if [ "$do10" = yes ] ; then echo $title10 if [ "$bits" = "16" -o "$bits" = "32" ] ; then echo " Skipped when running 16/32-bit tests" elif [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput10 testoutput$bits$opt/testoutput10 checkresult $? 10 "$opt" done fi fi # Tests for 16-bit and 32-bit features. Output is different for the two widths. if [ $do11 = yes ] ; then echo $title11 if [ "$bits" = "8" ] ; then echo " Skipped when running 8-bit tests" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput11 testoutput$bits$opt/testoutput11-$bits checkresult $? 11-$bits "$opt" done fi fi # Tests for 16-bit and 32-bit features with UTF-16/32 and UCP support. Output # is different for the two widths. if [ $do12 = yes ] ; then echo $title12 if [ "$bits" = "8" ] ; then echo " Skipped when running 8-bit tests" elif [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput12 testoutput$bits$opt/testoutput12-$bits checkresult $? 12-$bits "$opt" done fi fi # Tests for 16/32-bit-specific features in DFA non-UTF modes if [ $do13 = yes ] ; then echo $title13 if [ "$bits" = "8" ] ; then echo " Skipped when running 8-bit tests" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput13 testoutput$bits/testoutput13 checkresult $? 13 "" fi fi # Tests for DFA UTF and UCP features. Output is different for the different widths. if [ $do14 = yes ] ; then echo $title14 if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput14 testoutput$bits/testoutput14-$bits checkresult $? 14-$bits "" fi fi # Test non-JIT match and recursion limits if [ $do15 = yes ] ; then echo $title15 $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput15 testoutput$bits/testoutput15 checkresult $? 15 "" fi # Test JIT-specific features when JIT is not available if [ $do16 = yes ] ; then echo $title16 if [ $jit -ne 0 ] ; then echo " Skipped because JIT is available" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput16 testoutput$bits/testoutput16 checkresult $? 16 "" fi fi # Test JIT-specific features when JIT is available if [ $do17 = yes ] ; then echo $title17 if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then echo " Skipped because JIT is not available or nojit was specified" else $sim $valgrind $vjs $pcre2test $globalopts $bmode $testdata/testinput17 testoutput$bits/testoutput17 checkresult $? 17 "" fi fi # Tests for the POSIX interface without UTF/UCP (8-bit only) if [ $do18 = yes ] ; then echo $title18 if [ "$bits" = "16" -o "$bits" = "32" ] ; then echo " Skipped when running 16/32-bit tests" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput18 testoutput$bits/testoutput18 checkresult $? 18 "" fi fi # Tests for the POSIX interface with UTF/UCP (8-bit only) if [ $do19 = yes ] ; then echo $title19 if [ "$bits" = "16" -o "$bits" = "32" ] ; then echo " Skipped when running 16/32-bit tests" elif [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput19 testoutput$bits/testoutput19 checkresult $? 19 "" fi fi # Serialization tests if [ $do20 = yes ] ; then echo $title20 $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput20 testoutput$bits/testoutput20 checkresult $? 20 "" fi # \C tests without UTF - DFA matching is supported if [ "$do21" = yes ] ; then echo $title21 if [ $supportBSC -eq 0 ] ; then echo " Skipped because \C is disabled" else for opt in "" $jitopt -dfa; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput21 testoutput$bits$opt/testoutput21 checkresult $? 21 "$opt" done fi fi # \C tests with UTF - DFA matching is not supported for \C in UTF mode if [ "$do22" = yes ] ; then echo $title22 if [ $supportBSC -eq 0 ] ; then echo " Skipped because \C is disabled" elif [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput22 testoutput$bits$opt/testoutput22-$bits checkresult $? 22-$bits "$opt" done fi fi # Test when \C is disabled if [ "$do23" = yes ] ; then echo $title23 if [ $supportBSC -ne 0 ] ; then echo " Skipped because \C is not disabled" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput23 testoutput$bits/testoutput23 checkresult $? 23 "" fi fi # Non-UTF pattern conversion tests if [ "$do24" = yes ] ; then echo $title24 $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput24 testoutput$bits/testoutput24 checkresult $? 24 "" fi # UTF pattern conversion tests if [ "$do25" = yes ] ; then echo $title25 if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinput25 testoutput$bits/testoutput25 checkresult $? 25 "" fi fi # Unicode property tests if [ $do26 = yes ] ; then echo $title26 if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput26 testoutput$bits$opt/testoutput26 checkresult $? 26 "$opt" done fi fi # Auto-generated Unicode property tests if [ $do27 = yes ] ; then echo $title27 if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput27 testoutput$bits$opt/testoutput27 checkresult $? 27 "$opt" done fi fi # EBCDIC tests if [ $do28 = yes ] ; then echo $title28 if [ $ebcdic -eq 0 ] ; then echo " Skipped when not targetting EBCDIC" else for opt in "" $jitopt "-dfa"; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput28 testoutput$bits$opt/testoutput28 checkresult $? 28 "$opt" done fi fi # EBCDIC tests (for NL=0x25) if [ $do29 = yes ] ; then echo $title29 if [ $ebcdic -eq 0 ] ; then echo " Skipped when not targetting EBCDIC" elif [ $ebcdic_nl25 -eq 0 ] ; then echo " Skipped because EBCDIC newline is not 0x25" else for opt in "" $jitopt "-dfa"; do $sim $valgrind ${opt:+$vjs} $pcre2test $globalopts $bmode $opt $testdata/testinput29 testoutput$bits$opt/testoutput29 checkresult $? 29 "$opt" done fi fi # Manually selected heap tests - output may vary in different environments, # which is why that are not automatically run. if [ $doheap = yes ] ; then echo $titleheap $sim $valgrind $pcre2test $globalopts $bmode $testdata/testinputheap testoutput$bits/testoutputheap-$bits checkresult $? heap-$bits "" fi # End of loop for 8/16/32-bit tests done if [ $yield -eq 0 ] ; then echo "" echo "All tests passed." # Clean up local working files rm -f testbtables testSinput testSoutput test3input test3output test3outputA test3outputB testsaved1 testsaved2 teststdout teststderr rm -rf testoutput8 testoutput8-jit testoutput8-dfa \ testoutput16 testoutput16-jit testoutput16-dfa \ testoutput32 testoutput32-jit testoutput32-dfa else echo "" echo "** Tests failed. See output above for details." fi exit $yield # End ================================================ FILE: RunTest.bat ================================================ @echo off @rem @rem MS Windows batch file to run pcre2test on testfiles with the correct @rem options. This file must use CRLF linebreaks to function properly. @rem @rem ------------------------ HISTORY ---------------------------------- @rem This file was originally contributed to PCRE1 by Ralf Junker, and touched @rem up by Daniel Richard G. Tests 10-12 added by Philip H. @rem Philip H also changed test 3 to use "wintest" files. @rem @rem Updated by Tom Fortmann to support explicit test numbers on the command @rem line. Added argument validation and added error reporting. @rem @rem Sheri Pierce added logic to skip feature dependent tests @rem tests 4 5 7 10 12 14 19 22 25 and 26 require Unicode support @rem 8 requires Unicode @rem 16 requires absence of jit support @rem 17 requires presence of jit support @rem Sheri P also added override tests for study and jit testing @rem Zoltan Herczeg added libpcre16 support @rem Zoltan Herczeg added libpcre32 support @rem ------------------------------------------------------------------- @rem @rem The file was converted for PCRE2 by PH, February 2015. @rem Updated for new test 14 (moving others up a number), August 2015. @rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015. @rem PH added missing "set type" for test 22, April 2016. @rem PH added copy command for new testbtables file, November 2020 @rem PH caused it to show comparison output when comparison failed, July 2023 @rem PH updated unknown error number in test setlocal enabledelayedexpansion if [%srcdir%]==[] ( if exist testdata\ set srcdir=.) if [%srcdir%]==[] ( if exist ..\testdata\ set srcdir=..) if [%srcdir%]==[] ( if exist ..\..\testdata\ set srcdir=..\..) if NOT exist %srcdir%\testdata\ ( echo Error: distribution testdata folder not found! call :conferror exit /b 1 goto :eof ) if [%pcre2test%]==[] set pcre2test=.\pcre2test.exe echo source dir is %srcdir% echo pcre2test=%pcre2test% if NOT exist %pcre2test% ( echo Error: %pcre2test% not found! echo. call :conferror exit /b 1 ) %pcre2test% -C pcre2-8 >NUL set support8=%ERRORLEVEL% %pcre2test% -C pcre2-16 >NUL set support16=%ERRORLEVEL% %pcre2test% -C pcre2-32 >NUL set support32=%ERRORLEVEL% %pcre2test% -C unicode >NUL set unicode=%ERRORLEVEL% %pcre2test% -C jit >NUL set jit=%ERRORLEVEL% %pcre2test% -C backslash-C >NUL set supportBSC=%ERRORLEVEL% %pcre2test% -C ebcdic >NUL set ebcdic=%ERRORLEVEL% %pcre2test% -C ebcdic-nl25 >NUL set ebcdic_nl25=%ERRORLEVEL% if %support8% EQU 1 ( if not exist testout8 md testout8 if not exist testoutjit8 md testoutjit8 ) if %support16% EQU 1 ( if not exist testout16 md testout16 if not exist testoutjit16 md testoutjit16 ) if %support32% EQU 1 ( if not exist testout32 md testout32 if not exist testoutjit32 md testoutjit32 ) set do1=no set do2=no set do3=no set do4=no set do5=no set do6=no set do7=no set do8=no set do9=no set do10=no set do11=no set do12=no set do13=no set do14=no set do15=no set do16=no set do17=no set do18=no set do19=no set do20=no set do21=no set do22=no set do23=no set do24=no set do25=no set do26=no set do27=no set do28=no set do29=no set all=yes for %%a in (%*) do ( set valid=no for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29) do if %%v == %%a set valid=yes if "!valid!" == "yes" ( set do%%a=yes set all=no ) else ( echo Invalid test number - %%a! echo Usage %0 [ test_number ] ... echo Where test_number is one or more optional test numbers 1 through 29, default is all tests. exit /b 1 ) ) set failed="no" if "%all%" == "yes" ( set do1=yes set do2=yes set do3=yes set do4=yes set do5=yes set do6=yes set do7=yes set do8=yes set do9=yes set do10=yes set do11=yes set do12=yes set do13=yes set do14=yes set do15=yes set do16=yes set do17=yes set do18=yes set do19=yes set do20=yes set do21=yes set do22=yes set do23=yes set do24=yes set do25=yes set do26=yes set do27=yes set do28=yes set do29=yes ) @echo RunTest.bat's pcre2test output is written to newly created subfolders @echo named testout{8,16,32} and testoutjit{8,16,32}. @echo. set mode= set bits=8 :nextMode if "%mode%" == "" ( if %support8% EQU 0 goto modeSkip echo. echo ---- Testing 8-bit library ---- echo. ) if "%mode%" == "-16" ( if %support16% EQU 0 goto modeSkip echo. echo ---- Testing 16-bit library ---- echo. ) if "%mode%" == "-32" ( if %support32% EQU 0 goto modeSkip echo. echo ---- Testing 32-bit library ---- echo. ) if "%do1%" == "yes" call :do1 if "%do2%" == "yes" call :do2 if "%do3%" == "yes" call :do3 if "%do4%" == "yes" call :do4 if "%do5%" == "yes" call :do5 if "%do6%" == "yes" call :do6 if "%do7%" == "yes" call :do7 if "%do8%" == "yes" call :do8 if "%do9%" == "yes" call :do9 if "%do10%" == "yes" call :do10 if "%do11%" == "yes" call :do11 if "%do12%" == "yes" call :do12 if "%do13%" == "yes" call :do13 if "%do14%" == "yes" call :do14 if "%do15%" == "yes" call :do15 if "%do16%" == "yes" call :do16 if "%do17%" == "yes" call :do17 if "%do18%" == "yes" call :do18 if "%do19%" == "yes" call :do19 if "%do20%" == "yes" call :do20 if "%do21%" == "yes" call :do21 if "%do22%" == "yes" call :do22 if "%do23%" == "yes" call :do23 if "%do24%" == "yes" call :do24 if "%do25%" == "yes" call :do25 if "%do26%" == "yes" call :do26 if "%do27%" == "yes" call :do27 if "%do28%" == "yes" call :do28 if "%do29%" == "yes" call :do29 :modeSkip if "%mode%" == "" ( set mode=-16 set bits=16 goto nextMode ) if "%mode%" == "-16" ( set mode=-32 set bits=32 goto nextMode ) @rem If mode is -32, testing is finished if %failed% == "yes" ( echo In above output, one or more of the various tests failed! exit /b 1 ) echo All OK goto :eof :runsub @rem Function to execute pcre2test and compare the output @rem Arguments are as follows: @rem @rem 1 = test number @rem 2 = outputdir @rem 3 = test name use double quotes @rem 4 - 9 = pcre2test options if [%1] == [] ( echo Missing test number argument! exit /b 1 ) if [%2] == [] ( echo Missing outputdir! exit /b 1 ) if [%3] == [] ( echo Missing test name argument! exit /b 1 ) if %1 == 8 ( %pcre2test% -%bits% -C linksize >NUL set bits_link_size=!ERRORLEVEL! set outnum=%1-%bits%-!bits_link_size! ) else if %1 == 11 ( set outnum=%1-%bits% ) else if %1 == 12 ( set outnum=%1-%bits% ) else if %1 == 14 ( set outnum=%1-%bits% ) else if %1 == 22 ( set outnum=%1-%bits% ) else ( set outnum=%1 ) set testinput=testinput%1 set testoutput=testoutput%outnum% if exist %srcdir%\testdata\win%testinput% ( set testinput=wintestinput%1 set testoutput=wintestoutput%outnum% ) echo Test %1: %3 %pcre2test% %mode% %4 %5 %6 %7 %8 %9 %srcdir%\testdata\%testinput% >%2%bits%\%testoutput% if errorlevel 1 ( echo. failed executing command-line: echo. %pcre2test% %mode% %4 %5 %6 %7 %8 %9 %srcdir%\testdata\%testinput% ^>%2%bits%\%testoutput% set failed="yes" goto :eof ) else if [%1]==[2] ( %pcre2test% %mode% %4 %5 %6 %7 %8 %9 -error -80,-62,-2,-1,0,100,101,191,300 >>%2%bits%\%testoutput% ) set testexpected=%srcdir%\testdata\%testoutput% if %ebcdic% EQU 1 ( @rem We currently only use the #if ... #endif support in pcre2test for EBCDIC @rem testing. Run in "preprocess-only" mode (-E) on the testoutput file to trim @rem the output lines matching the input lines which are discarded. %pcre2test% -q -E %testexpected% >%2%bits%\%testoutput%-trimmed set testexpected=%2%bits%\%testoutput%-trimmed ) fc /n %testexpected% %2%bits%\%testoutput% >NUL if errorlevel 1 ( echo. failed comparison: fc /n %testexpected% %2%bits%\%testoutput% if [%1]==[3] ( echo. echo ** Test 3 failure usually means french locale is not echo ** available on the system, rather than a bug or problem with PCRE2. echo. goto :eof ) fc /n %testexpected% %2%bits%\%testoutput% set failed="yes" goto :eof ) echo. Passed. goto :eof :do1 call :runsub 1 testout "Main non-UTF, non-UCP functionality (Compatible with Perl >= 5.10)" -q if %jit% EQU 1 call :runsub 1 testoutjit "Test with JIT Override" -q -jit goto :eof :do2 copy /y %srcdir%\testdata\testbtables testbtables call :runsub 2 testout "API, errors, internals, and non-Perl stuff" -q if %jit% EQU 1 call :runsub 2 testoutjit "Test with JIT Override" -q -jit goto :eof :do3 call :runsub 3 testout "Locale-specific features" -q if %jit% EQU 1 call :runsub 3 testoutjit "Test with JIT Override" -q -jit goto :eof :do4 if %unicode% EQU 0 ( echo Test 4 Skipped due to absence of Unicode support. goto :eof ) call :runsub 4 testout "UTF-%bits% and Unicode property support - (Compatible with Perl >= 5.10)" -q if %jit% EQU 1 call :runsub 4 testoutjit "Test with JIT Override" -q -jit goto :eof :do5 if %unicode% EQU 0 ( echo Test 5 Skipped due to absence of Unicode support. goto :eof ) call :runsub 5 testout "API, internals, and non-Perl stuff for UTF-%bits% and UCP" -q if %jit% EQU 1 call :runsub 5 testoutjit "Test with JIT Override" -q -jit goto :eof :do6 call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q goto :eof :do7 if %unicode% EQU 0 ( echo Test 7 Skipped due to absence of Unicode support. goto :eof ) call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q goto :eof :do8 if %unicode% EQU 0 ( echo Test 8 Skipped due to absence of Unicode support. goto :eof ) call :runsub 8 testout "Internal offsets and code size tests" -q goto :eof :do9 if NOT %bits% EQU 8 ( echo Test 9 Skipped when running 16/32-bit tests. goto :eof ) call :runsub 9 testout "Specials for the basic 8-bit library" -q if %jit% EQU 1 call :runsub 9 testoutjit "Test with JIT Override" -q -jit goto :eof :do10 if NOT %bits% EQU 8 ( echo Test 10 Skipped when running 16/32-bit tests. goto :eof ) if %unicode% EQU 0 ( echo Test 10 Skipped due to absence of Unicode support. goto :eof ) call :runsub 10 testout "Specials for the 8-bit library with Unicode support" -q if %jit% EQU 1 call :runsub 10 testoutjit "Test with JIT Override" -q -jit goto :eof :do11 if %bits% EQU 8 ( echo Test 11 Skipped when running 8-bit tests. goto :eof ) call :runsub 11 testout "Specials for the basic 16/32-bit library" -q if %jit% EQU 1 call :runsub 11 testoutjit "Test with JIT Override" -q -jit goto :eof :do12 if %bits% EQU 8 ( echo Test 12 Skipped when running 8-bit tests. goto :eof ) if %unicode% EQU 0 ( echo Test 12 Skipped due to absence of Unicode support. goto :eof ) call :runsub 12 testout "Specials for the 16/32-bit library with Unicode support" -q if %jit% EQU 1 call :runsub 12 testoutjit "Test with JIT Override" -q -jit goto :eof :do13 if %bits% EQU 8 ( echo Test 13 Skipped when running 8-bit tests. goto :eof ) call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q goto :eof :do14 if %unicode% EQU 0 ( echo Test 14 Skipped due to absence of Unicode support. goto :eof ) call :runsub 14 testout "DFA specials for UTF and UCP support" -q goto :eof :do15 call :runsub 15 testout "Non-JIT limits and other non_JIT tests" -q goto :eof :do16 if %jit% EQU 1 ( echo Test 16 Skipped due to presence of JIT support. goto :eof ) call :runsub 16 testout "JIT-specific features when JIT is not available" -q goto :eof :do17 if %jit% EQU 0 ( echo Test 17 Skipped due to absence of JIT support. goto :eof ) call :runsub 17 testout "JIT-specific features when JIT is available" -q goto :eof :do18 if %bits% EQU 16 ( echo Test 18 Skipped when running 16-bit tests. goto :eof ) if %bits% EQU 32 ( echo Test 18 Skipped when running 32-bit tests. goto :eof ) call :runsub 18 testout "POSIX interface, excluding UTF-8 and UCP" -q goto :eof :do19 if %bits% EQU 16 ( echo Test 19 Skipped when running 16-bit tests. goto :eof ) if %bits% EQU 32 ( echo Test 19 Skipped when running 32-bit tests. goto :eof ) if %unicode% EQU 0 ( echo Test 19 Skipped due to absence of Unicode support. goto :eof ) call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q goto :eof :do20 call :runsub 20 testout "Serialization tests" -q goto :eof :do21 if %supportBSC% EQU 0 ( echo Test 21 Skipped due to absence of backslash-C support. goto :eof ) call :runsub 21 testout "Backslash-C tests without UTF" -q call :runsub 21 testout "Backslash-C tests without UTF (DFA)" -q -dfa if %jit% EQU 1 call :runsub 21 testoutjit "Test with JIT Override" -q -jit goto :eof :do22 if %supportBSC% EQU 0 ( echo Test 22 Skipped due to absence of backslash-C support. goto :eof ) if %unicode% EQU 0 ( echo Test 22 Skipped due to absence of Unicode support. goto :eof ) call :runsub 22 testout "Backslash-C tests with UTF" -q if %jit% EQU 1 call :runsub 22 testoutjit "Test with JIT Override" -q -jit goto :eof :do23 if %supportBSC% EQU 1 ( echo Test 23 Skipped due to presence of backslash-C support. goto :eof ) call :runsub 23 testout "Backslash-C disabled test" -q goto :eof :do24 call :runsub 24 testout "Non-UTF pattern conversion tests" -q goto :eof :do25 if %unicode% EQU 0 ( echo Test 25 Skipped due to absence of Unicode support. goto :eof ) call :runsub 25 testout "UTF pattern conversion tests" -q goto :eof :do26 if %unicode% EQU 0 ( echo Test 26 Skipped due to absence of Unicode support. goto :eof ) call :runsub 26 testout "Unicode property tests (Compatible with Perl >= 5.38)" -q if %jit% EQU 1 call :runsub 26 testoutjit "Test with JIT Override" -q -jit goto :eof :do27 if %unicode% EQU 0 ( echo Test 27 Skipped due to absence of Unicode support. goto :eof ) call :runsub 27 testout "Auto-generated unicode property tests" -q if %jit% EQU 1 call :runsub 27 testoutjit "Test with JIT Override" -q -jit goto :eof :do28 if %ebcdic% EQU 0 ( echo Test 28 Skipped when not targetting EBCDIC. goto :eof ) call :runsub 28 testout "EBCDIC-specific tests" -q call :runsub 28 testout "EBCDIC-specific tests (DFA)" -q -dfa if %jit% EQU 1 call :runsub 28 testoutjit "Test with JIT Override" -q -jit goto :eof :do29 if %ebcdic% EQU 0 ( echo Test 29 Skipped when not targetting EBCDIC. goto :eof ) if %ebcdic_nl25% EQU 0 ( echo Test 29 Skipped because EBCDIC newline is not 0x25. goto :eof ) call :runsub 29 testout "EBCDIC-specific tests for NL=0x25" -q call :runsub 29 testout "EBCDIC-specific tests for NL=0x25 (DFA)" -q -dfa if %jit% EQU 1 call :runsub 29 testoutjit "Test with JIT Override" -q -jit goto :eof :conferror @echo. @echo Either your build is incomplete or you have a configuration error. @echo. @echo If configured with cmake and executed via "make test" or the MSVC "RUN_TESTS" @echo project, pcre2_test.bat defines variables and automatically calls RunTest.bat. @echo For manual testing of all available features, after configuring with cmake @echo and building, you can run the built pcre2_test.bat. For best results with @echo cmake builds and tests avoid directories with full path names that include @echo spaces for source or build. @echo. @echo Otherwise, if the build dir is in a subdir of the source dir, testdata needed @echo for input and verification should be found automatically when (from the @echo location of the the built exes) you call RunTest.bat. By default RunTest.bat @echo runs all tests compatible with the linked pcre2 library but it can be given @echo a test number as an argument. @echo. @echo If the build dir is not under the source dir you can either copy your exes @echo to the source folder or copy RunTest.bat and the testdata folder to the @echo location of your built exes and then run RunTest.bat. @echo. goto :eof ================================================ FILE: SECURITY.md ================================================ Security policies ================= Release security ---------------- The PCRE2 project provides source-only releases, with no binaries. These source releases can be downloaded from the [GitHub Releases](https://github.com/PCRE2Project/pcre2/releases) page. Each release file is GPG-signed. * Releases up to and including 10.44 are signed by Philip Hazel (GPG key: 45F68D54BBE23FB3039B46E59766E084FB0F43D8) * Releases from 10.45 onwards will be signed by Nicholas Wilson (GPG key: A95536204A3BB489715231282A98E77EB6F24CA8, cross-signed by Philip Hazel's key for release continuity) From releases 10.45 onwards, the source code will additionally be provided via Git checkout of the (GPG-signed) release tag. Please contact the maintainers for any queries about release integrity or the project's supply-chain. Previous vulnerabilities ------------------------ * CVE-2025-58050 (August 2025). Affects 10.45 only (not earlier), and is fixed in 10.46. Reporting vulnerabilities ------------------------- The PCRE2 project prioritises security. We appreciate third-party testing and security research, and would be grateful if you could responsibly disclose your findings to us. We will make every effort to acknowledge your contributions. To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/PCRE2Project/pcre2/security/advisories/new) tab. (Alternatively, if you prefer you may send a GPG-encrypted email to one of the maintainers.) ### Timeline As a very small volunteer team, we cannot guarantee rapid response, but would aim to respond within 1 week, or perhaps 2 during holidays. ### Response procedure PCRE2 has in the past made at least one rapid release in response to security incidents. We have never produced an embargoed release, or provided preferential access to security fixes to any clients. We would aim to notify security managers from trusted downstream distributors, such as major Linux distributions, via the `pcre2-dev` mailing list, by publicly signalling an upcoming security release before disclosing the vulnerability publicly, where advance notification is possible. ================================================ FILE: autogen.sh ================================================ #!/bin/sh # Running aclocal here first (as happened for a while) caused the macros that # libtoolize puts in the m4 directory to be newer than the aclocal.m4 file that # aclocal creates. This meant that the next "make" cause aclocal to be run # again. Moving aclocal to after libtoolize does not seem to cause any # problems, and it fixes this issue. # GNU libtool is named differently on some systems. This code tries several # variants like glibtoolize (MacOSX) and libtoolize1x (FreeBSD) set +ex echo "Looking for a version of libtoolize (which can have different names)..." libtoolize="" for l in glibtoolize libtoolize15 libtoolize14 libtoolize ; do $l --version > /dev/null 2>&1 if [ $? = 0 ]; then libtoolize=$l echo "Found $l" break fi echo "Did not find $l" done if [ "x$libtoolize" = "x" ]; then echo "Can't find libtoolize on your system" exit 1 fi $libtoolize --version | head -n1 autoconf --version | head -n1 automake --version | head -n1 set -ex $libtoolize -c -f rm -rf autom4te.cache Makefile.in aclocal.m4 aclocal --force -I m4 autoconf -f -W all,no-obsolete autoheader -f -W all # Added no-portability to suppress automake 1.12's warning about the use # of recursive variables. automake -a -c -f -W all,no-portability rm -rf autom4te.cache exit 0 # end autogen.sh ================================================ FILE: build.zig ================================================ const std = @import("std"); pub const CodeUnitWidth = enum { @"8", @"16", @"32", }; pub fn build(b: *std.Build) !void { const optimize = b.standardOptimizeOption(.{}); const target = b.standardTargetOptions(.{}); const rt = target.result; const linkage = b.option(std.builtin.LinkMode, "linkage", "whether to statically or dynamically link the library") orelse @as(std.builtin.LinkMode, if (rt.isGnuLibC()) .dynamic else .static); const sanitize_c = b.option(std.zig.SanitizeC, "sanitize_c", "whether to build with undefined behaviour sanitizer enabled") orelse .off; const codeUnitWidth = b.option(CodeUnitWidth, "code-unit-width", "Sets the code unit width") orelse .@"8"; const jit = b.option(bool, "support_jit", "Enable/disable JIT compiler support") orelse false; const pcre2_h_dir = b.addWriteFiles(); const pcre2_h = pcre2_h_dir.addCopyFile(b.path("src/pcre2.h.generic"), "pcre2.h"); b.addNamedLazyPath("pcre2.h", pcre2_h); const is_unix = rt.os.tag != .windows; const is_mingw = rt.isMinGW(); const is_musl = rt.isMuslLibC(); const is_glibc = rt.isGnuLibC(); const is_freebsd = rt.isFreeBSDLibC(); const cflags = &.{ "-fvisibility=hidden", }; const config_h = b.addConfigHeader( .{ .style = .{ .cmake = b.path("src/config-cmake.h.in"), }, .include_path = "config.h", }, // These options should be kept in-sync with those in config-cmake.h.in, and // should be the same set of options (no more needed). It is permitted to // specify fewer options here than in config-cmake.h.in, if an option is // disabled in all zig build configurations. .{ .HAVE_ASSERT_H = true, .HAVE_DIRENT_H = is_unix or is_mingw, .HAVE_SYS_STAT_H = true, .HAVE_SYS_TYPES_H = true, .HAVE_UNISTD_H = is_unix or is_mingw, .HAVE_WINDOWS_H = rt.os.tag == .windows, .HAVE_MEMFD_CREATE = is_musl or is_glibc or is_freebsd, .HAVE_SECURE_GETENV = is_musl or is_glibc or is_freebsd, .HAVE_SETRLIMIT = is_unix and !is_mingw, // all compilation is using the Zig bundled c compiler .HAVE_BUILTIN_ASSUME = null, .HAVE_BUILTIN_MUL_OVERFLOW = true, .HAVE_BUILTIN_UNREACHABLE = true, .HAVE_ATTRIBUTE_UNINITIALIZED = true, .SUPPORT_PCRE2_8 = codeUnitWidth == .@"8", .SUPPORT_PCRE2_16 = codeUnitWidth == .@"16", .SUPPORT_PCRE2_32 = codeUnitWidth == .@"32", .SUPPORT_UNICODE = true, .SUPPORT_JIT = jit, // As for CMake builds, use visibilty attributes for both shared and static // builds. Internal symbols should be hidden even in static builds, because // the user could be statically linking PCRE2 into their own shared library. .PCRE2_EXPORT = "__attribute__ ((visibility (\"default\")))", .PCRE2_LINK_SIZE = 2, .PCRE2_PARENS_NEST_LIMIT = 250, .PCRE2_HEAP_LIMIT = 20000000, .PCRE2_MAX_VARLOOKBEHIND = 255, .PCRE2_MATCH_LIMIT = 10000000, .PCRE2_MATCH_LIMIT_DEPTH = "MATCH_LIMIT", .PCRE2GREP_BUFSIZE = 20480, .PCRE2GREP_MAX_BUFSIZE = 1048576, .NEWLINE_DEFAULT = 2, }, ); // pcre2-8/16/32 library const lib_mod = b.createModule(.{ .target = target, .optimize = optimize, .sanitize_c = sanitize_c, .link_libc = true, }); lib_mod.addCMacro("HAVE_CONFIG_H", ""); lib_mod.addCMacro("PCRE2_CODE_UNIT_WIDTH", @tagName(codeUnitWidth)); switch (linkage) { .static => lib_mod.addCMacro("PCRE2_STATIC", ""), .dynamic => {}, } lib_mod.addConfigHeader(config_h); lib_mod.addIncludePath(pcre2_h_dir.getDirectory()); lib_mod.addIncludePath(b.path("src")); lib_mod.addCSourceFile(.{ .file = b.addWriteFiles().addCopyFile(b.path("src/pcre2_chartables.c.dist"), "pcre2_chartables.c"), .flags = cflags, }); lib_mod.addCSourceFiles(.{ .files = &.{ "src/pcre2_auto_possess.c", "src/pcre2_chkdint.c", "src/pcre2_compile.c", "src/pcre2_compile_cgroup.c", "src/pcre2_compile_class.c", "src/pcre2_config.c", "src/pcre2_context.c", "src/pcre2_convert.c", "src/pcre2_dfa_match.c", "src/pcre2_error.c", "src/pcre2_extuni.c", "src/pcre2_find_bracket.c", "src/pcre2_jit_compile.c", "src/pcre2_maketables.c", "src/pcre2_match.c", "src/pcre2_match_data.c", "src/pcre2_match_next.c", "src/pcre2_newline.c", "src/pcre2_ord2utf.c", "src/pcre2_pattern_info.c", "src/pcre2_script_run.c", "src/pcre2_serialize.c", "src/pcre2_string_utils.c", "src/pcre2_study.c", "src/pcre2_substitute.c", "src/pcre2_substring.c", "src/pcre2_tables.c", "src/pcre2_ucd.c", "src/pcre2_valid_utf.c", "src/pcre2_xclass.c", }, .flags = cflags, }); const lib = b.addLibrary(.{ .name = b.fmt("pcre2-{t}", .{codeUnitWidth}), .root_module = lib_mod, .linkage = linkage, }); lib.installHeader(pcre2_h, "pcre2.h"); b.installArtifact(lib); // pcre2test const pcre2test_mod = b.createModule(.{ .target = target, .optimize = optimize, .sanitize_c = sanitize_c, .link_libc = true, }); pcre2test_mod.addCMacro("HAVE_CONFIG_H", ""); // Note: On Windows, consumers linking against the static library should // probably define PCRE2_STATIC themselves (e.g. via // addCMacro("PCRE2_STATIC", "")). // As far as I know, Zig's build system does not currently have a mechanism // to propagate compile definitions to downstream consumers (like CMake's // PUBLIC target_compile_definitions). On non-Windows targets this is a no-op. switch (linkage) { .static => pcre2test_mod.addCMacro("PCRE2_STATIC", ""), .dynamic => pcre2test_mod.addCMacro("PCRE2POSIX_SHARED", ""), } const pcre2test = b.addExecutable(.{ .name = "pcre2test", .root_module = pcre2test_mod, }); pcre2test_mod.addConfigHeader(config_h); pcre2test_mod.addIncludePath(pcre2_h_dir.getDirectory()); pcre2test_mod.addIncludePath(b.path("src")); pcre2test_mod.addCSourceFile(.{ .file = b.path("src/pcre2test.c"), .flags = cflags, }); pcre2test_mod.linkLibrary(lib); b.installArtifact(pcre2test); // pcre2-posix library if (codeUnitWidth == CodeUnitWidth.@"8") { const posixLib_mod = b.createModule(.{ .target = target, .optimize = optimize, .sanitize_c = sanitize_c, .link_libc = true, }); posixLib_mod.addCMacro("HAVE_CONFIG_H", ""); posixLib_mod.addCMacro("PCRE2_CODE_UNIT_WIDTH", @tagName(codeUnitWidth)); switch (linkage) { .static => posixLib_mod.addCMacro("PCRE2_STATIC", ""), .dynamic => posixLib_mod.addCMacro("PCRE2POSIX_SHARED", ""), } posixLib_mod.addConfigHeader(config_h); posixLib_mod.addIncludePath(pcre2_h_dir.getDirectory()); posixLib_mod.addIncludePath(b.path("src")); posixLib_mod.addCSourceFiles(.{ .files = &.{ "src/pcre2posix.c", }, .flags = cflags, }); posixLib_mod.linkLibrary(lib); const posixLib = b.addLibrary(.{ .name = "pcre2-posix", .root_module = posixLib_mod, .linkage = linkage, }); pcre2test_mod.linkLibrary(posixLib); b.addNamedLazyPath("pcre2posix.h", b.path("src/pcre2posix.h")); posixLib.installHeader(b.path("src/pcre2posix.h"), "pcre2posix.h"); b.installArtifact(posixLib); } } ================================================ FILE: cmake/COPYING-CMAKE-SCRIPTS ================================================ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: cmake/FindEditline.cmake ================================================ # Modified from FindReadline.cmake (PH Feb 2012) find_path(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES editline edit/readline) mark_as_advanced(EDITLINE_INCLUDE_DIR) find_library(EDITLINE_LIBRARY NAMES edit) mark_as_advanced(EDITLINE_LIBRARY) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(Editline DEFAULT_MSG EDITLINE_LIBRARY EDITLINE_INCLUDE_DIR) if(Editline_FOUND) set(EDITLINE_LIBRARIES "${EDITLINE_LIBRARY}") set(EDITLINE_INCLUDE_DIRS "${EDITLINE_INCLUDE_DIR}") endif() ================================================ FILE: cmake/FindReadline.cmake ================================================ # from http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/FindReadline.cmake # http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/COPYING-CMAKE-SCRIPTS # --> BSD licensed # # GNU Readline library finder find_path(READLINE_INCLUDE_DIR readline/readline.h PATH_SUFFIXES include) mark_as_advanced(READLINE_INCLUDE_DIR) find_library(READLINE_LIBRARY NAMES readline) mark_as_advanced(READLINE_LIBRARY) if(READLINE_INCLUDE_DIR AND READLINE_LIBRARY) # Check if we need to link to ncurses as well include(CheckSymbolExists) include(CMakePushCheckState) set(first_run FALSE) if(NOT DEFINED HAVE_READLINE_FUNC) set(first_run TRUE) endif() cmake_push_check_state(RESET) set(CMAKE_REQUIRED_LIBRARIES "${READLINE_LIBRARY}") set(CMAKE_REQUIRED_INCLUDES "${READLINE_INCLUDE_DIR}") check_symbol_exists("readline" "stdio.h;readline/readline.h" HAVE_READLINE_FUNC) if(NOT HAVE_READLINE_FUNC) foreach( lib IN ITEMS tinfo curses ncurses ncursesw termcap ) find_library(NCURSES_LIBRARY_${lib} NAMES ${lib}) mark_as_advanced(NCURSES_LIBRARY_${lib}) if(NCURSES_LIBRARY_${lib}) cmake_reset_check_state() set(CMAKE_REQUIRED_LIBRARIES "${READLINE_LIBRARY}" "${NCURSES_LIBRARY_${lib}}") set(CMAKE_REQUIRED_INCLUDES "${READLINE_INCLUDE_DIR}") check_symbol_exists("readline" "stdio.h;readline/readline.h" HAVE_READLINE_FUNC_${lib}) if(HAVE_READLINE_FUNC_${lib}) if(first_run) message(STATUS "Looking for readline - readline needs ${lib}") endif() set(NCURSES_LIBRARY "${NCURSES_LIBRARY_${lib}}" CACHE FILEPATH "Path to the ncurses library") mark_as_advanced(NCURSES_LIBRARY) break() endif() endif() endforeach() endif() cmake_pop_check_state() endif() include(FindPackageHandleStandardArgs) find_package_handle_standard_args(Readline DEFAULT_MSG READLINE_LIBRARY READLINE_INCLUDE_DIR) if(Readline_FOUND) set(READLINE_LIBRARIES "${READLINE_LIBRARY}") if(DEFINED NCURSES_LIBRARY) list(APPEND READLINE_LIBRARIES "${NCURSES_LIBRARY}") endif() set(READLINE_INCLUDE_DIRS "${READLINE_INCLUDE_DIR}") endif() ================================================ FILE: cmake/PCRE2CheckVscript.cmake ================================================ # Similarly to Autoconf's ax_check_vscript.m4, check whether the linker supports # version scripts (GNU ld) or map files (Sun linker). # Sets the "have_var" to TRUE or FALSE depending on the detected support; and if # support is detected then sets "flag_var" to the appropriate flag to pass to # the linker (namely, --version-script or -M). # Helper function: try to compile a shared library with a given linker flag and # version script. This properly tests version script support by building a # shared library rather than an executable, avoiding issues with # executable-specific symbols (e.g. FreeBSD's crt1.o symbols, Solaris linker # symbols in values-Xc.o). function(_pcre2_try_vscript_shared_lib link_flag map_file result_var) if(DEFINED ${result_var}) return() endif() message(STATUS "Performing Test ${result_var}") set(${result_var} FALSE PARENT_SCOPE) set(try_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/CMakeScratch/CheckVscript") file(REMOVE_RECURSE "${try_dir}") file(MAKE_DIRECTORY "${try_dir}") # Write a minimal C source with exported symbols file(WRITE "${try_dir}/test_vscript.c" " int hidethis(void) { return 0; } int exposethis(void) { return hidethis(); } ") # Write a CMakeLists.txt that builds a shared library with the version script file(WRITE "${try_dir}/CMakeLists.txt" " cmake_minimum_required(VERSION 3.15) project(test_vscript C) add_library(test_vscript SHARED test_vscript.c) target_link_options(test_vscript PRIVATE \"-Wl,${link_flag},${map_file}\") ") try_compile( compile_result "${try_dir}/build" # BINARY_DIR "${try_dir}" # SOURCE_DIR test_vscript # Project name OUTPUT_VARIABLE compile_output ) if(compile_result) set(${result_var} TRUE PARENT_SCOPE) endif() if(${compile_result}) set(${result_var} 1 CACHE INTERNAL "Test ${result_var}") message(STATUS "Performing Test ${result_var} - Success") else() set(${result_var} "" CACHE INTERNAL "Test ${result_var}") message(STATUS "Performing Test ${result_var} - Failed") endif() endfunction() function(pcre2_check_vscript have_var flag_var no_star_var) set(${have_var} FALSE PARENT_SCOPE) set(${flag_var} "" PARENT_SCOPE) set(${no_star_var} FALSE PARENT_SCOPE) if(MSVC) return() endif() set(first_run FALSE) if(NOT DEFINED HAVE_VSCRIPT_GNU) set(first_run TRUE) message(STATUS "Detecting linker version script support") endif() # Write test version script files file(WRITE "${PROJECT_BINARY_DIR}/test-map-file.sym" "PCRE2_10.00 { global: exposethis; local: *; };") file(WRITE "${PROJECT_BINARY_DIR}/test-map-file-broken.sym" "PCRE2_10.00 { global: exposethis; local: *; }; {") file(WRITE "${PROJECT_BINARY_DIR}/test-map-file-no-star.sym" "PCRE2_10.00 { global: exposethis; local: hidethis; };") set(HAVE_VSCRIPT FALSE) # Test GNU ld --version-script flag _pcre2_try_vscript_shared_lib("--version-script" "${PROJECT_BINARY_DIR}/test-map-file.sym" HAVE_VSCRIPT_GNU) if(HAVE_VSCRIPT_GNU) set(VSCRIPT_FLAG --version-script) set(HAVE_VSCRIPT TRUE) else() # Test Sun linker -M flag _pcre2_try_vscript_shared_lib("-M" "${PROJECT_BINARY_DIR}/test-map-file.sym" HAVE_VSCRIPT_SUN) if(HAVE_VSCRIPT_SUN) set(VSCRIPT_FLAG -M) set(HAVE_VSCRIPT TRUE) endif() endif() if(HAVE_VSCRIPT) # Perform the same logic as ax_check_vscript.m4, to test whether the linker # silently ignores (and overwrites) linker scripts it doesn't understand. _pcre2_try_vscript_shared_lib("${VSCRIPT_FLAG}" "${PROJECT_BINARY_DIR}/test-map-file-broken.sym" HAVE_VSCRIPT_BROKEN) if(HAVE_VSCRIPT_BROKEN) set(HAVE_VSCRIPT FALSE) endif() endif() if(first_run) if(HAVE_VSCRIPT) message(STATUS "Detecting linker version script support - yes (${VSCRIPT_FLAG})") elseif(HAVE_VSCRIPT_BROKEN) message(STATUS "Detecting linker version script support - no (linker overwrites unknown scripts)") else() message(STATUS "Detecting linker version script support - none detected") endif() endif() if(HAVE_VSCRIPT) if(first_run) message(STATUS "Detecting if version scripts work without wildcard") endif() # Test that the linker works without requiring a wildcard to hide platform-specific # symbols (_init, _fini, etc.). _pcre2_try_vscript_shared_lib("${VSCRIPT_FLAG}" "${PROJECT_BINARY_DIR}/test-map-file-no-star.sym" HAVE_VSCRIPT_NO_STAR) if(first_run) message(STATUS "Detecting if version scripts work without wildcard - ${HAVE_VSCRIPT_NO_STAR}") endif() endif() file(REMOVE "${PROJECT_BINARY_DIR}/test-map-file.sym") file(REMOVE "${PROJECT_BINARY_DIR}/test-map-file-broken.sym") file(REMOVE "${PROJECT_BINARY_DIR}/test-map-file-no-star.sym") if(HAVE_VSCRIPT) set(${have_var} TRUE PARENT_SCOPE) set(${flag_var} "${VSCRIPT_FLAG}" PARENT_SCOPE) set(${no_star_var} "${HAVE_VSCRIPT_NO_STAR}" PARENT_SCOPE) endif() endfunction() ================================================ FILE: cmake/PCRE2UseSystemExtensions.cmake ================================================ # This CMake module is supposed to give similar results to the # AC_USE_SYSTEM_EXTENSIONS Autoconf macro, which turns on a load of # system feature-check macros, including _ALL_SOURCE, _GNU_SOURCE, # _NETBSD_SOURCE, and many more. # # Because PCRE2 uses so few OS features, we don't seem to actually need to # enable many of these. Modern platforms with CMake users generally enable # all the basic POSIX features by default. # # So far, we know that we require: # - _ALL_SOURCE on IBM systems (z/OS, probably AIX) in order to call # getrlimit() in pcre2test. # - _GNU_SOURCE on Linux in order to call mkostemp() in some (non-default) # configurations of the JIT. # # Autoconf enables this unconditionally. However, our CMake script potentially # supports *more* platforms than Autoconf, so we use a feature check. function(pcre2_use_system_extensions) if(WIN32) return() endif() set(first_run FALSE) set(found_macro FALSE) if(NOT DEFINED HAVE_GETRLIMIT_NAKED) set(first_run TRUE) message(STATUS "Detecting platform feature test macros") endif() include(CheckSymbolExists) include(CheckCSourceCompiles) include(CMakePushCheckState) cmake_push_check_state(RESET) set( _pcre2_test_src [=[ #include #include int main(void) { struct rlimit rlim; getrlimit(RLIMIT_STACK, &rlim); return 0; } ]=] ) set(CMAKE_REQUIRED_QUIET TRUE) check_c_source_compiles("${_pcre2_test_src}" HAVE_GETRLIMIT_NAKED) if(NOT HAVE_GETRLIMIT_NAKED) # Try again with _ALL_SOURCE set(CMAKE_REQUIRED_DEFINITIONS "-D_ALL_SOURCE") check_c_source_compiles("${_pcre2_test_src}" HAVE_GETRLIMIT_ALLSOURCE) unset(CMAKE_REQUIRED_DEFINITIONS) if(HAVE_GETRLIMIT_ALLSOURCE) add_compile_definitions(_ALL_SOURCE) set(found_macro TRUE) if(first_run) message(STATUS "Detecting platform feature test macros - _ALL_SOURCE") endif() endif() endif() check_symbol_exists(mkostemp stdlib.h HAVE_MKOSTEMP_NAKED) if(NOT HAVE_MKOSTEMP_NAKED) # Try again with _GNU_SOURCE set(CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE") check_symbol_exists(mkostemp stdlib.h HAVE_MKOSTEMP_GNUSOURCE) unset(CMAKE_REQUIRED_DEFINITIONS) if(HAVE_MKOSTEMP_GNUSOURCE) add_compile_definitions(_GNU_SOURCE) set(found_macro TRUE) if(first_run) message(STATUS "Detecting platform feature test macros - _GNU_SOURCE") endif() endif() endif() if(first_run AND NOT found_macro) message(STATUS "Detecting platform feature test macros - none") endif() cmake_pop_check_state() endfunction() ================================================ FILE: cmake/PCRE2WarningAsError.cmake ================================================ # This file can be removed once the minimum CMake version is increased to 3.24 # or higher. Calls to pcre2_warning_as_error can be changed to the built in # CMAKE_C_COMPILE_OPTIONS_WARNING_AS_ERROR. function(pcre2_warning_as_error out_var) set(${out_var} "" PARENT_SCOPE) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) # Since CMake 3.24, we should use the CMAKE_C_COMPILE_OPTIONS_WARNING_AS_ERROR # variable for greatest compiler compatibility. if(DEFINED CMAKE_C_COMPILE_OPTIONS_WARNING_AS_ERROR) set(${out_var} "${CMAKE_C_COMPILE_OPTIONS_WARNING_AS_ERROR}" PARENT_SCOPE) endif() else() # The fallback probes for support, trying a few common flags. if(NOT MSVC) include(CheckCCompilerFlag) include(CMakePushCheckState) cmake_push_check_state(RESET) check_c_compiler_flag("-Werror" HAVE_WERROR) if(HAVE_WERROR) set(${out_var} "-Werror" PARENT_SCOPE) else() check_c_compiler_flag("-errwarn=%all" HAVE_ERRWARN_ALL) if(HAVE_ERRWARN_ALL) set(${out_var} "-errwarn=%all" PARENT_SCOPE) endif() endif() cmake_pop_check_state() endif() endif() endfunction() ================================================ FILE: cmake/pcre2-config.cmake.in ================================================ # pcre2-config.cmake # ---------------- # # Finds the PCRE2 library, specify the starting search path in PCRE2_ROOT. # # Static vs. shared # ----------------- # To force using the static library instead of the shared one, one needs # to set the variable PCRE2_USE_STATIC_LIBS to ON before calling find_package. # If the variable is not set, the static library will be used if only that has # been built, otherwise the shared library will be used. # # The following components are supported: 8BIT, 16BIT, 32BIT and POSIX. # They used to be required but not anymore; all available targets will # be defined regardless of the requested components. # Example: # set(PCRE2_USE_STATIC_LIBS ON) # find_package(PCRE2 CONFIG) # # This will define the following variables: # # PCRE2_FOUND - True if the system has the PCRE2 library. # PCRE2_VERSION - The version of the PCRE2 library which was found. # # and the following imported targets: # # PCRE2::8BIT - The 8 bit PCRE2 library. # PCRE2::16BIT - The 16 bit PCRE2 library. # PCRE2::32BIT - The 32 bit PCRE2 library. # PCRE2::POSIX - The POSIX PCRE2 library. @PACKAGE_INIT@ include(CMakeFindDependencyMacro) if("@REQUIRE_PTHREAD@") # REQUIRE_PTHREAD find_dependency(Threads) endif() include("${CMAKE_CURRENT_LIST_DIR}/pcre2-targets.cmake") # Set version set(PCRE2_VERSION "@PCRE2_MAJOR@.@PCRE2_MINOR@.0") # Chooses the linkage of the library to expose in the # unsuffixed edition of the target. macro(_pcre2_add_component_target component target) # If the static library exists and either PCRE2_USE_STATIC_LIBS # is defined, or the dynamic library does not exist, use the static library. if(NOT TARGET PCRE2::${component}) if(TARGET pcre2::pcre2-${target}-static AND (PCRE2_USE_STATIC_LIBS OR NOT TARGET pcre2::pcre2-${target}-shared)) add_library(PCRE2::${component} ALIAS pcre2::pcre2-${target}-static) set(PCRE2_${component}_FOUND TRUE) # Otherwise use the dynamic library if it exists. elseif(TARGET pcre2::pcre2-${target}-shared AND NOT PCRE2_USE_STATIC_LIBS) add_library(PCRE2::${component} ALIAS pcre2::pcre2-${target}-shared) set(PCRE2_${component}_FOUND TRUE) endif() if(PCRE2_${component}_FOUND) get_target_property(PCRE2_${component}_LIBRARY PCRE2::${component} IMPORTED_LOCATION) set(PCRE2_LIBRARIES ${PCRE2_LIBRARIES} ${PCRE2_${component}_LIBRARY}) endif() endif() endmacro() _pcre2_add_component_target(8BIT 8) _pcre2_add_component_target(16BIT 16) _pcre2_add_component_target(32BIT 32) _pcre2_add_component_target(POSIX posix) # When POSIX component has been specified make sure that also 8BIT component is specified. set(PCRE2_8BIT_COMPONENT FALSE) set(PCRE2_POSIX_COMPONENT FALSE) foreach(component ${PCRE2_FIND_COMPONENTS}) if(component STREQUAL "8BIT") set(PCRE2_8BIT_COMPONENT TRUE) elseif(component STREQUAL "POSIX") set(PCRE2_POSIX_COMPONENT TRUE) endif() endforeach() if(PCRE2_POSIX_COMPONENT AND NOT PCRE2_8BIT_COMPONENT) message( FATAL_ERROR "The component POSIX is specified while the 8BIT one is not. This is not allowed. Please, also specify the 8BIT component." ) endif() unset(PCRE2_8BIT_COMPONENT) unset(PCRE2_POSIX_COMPONENT) # Check for required components. check_required_components("PCRE2") ================================================ FILE: configure.ac ================================================ dnl Process this file with autoconf to produce a configure script. dnl NOTE FOR MAINTAINERS: Do not use minor version numbers 08 or 09 because dnl the leading zeros may cause them to be treated as invalid octal constants dnl if a PCRE2 user writes code that uses PCRE2_MINOR as a number. There is now dnl a check further down that throws an error if 08 or 09 are used. dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might dnl be defined as -RC2, for example. For real releases, it should be empty. m4_define(pcre2_major, [10]) m4_define(pcre2_minor, [48]) m4_define(pcre2_prerelease, [-DEV]) m4_define(pcre2_date, [2025-10-21]) # Libtool shared library interface versions (current:revision:age) m4_define(libpcre2_8_version, [15:0:15]) m4_define(libpcre2_16_version, [15:0:15]) m4_define(libpcre2_32_version, [15:0:15]) m4_define(libpcre2_posix_version, [3:7:0]) # NOTE: The CMakeLists.txt file searches for the above variables in the first # 50 lines of this file. Please update that if the variables above are moved. AC_PREREQ([2.69]) AC_INIT([PCRE2],pcre2_major.pcre2_minor[]pcre2_prerelease,[],[pcre2]) AC_CONFIG_SRCDIR([src/pcre2.h.in]) AM_INIT_AUTOMAKE([dist-bzip2 dist-zip foreign]) ifelse(pcre2_prerelease, [-DEV], [dnl For development builds, ./configure is not checked in to Git, so we are dnl happy to have it regenerated as needed. AM_MAINTAINER_MODE([enable])], [dnl For a release build (or RC), the ./configure script we ship in the dnl tarball (and check in to the Git tag) should not be regenerated dnl implicitly. This is important if users want to check out a release tag dnl using Git. AM_MAINTAINER_MODE]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_CONFIG_HEADERS(src/config.h) # This was added at the suggestion of libtoolize (03-Jan-10) AC_CONFIG_MACRO_DIR([m4]) # The default CFLAGS in Autoconf are "-g -O2" for gcc and just "-g" for any # other compiler. There doesn't seem to be a standard way of getting rid of the # -g (which I don't think is needed for a production library). This fudge seems # to achieve the necessary. First, we remember the externally set values of # CFLAGS. Then call the AC_PROG_CC macro to find the compiler - if CFLAGS is # not set, it will be set to Autoconf's defaults. Afterwards, if the original # values were not set, remove the -g from the Autoconf defaults. remember_set_CFLAGS="$CFLAGS" m4_version_prereq(2.70, [AC_PROG_CC], [AC_PROG_CC_C99]) AM_PROG_CC_C_O AC_USE_SYSTEM_EXTENSIONS if test "x$remember_set_CFLAGS" = "x" then if test "$CFLAGS" = "-g -O2" then CFLAGS="-O2" elif test "$CFLAGS" = "-g" then CFLAGS="" fi fi # This is a new thing required to stop a warning from automake 1.12 m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) # Check for a 64-bit integer type AC_TYPE_INT64_T # Check for xlc which has some special (broken/non-standard) behaviour on z/OS. PCRE2_ZOS_FIXES AC_PROG_INSTALL AC_PROG_LN_S # As well as LT_INIT, we use LT_OUTPUT so that PCRE2_CHECK_VSCRIPT can invoke # libtool to test version scripts correctly. LT_INIT([win32-dll]) LT_OUTPUT AC_SYS_LARGEFILE # Check for GCC visibility feature PCRE2_VISIBILITY # Check for the availability of -Wl,--version-script (or -Wl,-M on Solaris) PCRE2_CHECK_VSCRIPT # Check for Clang __attribute__((uninitialized)) feature AC_MSG_CHECKING([for __attribute__((uninitialized))]) AC_LANG_PUSH([C]) tmp_CFLAGS=$CFLAGS if test $WORKING_WERROR -eq 1; then CFLAGS="$CFLAGS -Werror" fi AC_COMPILE_IFELSE([AC_LANG_PROGRAM(, [[char buf[128] __attribute__((uninitialized));(void)buf]])], [pcre2_cc_cv_attribute_uninitialized=yes], [pcre2_cc_cv_attribute_uninitialized=no]) AC_MSG_RESULT([$pcre2_cc_cv_attribute_uninitialized]) if test "$pcre2_cc_cv_attribute_uninitialized" = yes; then AC_DEFINE([HAVE_ATTRIBUTE_UNINITIALIZED], 1, [Define this if your compiler supports __attribute__((uninitialized))]) fi CFLAGS=$tmp_CFLAGS AC_LANG_POP([C]) # Check for the assume() builtin AC_MSG_CHECKING([for __assume()]) AC_LANG_PUSH([C]) AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[__assume(1)]])], [pcre2_cc_cv_builtin_assume=yes], [pcre2_cc_cv_builtin_assume=no]) AC_MSG_RESULT([$pcre2_cc_cv_builtin_assume]) if test "$pcre2_cc_cv_builtin_assume" = yes; then AC_DEFINE([HAVE_BUILTIN_ASSUME], 1, [Define this if your compiler provides __assume()]) fi AC_LANG_POP([C]) # Check for the mul_overflow() builtin AC_MSG_CHECKING([for __builtin_mul_overflow()]) AC_LANG_PUSH([C]) AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #ifdef HAVE_SYS_TYPES_H #include #endif #include int a, b; size_t m; ]], [[__builtin_mul_overflow(a, b, &m)]])], [pcre2_cc_cv_builtin_mul_overflow=yes], [pcre2_cc_cv_builtin_mul_overflow=no]) AC_MSG_RESULT([$pcre2_cc_cv_builtin_mul_overflow]) if test "$pcre2_cc_cv_builtin_mul_overflow" = yes; then AC_DEFINE([HAVE_BUILTIN_MUL_OVERFLOW], 1, [Define this if your compiler provides __builtin_mul_overflow()]) fi AC_LANG_POP([C]) # Check for the unreachable() builtin AC_MSG_CHECKING([for __builtin_unreachable()]) AC_LANG_PUSH([C]) AC_LINK_IFELSE([AC_LANG_PROGRAM([[int r;]], [[if (r) __builtin_unreachable()]])], [pcre2_cc_cv_builtin_unreachable=yes], [pcre2_cc_cv_builtin_unreachable=no]) AC_MSG_RESULT([$pcre2_cc_cv_builtin_unreachable]) if test "$pcre2_cc_cv_builtin_unreachable" = yes; then AC_DEFINE([HAVE_BUILTIN_UNREACHABLE], 1, [Define this if your compiler provides __builtin_unreachable()]) fi AC_LANG_POP([C]) # Versioning PCRE2_MAJOR="pcre2_major" PCRE2_MINOR="pcre2_minor" PCRE2_PRERELEASE="pcre2_prerelease" PCRE2_DATE="pcre2_date" if test "$PCRE2_MINOR" = "08" -o "$PCRE2_MINOR" = "09" then echo "***" echo "*** Minor version number $PCRE2_MINOR must not be used. ***" echo "*** Use only 00 to 07 or 10 onwards, to avoid octal issues. ***" echo "***" exit 1 fi AC_SUBST(PCRE2_MAJOR) AC_SUBST(PCRE2_MINOR) AC_SUBST(PCRE2_PRERELEASE) AC_SUBST(PCRE2_DATE) # Set a more sensible default value for $(htmldir). if test "x$htmldir" = 'x${docdir}' then htmldir='${docdir}/html' fi # Force an error for PCRE1 size options AC_ARG_ENABLE(pcre8,,,enable_pcre8=no) AC_ARG_ENABLE(pcre16,,,enable_pcre16=no) AC_ARG_ENABLE(pcre32,,,enable_pcre32=no) if test "$enable_pcre8$enable_pcre16$enable_pcre32" != "nonono" then echo "** ERROR: Use --[[en|dis]]able-pcre2-[[8|16|32]], not --[[en|dis]]able-pcre[[8|16|32]]" exit 1 fi # Handle --disable-pcre2-8 (enabled by default) AC_ARG_ENABLE(pcre2-8, AS_HELP_STRING([--disable-pcre2-8], [disable 8 bit character support]), , enable_pcre2_8=unset) AC_SUBST(enable_pcre2_8) # Handle --enable-pcre2-16 (disabled by default) AC_ARG_ENABLE(pcre2-16, AS_HELP_STRING([--enable-pcre2-16], [enable 16 bit character support]), , enable_pcre2_16=unset) AC_SUBST(enable_pcre2_16) # Handle --enable-pcre2-32 (disabled by default) AC_ARG_ENABLE(pcre2-32, AS_HELP_STRING([--enable-pcre2-32], [enable 32 bit character support]), , enable_pcre2_32=unset) AC_SUBST(enable_pcre2_32) # Handle --enable-debug (disabled by default) AC_ARG_ENABLE(debug, AS_HELP_STRING([--enable-debug], [enable debugging code]), , enable_debug=no) # Handle --enable-jit (disabled by default) AC_ARG_ENABLE(jit, AS_HELP_STRING([--enable-jit], [enable Just-In-Time compiling support]), , enable_jit=no) # This code enables JIT if the hardware supports it. if test "$enable_jit" = "auto"; then AC_LANG(C) SAVE_CPPFLAGS=$CPPFLAGS CPPFLAGS=-I$srcdir AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ #define SLJIT_CONFIG_AUTO 1 #include "deps/sljit/sljit_src/sljitConfigCPU.h" #if (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED) #error unsupported #endif]])], enable_jit=yes, enable_jit=no) CPPFLAGS=$SAVE_CPPFLAGS echo checking for JIT support on this hardware... $enable_jit fi # Handle --enable-jit-sealloc (disabled by default and only experimental) case $host_os in linux* | netbsd*) AC_ARG_ENABLE(jit-sealloc, AS_HELP_STRING([--enable-jit-sealloc], [enable SELinux compatible execmem allocator in JIT (experimental)]), ,enable_jit_sealloc=no) ;; *) enable_jit_sealloc=unsupported ;; esac # Handle --disable-pcre2grep-jit (enabled by default) AC_ARG_ENABLE(pcre2grep-jit, AS_HELP_STRING([--disable-pcre2grep-jit], [disable JIT support in pcre2grep]), , enable_pcre2grep_jit=yes) # Handle --disable-pcre2grep-callout (enabled by default) AC_ARG_ENABLE(pcre2grep-callout, AS_HELP_STRING([--disable-pcre2grep-callout], [disable callout script support in pcre2grep]), , enable_pcre2grep_callout=yes) # Handle --disable-pcre2grep-callout-fork (enabled by default) AC_ARG_ENABLE(pcre2grep-callout-fork, AS_HELP_STRING([--disable-pcre2grep-callout-fork], [disable callout script fork support in pcre2grep]), , enable_pcre2grep_callout_fork=yes) # Handle --enable-rebuild-chartables AC_ARG_ENABLE(rebuild-chartables, AS_HELP_STRING([--enable-rebuild-chartables], [rebuild character tables in current locale]), , enable_rebuild_chartables=no) # Handle --disable-unicode (enabled by default) AC_ARG_ENABLE(unicode, AS_HELP_STRING([--disable-unicode], [disable Unicode support]), , enable_unicode=unset) # Handle newline options ac_pcre2_newline=lf AC_ARG_ENABLE(newline-is-cr, AS_HELP_STRING([--enable-newline-is-cr], [use CR as newline character]), ac_pcre2_newline=cr) AC_ARG_ENABLE(newline-is-lf, AS_HELP_STRING([--enable-newline-is-lf], [use LF as newline character (default)]), ac_pcre2_newline=lf) AC_ARG_ENABLE(newline-is-crlf, AS_HELP_STRING([--enable-newline-is-crlf], [use CRLF as newline sequence]), ac_pcre2_newline=crlf) AC_ARG_ENABLE(newline-is-anycrlf, AS_HELP_STRING([--enable-newline-is-anycrlf], [use CR, LF, or CRLF as newline sequence]), ac_pcre2_newline=anycrlf) AC_ARG_ENABLE(newline-is-any, AS_HELP_STRING([--enable-newline-is-any], [use any valid Unicode newline sequence]), ac_pcre2_newline=any) AC_ARG_ENABLE(newline-is-nul, AS_HELP_STRING([--enable-newline-is-nul], [use NUL (binary zero) as newline character]), ac_pcre2_newline=nul) enable_newline="$ac_pcre2_newline" # Handle --enable-bsr-anycrlf AC_ARG_ENABLE(bsr-anycrlf, AS_HELP_STRING([--enable-bsr-anycrlf], [\R matches only CR, LF, CRLF by default]), , enable_bsr_anycrlf=no) # Handle --enable-never-backslash-C AC_ARG_ENABLE(never-backslash-C, AS_HELP_STRING([--enable-never-backslash-C], [use of \C causes an error]), , enable_never_backslash_C=no) # Handle --enable-ebcdic AC_ARG_ENABLE(ebcdic, AS_HELP_STRING([--enable-ebcdic], [assume EBCDIC coding rather than ASCII; incompatible with --enable-unicode; use only in (uncommon) EBCDIC environments]), , enable_ebcdic=no) # Handle --enable-ebcdic-nl25 AC_ARG_ENABLE(ebcdic-nl25, AS_HELP_STRING([--enable-ebcdic-nl25], [set EBCDIC code for NL to 0x25 instead of 0x15; it implies --enable-ebcdic]), , enable_ebcdic_nl25=no) # Handle --enable-ebcdic-ignoring-compiler AC_ARG_ENABLE(ebcdic-ignoring-compiler, AS_HELP_STRING([--enable-ebcdic-ignoring-compiler], [force EBCDIC 1047 using numeric literals rather than C character literals; it implies --enable-ebcdic]), , enable_ebcdic_ignoring_compiler=no) # Handle --enable-pcre2grep-libz AC_ARG_ENABLE(pcre2grep-libz, AS_HELP_STRING([--enable-pcre2grep-libz], [link pcre2grep with libz to handle .gz files]), , enable_pcre2grep_libz=no) # Handle --enable-pcre2grep-libbz2 AC_ARG_ENABLE(pcre2grep-libbz2, AS_HELP_STRING([--enable-pcre2grep-libbz2], [link pcre2grep with libbz2 to handle .bz2 files]), , enable_pcre2grep_libbz2=no) # Handle --with-pcre2grep-bufsize=N AC_ARG_WITH(pcre2grep-bufsize, AS_HELP_STRING([--with-pcre2grep-bufsize=N], [pcre2grep initial buffer size (default=20480, minimum=8192)]), , with_pcre2grep_bufsize=20480) # Handle --with-pcre2grep-max-bufsize=N AC_ARG_WITH(pcre2grep-max-bufsize, AS_HELP_STRING([--with-pcre2grep-max-bufsize=N], [pcre2grep maximum buffer size (default=1048576, minimum=8192)]), , with_pcre2grep_max_bufsize=1048576) # Handle --enable-pcre2test-libedit AC_ARG_ENABLE(pcre2test-libedit, AS_HELP_STRING([--enable-pcre2test-libedit], [link pcre2test with libedit]), , enable_pcre2test_libedit=no) # Handle --enable-pcre2test-libreadline AC_ARG_ENABLE(pcre2test-libreadline, AS_HELP_STRING([--enable-pcre2test-libreadline], [link pcre2test with libreadline]), , enable_pcre2test_libreadline=no) # Handle --with-link-size=N AC_ARG_WITH(link-size, AS_HELP_STRING([--with-link-size=N], [internal link size (2, 3, or 4 allowed; default=2)]), , with_link_size=2) # Handle --with-max-varlookbehind=N AC_ARG_WITH(max-varlookbehind, AS_HELP_STRING([--with-max-varlookbehind=N], [maximum length of variable lookbehind (default=255)]), , with_max_varlookbehind=255) # Handle --with-parens-nest-limit=N AC_ARG_WITH(parens-nest-limit, AS_HELP_STRING([--with-parens-nest-limit=N], [nested parentheses limit (default=250)]), , with_parens_nest_limit=250) # Handle --with-heap-limit AC_ARG_WITH(heap-limit, AS_HELP_STRING([--with-heap-limit=N], [default limit on heap memory (kibibytes, default=20000000)]), , with_heap_limit=20000000) # Handle --with-match-limit=N AC_ARG_WITH(match-limit, AS_HELP_STRING([--with-match-limit=N], [default limit on internal looping (default=10000000)]), , with_match_limit=10000000) # Handle --with-match-limit-depth=N # Recognize old synonym --with-match-limit-recursion # # Note: In config.h, the default is to define MATCH_LIMIT_DEPTH symbolically as # MATCH_LIMIT, which in turn is defined to be some numeric value (e.g. # 10000000). MATCH_LIMIT_DEPTH can otherwise be set to some different numeric # value (or even the same numeric value as MATCH_LIMIT, though no longer # defined in terms of the latter). # AC_ARG_WITH(match-limit-depth, AS_HELP_STRING([--with-match-limit-depth=N], [default limit on match tree depth (default=MATCH_LIMIT)]), , with_match_limit_depth=MATCH_LIMIT) AC_ARG_WITH(match-limit-recursion,, , with_match_limit_recursion=UNSET) # Handle --enable-valgrind AC_ARG_ENABLE(valgrind, AS_HELP_STRING([--enable-valgrind], [enable valgrind support]), , enable_valgrind=no) # Enable code coverage reports using gcov AC_ARG_ENABLE(coverage, AS_HELP_STRING([--enable-coverage], [enable code coverage reports using gcov]), , enable_coverage=no) # Handle --enable-fuzz-support AC_ARG_ENABLE(fuzz_support, AS_HELP_STRING([--enable-fuzz-support], [enable fuzzer support]), , enable_fuzz_support=no) # Handle --enable-diff-fuzz-support AC_ARG_ENABLE(diff_fuzz_support, AS_HELP_STRING([--enable-diff-fuzz-support], [enable differential fuzzer support]), , enable_diff_fuzz_support=no) # Handle --disable-stack-for-recursion # This option became obsolete at release 10.30. AC_ARG_ENABLE(stack-for-recursion,, , enable_stack_for_recursion=yes) # Original code # AC_ARG_ENABLE(stack-for-recursion, # AS_HELP_STRING([--disable-stack-for-recursion], # [don't use stack recursion when matching]), # , enable_stack_for_recursion=yes) # Handle --disable-percent_zt (set as "auto" by default) AC_ARG_ENABLE(percent-zt, AS_HELP_STRING([--disable-percent-zt], [disable the use of z and t formatting modifiers]), , enable_percent_zt=auto) # Handle --enable-Werror/errwarn AC_ARG_ENABLE(Werror, AS_HELP_STRING([--enable-Werror], [Add -Werror to CFLAGS (GCC/Clang style); if -Werror is passed to ./configure via CFLAGS it interferes with feature detection]), , enable_Werror=no) AC_ARG_ENABLE(errwarn, AS_HELP_STRING([--enable-errwarn], [Add -errwarn=%all to CFLAGS (Sun cc style)]), , enable_errwarn=no) # Set the default value for pcre2-8 if test "x$enable_pcre2_8" = "xunset" then enable_pcre2_8=yes fi # Set the default value for pcre2-16 if test "x$enable_pcre2_16" = "xunset" then enable_pcre2_16=no fi # Set the default value for pcre2-32 if test "x$enable_pcre2_32" = "xunset" then enable_pcre2_32=no fi # Make sure at least one library is selected if test "x$enable_pcre2_8$enable_pcre2_16$enable_pcre2_32" = "xnonono" then AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled]) fi # Unicode is enabled by default. if test "x$enable_unicode" = "xunset" then enable_unicode=yes fi # Convert the newline identifier into the appropriate integer value. These must # agree with the PCRE2_NEWLINE_xxx values in pcre2.h. case "$enable_newline" in cr) ac_pcre2_newline_value=1 ;; lf) ac_pcre2_newline_value=2 ;; crlf) ac_pcre2_newline_value=3 ;; any) ac_pcre2_newline_value=4 ;; anycrlf) ac_pcre2_newline_value=5 ;; nul) ac_pcre2_newline_value=6 ;; *) AC_MSG_ERROR([invalid argument "$enable_newline" to --enable-newline option]) ;; esac # --enable-ebcdic-nl25 and --enable-ebcdic-ignoring-compiler imply --enable-ebcdic if test "x$enable_ebcdic_nl25" = "xyes" -o "x$enable_ebcdic_ignoring_compiler" = "xyes"; then enable_ebcdic=yes fi # Make sure that if enable_ebcdic is set (without # enable_ebcdic_ignoring_compiler), rebuild_chartables is also enabled. # Also check that UTF support is not requested, because PCRE2 cannot handle # EBCDIC and UTF in the same build. To do so it would need to use different # character constants depending on the mode. # Also, EBCDIC cannot be used with 16-bit and 32-bit libraries. if test "x$enable_ebcdic" = "xyes"; then if test "x$enable_ebcdic_ignoring_compiler" != "xyes"; then enable_rebuild_chartables=yes fi if test "x$enable_unicode" = "xyes"; then AC_MSG_ERROR([support for EBCDIC and Unicode cannot be enabled at the same time]) fi if test "x$enable_pcre2_16" = "xyes" -o "x$enable_pcre2_32" = "xyes"; then AC_MSG_ERROR([EBCDIC support is available only for the 8-bit library]) fi fi # Check argument to --with-link-size case "$with_link_size" in 2|3|4) ;; *) AC_MSG_ERROR([invalid argument "$with_link_size" to --with-link-size option]) ;; esac AH_TOP([ /* PCRE2 is written in Standard C, but there are a few non-standard things it can cope with, allowing it to run on SunOS4 and other "close to standard" systems. In environments that support the GNU autotools, config.h.in is converted into config.h by the "configure" script. In environments that use CMake, config-cmake.in is converted into config.h. If you are going to build PCRE2 "by hand" without using "configure" or CMake, you should copy the distributed config.h.generic to config.h, and edit the macro definitions to be the way you need them. You must then add -DHAVE_CONFIG_H to all of your compile commands, so that config.h is included at the start of every source. Alternatively, you can avoid editing by using -D on the compiler command line to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H, but if you do, default values will be taken from config.h for non-boolean macros that are not defined on the command line. Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All such macros are listed as a commented #undef in config.h.generic. Macros such as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are surrounded by #ifndef/#endif lines so that the value can be overridden by -D. */]) # Checks for header files. AC_CHECK_HEADERS(assert.h limits.h sys/types.h sys/stat.h dirent.h) AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1]) AC_CHECK_HEADERS([sys/wait.h], [HAVE_SYS_WAIT_H=1]) # Conditional compilation AM_CONDITIONAL(WITH_PCRE2_8, test "x$enable_pcre2_8" = "xyes") AM_CONDITIONAL(WITH_PCRE2_16, test "x$enable_pcre2_16" = "xyes") AM_CONDITIONAL(WITH_PCRE2_32, test "x$enable_pcre2_32" = "xyes") AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes") AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes") AM_CONDITIONAL(WITH_UNICODE, test "x$enable_unicode" = "xyes") AM_CONDITIONAL(WITH_EBCDIC, test "x$enable_ebcdic" = "xyes") AM_CONDITIONAL(WITH_EBCDIC_NL25, test "x$enable_ebcdic_nl25" = "xyes") AM_CONDITIONAL(WITH_VALGRIND, test "x$enable_valgrind" = "xyes") AM_CONDITIONAL(WITH_FUZZ_SUPPORT, test "x$enable_fuzz_support" = "xyes") AM_CONDITIONAL(WITH_DIFF_FUZZ_SUPPORT, test "x$enable_diff_fuzz_support" = "xyes") if test "$enable_fuzz_support" = "yes" -a "$enable_pcre2_8" = "no"; then echo "** ERROR: Fuzzer support requires the 8-bit library" exit 1 fi if test "$enable_diff_fuzz_support" = "yes"; then if test "$enable_fuzz_support" = "no"; then echo "** ERROR: Differential fuzzing support requires fuzzing support" exit 1 fi if test "$enable_jit" = "no"; then echo "** ERROR: Differential fuzzing support requires Just-in-Time compilation support" exit 1 fi AC_DEFINE([SUPPORT_DIFF_FUZZ], [], [ Define to any value to enable differential fuzzing support.]) fi # Checks for typedefs, structures, and compiler characteristics. AC_C_CONST AC_TYPE_SIZE_T # Checks for library functions. AC_CHECK_FUNCS(memfd_create mkostemp secure_getenv setrlimit) AC_MSG_CHECKING([for realpath]) AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #include #include ]],[[ char buffer[PATH_MAX]; realpath(".", buffer); ]])], [AC_MSG_RESULT([yes]) AC_DEFINE([HAVE_REALPATH], 1, [Define to 1 if you have the `realpath' function.]) ], AC_MSG_RESULT([no])) # Check for the availability of libz (aka zlib) AC_CHECK_HEADERS([zlib.h], [HAVE_ZLIB_H=1]) AC_CHECK_LIB([z], [gzopen], [HAVE_LIBZ=1]) # Check for the availability of libbz2. Originally we just used AC_CHECK_LIB, # as for libz. However, this had the following problem, diagnosed and fixed by # a user: # # - libbz2 uses the Pascal calling convention (WINAPI) for the functions # under Win32. # - The standard autoconf AC_CHECK_LIB fails to include "bzlib.h", # therefore missing the function definition. # - The compiler thus generates a "C" signature for the test function. # - The linker fails to find the "C" function. # - PCRE2 fails to configure if asked to do so against libbz2. # # Solution: # # - Replace the AC_CHECK_LIB test with a custom test. AC_CHECK_HEADERS([bzlib.h], [HAVE_BZLIB_H=1]) # Original test # AC_CHECK_LIB([bz2], [BZ2_bzopen], [HAVE_LIBBZ2=1]) # # Custom test follows AC_MSG_CHECKING([for libbz2]) OLD_LIBS="$LIBS" LIBS="$LIBS -lbz2" AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #ifdef HAVE_BZLIB_H #include #endif]], [[return (int)BZ2_bzopen("conftest", "rb");]])], [AC_MSG_RESULT([yes]);HAVE_LIBBZ2=1; break;], AC_MSG_RESULT([no])) LIBS="$OLD_LIBS" # Check for the availability of libreadline if test "$enable_pcre2test_libreadline" = "yes"; then AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_H=1]) AC_CHECK_HEADERS([readline/history.h], [HAVE_HISTORY_H=1]) AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lreadline"], [unset ac_cv_lib_readline_readline; AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-ltinfo"], [unset ac_cv_lib_readline_readline; AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lcurses"], [unset ac_cv_lib_readline_readline; AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lncurses"], [unset ac_cv_lib_readline_readline; AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lncursesw"], [unset ac_cv_lib_readline_readline; AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-ltermcap"], [LIBREADLINE=""], [-ltermcap])], [-lncursesw])], [-lncurses])], [-lcurses])], [-ltinfo])]) AC_SUBST(LIBREADLINE) if test -n "$LIBREADLINE"; then if test "$LIBREADLINE" != "-lreadline"; then echo "-lreadline needs $LIBREADLINE" LIBREADLINE="-lreadline $LIBREADLINE" fi fi fi # Check for the availability of libedit. Different distributions put its # headers in different places. Try to cover the most common ones. if test "$enable_pcre2test_libedit" = "yes"; then AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [ HAVE_LIBEDIT_HEADER=1 break ]) AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"]) fi # Set up shared/static library flags PCRE2_STATIC_CFLAG="" if test "x$enable_shared" = "xno" ; then AC_DEFINE([PCRE2_STATIC], [1], [ Define to any value if linking statically. Ideally, if both static and shared libraries are being built, then PCRE2_STATIC would be defined only for the static build. Indeed, this is a requirement on Windows. With Autoconf and libtool however, it is idiomatic to compile the sources once to create both the static and shared library, so in this case, PCRE2_STATIC should only be defined if no shared library is being built.]) PCRE2_STATIC_CFLAG="-DPCRE2_STATIC" fi AC_SUBST(PCRE2_STATIC_CFLAG) PCRE2POSIX_CFLAG="" if test "x$enable_shared" = "xyes" ; then AC_DEFINE([PCRE2POSIX_SHARED], [1], [ Define to any value if linking libpcre2-posix dynamically. Ideally, if both static and shared libraries are being built, then PCRE2POSIX_SHARED would be defined only for the shared build. Indeed, this is a requirement on Windows. However, when building with Autoconf and libtool, we compile the sources once only to create both the static and shared library, so in this case, PCRE2POSIX_SHARED should only be defined if the shared library is being built, regardless of whether or not the static library is also being built.]) PCRE2POSIX_CFLAG="-DPCRE2POSIX_SHARED" fi AC_SUBST(PCRE2POSIX_CFLAG) # Here is where PCRE2-specific defines are handled if test "$enable_pcre2_8" = "yes"; then AC_DEFINE([SUPPORT_PCRE2_8], [], [ Define to any value to enable the 8 bit PCRE2 library.]) fi if test "$enable_pcre2_16" = "yes"; then AC_DEFINE([SUPPORT_PCRE2_16], [], [ Define to any value to enable the 16 bit PCRE2 library.]) fi if test "$enable_pcre2_32" = "yes"; then AC_DEFINE([SUPPORT_PCRE2_32], [], [ Define to any value to enable the 32 bit PCRE2 library.]) fi if test "$enable_debug" = "yes"; then AC_DEFINE([PCRE2_DEBUG], [], [ Define to any value to include debugging code.]) fi if test "$enable_percent_zt" = "no"; then AC_DEFINE([DISABLE_PERCENT_ZT], [], [ Define to any value to disable the use of the z and t modifiers in formatting settings such as %zu or %td (this is rarely needed).]) else enable_percent_zt=auto fi # Unless running under Windows, JIT support requires pthreads. if test "$enable_jit" = "yes"; then if test "$HAVE_WINDOWS_H" != "1"; then AX_PTHREAD([], [AC_MSG_ERROR([JIT support requires pthreads])]) CC="$PTHREAD_CC" CFLAGS="$PTHREAD_CFLAGS $CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" fi AC_DEFINE([SUPPORT_JIT], [], [ Define to any value to enable support for Just-In-Time compiling.]) else enable_pcre2grep_jit="no" fi if test "$enable_jit_sealloc" = "yes"; then AC_DEFINE([SLJIT_PROT_EXECUTABLE_ALLOCATOR], [1], [ Define to any non-zero number to enable support for SELinux compatible executable memory allocator in JIT. Note that this will have no effect unless SUPPORT_JIT is also defined.]) fi if test "$enable_pcre2grep_jit" = "yes"; then AC_DEFINE([SUPPORT_PCRE2GREP_JIT], [], [ Define to any value to enable JIT support in pcre2grep. Note that this will have no effect unless SUPPORT_JIT is also defined.]) fi if test "$enable_pcre2grep_callout" = "yes"; then if test "$enable_pcre2grep_callout_fork" = "yes"; then if test "$HAVE_WINDOWS_H" != "1"; then if test "$HAVE_SYS_WAIT_H" != "1"; then AC_MSG_ERROR([Callout script support needs sys/wait.h.]) fi fi AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT_FORK], [], [ Define to any value to enable fork support in pcre2grep callout scripts. This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also defined.]) fi AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT], [], [ Define to any value to enable callout script support in pcre2grep.]) else enable_pcre2grep_callout_fork="no" fi if test "$enable_unicode" = "yes"; then AC_DEFINE([SUPPORT_UNICODE], [], [ Define to any value to enable support for Unicode and UTF encoding. This will work even in an EBCDIC environment, but it is incompatible with the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or* ASCII/Unicode, but not both at once.]) fi if test "$enable_pcre2grep_libz" = "yes"; then AC_DEFINE([SUPPORT_LIBZ], [], [ Define to any value to allow pcre2grep to be linked with libz, so that it is able to handle .gz files.]) fi if test "$enable_pcre2grep_libbz2" = "yes"; then AC_DEFINE([SUPPORT_LIBBZ2], [], [ Define to any value to allow pcre2grep to be linked with libbz2, so that it is able to handle .bz2 files.]) fi if test $with_pcre2grep_bufsize -lt 8192 ; then AC_MSG_WARN([$with_pcre2grep_bufsize is too small for --with-pcre2grep-bufsize; using 8192]) with_pcre2grep_bufsize="8192" else if test $? -gt 1 ; then AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize]) fi fi if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize" else if test $? -gt 1 ; then AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize]) fi fi AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [ The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by pcre2grep to hold parts of the file it is searching. The buffer will be expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very long lines. The actual amount of memory used by pcre2grep is three times this number, because it allows for the buffering of "before" and "after" lines.]) AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [ The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer used by pcre2grep to hold parts of the file it is searching. The actual amount of memory used by pcre2grep is three times this number, because it allows for the buffering of "before" and "after" lines.]) if test "$enable_pcre2test_libedit" = "yes"; then AC_DEFINE([SUPPORT_LIBEDIT], [], [ Define to any value to allow pcre2test to be linked with libedit.]) LIBREADLINE="$LIBEDIT" elif test "$enable_pcre2test_libreadline" = "yes"; then AC_DEFINE([SUPPORT_LIBREADLINE], [], [ Define to any value to allow pcre2test to be linked with libreadline.]) fi AC_DEFINE_UNQUOTED([NEWLINE_DEFAULT], [$ac_pcre2_newline_value], [ The value of NEWLINE_DEFAULT determines the default newline character sequence. PCRE2 client programs can override this by selecting other values at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), 5 (ANYCRLF), and 6 (NUL).]) if test "$enable_bsr_anycrlf" = "yes"; then AC_DEFINE([BSR_ANYCRLF], [], [ By default, the \R escape sequence matches any Unicode line ending character or sequence of characters. If BSR_ANYCRLF is defined (to any value), this is changed so that backslash-R matches only CR, LF, or CRLF. The build-time default can be overridden by the user of PCRE2 at runtime.]) fi if test "$enable_never_backslash_C" = "yes"; then AC_DEFINE([NEVER_BACKSLASH_C], [], [ Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.]) fi AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [ The value of LINK_SIZE determines the number of bytes used to store links as offsets within the compiled regex. The default is 2, which allows for compiled patterns up to 65535 code units long. This covers the vast majority of cases. However, PCRE2 can also be compiled to use 3 or 4 bytes instead. This allows for longer patterns in extreme cases.]) AC_DEFINE_UNQUOTED([MAX_VARLOOKBEHIND], [$with_max_varlookbehind], [ The value of MAX_VARLOOKBEHIND specifies the default maximum length, in characters, for a variable-length lookbehind assertion.]) AC_DEFINE_UNQUOTED([PARENS_NEST_LIMIT], [$with_parens_nest_limit], [ The value of PARENS_NEST_LIMIT specifies the maximum depth of nested parentheses (of any kind) in a pattern. This limits the amount of system stack that is used while compiling a pattern.]) AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [ The value of MATCH_LIMIT determines the default number of times the pcre2_match() function can record a backtrack position during a single matching attempt. The value is also used to limit a loop counter in pcre2_dfa_match(). There is a runtime interface for setting a different limit. The limit exists in order to catch runaway regular expressions that take forever to determine that they do not match. The default is set very large so that it does not accidentally catch legitimate cases.]) # --with-match-limit-recursion is an obsolete synonym for --with-match-limit-depth if test "$with_match_limit_recursion" != "UNSET"; then cat < Delete Cache"; or the folder "CMakeCache" can be deleted. 1. Install the latest CMake version available from http://www.cmake.org/, and ensure that cmake\bin is on your path. 2. Unzip (retaining folder structure) the PCRE2 source tree into a source directory such as C:\pcre2. You should ensure your local date and time is not earlier than the file dates in your source dir if the release is very new. 3. Create a new, empty build directory, preferably a subdirectory of the source dir. For example, C:\pcre2\pcre2-xx\build. 4. Run CMake. - Using the CLI, simply run `cmake ..` inside the `build/` directory. You can use the `ccmake` ncurses GUI to select and configure PCRE2 features. - Using the CMake GUI: a) Run cmake-gui from the Shell environment of your build tool, for example, Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. b) Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and build directories, respectively. c) Press the "Configure" button. d) Select the particular IDE / build tool that you are using (Visual Studio, MSYS makefiles, MinGW makefiles, etc.) e) The GUI will then list several configuration options. This is where you can disable Unicode support or select other PCRE2 optional features. f) Press "Configure" again. The adjacent "Generate" button should now be active. g) Press "Generate". 5. The build directory should now contain a usable build system, be it a solution file for Visual Studio, makefiles for MinGW, etc. Exit from cmake-gui and use the generated build system with your compiler or IDE. E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 solution, select the desired configuration (Debug, or Release, etc.) and build the ALL_BUILD project. Regardless of build system used, `cmake --build .` will build it. 6. If during configuration with cmake-gui you've elected to build the test programs, you can execute them by building the test project. E.g., for MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The most recent build configuration is targeted by the tests. A summary of test results is presented. Complete test output is subsequently available for review in Testing\Temporary under your build dir. Regardless of build system used, `ctest` will run the tests. Building PCRE2 on Windows with Visual Studio -------------------------------------------- The code currently cannot be compiled without an inttypes.h header, which is available only with Visual Studio 2013 or newer. However, this portable and permissively-licensed implementation of the stdint.h header could be used as an alternative: http://www.azillionmonkeys.com/qed/pstdint.h Just rename it and drop it into the top level of the build tree. Testing with RunTest.bat ------------------------ If configured with CMake, building the test project ("make test" or building ALL_TESTS in Visual Studio) creates (and runs) pcre2_test.bat (and depending on your configuration options, possibly other test programs) in the build directory. The pcre2_test.bat script runs RunTest.bat with correct source and exe paths. For manual testing with RunTest.bat, provided the build dir is a subdirectory of the source directory: Open command shell window. Chdir to the location of your pcre2test.exe and pcre2grep.exe programs. Call RunTest.bat with "..\RunTest.Bat" or "..\..\RunTest.bat" as appropriate. To run only a particular test with RunTest.Bat provide a test number argument. Otherwise: 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe have been created. 2. Edit RunTest.bat to identify the full or relative location of the pcre2 source (wherein which the testdata folder resides), e.g.: set srcdir=C:\pcre2\pcre2-10.00 3. In a Windows command environment, chdir to the location of your bat and exe programs. 4. Run RunTest.bat. Test outputs will automatically be compared to expected results, and discrepancies will be identified in the console output. To independently test the just-in-time compiler, run pcre2_jit_test.exe. Building PCRE2 on z/OS and z/VM ------------------------------- z/OS and z/VM are operating systems for mainframe computers, produced by IBM. The character code used is EBCDIC, not ASCII or Unicode. In z/OS, UNIX APIs and applications can be supported through UNIX System Services. The PCRE2 codebase compiles and runs with native EBCDIC support on modern z/OS systems, using the pre-installed tools (/bin/sh, ./configure, and the XLC or IBM-Clang compilers). PCRE2 supports z/OS using both Autoconf (./configure) and CMake (which IBM distributes via "zopen install cmake"). Note that as of the time of writing, IBM's port of CMake to z/OS has only partial support for EBCDIC. It is recommended to build PCRE2 using the ./configure script, if you require an EBCDIC build. Any EBCDIC codepage should work (PCRE2 does not assume or require IBM-1047), or PCRE2 can compiled for ASCII/Latin-1/Unicode. After unpacking the PCRE2 tarball, you must subsequently tag the files as ASCII in order for the z/OS shell and compiler to interpret them correctly: chtag -R -tc ISO8859-1 Some unusual features on the IBM platform are: - The _ALL_SOURCE macro must be provided. Unlike on Linux or macOS, even quite standard POSIX APIs are not made visible by default. PCRE2's Autoconf and CMake system both provide this for you. - The `cc`, `c89`, and even `c99` compilers provided by IBM do not default to the same argument ordering as other Unix platforms. - The XLC compiler requires `-qhaltonmsg=CCN3296`, otherwise it will treat any preprocessor #include errors as a warning rather than an error. Needless to say this default wrecks Autoconf and CMake's feature-detection tests. PCRE2's build system is aware of this. - The test suite (in the testdata/ directory) is entirely in ASCII/UTF-8. When running the tests, you must ensure that the EBCDIC-native build of pcre2test receives an EBCDIC version of these files. The easiest way to achieve this is via filesystem tagging (chtag). Alternatively, you could manually re-encode the testdata files as EBCDIC, and tag them as EBCDIC. (Latin-1 and EBCDIC are one-to-one convertible encodings, a simple byte-by-byte permutation of the 256 values.) In native z/OS (without UNIX System Services) and in z/VM, a user has provided a special port of PCRE2. For details, please see file 939 on this web site: http://www.cbttape.org The user-provided port also provides an API for LE languages such as COBOL and for the z/OS and z/VM versions of the Rexx languages. Building PCRE2 under VMS ------------------------ Alexey Chuphin has contributed some auxiliary files for building PCRE2 under OpenVMS. They are in the "vms" directory in the distribution tarball. Please read the file called vms/openvms_readme.txt. The pcre2test and pcre2grep programs contain some VMS-specific code. This has not been tested for some time. The PCRE2 maintainers would be grateful to learn whether it still works (or if anyone still uses it). ============================= Last updated: 17 October 2025 ============================= ================================================ FILE: doc/html/README.txt ================================================ README file for PCRE2 (Perl-compatible regular expression library) ================================================================== PCRE2 is a re-working of the original PCRE1 library to provide an entirely new API. Since its initial release in 2015, there has been further development of the code and it now differs from PCRE1 in more than just the API. There are new features, and the internals have been improved. The original PCRE1 library is now obsolete and no longer maintained. The latest release of PCRE2 is available in .tar.gz, tar.bz2, or .zip form from this GitHub repository: https://github.com/PCRE2Project/pcre2/releases There is a mailing list for discussion about the development of PCRE2 at pcre2-dev@googlegroups.com. You can subscribe by sending an email to pcre2-dev+subscribe@googlegroups.com. You can access the archives and also subscribe or manage your subscription here: https://groups.google.com/g/pcre2-dev Please read the NEWS file if you are upgrading from a previous release. The contents of this README file are: The PCRE2 APIs Documentation for PCRE2 Building PCRE2 on non-Unix-like systems Building PCRE2 without using autotools Building PCRE2 using autotools Retrieving configuration information Shared libraries Cross-compiling using autotools Making new tarballs Testing PCRE2 Character tables File manifest The PCRE2 APIs -------------- PCRE2 is written in C, and it has its own API. There are three sets of functions, one for the 8-bit library, which processes strings of bytes, one for the 16-bit library, which processes strings of 16-bit values, and one for the 32-bit library, which processes strings of 32-bit values. Unlike PCRE1, there are no C++ wrappers. The distribution does contain a set of C wrapper functions for the 8-bit library that are based on the POSIX regular expression API (see the pcre2posix man page). These are built into a library called libpcre2-posix. Note that this just provides a POSIX calling interface to PCRE2; the regular expressions themselves still follow Perl syntax and semantics. The POSIX API is restricted, and does not give full access to all of PCRE2's facilities. The header file for the POSIX-style functions is called pcre2posix.h. The official POSIX name is regex.h, but I did not want to risk possible problems with existing files of that name by distributing it that way. To use PCRE2 with an existing program that uses the POSIX API, pcre2posix.h will have to be renamed or pointed at by a link (or the program modified, of course). See the pcre2posix documentation for more details. Documentation for PCRE2 ----------------------- If you install PCRE2 in the normal way on a Unix-like system, you will end up with a set of man pages whose names all start with "pcre2". The one that is just called "pcre2" lists all the others. In addition to these man pages, the PCRE2 documentation is supplied in two other forms: 1. There are files called doc/pcre2.txt, doc/pcre2grep.txt, and doc/pcre2test.txt in the source distribution. The first of these is a concatenation of the text forms of all the section 3 man pages except the listing of pcre2demo.c and those that summarize individual functions. The other two are the text forms of the section 1 man pages for the pcre2grep and pcre2test commands. These text forms are provided for ease of scanning with text editors or similar tools. They are installed in /share/doc/pcre2, where is the installation prefix (defaulting to /usr/local). 2. A set of files containing all the documentation in HTML form, hyperlinked in various ways, and rooted in a file called index.html, is distributed in doc/html and installed in /share/doc/pcre2/html. Building PCRE2 on non-Unix-like systems --------------------------------------- For a non-Unix-like system, please read the file NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and "make" you may be able to build PCRE2 using autotools in the same way as for many Unix-like systems. This file also contains useful information on building for some unusual Unix environments (such as EBCDIC mainframes). PCRE2 can also be configured using CMake, which can be run in various ways (command line, GUI, etc). This creates Makefiles, solution files, etc. The file NON-AUTOTOOLS-BUILD has information about CMake. PCRE2 has been compiled on many different operating systems. It should be straightforward to build PCRE2 on any system that has a C99 or later compiler and library. Building PCRE2 without using autotools -------------------------------------- The use of autotools (in particular, libtool) is problematic in some environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD file for ways of building PCRE2 without using autotools. Building PCRE2 using autotools ------------------------------ The following instructions assume the use of the widely used "configure; make; make install" (autotools) process. If you have downloaded and unpacked a PCRE2 release tarball, run the "configure" command from the PCRE2 directory, with your current directory set to the directory where you want the files to be created. This command is a standard GNU "autoconf" configuration script, for which generic instructions are supplied in the file INSTALL. The files in the GitHub repository do not contain "configure". If you have downloaded the PCRE2 source files from GitHub, before you can run "configure" you must run the shell script called autogen.sh. This runs a number of autotools to create a "configure" script (you must of course have the autotools commands installed in order to do this). Most commonly, people build PCRE2 within its own distribution directory, and in this case, on many systems, just running "./configure" is sufficient. However, the usual methods of changing standard defaults are available. For example: CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local This command specifies that the C compiler should be run with the flags '-O2 -Wall' instead of the default, and that "make install" should install PCRE2 under /opt/local instead of the default /usr/local. If you want to build in a different directory, just run "configure" with that directory as current. For example, suppose you have unpacked the PCRE2 source into /source/pcre2/pcre2-xxx, but you want to build it in /build/pcre2/pcre2-xxx: cd /build/pcre2/pcre2-xxx /source/pcre2/pcre2-xxx/configure PCRE2 is written in C and is normally compiled as a C library. However, it is possible to build it as a C++ library, though the provided building apparatus does not have any features to support this. There are some optional features that can be included or omitted from the PCRE2 library. They are also documented in the pcre2build man page. . By default, both shared and static libraries are built. You can change this by adding one of these options to the "configure" command: --disable-shared --disable-static Setting --disable-shared ensures that PCRE2 libraries are built as static libraries. The binaries that are then created as part of the build process (for example, pcre2test and pcre2grep) are linked statically with one or more PCRE2 libraries, but may also be dynamically linked with other libraries such as libc. If you want these binaries to be fully statically linked, you can set LDFLAGS like this: LDFLAGS=--static ./configure --disable-shared Note the two hyphens in --static. Of course, this works only if static versions of all the relevant libraries are available for linking. See also "Shared libraries" below. Shared libraries are compiled with symbol versioning enabled on platforms that support this, but this can be disabled by adding --disable-symvers. . By default, only the 8-bit library is built. If you add --enable-pcre2-16 to the "configure" command, the 16-bit library is also built. If you add --enable-pcre2-32 to the "configure" command, the 32-bit library is also built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 to disable building the 8-bit library. . If you want to include support for just-in-time (JIT) compiling, which can give large performance improvements on certain platforms, add --enable-jit to the "configure" command. This support is available only for certain hardware architectures. If you try to enable it on an unsupported architecture, there will be a compile time error. If in doubt, use --enable-jit=auto, which enables JIT only if the current hardware is supported. . If you are enabling JIT under SELinux environment you may also want to add --enable-jit-sealloc, which enables the use of an executable memory allocator that is compatible with SELinux. Warning: this allocator is experimental! It does not support fork() operation and may crash when no disk space is available. This option has no effect if JIT is disabled. . If you do not want to make use of the default support for UTF-8 Unicode character strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit library, or UTF-32 Unicode character strings in the 32-bit library, you can add --disable-unicode to the "configure" command. This reduces the size of the libraries. It is not possible to configure one library with Unicode support, and another without, in the same configuration. It is also not possible to use --enable-ebcdic (see below) with Unicode support, so if this option is set, you must also use --disable-unicode. When Unicode support is available, the use of a UTF encoding still has to be enabled by setting the PCRE2_UTF option at run time or starting a pattern with (*UTF). When PCRE2 is compiled with Unicode support, its input can only either be ASCII or UTF-8/16/32, even when running on EBCDIC platforms. As well as supporting UTF strings, Unicode support includes support for the \P, \p, and \X sequences that recognize Unicode character properties. However, only a subset of Unicode properties are supported; see the pcre2pattern man page for details. Escape sequences such as \d and \w in patterns do not by default make use of Unicode properties, but can be made to do so by setting the PCRE2_UCP option or starting a pattern with (*UCP). . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any of the preceding, or any of the Unicode newline sequences, or the NUL (zero) character as indicating the end of a line. Whatever you specify at build time is the default; the caller of PCRE2 can change the selection at run time. The default newline indicator is a single LF character (the Unix standard). You can specify the default newline indicator by adding --enable-newline-is-cr, --enable-newline-is-lf, --enable-newline-is-crlf, --enable-newline-is-anycrlf, --enable-newline-is-any, or --enable-newline-is-nul to the "configure" command, respectively. . By default, the sequence \R in a pattern matches any Unicode line ending sequence. This is independent of the option specifying what PCRE2 considers to be the end of a line (see above). However, the caller of PCRE2 can restrict \R to match only CR, LF, or CRLF. You can make this the default by adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R"). . In a pattern, the escape sequence \C matches a single code unit, even in a UTF mode. This can be dangerous because it breaks up multi-code-unit characters. You can build PCRE2 with the use of \C permanently locked out by adding --enable-never-backslash-C (note the upper case C) to the "configure" command. When \C is allowed by the library, individual applications can lock it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option. . PCRE2 has a counter that limits the depth of nesting of parentheses in a pattern. This limits the amount of system stack that a pattern uses when it is compiled. The default is 250, but you can change it by setting, for example, --with-parens-nest-limit=500 . PCRE2 has a counter that can be set to limit the amount of computing resource it uses when matching a pattern. If the limit is exceeded during a match, the match fails. The default is ten million. You can change the default by setting, for example, --with-match-limit=500000 on the "configure" command. This is just the default; individual calls to pcre2_match() or pcre2_dfa_match() can supply their own value. There is more discussion in the pcre2api man page (search for pcre2_set_match_limit). . There is a separate counter that limits the depth of nested backtracking (pcre2_match()) or nested function calls (pcre2_dfa_match()) during a matching process, which indirectly limits the amount of heap memory that is used, and in the case of pcre2_dfa_match() the amount of stack as well. This counter also has a default of ten million, which is essentially "unlimited". You can change the default by setting, for example, --with-match-limit-depth=5000 There is more discussion in the pcre2api man page (search for pcre2_set_depth_limit). . You can also set an explicit limit on the amount of heap memory used by the pcre2_match() and pcre2_dfa_match() interpreters: --with-heap-limit=500 The units are kibibytes (units of 1024 bytes). This limit does not apply when the JIT optimization (which has its own memory control features) is used. There is more discussion on the pcre2api man page (search for pcre2_set_heap_limit). . In the 8-bit library, the default maximum compiled pattern size is around 64 kibibytes. You can increase this by adding --with-link-size=3 to the "configure" command. PCRE2 then uses three bytes instead of two for offsets to different parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is the same as --with-link-size=4, which (in both libraries) uses four-byte offsets. Increasing the internal link size reduces performance in the 8-bit and 16-bit libraries. In the 32-bit library, the link size setting is ignored, as 4-byte offsets are always used. . Lookbehind assertions in which one or more branches can match a variable number of characters are supported only if there is a maximum matching length for each top-level branch. There is a limit to this maximum that defaults to 255 characters. You can alter this default by a setting such as --with-max-varlookbehind=100 The limit can be changed at runtime by calling pcre2_set_max_varlookbehind(). Lookbehind assertions in which every branch matches a fixed number of characters (not necessarily all the same) are not constrained by this limit. . For speed, PCRE2 uses four tables for manipulating and identifying characters whose code point values are less than 256. By default, it uses a set of tables for ASCII encoding that is part of the distribution. If you specify --enable-rebuild-chartables a program called pcre2_dftables is compiled and run in the default C locale when you obey "make". It builds a source file called pcre2_chartables.c. If you do not specify this option, pcre2_chartables.c is created as a copy of pcre2_chartables.c.dist. See "Character tables" below for further information. . It is possible to compile PCRE2 for use on systems that use EBCDIC as their character code (as opposed to ASCII/Unicode) by specifying --enable-ebcdic --disable-unicode This automatically implies --enable-rebuild-chartables (see above), in order to ensure that you have the correct default character tables for your system's codepage. There is an exception when you set --enable-ebcdic-ignoring-compiler (see below), which allows using a default set of EBCDIC 1047 character tables rather than forcing use of --enable-rebuild-chartables. When PCRE2 is built with EBCDIC support, it always operates in EBCDIC. It cannot support both EBCDIC and ASCII or UTF-8/16/32. There is a second option, --enable-ebcdic-nl25, which specifies that the code value for the EBCDIC NL character is 0x25 instead of the default 0x15. There is a third option, --enable-ebcdic-ignoring-compiler, which disregards the compiler's codepage for determining the numeric value of C character constants such as 'z', and instead forces PCRE2 to use numeric constants for the EBCDIC 1047 codepage instead. . If you specify --enable-debug, additional debugging code is included in the build. This option is intended for use by the PCRE2 maintainers. . In environments where valgrind is installed, if you specify --enable-valgrind PCRE2 will use valgrind annotations to mark certain memory regions as unaddressable. This allows it to detect invalid memory accesses, and is mostly useful for debugging PCRE2 itself. . In environments where the gcc compiler is used and lcov is installed, if you specify --enable-coverage the build process implements a code coverage report for the test suite. The report is generated by running "make coverage". If ccache is installed on your system, it must be disabled when building PCRE2 for coverage reporting. You can do this by setting the environment variable CCACHE_DISABLE=1 before running "make" to build PCRE2. There is more information about coverage reporting in the "pcre2build" documentation. . When JIT support is enabled, pcre2grep automatically makes use of it, unless you add --disable-pcre2grep-jit to the "configure" command. . There is support for calling external programs during matching in the pcre2grep command, using PCRE2's callout facility with string arguments. This support can be disabled by adding --disable-pcre2grep-callout to the "configure" command. There are two kinds of callout: one that generates output from inbuilt code, and another that calls an external program. The latter has special support for Windows and VMS; otherwise it assumes the existence of the fork() function. This facility can be disabled by adding --disable-pcre2grep-callout-fork to the "configure" command. . The pcre2grep program currently supports only 8-bit data files, and so requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by specifying one or both of --enable-pcre2grep-libz --enable-pcre2grep-libbz2 Of course, the relevant libraries must be installed on your system. . The default starting size (in bytes) of the internal buffer used by pcre2grep can be set by, for example: --with-pcre2grep-bufsize=51200 The value must be a plain integer. The default is 20480. The amount of memory used by pcre2grep is actually three times this number, to allow for "before" and "after" lines. If very long lines are encountered, the buffer is automatically enlarged, up to a fixed maximum size. . The default maximum size of pcre2grep's internal buffer can be set by, for example: --with-pcre2grep-max-bufsize=2097152 The default is either 1048576 or the value of --with-pcre2grep-bufsize, whichever is the larger. . It is possible to compile pcre2test so that it links with the libreadline or libedit libraries, by specifying, respectively, --enable-pcre2test-libreadline or --enable-pcre2test-libedit If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. Enabling libreadline causes the -lreadline option to be added to the pcre2test build. In many operating environments with a system-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is in use), it may be necessary to specify something like LIBS="-lncurses" as well. This is because, to quote the readline INSTALL, "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link with readline the option to choose an appropriate library." If you get error messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto, this is the problem, and linking with the ncurses library should fix it. . The C99 standard defines formatting modifiers z and t for size_t and ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in environments other than Microsoft Visual Studio versions earlier than 2013 when __STDC_VERSION__ is defined and has a value greater than or equal to 199901L (indicating C99). However, there is at least one environment that claims to be C99 but does not support these modifiers. If --disable-percent-zt is specified, no use is made of the z or t modifiers. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who want to run fuzzing tests on PCRE2. If set, it causes an extra library called libpcre2-fuzzsupport.a to be built, but not installed. This contains a single function called LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the length of the string. When called, this function tries to compile the string as a pattern, and if that succeeds, to match it. This is done both with no options and with some random options bits that are generated from the string. Setting --enable-fuzz-support also causes an executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing function and outputs information about what it is doing. The input strings are specified by arguments: if an argument starts with "=" the rest of it is a literal input string. Otherwise, it is assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for backtracking instead of recursive function calls (which use the stack). This is now obsolete because pcre2_match() was refactored always to use the heap (in a much more efficient way than before). This option is retained for backwards compatibility, but has no effect other than to output a warning. The "configure" script builds the following files for the basic C library: . Makefile the makefile that builds the library . src/config.h build-time configuration options for the library . src/pcre2.h the public PCRE2 header file . pcre2-config script that shows the building settings such as CFLAGS that were set for "configure" . libpcre2-8.pc ) . libpcre2-16.pc ) data for the pkg-config command . libpcre2-32.pc ) . libpcre2-posix.pc ) . libtool script that builds shared and/or static libraries Versions of config.h and pcre2.h are distributed in the src directory of PCRE2 tarballs under the names config.h.generic and pcre2.h.generic. These are provided for those who have to build PCRE2 without using "configure" or CMake. If you use "configure" or CMake, the .generic versions are not used. The "configure" script also creates config.status, which is an executable script that can be run to recreate the configuration, and config.log, which contains compiler output from tests that "configure" runs. Once "configure" has run, you can run "make". This builds whichever of the libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test program called pcre2test. If you enabled JIT support with --enable-jit, another test program called pcre2_jit_test is built as well. If the 8-bit library is built, libpcre2-posix, pcre2posix_test, and the pcre2grep command are also built. Running "make" with the -j option may speed up compilation on multiprocessor systems. The command "make check" runs all the appropriate tests. Details of the PCRE2 tests are given below in a separate section of this document. The -j option of "make" can also be used when running the tests. You can use "make install" to install PCRE2 into live directories on your system. The following are installed (file names are all relative to the that is set when "configure" is run): Commands (bin): pcre2test pcre2grep (if 8-bit support is enabled) pcre2-config Libraries (lib): libpcre2-8 (if 8-bit support is enabled) libpcre2-16 (if 16-bit support is enabled) libpcre2-32 (if 32-bit support is enabled) libpcre2-posix (if 8-bit support is enabled) Configuration information (lib/pkgconfig): libpcre2-8.pc libpcre2-16.pc libpcre2-32.pc libpcre2-posix.pc Header files (include): pcre2.h pcre2posix.h Man pages (share/man/man{1,3}): pcre2grep.1 pcre2test.1 pcre2-config.1 pcre2.3 pcre2*.3 (lots more pages, all starting "pcre2") HTML documentation (share/doc/pcre2/html): index.html *.html (lots more pages, hyperlinked from index.html) Text file documentation (share/doc/pcre2): AUTHORS COPYING ChangeLog LICENCE NEWS README SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page pcre2-config.txt the pcre2-config man page If you want to remove PCRE2 from your system, you can run "make uninstall". This removes all the files that "make install" installed. However, it does not remove any directories, because these are often shared with other programs. Retrieving configuration information ------------------------------------ Running "make install" installs the command pcre2-config, which can be used to recall information about the PCRE2 configuration and installation. For example: pcre2-config --version prints the version number, and pcre2-config --libs8 outputs information about where the 8-bit library is installed. This command can be included in makefiles for programs that use PCRE2, saving the programmer from having to remember too many details. Run pcre2-config with no arguments to obtain a list of possible arguments. The pkg-config command is another system for saving and retrieving information about installed libraries. Instead of separate commands for each library, a single command is used. For example: pkg-config --libs libpcre2-16 The data is held in *.pc files that are installed in a directory called /lib/pkgconfig. Shared libraries ---------------- The default distribution builds PCRE2 as shared libraries and static libraries, as long as the operating system supports shared libraries. Shared library support relies on the "libtool" script which is built as part of the "configure" process. The libtool script is used to compile and link both shared and static libraries. They are placed in a subdirectory called .libs when they are newly built. The programs pcre2test and pcre2grep are built to use these uninstalled libraries (by means of wrapper scripts in the case of shared libraries). When you use "make install" to install shared libraries, pcre2grep and pcre2test are automatically re-built to use the newly installed shared libraries before being installed themselves. However, the versions left in the build directory still use the uninstalled libraries. To build PCRE2 using static libraries only you must use --disable-shared when configuring it. For example: ./configure --prefix=/usr/gnu --disable-shared Then run "make" in the usual way. Similarly, you can use --disable-static to build only shared libraries. Note, however, that when you build only static libraries, binary programs such as pcre2test and pcre2grep may still be dynamically linked with other libraries (for example, libc) unless you set LDFLAGS to --static when running "configure". Cross-compiling using autotools ------------------------------- You can specify CC and CFLAGS in the normal way to the "configure" command, in order to cross-compile PCRE2 for some other host. However, you should NOT specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c source file is compiled and run on the local host, in order to generate the inbuilt character tables (the pcre2_chartables.c file). This will probably not work, because pcre2_dftables.c needs to be compiled with the local compiler, not the cross compiler. When --enable-rebuild-chartables is not specified, pcre2_chartables.c is created by making a copy of pcre2_chartables.c.dist, which is a default set of tables that assumes ASCII code. Cross-compiling with the default tables should not be a problem. If you need to modify the character tables when cross-compiling, you should move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by hand and run it on the local host to make a new version of pcre2_chartables.c.dist. See the pcre2build section "Creating character tables at build time" for more details. Making new tarballs ------------------- The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you should first run the maint/UpdateAlways script before making a distribution. This script creates the .txt and HTML forms of the documentation from the man pages. Testing PCRE2 ------------- To test the basic PCRE2 library on a Unix-like system, run the RunTest script. There is another script called RunGrepTest that tests the pcre2grep command. When the 8-bit library is built, a test program for the POSIX wrapper, called pcre2posix_test, is compiled, and when JIT support is enabled, a test program called pcre2_jit_test is built. The scripts and the program tests are all run when you obey "make check". For other environments, see the instructions in NON-AUTOTOOLS-BUILD. The RunTest script runs the pcre2test test program (which is documented in its own man page) on each of the relevant testinput files in the testdata directory, and compares the output with the contents of the corresponding testoutput files. RunTest places its output in directories testoutput{8,16,32}{,-jit,-dfa}. Other files whose names begin with "test" are used as working files in some tests. Some tests are relevant only when certain build-time options were selected. For example, the tests for UTF-8/16/32 features are run only when Unicode support is available. RunTest outputs a comment when it skips a test. Many (but not all) of the tests that are not skipped are run twice if JIT support is available. On the second run, JIT compilation is forced. This testing can be suppressed by putting "-nojit" on the RunTest command line. The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit libraries that are enabled. If you want to run just one set of tests, call RunTest with either the -8, -16 or -32 option. If valgrind is installed, you can run the tests under it by putting "-valgrind" on the RunTest command line. To run pcre2test on just one or more specific test files, give their numbers as arguments to RunTest, for example: RunTest 2 7 11 You can also specify ranges of tests such as 3-6 or 3- (meaning 3 to the end), or a number preceded by ~ to exclude a test. For example: Runtest 3-15 ~10 This runs tests 3 to 15, excluding test 10, and just ~13 runs all the tests except test 13. Whatever order the arguments are in, the tests are always run in numerical order. You can also call RunTest with the single argument "list" to cause it to output a list of tests. The test sequence starts with "test 0", which is a special test that has no input file, and whose output is not checked. This is because it will be different on different hardware and with different configurations. The test exists in order to exercise some of pcre2test's code that would not otherwise be run. Tests 1 and 2 can always be run, as they expect only plain text strings (not UTF) and make no use of Unicode properties. The first test file can be fed directly into the perltest.sh script to check that Perl gives the same results. The only difference you should see is in the first few lines, where the Perl version is given instead of the PCRE2 version. The second set of tests check auxiliary functions, error detection, and run-time flags that are specific to PCRE2. It also uses the debugging flags to check some of the internals of pcre2_compile(). If you build PCRE2 with a locale setting that is not the standard C locale, the character tables may be different (see next paragraph). In some cases, this may cause failures in the second set of tests. For example, in a locale where the isprint() function yields TRUE for characters in the range 128-255, the use of [:isascii:] inside a character class defines a different set of characters, and this shows up in this test as a difference in the compiled code, which is being listed for checking. For example, where the comparison test output contains [\x00-\x7f] the test might contain [\x00-\xff], and similarly in some other cases. This is not a bug in PCRE2. Test 3 checks pcre2_maketables(), the facility for building a set of character tables for a specific locale and using them instead of the default tables. The script uses the "locale" command to check for the availability of the "fr_FR", "french", or "fr" locale, and uses the first one that it finds. If the "locale" command fails, or if its output doesn't include "fr_FR", "french", or "fr" in the list of available locales, the third test cannot be run, and a comment is output to say why. If running this test produces an error like this: ** Failed to set locale "fr_FR" it means that the given locale is not available on your system, despite being listed by "locale". This does not mean that PCRE2 is broken. There are three alternative output files for the third test, because three different versions of the French locale have been encountered. The test passes if its output matches any one of them. Tests 4 and 5 check UTF and Unicode property support, test 4 being compatible with the perltest.sh script, and test 5 checking PCRE2-specific things. Tests 6 and 7 check the pcre2_dfa_match() alternative matching function, in non-UTF mode and UTF-mode with Unicode property support, respectively. Test 8 checks some internal offsets and code size features, but it is run only when Unicode support is enabled. The output is different in 8-bit, 16-bit, and 32-bit modes and for different link sizes, so there are different output files for each mode and link size. Tests 9 and 10 are run only in 8-bit mode, and tests 11 and 12 are run only in 16-bit and 32-bit modes. These are tests that generate different output in 8-bit mode. Each pair are for general cases and Unicode support, respectively. Test 13 checks the handling of non-UTF characters greater than 255 by pcre2_dfa_match() in 16-bit and 32-bit modes. Test 14 contains some special UTF and UCP tests that give different output for different code unit widths. Test 15 contains a number of tests that must not be run with JIT. They check, among other non-JIT things, the match-limiting features of the interpretive matcher. Test 16 is run only when JIT support is not available. It checks that an attempt to use JIT has the expected behaviour. Test 17 is run only when JIT support is available. It checks JIT complete and partial modes, match-limiting under JIT, and other JIT-specific features. Tests 18 and 19 are run only in 8-bit mode. They check the POSIX interface to the 8-bit library, without and with Unicode support, respectively. Test 20 checks the serialization functions by writing a set of compiled patterns to a file, and then reloading and checking them. Tests 21 and 22 test \C support when the use of \C is not locked out, without and with UTF support, respectively. Test 23 tests \C when it is locked out. Tests 24 and 25 test the experimental pattern conversion functions, without and with UTF support, respectively. Test 26 checks Unicode property support using tests that were generated automatically from the Unicode data tables. These are the archived version of the tests from Unicode 15. Test 27 checks Unicode property support using tests that are generated automatically from the currently-used Unicode data tables. Test 28 tests EBCDIC support, and is only run when PCRE2 is specifically compiled for EBCDIC. Test 29 tests EBCDIC when NL has been configured to be 0x25. Character tables ---------------- For speed, PCRE2 uses four tables for manipulating and identifying characters whose code point values are less than 256. By default, a set of tables that is built into the library is used. The pcre2_maketables() function can be called by an application to create a new set of tables in the current locale. This are passed to PCRE2 by calling pcre2_set_character_tables() to put a pointer into a compile context. The source file called pcre2_chartables.c contains the default set of tables. By default, this is created as a copy of pcre2_chartables.c.dist, which contains tables for ASCII coding. However, if --enable-rebuild-chartables is specified for ./configure, a new version of pcre2_chartables.c is built by the program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C character handling functions such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table sources. This means that the default C locale that is set for your system will control the contents of these default tables. You can change the default tables by editing pcre2_chartables.c and then re-building PCRE2. If you do this, you should take care to ensure that the file does not get automatically re-generated. The best way to do this is to move pcre2_chartables.c.dist out of the way and replace it with your customized tables. When the pcre2_dftables program is run as a result of specifying --enable-rebuild-chartables, it uses the default C locale that is set on your system. It does not pay attention to the LC_xxx environment variables. In other words, it uses the system's default locale rather than whatever the compiling user happens to have set. If you really do want to build a source set of character tables in a locale that is specified by the LC_xxx variables, you can run the pcre2_dftables program by hand with the -L option. For example: ./pcre2_dftables -L pcre2_chartables.c.special The second argument names the file where the source code for the tables is written. The first two 256-byte tables provide lower casing and case flipping functions, respectively. The next table consists of a number of 32-byte bit maps which identify certain character classes such as digits, "word" characters, white space, etc. These are used when building 32-byte bit maps that represent character classes for code points less than 256. The final 256-byte table has bits indicating various character types, as follows: 1 white space character 2 letter 4 lower case letter 8 decimal digit 16 alphanumeric or '_' You can also specify -b (with or without -L) when running pcre2_dftables. This causes the tables to be written in binary instead of as source code. A set of binary tables can be loaded into memory by an application and passed to pcre2_compile() in the same way as tables created dynamically by calling pcre2_maketables(). The tables are just a string of bytes, independent of hardware characteristics such as endianness. This means they can be bundled with an application that runs in different environments, to ensure consistent behaviour. See also the pcre2build section "Creating character tables at build time". File manifest ------------- The distribution should contain the files listed below. (A) Source files for the PCRE2 library functions and their headers are found in the src directory: src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c when --enable-rebuild-chartables is specified src/pcre2_chartables.c.dist a default set of character tables that assume ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c src/pcre2_chartables.c.ebcdic-1047-{nl15,nl25} a default set of character tables for EBCDIC 1047; used if --enable-ebcdic-ignoring-compiler is specified without --enable-rebuild-chartables src/pcre2posix.c ) src/pcre2_auto_possess.c ) src/pcre2_chkdint.c ) src/pcre2_compile.c ) src/pcre2_compile_cgroup.c ) src/pcre2_compile_class.c ) src/pcre2_config.c ) src/pcre2_context.c ) src/pcre2_convert.c ) src/pcre2_dfa_match.c ) src/pcre2_error.c ) src/pcre2_extuni.c ) src/pcre2_find_bracket.c ) src/pcre2_jit_compile.c ) src/pcre2_maketables.c ) sources for the functions in the library, src/pcre2_match.c ) and some internal functions that they use src/pcre2_match_data.c ) src/pcre2_match_next.c ) src/pcre2_newline.c ) src/pcre2_ord2utf.c ) src/pcre2_pattern_info.c ) src/pcre2_script_run.c ) src/pcre2_serialize.c ) src/pcre2_string_utils.c ) src/pcre2_study.c ) src/pcre2_substitute.c ) src/pcre2_substring.c ) src/pcre2_tables.c ) src/pcre2_ucd.c ) src/pcre2_valid_utf.c ) src/pcre2_xclass.c ) src/pcre2_fuzzsupport.c function for (optional) fuzzing support src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_match_inc.h header used by JIT src/pcre2_jit_misc_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_printint_inc.h debugging function that is used by pcre2test src/pcre2_ucp.h header for Unicode property handling src/pcre2_ucptables_inc.h header with Unicode data tables src/pcre2_util.h header for internal utils deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: src/pcre2demo.c simple demonstration of coding calls to PCRE2 src/pcre2grep.c source of a grep utility that uses PCRE2 src/pcre2test.c comprehensive test program src/pcre2test_inc.h header used by pcre2test src/pcre2_jit_test.c JIT test program src/pcre2posix_test.c POSIX wrapper API test program (C) Auxiliary files: AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests RunTest.bat a Windows batch file for running tests RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages doc/pcre2-config.txt plain text documentation of pcre2-config script doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config ar-lib ) config.guess ) config.sub ) depcomp ) helper tools generated by libtool and compile ) automake, used internally by ./configure install-sh ) ltmain.sh ) missing ) test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests testdata/testoutput* expected test results testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files src/libpcre2-8.sym.in ) src/libpcre2-16.sym.in ) symbol version script templates for the src/libpcre2-32.sym.in ) GNU, BSD and Sun linkers src/libpcre2-posix.sym.in ) (D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS cmake/FindEditline.cmake cmake/FindReadline.cmake cmake/pcre2-config.cmake.in cmake/PCRE2CheckVscript.cmake cmake/PCRE2UseSystemExtensions.cmake cmake/PCRE2WarningAsError.cmake src/config-cmake.h.in CMakeLists.txt (E) Auxiliary files for building PCRE2 "by hand" src/pcre2.h.generic ) a version of the public PCRE2 header file ) for use in non-"configure" environments src/config.h.generic ) a version of config.h for use in non-"configure" ) environments (F) Auxiliary files for building PCRE2 using other build systems BUILD.bazel ) files used by the Bazel MODULE.bazel ) build system build.zig file used by zig's build system (G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) ============================= Last updated: 15 October 2025 ============================= ================================================ FILE: doc/html/index.html ================================================ PCRE2 specification

Perl-compatible Regular Expressions (revised API: PCRE2)

The HTML documentation for PCRE2 consists of a number of pages that are listed below in alphabetical order. If you are new to PCRE2, please read the first one first.

pcre2 Introductory page
pcre2-config Information about the installation configuration
pcre2api PCRE2's native API
pcre2build Building PCRE2
pcre2callout The callout facility
pcre2compat Compability with Perl
pcre2convert Experimental foreign pattern conversion functions
pcre2demo A demonstration C program that uses the PCRE2 library
pcre2grep The pcre2grep command
pcre2jit Discussion of the just-in-time optimization support
pcre2limits Details of size and other limits
pcre2matching Discussion of the two matching algorithms
pcre2partial Using PCRE2 for partial matching
pcre2pattern Specification of the regular expressions supported by PCRE2
pcre2perform Some comments on performance
pcre2posix The POSIX API to the PCRE2 8-bit library
pcre2sample Discussion of the pcre2demo program
pcre2serialize Serializing functions for saving precompiled patterns
pcre2syntax Syntax quick-reference summary
pcre2test The pcre2test command for testing PCRE2
pcre2unicode Discussion of Unicode and UTF-8/UTF-16/UTF-32 support

There are also individual pages that summarize the interface for each function in the library.

pcre2_callout_enumerate Enumerate callouts in a compiled pattern
pcre2_code_copy Copy a compiled pattern
pcre2_code_copy_with_tables Copy a compiled pattern and its character tables
pcre2_code_free Free a compiled pattern
pcre2_compile Compile a regular expression pattern
pcre2_compile_context_copy Copy a compile context
pcre2_compile_context_create Create a compile context
pcre2_compile_context_free Free a compile context
pcre2_config Show build-time related configuration options
pcre2_convert_context_copy Copy a convert context
pcre2_convert_context_create Create a convert context
pcre2_convert_context_free Free a convert context
pcre2_converted_pattern_free Free converted foreign pattern
pcre2_dfa_match Match a compiled pattern to a subject string (DFA algorithm; not Perl compatible)
pcre2_general_context_copy Copy a general context
pcre2_general_context_create Create a general context
pcre2_general_context_free Free a general context
pcre2_get_error_message Get textual error message for error number
pcre2_get_mark Get a (*MARK) name
pcre2_get_match_data_size Get the size of a match data block
pcre2_get_ovector_count Get the ovector count
pcre2_get_ovector_pointer Get a pointer to the ovector
pcre2_get_startchar Get the starting character offset
pcre2_jit_compile Process a compiled pattern with the JIT compiler
pcre2_jit_free_unused_memory Free unused JIT memory
pcre2_jit_match Fast path interface to JIT matching
pcre2_jit_stack_assign Assign stack for JIT matching
pcre2_jit_stack_create Create a stack for JIT matching
pcre2_jit_stack_free Free a JIT matching stack
pcre2_maketables Build character tables in current locale
pcre2_maketables_free Free character tables
pcre2_match Match a compiled pattern to a subject string (Perl compatible)
pcre2_match_context_copy Copy a match context
pcre2_match_context_create Create a match context
pcre2_match_context_free Free a match context
pcre2_match_data_create Create a match data block
pcre2_match_data_create_from_pattern Create a match data block getting size from pattern
pcre2_match_data_free Free a match data block
pcre2_next_match Get the match parameters for the next match
pcre2_pattern_convert Experimental foreign pattern converter
pcre2_pattern_info Extract information about a pattern
pcre2_serialize_decode Decode serialized compiled patterns
pcre2_serialize_encode Serialize compiled patterns for save/restore
pcre2_serialize_free Free serialized compiled patterns
pcre2_serialize_get_number_of_codes Get number of serialized compiled patterns
pcre2_set_bsr Set \R convention
pcre2_set_callout Set up a callout function
pcre2_set_character_tables Set character tables
pcre2_set_compile_extra_options Set compile time extra options
pcre2_set_compile_recursion_guard Set up a compile recursion guard function
pcre2_set_depth_limit Set the match backtracking depth limit
pcre2_set_glob_escape Set glob escape character
pcre2_set_glob_separator Set glob separator character
pcre2_set_heap_limit Set the match backtracking heap limit
pcre2_set_match_limit Set the match limit
pcre2_set_max_pattern_compiled_length Set the maximum length of a compiled pattern
pcre2_set_max_pattern_length Set the maximum length of a pattern
pcre2_set_max_varlookbehind Set the maximum match length for a variable-length lookbehind
pcre2_set_newline Set the newline convention
pcre2_set_offset_limit Set the offset limit
pcre2_set_optimize Set an optimization directive
pcre2_set_parens_nest_limit Set the parentheses nesting limit
pcre2_set_recursion_limit Obsolete: use pcre2_set_depth_limit
pcre2_set_recursion_memory_management Obsolete function that (from 10.30 onwards) does nothing
pcre2_set_substitute_callout Set a substitution callout function
pcre2_set_substitute_case_callout Set a substitution case callout function
pcre2_substitute Match a compiled pattern to a subject string and do substitutions
pcre2_substring_copy_byname Extract named substring into given buffer
pcre2_substring_copy_bynumber Extract numbered substring into given buffer
pcre2_substring_free Free extracted substring
pcre2_substring_get_byname Extract named substring into new memory
pcre2_substring_get_bynumber Extract numbered substring into new memory
pcre2_substring_length_byname Find length of named substring
pcre2_substring_length_bynumber Find length of numbered substring
pcre2_substring_list_free Free list of extracted substrings
pcre2_substring_list_get Extract all substrings into new memory
pcre2_substring_nametable_scan Find table entries for given string name
pcre2_substring_number_from_name Convert captured string name to number
================================================ FILE: doc/html/pcre2-config.html ================================================ pcre2-config specification

pcre2-config man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

pcre2-config [--prefix] [--exec-prefix] [--version] [--libs8] [--libs16] [--libs32] [--libs-posix] [--cflags] [--cflags-posix]

DESCRIPTION

pcre2-config returns the configuration of the installed PCRE2 libraries and the options required to compile a program to use them. Some of the options apply only to the 8-bit, 16-bit, or 32-bit libraries, respectively, and are not available for libraries that have not been built. If an unavailable option is encountered, the "usage" information is output.

OPTIONS

--prefix Writes the directory prefix used in the PCRE2 installation for architecture-independent files (/usr on many systems, /usr/local on some systems) to the standard output.

--exec-prefix Writes the directory prefix used in the PCRE2 installation for architecture-dependent files (normally the same as --prefix) to the standard output.

--version Writes the version number of the installed PCRE2 libraries to the standard output.

--libs8 Writes to the standard output the command line options required to link with the 8-bit PCRE2 library (-lpcre2-8 on many systems).

--libs16 Writes to the standard output the command line options required to link with the 16-bit PCRE2 library (-lpcre2-16 on many systems).

--libs32 Writes to the standard output the command line options required to link with the 32-bit PCRE2 library (-lpcre2-32 on many systems).

--libs-posix Writes to the standard output the command line options required to link with PCRE2's POSIX API wrapper library (-lpcre2-posix -lpcre2-8 on many systems).

--cflags Writes to the standard output the command line options required to compile files that use PCRE2 (this may include some -I options, but is blank on many systems).

--cflags-posix Writes to the standard output the command line options required to compile files that use PCRE2's POSIX API wrapper library (this may include some -I options, but is blank on many systems).

SEE ALSO

pcre2(3)

AUTHOR

This manual page was originally written by Mark Baker for the Debian GNU/Linux system. It has been subsequently revised as a generic PCRE2 man page.

REVISION

Last updated: 22 February 2025

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2.html ================================================ pcre2 specification

pcre2 man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

INTRODUCTION

PCRE2 is the name used for a revised API for the PCRE library, which is a set of functions, written in C, that implement regular expression pattern matching using the same syntax and semantics as Perl, with just a few differences. After nearly two decades, the limitations of the original API were making development increasingly difficult. The new API is more extensible, and it was simplified by abolishing the separate "study" optimizing function; in PCRE2, patterns are automatically optimized where possible. Since forking from PCRE1, the code has been extensively refactored and new features introduced. The old library is now obsolete and is no longer maintained.

As well as Perl-style regular expression patterns, some features that appeared in Python and the original PCRE before they appeared in Perl are available using the Python syntax. There is also support for some .NET and Oniguruma syntax items, and there are options for requesting minor changes that give better ECMAScript (JavaScript) compatibility.

The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit, or 32-bit code units, which means that up to three separate libraries may be installed, one for each code unit size. The size of a code unit is not related to the bit size of the underlying hardware. In a 64-bit environment that also supports 32-bit applications, versions of PCRE2 that are compiled in both 64-bit and 32-bit modes may be needed.

The original work to extend PCRE to 16-bit and 32-bit code units was done by Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings can be interpreted either as one character per code unit, or as UTF-encoded Unicode, with support for Unicode general category properties. Unicode support is optional at build time (but is the default). However, processing strings as UTF code units must be enabled explicitly at run time. The version of Unicode in use can be discovered by running

  pcre2test -C

The three libraries contain identical sets of functions, with names ending in _8, _16, or _32, respectively (for example, pcre2_compile_8()). However, by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just one code unit width can be written using generic names such as pcre2_compile(), and the documentation is written assuming that this is the case.

In addition to the Perl-compatible matching function, PCRE2 contains an alternative function that matches the same compiled patterns in a different way. In certain circumstances, the alternative function has some advantages. For a discussion of the two matching algorithms, see the pcre2matching page.

Details of exactly which Perl regular expression features are and are not supported by PCRE2 are given in separate documents. See the pcre2pattern and pcre2compat pages. There is a syntax summary in the pcre2syntax page.

Some features of PCRE2 can be included, excluded, or changed when the library is built. The pcre2_config() function makes it possible for a client to discover which features are available. The features themselves are described in the pcre2build page. Documentation about building PCRE2 for various operating systems can be found in the README and NON-AUTOTOOLS-BUILD files in the source distribution.

The libraries contains a number of undocumented internal functions and data tables that are used by more than one of the exported external functions, but which are not intended for use by external callers. Their names all begin with "_pcre2", which hopefully will not provoke any name clashes. In some environments, it is possible to control which external symbols are exported when a shared library is built, and in these cases the undocumented symbols are not exported.

SECURITY CONSIDERATIONS

If you are using PCRE2 in a non-UTF application that permits users to supply arbitrary patterns for compilation, you should be aware of a feature that allows users to turn on UTF support from within a pattern. For example, an 8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings of UTF-8 code units instead of individual 8-bit characters. This causes both the pattern and any data against which it is matched to be checked for UTF-8 validity. If the data string is very long, such a check might use sufficiently many resources as to cause your application to lose performance.

One way of guarding against this possibility is to use the pcre2_pattern_info() function to check the compiled pattern's options for PCRE2_UTF. Alternatively, you can set the PCRE2_NEVER_UTF option when calling pcre2_compile(). This causes a compile time error if the pattern contains a UTF-setting sequence.

The use of Unicode properties for character types such as \d can also be enabled from within the pattern, by specifying "(*UCP)". This feature can be disallowed by setting the PCRE2_NEVER_UCP option.

If your application is one that supports UTF, be aware that validity checking can take time. If the same data string is to be matched many times, you can use the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid running redundant checks.

The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to problems, because it may leave the current matching point in the middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an application to lock out the use of \C, causing a compile-time error if it is encountered. It is also possible to build PCRE2 with the use of \C permanently disabled.

Another way that performance can be hit is by running a pattern that has a very large search tree against a string that will never match. Nested unlimited repeats in a pattern are a common example. PCRE2 provides some protection against this: see the pcre2_set_match_limit() function in the pcre2api page. There is a similar function called pcre2_set_depth_limit() that can be used to restrict the amount of memory that is used.

USER DOCUMENTATION

The user documentation for PCRE2 comprises a number of different sections. In the "man" format, each of these is a separate "man page". In the HTML format, each is a separate page, linked from the index page. In the plain text format, the descriptions of the pcre2grep and pcre2test programs are in files called pcre2grep.txt and pcre2test.txt, respectively. The remaining sections, except for the pcre2demo section (which is a program listing), and the short pages for individual functions, are concatenated in pcre2.txt, for ease of searching. The sections are as follows:

  pcre2              this document
  pcre2-config       show PCRE2 installation configuration information
  pcre2api           details of PCRE2's native C API
  pcre2build         building PCRE2
  pcre2callout       details of the pattern callout feature
  pcre2compat        discussion of Perl compatibility
  pcre2convert       details of pattern conversion functions
  pcre2demo          a demonstration C program that uses PCRE2
  pcre2grep          description of the pcre2grep command (8-bit only)
  pcre2jit           discussion of just-in-time optimization support
  pcre2limits        details of size and other limits
  pcre2matching      discussion of the two matching algorithms
  pcre2partial       details of the partial matching facility
  pcre2pattern       syntax and semantics of supported regular expression patterns
  pcre2perform       discussion of performance issues
  pcre2posix         the POSIX-compatible C API for the 8-bit library
  pcre2sample        discussion of the pcre2demo program
  pcre2serialize     details of pattern serialization
  pcre2syntax        quick syntax reference
  pcre2test          description of the pcre2test command
  pcre2unicode       discussion of Unicode and UTF support
In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results.

AUTHORS

The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Herczeg.

PCRE2 was written by Philip Hazel, of the University Computing Service, Cambridge, England. Many others have also contributed.

To contact the maintainers, please use the GitHub issues tracker or PCRE2 mailing list, as described at the project page: https://github.com/PCRE2Project/pcre2

REVISION

Last updated: 22 February 2025
Copyright © 1997-2021 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_callout_enumerate.html ================================================ pcre2_callout_enumerate specification

pcre2_callout_enumerate man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *callout_data);

DESCRIPTION

This function scans a compiled regular expression and calls the callback() function for each callout within the pattern. The yield of the function is zero for success and non-zero otherwise. The arguments are:

  code           Points to the compiled pattern
  callback       The callback function
  callout_data   User data that is passed to the callback
The callback() function is passed a pointer to a data block containing the following fields (not necessarily in this order):
  uint32_t   version                Block version number
  uint32_t   callout_number         Number for numbered callouts
  PCRE2_SIZE pattern_position       Offset to next item in pattern
  PCRE2_SIZE next_item_length       Length of next item in pattern
  PCRE2_SIZE callout_string_offset  Offset to string within pattern
  PCRE2_SIZE callout_string_length  Length of callout string
  PCRE2_SPTR callout_string         Points to callout string or is NULL
The second argument passed to the callback() function is the callout data that was passed to pcre2_callout_enumerate(). The callback() function must return zero for success. Any other value causes the pattern scan to stop, with the value being passed back as the result of pcre2_callout_enumerate().

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_code_copy.html ================================================ pcre2_code_copy specification

pcre2_code_copy man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_code *pcre2_code_copy(const pcre2_code *code);

DESCRIPTION

This function makes a copy of the memory used for a compiled pattern, excluding any memory used by the JIT compiler. Without a subsequent call to pcre2_jit_compile(), the copy can be used only for non-JIT matching. The pointer to the character tables is copied, not the tables themselves (see pcre2_code_copy_with_tables()). The yield of the function is NULL if code is NULL or if sufficient memory cannot be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_code_copy_with_tables.html ================================================ pcre2_code_copy_with_tables specification

pcre2_code_copy_with_tables man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);

DESCRIPTION

This function makes a copy of the memory used for a compiled pattern, excluding any memory used by the JIT compiler. Without a subsequent call to pcre2_jit_compile(), the copy can be used only for non-JIT matching. Unlike pcre2_code_copy(), a separate copy of the character tables is also made, with the new code pointing to it. This memory will be automatically freed when pcre2_code_free() is called. The yield of the function is NULL if code is NULL or if sufficient memory cannot be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_code_free.html ================================================ pcre2_code_free specification

pcre2_code_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_code_free(pcre2_code *code);

DESCRIPTION

If code is NULL, this function does nothing. Otherwise, code must point to a compiled pattern. This function frees its memory, including any memory used by the JIT compiler. If the compiled pattern was created by a call to pcre2_code_copy_with_tables(), the memory for the character tables is also freed.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_compile.html ================================================ pcre2_compile specification

pcre2_compile man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext);

DESCRIPTION

This function compiles a regular expression pattern into an internal form. Its arguments are:

  pattern       A string containing expression to be compiled
  length        The length of the string or PCRE2_ZERO_TERMINATED
  options       Primary option bits
  errorcode     Where to put an error code
  erroffset     Where to put an error offset
  ccontext      Pointer to a compile context or NULL
The length of the pattern and any error offset that is returned are in code units, not characters. A NULL pattern with zero length is treated as an empty string. A compile context is needed only if you want to provide custom memory allocation functions, or to provide an external function for system stack size checking (see pcre2_set_compile_recursion_guard()), or to change one or more of these parameters:
  What \R matches (Unicode newlines, or CR, LF, CRLF only);
  PCRE2's character tables;
  The newline character sequence;
  The compile time nested parentheses limit;
  The maximum pattern length (in code units) that is allowed;
  The additional options bits.
The primary option bits are:
  PCRE2_ANCHORED           Force pattern anchoring
  PCRE2_ALLOW_EMPTY_CLASS  Allow empty classes
  PCRE2_ALT_BSUX           Alternative handling of \u, \U, and \x
  PCRE2_ALT_CIRCUMFLEX     Alternative handling of ^ in multiline mode
  PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax
  PCRE2_ALT_VERBNAMES      Process backslashes in verb names
  PCRE2_AUTO_CALLOUT       Compile automatic callouts
  PCRE2_CASELESS           Do caseless matching
  PCRE2_DOLLAR_ENDONLY     $ not to match newline at end
  PCRE2_DOTALL             . matches anything including NL
  PCRE2_DUPNAMES           Allow duplicate names for subpatterns
  PCRE2_ENDANCHORED        Pattern can match only at end of subject
  PCRE2_EXTENDED           Ignore white space and # comments
  PCRE2_FIRSTLINE          Force matching to be before newline
  PCRE2_LITERAL            Pattern characters are all literal
  PCRE2_MATCH_INVALID_UTF  Enable support for matching invalid UTF
  PCRE2_MATCH_UNSET_BACKREF  Match unset backreferences
  PCRE2_MULTILINE          ^ and $ match newlines within data
  PCRE2_NEVER_BACKSLASH_C  Lock out the use of \C in patterns
  PCRE2_NEVER_UCP          Lock out PCRE2_UCP, e.g. via (*UCP)
  PCRE2_NEVER_UTF          Lock out PCRE2_UTF, e.g. via (*UTF)
  PCRE2_NO_AUTO_CAPTURE    Disable numbered capturing paren-
                            theses (named ones available)
  PCRE2_NO_AUTO_POSSESS    Disable auto-possessification
  PCRE2_NO_DOTSTAR_ANCHOR  Disable automatic anchoring for .*
  PCRE2_NO_START_OPTIMIZE  Disable match-time start optimizations
  PCRE2_NO_UTF_CHECK       Do not check the pattern for UTF validity
                             (only relevant if PCRE2_UTF is set)
  PCRE2_UCP                Use Unicode properties for \d, \w, etc.
  PCRE2_UNGREEDY           Invert greediness of quantifiers
  PCRE2_USE_OFFSET_LIMIT   Enable offset limit for unanchored matching
  PCRE2_UTF                Treat pattern and subjects as UTF strings
PCRE2 must be built with Unicode support (the default) in order to use PCRE2_UTF, PCRE2_UCP and related options.

Additional options may be set in the compile context via the pcre2_set_compile_extra_options function.

If either of errorcode or erroroffset is NULL, the function returns NULL immediately. Otherwise, the yield of this function is a pointer to a private data structure that contains the compiled pattern, or NULL if an error was detected. In the error case, a text error message can be obtained by passing the value returned via the errorcode argument to the pcre2_get_error_message() function. The offset (in code units) where the error was encountered is returned via the erroroffset argument.

If there is no error, the value passed via errorcode returns the message "no error" if passed to pcre2_get_error_message(), and the value passed via erroroffset is zero.

There is a complete description of the PCRE2 native API, with more detail on each option, in the pcre2api page, and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_compile_context_copy.html ================================================ pcre2_compile_context_copy specification

pcre2_compile_context_copy man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_compile_context *pcre2_compile_context_copy( pcre2_compile_context *ccontext);

DESCRIPTION

This function makes a new copy of a compile context, using the memory allocation function that was used for the original context. The result is NULL if the memory cannot be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_compile_context_create.html ================================================ pcre2_compile_context_create specification

pcre2_compile_context_create man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_compile_context *pcre2_compile_context_create( pcre2_general_context *gcontext);

DESCRIPTION

This function creates and initializes a new compile context. If its argument is NULL, malloc() is used to get the necessary memory; otherwise the memory allocation function within the general context is used. The result is NULL if the memory could not be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_compile_context_free.html ================================================ pcre2_compile_context_free specification

pcre2_compile_context_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_compile_context_free(pcre2_compile_context *ccontext);

DESCRIPTION

This function frees the memory occupied by a compile context, using the memory freeing function from the general context with which it was created, or free() if that was not set. If the argument is NULL, the function returns immediately without doing anything.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_config.html ================================================ pcre2_config specification

pcre2_config man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_config(uint32_t what, void *where);

DESCRIPTION

This function makes it possible for a client program to find out which optional features are available in the version of the PCRE2 library it is using. The arguments are as follows:

  what     A code specifying what information is required
  where    Points to where to put the information
If where is NULL, the function returns the amount of memory needed for the requested information. When the information is a string, the value is in code units; for other types of data it is in bytes.

If where is not NULL, for PCRE2_CONFIG_JITTARGET, PCRE2_CONFIG_UNICODE_VERSION, and PCRE2_CONFIG_VERSION it must point to a buffer that is large enough to hold the string. For all other codes it must point to a uint32_t integer variable. The available codes are:

  PCRE2_CONFIG_BSR                Indicates what \R matches by default:
                                    PCRE2_BSR_UNICODE
                                    PCRE2_BSR_ANYCRLF
  PCRE2_CONFIG_COMPILED_WIDTHS    Which of 8/16/32 support was compiled
  PCRE2_CONFIG_DEPTHLIMIT         Default backtracking depth limit
  PCRE2_CONFIG_EFFECTIVE_LINKSIZE How many bytes are used for link size
  PCRE2_CONFIG_HEAPLIMIT          Default heap memory limit
  PCRE2_CONFIG_JIT                Availability of just-in-time compiler support (1=yes 0=no)
  PCRE2_CONFIG_JITTARGET          Information (a string) about the target architecture for the JIT compiler
  PCRE2_CONFIG_LINKSIZE           Configured internal link size (2, 3, 4)
  PCRE2_CONFIG_MATCHLIMIT         Default internal resource limit
  PCRE2_CONFIG_NEVER_BACKSLASH_C  Whether or not \C is disabled
  PCRE2_CONFIG_NEWLINE            Code for the default newline sequence:
                                    PCRE2_NEWLINE_CR
                                    PCRE2_NEWLINE_LF
                                    PCRE2_NEWLINE_CRLF
                                    PCRE2_NEWLINE_ANY
                                    PCRE2_NEWLINE_ANYCRLF
                                    PCRE2_NEWLINE_NUL
  PCRE2_CONFIG_PARENSLIMIT        Default parentheses nesting limit
  PCRE2_CONFIG_RECURSIONLIMIT     Obsolete: use PCRE2_CONFIG_DEPTHLIMIT
  PCRE2_CONFIG_STACKRECURSE       Obsolete: always returns 0
  PCRE2_CONFIG_UNICODE            Availability of Unicode support (1=yes 0=no)
  PCRE2_CONFIG_UNICODE_VERSION    The Unicode version (a string)
  PCRE2_CONFIG_VERSION            The PCRE2 version (a string)
The function yields a non-negative value on success or the negative value PCRE2_ERROR_BADOPTION otherwise. This is also the result for the PCRE2_CONFIG_JITTARGET code if JIT support is not available. When a string is requested, the function returns the number of code units used, including the terminating zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_convert_context_copy.html ================================================ pcre2_convert_context_copy specification

pcre2_convert_context_copy man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_convert_context *pcre2_convert_context_copy( pcre2_convert_context *cvcontext);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It makes a new copy of a convert context, using the memory allocation function that was used for the original context. The result is NULL if the memory cannot be obtained.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_convert_context_create.html ================================================ pcre2_convert_context_create specification

pcre2_convert_context_create man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_convert_context *pcre2_convert_context_create( pcre2_general_context *gcontext);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It creates and initializes a new convert context. If its argument is NULL, malloc() is used to get the necessary memory; otherwise the memory allocation function within the general context is used. The result is NULL if the memory could not be obtained.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_convert_context_free.html ================================================ pcre2_convert_context_free specification

pcre2_convert_context_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_convert_context_free(pcre2_convert_context *cvcontext);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It frees the memory occupied by a convert context, using the memory freeing function from the general context with which it was created, or free() if that was not set. If the argument is NULL, the function returns immediately without doing anything.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_converted_pattern_free.html ================================================ pcre2_converted_pattern_free specification

pcre2_converted_pattern_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It frees the memory occupied by a converted pattern that was obtained by calling pcre2_pattern_convert() with arguments that caused it to place the converted pattern into newly obtained heap memory. If the argument is NULL, the function returns immediately without doing anything.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_dfa_match.html ================================================ pcre2_dfa_match specification

pcre2_dfa_match man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount);

DESCRIPTION

This function matches a compiled regular expression against a given subject string, using an alternative matching algorithm that scans the subject string just once (except when processing lookaround assertions). This function is not Perl-compatible (the Perl-compatible matching function is pcre2_match()). The arguments for this function are:

  code         Points to the compiled pattern
  subject      Points to the subject string
  length       Length of the subject string
  startoffset  Offset in the subject at which to start matching
  options      Option bits
  match_data   Points to a match data block, for results
  mcontext     Points to a match context, or is NULL
  workspace    Points to a vector of ints used as working space
  wscount      Number of elements in the vector
The size of output vector needed to contain all the results depends on the number of simultaneous matches, not on the number of parentheses in the pattern. Using pcre2_match_data_create_from_pattern() to create the match data block is therefore not advisable when using this function.

A match context is needed only if you want to set up a callout function or specify the heap limit or the match or the recursion depth limits. The length and startoffset values are code units, not characters. The options are:

  PCRE2_ANCHORED          Match only at the first position
  PCRE2_COPY_MATCHED_SUBJECT
                          On success, make a private subject copy
  PCRE2_ENDANCHORED       Pattern can match only at end of subject
  PCRE2_NOTBOL            Subject is not the beginning of a line
  PCRE2_NOTEOL            Subject is not the end of a line
  PCRE2_NOTEMPTY          An empty string is not a valid match
  PCRE2_NOTEMPTY_ATSTART  An empty string at the start of the subject is not a valid match
  PCRE2_NO_UTF_CHECK      Do not check the subject for UTF validity (only relevant if PCRE2_UTF
                           was set at compile time)
  PCRE2_PARTIAL_HARD      Return PCRE2_ERROR_PARTIAL for a partial match even if there is a full match
  PCRE2_PARTIAL_SOFT      Return PCRE2_ERROR_PARTIAL for a partial match if no full matches are found
  PCRE2_DFA_RESTART       Restart after a partial match
  PCRE2_DFA_SHORTEST      Return only the shortest match
There are restrictions on what may appear in a pattern when using this matching function. Details are given in the pcre2matching documentation. For details of partial matching, see the pcre2partial page. There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_general_context_copy.html ================================================ pcre2_general_context_copy specification

pcre2_general_context_copy man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_general_context *pcre2_general_context_copy( pcre2_general_context *gcontext);

DESCRIPTION

This function makes a new copy of a general context, using the memory allocation functions in the context, if set, to get the necessary memory. Otherwise malloc() is used. The result is NULL if the memory cannot be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_general_context_create.html ================================================ pcre2_general_context_create specification

pcre2_general_context_create man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_general_context *pcre2_general_context_create( void *(*private_malloc)(size_t, void *), void (*private_free)(void *, void *), void *memory_data);

DESCRIPTION

This function creates and initializes a general context. The arguments define custom memory management functions and a data value that is passed to them when they are called. The private_malloc() function is used to get memory for the context. If either of the first two arguments is NULL, the system memory management function is used. The result is NULL if no memory could be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_general_context_free.html ================================================ pcre2_general_context_free specification

pcre2_general_context_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_general_context_free(pcre2_general_context *gcontext);

DESCRIPTION

This function frees the memory occupied by a general context, using the memory freeing function within the context, if set. If the argument is NULL, the function returns immediately without doing anything.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_error_message.html ================================================ pcre2_get_error_message specification

pcre2_get_error_message man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen);

DESCRIPTION

This function provides a textual error message for each PCRE2 error code. Compilation errors are positive numbers; UTF formatting errors and matching errors are negative numbers. The arguments are:

  errorcode   an error code (positive or negative)
  buffer      where to put the message
  bufflen     the length of the buffer (code units)
The function returns the length of the message in code units, excluding the trailing zero, or the negative error code PCRE2_ERROR_NOMEMORY if the buffer is too small. In this case, the returned message is truncated (but still with a trailing zero). If errorcode does not contain a recognized error code number, the negative value PCRE2_ERROR_BADDATA is returned.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_mark.html ================================================ pcre2_get_mark specification

pcre2_get_mark man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);

DESCRIPTION

After a call of pcre2_match() that was passed the match block that is this function's argument, this function returns a pointer to the last (*MARK), (*PRUNE), or (*THEN) name that was encountered during the matching process. The name is zero-terminated, and is within the compiled pattern. The length of the name is in the preceding code unit. If no name is available, NULL is returned.

After a successful match, the name that is returned is the last one on the matching path. After a failed match or a partial match, the last encountered name is returned.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_match_data_heapframes_size.html ================================================ pcre2_get_match_data_heapframes_size specification

pcre2_get_match_data_heapframes_size man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

PCRE2_SIZE pcre2_get_match_data_heapframes_size( pcre2_match_data *match_data);

DESCRIPTION

This function returns the size, in bytes, of the heapframes data block that is owned by its argument.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_match_data_size.html ================================================ pcre2_get_match_data_size specification

pcre2_get_match_data_size man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data);

DESCRIPTION

This function returns the size, in bytes, of the match data block that is its argument.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_ovector_count.html ================================================ pcre2_get_ovector_count specification

pcre2_get_ovector_count man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);

DESCRIPTION

This function returns the number of pairs of offsets in the ovector that forms part of the given match data block.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_ovector_pointer.html ================================================ pcre2_get_ovector_pointer specification

pcre2_get_ovector_pointer man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);

DESCRIPTION

This function returns a pointer to the vector of offsets that forms part of the given match data block. The number of pairs can be found by calling pcre2_get_ovector_count().

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_get_startchar.html ================================================ pcre2_get_startchar specification

pcre2_get_startchar man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);

DESCRIPTION

After a successful call of pcre2_match() that was passed the match block that is this function's argument, this function returns the code unit offset of the character at which the successful match started. For a non-partial match, this can be different to the value of ovector[0] if the pattern contains the \K escape sequence. After a partial match, however, this value is always the same as ovector[0] because \K does not affect the result of a partial match.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_jit_compile.html ================================================ pcre2_jit_compile specification

pcre2_jit_compile man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_jit_compile(pcre2_code *code, uint32_t options);

DESCRIPTION

This function requests JIT compilation, which, if the just-in-time compiler is available, further processes a compiled pattern into machine code that executes much faster than the pcre2_match() interpretive matching function. Full details are given in the pcre2jit documentation.

The availability of JIT support can be tested by calling pcre2_compile_jit() with a single option PCRE2_JIT_TEST_ALLOC (the code argument is ignored, so a NULL value is accepted). Such a call returns zero if JIT is available and has a working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled.

Otherwise, the first argument must be a pointer that was returned by a successful call to pcre2_compile(), and the second must contain one or more of the following bits:

  PCRE2_JIT_COMPLETE      compile code for full matching
  PCRE2_JIT_PARTIAL_SOFT  compile code for soft partial matching
  PCRE2_JIT_PARTIAL_HARD  compile code for hard partial matching
There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF. The old option is deprecated and may be removed in the future.

The yield of the function when called with any of the three options above is 0 for success, or a negative error code otherwise. In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or if an unknown bit is set in options. The function can also return PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the compiler, even if it was because of a system security restriction. In a few cases, the function may return with PCRE2_ERROR_JIT_UNSUPPORTED for unsupported features.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_jit_free_unused_memory.html ================================================ pcre2_jit_free_unused_memory specification

pcre2_jit_free_unused_memory man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);

DESCRIPTION

This function frees unused JIT executable memory. The argument is a general context, for custom memory management, or NULL for standard memory management. JIT memory allocation retains some memory in order to improve future JIT compilation speed. In low memory conditions, pcre2_jit_free_unused_memory() can be used to cause this memory to be freed.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_jit_match.html ================================================ pcre2_jit_match specification

pcre2_jit_match man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext);

DESCRIPTION

This function matches a compiled regular expression that has been successfully processed by the JIT compiler against a given subject string, using a matching algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and it bypasses some of the sanity checks that pcre2_match() applies.

In UTF mode, the subject string is not checked for UTF validity. Unless PCRE2_MATCH_INVALID_UTF was set when the pattern was compiled, passing an invalid UTF string results in undefined behaviour. Your program may crash or loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you should only call pcre2_jit_match() in UTF mode if you are sure the subject is valid.

The arguments for pcre2_jit_match() are exactly the same as for pcre2_match(), except that the subject string must be specified with a length; PCRE2_ZERO_TERMINATED is not supported.

The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Unsupported options are ignored.

The return values are the same as for pcre2_match() plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. For details of partial matching, see the pcre2partial page.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the JIT API in the pcre2jit page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_jit_stack_assign.html ================================================ pcre2_jit_stack_assign specification

pcre2_jit_stack_assign man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback_function, void *callback_data);

DESCRIPTION

This function provides control over the memory used by JIT as a run-time stack when pcre2_match() or pcre2_jit_match() is called with a pattern that has been successfully processed by the JIT compiler. The information that determines which stack is used is put into a match context that is subsequently passed to a matching function. The arguments of this function are:

  mcontext       a pointer to a match context
  callback       a callback function
  callback_data  a JIT stack or a value to be passed to the callback

If mcontext is NULL, the function returns immediately, without doing anything.

If callback is NULL and callback_data is NULL, an internal 32KiB block on the machine stack is used.

If callback is NULL and callback_data is not NULL, callback_data must be a valid JIT stack, the result of calling pcre2_jit_stack_create().

If callback not NULL, it is called with callback_data as an argument at the start of matching, in order to set up a JIT stack. If the result is NULL, the internal 32KiB stack is used; otherwise the return value must be a valid JIT stack, the result of calling pcre2_jit_stack_create().

You may safely use the same JIT stack for multiple patterns, as long as they are all matched in the same thread. In a multithread application, each thread must use its own JIT stack. For more details, see the pcre2jit page.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_jit_stack_create.html ================================================ pcre2_jit_stack_create specification

pcre2_jit_stack_create man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize, size_t maxsize, pcre2_general_context *gcontext);

DESCRIPTION

This function is used to create a stack for use by the code compiled by the JIT compiler. The first two arguments are a starting size for the stack, and a maximum size to which it is allowed to grow. The final argument is a general context, for memory allocation functions, or NULL for standard memory allocation. The result can be passed to the JIT run-time code by calling pcre2_jit_stack_assign() to associate the stack with a compiled pattern, which can then be processed by pcre2_match() or pcre2_jit_match(). A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. If the stack couldn't be allocated or the values passed were not reasonable, NULL will be returned. For more details, see the pcre2jit page.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_jit_stack_free.html ================================================ pcre2_jit_stack_free specification

pcre2_jit_stack_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);

DESCRIPTION

This function is used to free a JIT stack that was created by pcre2_jit_stack_create() when it is no longer needed. If the argument is NULL, the function returns immediately without doing anything. For more details, see the pcre2jit page.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_maketables.html ================================================ pcre2_maketables specification

pcre2_maketables man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);

DESCRIPTION

This function builds a set of character tables for character code points that are less than 256. These can be passed to pcre2_compile() in a compile context in order to override the internal, built-in tables (which were either defaulted or made by pcre2_maketables() when PCRE2 was compiled). See the pcre2_set_character_tables() page. You might want to do this if you are using a non-standard locale.

If the argument is NULL, malloc() is used to get memory for the tables. Otherwise it must point to a general context, which can supply pointers to a custom memory manager. The function yields a pointer to the tables.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_maketables_free.html ================================================ pcre2_maketables_free specification

pcre2_maketables_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables);

DESCRIPTION

This function discards a set of character tables that were created by a call to pcre2_maketables().

The gcontext parameter should match what was used in that call to account for any custom allocators that might be in use; if it is NULL the system free() is used.

There is a complete description of the PCRE2 native API in the pcre2api page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match.html ================================================ pcre2_match specification

pcre2_match man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext);

DESCRIPTION

This function matches a compiled regular expression against a given subject string, using a matching algorithm that is similar to Perl's. It returns offsets to what it has matched and to captured substrings via the match_data block, which can be processed by functions with names that start with pcre2_get_ovector_...() or pcre2_substring_...(). The return from pcre2_match() is one more than the highest numbered capturing pair that has been set (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. The function arguments are:

  code         Points to the compiled pattern
  subject      Points to the subject string
  length       Length of the subject string
  startoffset  Offset in the subject at which to start matching
  options      Option bits
  match_data   Points to a match data block, for results
  mcontext     Points to a match context, or is NULL
A match context is needed only if you want to:
  Set up a callout function
  Set a matching offset limit
  Change the heap memory limit
  Change the backtracking match limit
  Change the backtracking depth limit
  Set custom memory management specifically for the match
The length and startoffset values are code units, not characters. The length may be given as PCRE2_ZERO_TERMINATED for a subject that is terminated by a binary zero code unit. The options are:
  PCRE2_ANCHORED          Match only at the first position
  PCRE2_COPY_MATCHED_SUBJECT
                          On success, make a private subject copy
  PCRE2_DISABLE_RECURSELOOP_CHECK
                          Only useful in rare cases; use with care
  PCRE2_ENDANCHORED       Pattern can match only at end of subject
  PCRE2_NOTBOL            Subject string is not the beginning of a line
  PCRE2_NOTEOL            Subject string is not the end of a line
  PCRE2_NOTEMPTY          An empty string is not a valid match
  PCRE2_NOTEMPTY_ATSTART  An empty string at the start of the subject is not a valid match
  PCRE2_NO_JIT            Do not use JIT matching
  PCRE2_NO_UTF_CHECK      Do not check the subject for UTF validity (only relevant if PCRE2_UTF
                           was set at compile time)
  PCRE2_PARTIAL_HARD      Return PCRE2_ERROR_PARTIAL for a partial match even if there is a full match
  PCRE2_PARTIAL_SOFT      Return PCRE2_ERROR_PARTIAL for a partial match if no full matches are found
For details of partial matching, see the pcre2partial page. There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match_context_copy.html ================================================ pcre2_match_context_copy specification

pcre2_match_context_copy man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_match_context *pcre2_match_context_copy( pcre2_match_context *mcontext);

DESCRIPTION

This function makes a new copy of a match context, using the memory allocation function that was used for the original context. The result is NULL if the memory cannot be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match_context_create.html ================================================ pcre2_match_context_create specification

pcre2_match_context_create man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_match_context *pcre2_match_context_create( pcre2_general_context *gcontext);

DESCRIPTION

This function creates and initializes a new match context. If its argument is NULL, malloc() is used to get the necessary memory; otherwise the memory allocation function within the general context is used. The result is NULL if the memory could not be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match_context_free.html ================================================ pcre2_match_context_free specification

pcre2_match_context_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_match_context_free(pcre2_match_context *mcontext);

DESCRIPTION

This function frees the memory occupied by a match context, using the memory freeing function from the general context with which it was created, or free() if that was not set. If the argument is NULL, the function returns immediately without doing anything.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match_data_create.html ================================================ pcre2_match_data_create specification

pcre2_match_data_create man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext);

DESCRIPTION

This function creates a new match data block, which is used for holding the result of a match. The first argument specifies the number of pairs of offsets that are required. These form the "output vector" (ovector) within the match data block, and are used to identify the matched string and any captured substrings when matching with pcre2_match(), or a number of different matches at the same point when used with pcre2_dfa_match(). There is always one pair of offsets; if ovecsize is zero, it is treated as one.

The second argument points to a general context, for custom memory management, or is NULL for system memory management. The result of the function is NULL if the memory for the block could not be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match_data_create_from_pattern.html ================================================ pcre2_match_data_create_from_pattern specification

pcre2_match_data_create_from_pattern man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

pcre2_match_data *pcre2_match_data_create_from_pattern( const pcre2_code *code, pcre2_general_context *gcontext);

DESCRIPTION

This function creates a new match data block for holding the result of a match. If the first argument is NULL, this function returns NULL, otherwise the first argument points to a compiled pattern. The number of capturing parentheses within the pattern is used to compute the number of pairs of offsets that are required in the match data block. These form the "output vector" (ovector) within the match data block, and are used to identify the matched string and any captured substrings when matching with pcre2_match(). If you are using pcre2_dfa_match(), which uses the output vector in a different way, you should use pcre2_match_data_create() instead of this function.

The second argument points to a general context, for custom memory management, or is NULL to use the same memory allocator that was used for the compiled pattern. The result of the function is NULL if the memory for the block could not be obtained or if NULL was provided as the first argument.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_match_data_free.html ================================================ pcre2_match_data_free specification

pcre2_match_data_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_match_data_free(pcre2_match_data *match_data);

DESCRIPTION

If match_data is NULL, this function does nothing. Otherwise, match_data must point to a match data block, which this function frees, using the memory freeing function from the general context or compiled pattern with which it was created, or free() if that was not set. If the match data block was previously passed to pcre2_match(), it will have an attached heapframe vector; this is also freed.

If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this match data block, the copy of the subject that was referenced within the block is also freed.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_next_match.html ================================================ pcre2_next_match specification

pcre2_next_match man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_next_match(pcre2_match_data *match_data, PCRE2_SIZE *pstart_offset, uint32_t *poptions);

DESCRIPTION

This function can be called after one of the match functions (pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match()), and must be provided with the same match_data parameter. It outputs the appropriate parameters for searching for the next match in the same subject string, and is suitable for applications providing "global" matching behaviour (for example, replacing all matches in the subject, or splitting the subject on all matches, or simply counting the number of matches).

It returns 0 ("false") if there is no need to make any further match attempts, or 1 ("true") if another match should be attempted.

The *pstart_offset and *poptions are set if the function returns 1. The *pstart_offset should be passed to the next match attempt directly, and the *poptions should be passed to the next match attempt by combining with the application's match options using OR.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_pattern_convert.html ================================================ pcre2_pattern_convert specification

pcre2_pattern_convert man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, PCRE2_UCHAR **buffer, PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It converts a foreign pattern (for example, a glob) into a PCRE2 regular expression pattern. Its arguments are:

  pattern     The foreign pattern
  length      The length of the input pattern or PCRE2_ZERO_TERMINATED
  options     Option bits
  buffer      Pointer to pointer to output buffer, or NULL
  blength     Pointer to output length field
  cvcontext   Pointer to a convert context or NULL
The length of the converted pattern (excluding the terminating zero) is returned via blength. If buffer is NULL, the function just returns the output length. If buffer points to a NULL pointer, heap memory is obtained for the converted pattern, using the allocator in the context if present (or else malloc()), and the field pointed to by buffer is updated. If buffer points to a non-NULL field, that must point to a buffer whose size is in the variable pointed to by blength. This value is updated.

The option bits are:

  PCRE2_CONVERT_UTF                     Input is UTF
  PCRE2_CONVERT_NO_UTF_CHECK            Do not check UTF validity
  PCRE2_CONVERT_POSIX_BASIC             Convert POSIX basic pattern
  PCRE2_CONVERT_POSIX_EXTENDED          Convert POSIX extended pattern
  PCRE2_CONVERT_GLOB                    ) Convert
  PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR  )   various types
  PCRE2_CONVERT_GLOB_NO_STARSTAR        )     of glob
The return value from pcre2_pattern_convert() is zero on success or a non-zero PCRE2 error code.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_pattern_info.html ================================================ pcre2_pattern_info specification

pcre2_pattern_info man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where);

DESCRIPTION

This function returns information about a compiled pattern. Its arguments are:

  code     Pointer to a compiled regular expression pattern
  what     What information is required
  where    Where to put the information
The recognized values for the what argument, and the information they request are as follows:
  PCRE2_INFO_ALLOPTIONS      Final options after compiling
  PCRE2_INFO_ARGOPTIONS      Options passed to pcre2_compile()
  PCRE2_INFO_BACKREFMAX      Number of highest backreference
  PCRE2_INFO_BSR             What \R matches:
                               PCRE2_BSR_UNICODE: Unicode line endings
                               PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only
  PCRE2_INFO_CAPTURECOUNT    Number of capturing subpatterns
  PCRE2_INFO_DEPTHLIMIT      Backtracking depth limit if set, otherwise PCRE2_ERROR_UNSET
  PCRE2_INFO_EXTRAOPTIONS    Extra options that were passed in the
                               compile context
  PCRE2_INFO_FIRSTBITMAP     Bitmap of first code units, or NULL
  PCRE2_INFO_FIRSTCODETYPE   Type of start-of-match information
                               0 nothing set
                               1 first code unit is set
                               2 start of string or after newline
  PCRE2_INFO_FIRSTCODEUNIT   First code unit when type is 1
  PCRE2_INFO_FRAMESIZE       Size of backtracking frame
  PCRE2_INFO_HASBACKSLASHC   Return 1 if pattern contains \C
  PCRE2_INFO_HASCRORLF       Return 1 if explicit CR or LF matches exist in the pattern
  PCRE2_INFO_HEAPLIMIT       Heap memory limit if set, otherwise PCRE2_ERROR_UNSET
  PCRE2_INFO_JCHANGED        Return 1 if (?J) or (?-J) was used
  PCRE2_INFO_JITSIZE         Size of JIT compiled code, or 0
  PCRE2_INFO_LASTCODETYPE    Type of must-be-present information
                               0 nothing set
                               1 code unit is set
  PCRE2_INFO_LASTCODEUNIT    Last code unit when type is 1
  PCRE2_INFO_MATCHEMPTY      1 if the pattern can match an empty string, 0 otherwise
  PCRE2_INFO_MATCHLIMIT      Match limit if set, otherwise PCRE2_ERROR_UNSET
  PCRE2_INFO_MAXLOOKBEHIND   Length (in characters) of the longest lookbehind assertion
  PCRE2_INFO_MINLENGTH       Lower bound length of matching strings
  PCRE2_INFO_NAMECOUNT       Number of named subpatterns
  PCRE2_INFO_NAMEENTRYSIZE   Size of name table entries
  PCRE2_INFO_NAMETABLE       Pointer to name table
  PCRE2_CONFIG_NEWLINE       Code for the newline sequence:
                               PCRE2_NEWLINE_CR
                               PCRE2_NEWLINE_LF
                               PCRE2_NEWLINE_CRLF
                               PCRE2_NEWLINE_ANY
                               PCRE2_NEWLINE_ANYCRLF
                               PCRE2_NEWLINE_NUL
  PCRE2_INFO_RECURSIONLIMIT  Obsolete synonym for PCRE2_INFO_DEPTHLIMIT
  PCRE2_INFO_SIZE            Size of compiled pattern
If where is NULL, the function returns the amount of memory needed for the requested information, in bytes. Otherwise, the where argument must point to an unsigned 32-bit integer (uint32_t variable), except for the following what values, when it must point to a variable of the type shown:
  PCRE2_INFO_FIRSTBITMAP     const uint8_t *
  PCRE2_INFO_JITSIZE         size_t
  PCRE2_INFO_NAMETABLE       PCRE2_SPTR
  PCRE2_INFO_SIZE            size_t
The yield of the function is zero on success or:
  PCRE2_ERROR_NULL           the argument code is NULL
  PCRE2_ERROR_BADMAGIC       the "magic number" was not found
  PCRE2_ERROR_BADOPTION      the value of what is invalid
  PCRE2_ERROR_BADMODE        the pattern was compiled in the wrong mode
  PCRE2_ERROR_UNSET          the requested information is not set

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_serialize_decode.html ================================================ pcre2_serialize_decode specification

pcre2_serialize_decode man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int32_t pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext);

DESCRIPTION

This function decodes a serialized set of compiled patterns back into a list of individual patterns. This is possible only on a host that is running the same version of PCRE2, with the same code unit width, and the host must also have the same endianness, pointer width and PCRE2_SIZE type. The arguments for pcre2_serialize_decode() are:

  codes            pointer to a vector in which to build the list
  number_of_codes  number of slots in the vector
  bytes            the serialized byte stream
  gcontext         pointer to a general context or NULL
The bytes argument must point to a block of data that was originally created by pcre2_serialize_encode(), though it may have been saved on disc or elsewhere in the meantime. If there are more codes in the serialized data than slots in the list, only those compiled patterns that will fit are decoded. The yield of the function is the number of decoded patterns, or one of the following negative error codes:
  PCRE2_ERROR_BADDATA   number_of_codes is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in bytes
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      codes or bytes is NULL
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the serialization functions in the pcre2serialize page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_serialize_encode.html ================================================ pcre2_serialize_encode specification

pcre2_serialize_encode man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int32_t pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);

DESCRIPTION

This function encodes a list of compiled patterns into a byte stream that can be saved on disc or elsewhere. Note that this is not an abstract format like Java or .NET. Conversion of the byte stream back into usable compiled patterns can only happen on a host that is running the same version of PCRE2, with the same code unit width, and the host must also have the same endianness, pointer width and PCRE2_SIZE type. The arguments for pcre2_serialize_encode() are:

  codes             pointer to a vector containing the list
  number_of_codes   number of slots in the vector
  serialized_bytes  set to point to the serialized byte stream
  serialized_size   set to the number of bytes in the byte stream
  gcontext          pointer to a general context or NULL
The context argument is used to obtain memory for the byte stream. When the serialized data is no longer needed, it must be freed by calling pcre2_serialize_free(). The yield of the function is the number of serialized patterns, or one of the following negative error codes:
  PCRE2_ERROR_BADDATA      number_of_codes is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
  PCRE2_ERROR_MEMORY       memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         an argument other than gcontext is NULL
PCRE2_ERROR_BADMAGIC means either that a pattern's code has been corrupted, or that a slot in the vector does not point to a compiled pattern.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the serialization functions in the pcre2serialize page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_serialize_free.html ================================================ pcre2_serialize_free specification

pcre2_serialize_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_serialize_free(uint8_t *bytes);

DESCRIPTION

This function frees the memory that was obtained by pcre2_serialize_encode() to hold a serialized byte stream. The argument must point to such a byte stream or be NULL, in which case the function returns without doing anything.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the serialization functions in the pcre2serialize page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_serialize_get_number_of_codes.html ================================================ pcre2_serialize_get_number_of_codes specification

pcre2_serialize_get_number_of_codes man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);

DESCRIPTION

The bytes argument must point to a serialized byte stream that was originally created by pcre2_serialize_encode() (though it may have been saved on disc or elsewhere in the meantime). The function returns the number of serialized patterns in the byte stream, or one of the following negative error codes:

  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in bytes
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
  PCRE2_ERROR_NULL      the argument is NULL
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the serialization functions in the pcre2serialize page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_bsr.html ================================================ pcre2_set_bsr specification

pcre2_set_bsr man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value);

DESCRIPTION

This function sets the convention for processing \R within a compile context. The second argument must be one of PCRE2_BSR_ANYCRLF or PCRE2_BSR_UNICODE. The result is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_callout.html ================================================ pcre2_set_callout specification

pcre2_set_callout man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_callout_block *), void *callout_data);

DESCRIPTION

This function sets the callout fields in a match context (the first argument). The second argument specifies a callout function, and the third argument is an opaque data item that is passed to it. The result of this function is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_character_tables.html ================================================ pcre2_set_character_tables specification

pcre2_set_character_tables man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_character_tables(pcre2_compile_context *ccontext, const uint8_t *tables);

DESCRIPTION

This function sets a pointer to custom character tables within a compile context. The second argument must point to a set of PCRE2 character tables or be NULL to request the default tables. The result is always zero. Character tables can be created by calling pcre2_maketables() or by running the pcre2_dftables maintenance command in binary mode (see the pcre2build documentation).

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_compile_extra_options.html ================================================ pcre2_set_compile_extra_options specification

pcre2_set_compile_extra_options man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t extra_options);

DESCRIPTION

This function sets additional option bits for pcre2_compile() that are housed in a compile context. It completely replaces all the bits. The extra options are:

  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \K in lookarounds
  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \u, \U, and \x handling
  PCRE2_EXTRA_ASCII_BSD                \d remains ASCII in UCP mode
  PCRE2_EXTRA_ASCII_BSS                \s remains ASCII in UCP mode
  PCRE2_EXTRA_ASCII_BSW                \w remains ASCII in UCP mode
  PCRE2_EXTRA_ASCII_DIGIT              [:digit:] and [:xdigit:] POSIX classes remain ASCII in UCP mode
  PCRE2_EXTRA_ASCII_POSIX              POSIX classes remain ASCII in UCP mode
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as a literal following character
  PCRE2_EXTRA_CASELESS_RESTRICT        Disable mixed ASCII/non-ASCII case folding
  PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \r as \n
  PCRE2_EXTRA_MATCH_LINE               Pattern matches whole lines
  PCRE2_EXTRA_MATCH_WORD               Pattern matches "words"
  PCRE2_EXTRA_NEVER_CALLOUT            Disallow callouts in pattern
  PCRE2_EXTRA_NO_BS0                   Disallow \0 (but not \00 or \000)
  PCRE2_EXTRA_PYTHON_OCTAL             Use Python rules for octal
  PCRE2_EXTRA_TURKISH_CASING           Use Turkish I case folding
There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_compile_recursion_guard.html ================================================ pcre2_set_compile_recursion_guard specification

pcre2_set_compile_recursion_guard man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data);

DESCRIPTION

This function defines, within a compile context, a function that is called whenever pcre2_compile() starts to compile a parenthesized part of a pattern. The first argument to the function gives the current depth of parenthesis nesting, and the second is user data that is supplied when the function is set up. The callout function should return zero if all is well, or non-zero to force an error. This feature is provided so that applications can check the available system stack space, in order to avoid running out. The result of pcre2_set_compile_recursion_guard() is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_depth_limit.html ================================================ pcre2_set_depth_limit specification

pcre2_set_depth_limit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t value);

DESCRIPTION

This function sets the backtracking depth limit field in a match context. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_glob_escape.html ================================================ pcre2_set_glob_escape specification

pcre2_set_glob_escape man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, uint32_t escape_char);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It sets the escape character that is used when converting globs. The second argument must either be zero (meaning there is no escape character) or a punctuation character whose code point is less than 256. The default is grave accent if running under Windows, otherwise backslash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_glob_separator.html ================================================ pcre2_set_glob_separator specification

pcre2_set_glob_separator man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, uint32_t separator_char);

DESCRIPTION

This function is part of an experimental set of pattern conversion functions. It sets the component separator character that is used when converting globs. The second argument must be one of the characters forward slash, backslash, or dot. The default is backslash when running under Windows, otherwise forward slash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid.

The pattern conversion functions are described in the pcre2convert documentation.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_heap_limit.html ================================================ pcre2_set_heap_limit specification

pcre2_set_heap_limit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t value);

DESCRIPTION

This function sets the backtracking heap limit field in a match context. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_match_limit.html ================================================ pcre2_set_match_limit specification

pcre2_set_match_limit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value);

DESCRIPTION

This function sets the match limit field in a match context. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_max_pattern_compiled_length.html ================================================ pcre2_set_max_pattern_compiled_length specification

pcre2_set_max_pattern_compiled_length man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_max_pattern_compiled_length( pcre2_compile_context *ccontext, PCRE2_SIZE value);

DESCRIPTION

This function sets, in a compile context, the maximum size (in bytes) for the memory needed to hold the compiled version of a pattern that is using this context. The result is always zero. If a pattern that is passed to pcre2_compile() referencing this context needs more memory, an error is generated. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_max_pattern_length.html ================================================ pcre2_set_max_pattern_length specification

pcre2_set_max_pattern_length man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE value);

DESCRIPTION

This function sets, in a compile context, the maximum text length (in code units) of the pattern that can be compiled. The result is always zero. If a longer pattern is passed to pcre2_compile() there is an immediate error return. The default is effectively unlimited, being the largest value a PCRE2_SIZE variable can hold.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_max_varlookbehind.html ================================================ pcre2_set_max_varlookbehind specification

pcre2_set_max_varlookbehind man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_max_varlookbehind(pcre2_compile_context *ccontext, uint32_t value);

DESCRIPTION

This sets a maximum length for the number of characters matched by a variable-length lookbehind assertion. The default is set when PCRE2 is built, with the ultimate default being 255, the same as Perl. Lookbehind assertions without a bounding length are not supported. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_newline.html ================================================ pcre2_set_newline specification

pcre2_set_newline man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value);

DESCRIPTION

This function sets the newline convention within a compile context. This specifies which character(s) are recognized as newlines when compiling and matching patterns. The second argument must be one of:

  PCRE2_NEWLINE_CR        Carriage return only
  PCRE2_NEWLINE_LF        Linefeed only
  PCRE2_NEWLINE_CRLF      CR followed by LF only
  PCRE2_NEWLINE_ANYCRLF   Any of the above
  PCRE2_NEWLINE_ANY       Any Unicode newline sequence
  PCRE2_NEWLINE_NUL       The NUL character (binary zero)
The result is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_offset_limit.html ================================================ pcre2_set_offset_limit specification

pcre2_set_offset_limit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);

DESCRIPTION

This function sets the offset limit field in a match context. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_optimize.html ================================================ pcre2_set_optimize specification

pcre2_set_optimize man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive);

DESCRIPTION

This function controls which performance optimizations will be applied by pcre2_compile(). It can be called multiple times with the same compile context; the effects are cumulative, with the effects of later calls taking precedence over earlier ones.

The result is zero for success, PCRE2_ERROR_NULL if ccontext is NULL, or PCRE2_ERROR_BADOPTION if directive is unknown. The latter could be useful to detect if a certain optimization is available.

The list of possible values for the directive parameter are:

  PCRE2_OPTIMIZATION_FULL   Enable all optimizations (default)
  PCRE2_OPTIMIZATION_NONE   Disable all optimizations
  PCRE2_AUTO_POSSESS        Enable auto-possessification
  PCRE2_AUTO_POSSESS_OFF    Disable auto-possessification
  PCRE2_DOTSTAR_ANCHOR      Enable implicit dotstar anchoring
  PCRE2_DOTSTAR_ANCHOR_OFF  Disable implicit dotstar anchoring
  PCRE2_START_OPTIMIZE      Enable start-up optimizations at match time
  PCRE2_START_OPTIMIZE_OFF  Disable start-up optimizations at match time
There is a complete description of the PCRE2 native API, including detailed descriptions directive parameter values in the pcre2api page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_parens_nest_limit.html ================================================ pcre2_set_parens_nest_limit specification

pcre2_set_parens_nest_limit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t value);

DESCRIPTION

This function sets, in a compile context, the maximum depth of nested parentheses in a pattern. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_recursion_limit.html ================================================ pcre2_set_recursion_limit specification

pcre2_set_recursion_limit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t value);

DESCRIPTION

This function is obsolete and should not be used in new code. Use pcre2_set_depth_limit() instead.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_recursion_memory_management.html ================================================ pcre2_set_recursion_memory_management specification

pcre2_set_recursion_memory_management man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_recursion_memory_management( pcre2_match_context *mcontext, void *(*private_malloc)(size_t, void *), void (*private_free)(void *, void *), void *memory_data);

DESCRIPTION

From release 10.30 onwards, this function is obsolete and does nothing. The result is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_substitute_callout.html ================================================ pcre2_set_substitute_callout specification

pcre2_set_substitute_callout man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);

DESCRIPTION

This function sets the substitute callout fields in a match context (the first argument). The second argument specifies a callout function, and the third argument is an opaque data item that is passed to it. The result of this function is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_set_substitute_case_callout.html ================================================ pcre2_set_substitute_case_callout specification

pcre2_set_substitute_case_callout man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data);

DESCRIPTION

This function sets the substitute case callout fields in a match context (the first argument). The second argument specifies a callout function, and the third argument is an opaque data item that is passed to it. The result of this function is always zero.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substitute.html ================================================ pcre2_substitute specification

pcre2_substitute man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, PCRE2_SIZE *outlengthptr);

DESCRIPTION

This function matches a compiled regular expression against a given subject string, using a matching algorithm that is similar to Perl's. It then makes a copy of the subject, substituting a replacement string for what was matched. Its arguments are:

  code          Points to the compiled pattern
  subject       Points to the subject string
  length        Length of the subject string
  startoffset   Offset in the subject at which to start matching
  options       Option bits
  match_data    Points to a match data block, or is NULL
  mcontext      Points to a match context, or is NULL
  replacement   Points to the replacement string
  rlength       Length of the replacement string
  outputbuffer  Points to the output buffer
  outlengthptr  Points to the length of the output buffer
A match data block is needed only if you want to inspect the data from the final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is set. A match context is needed only if you want to:
  Set up a callout function
  Set a matching offset limit
  Change the backtracking match limit
  Change the backtracking depth limit
  Set custom memory management in the match context
The length, startoffset and rlength values are code units, not characters, as is the contents of the variable pointed at by outlengthptr. This variable must contain the length of the output buffer when the function is called. If the function is successful, the value is changed to the length of the new string, excluding the trailing zero that is automatically added.

The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for zero-terminated strings. The options are:

  PCRE2_ANCHORED                     Match only at the first position
  PCRE2_ENDANCHORED                  Match only at end of subject
  PCRE2_NOTBOL                       Subject is not the beginning of a line
  PCRE2_NOTEOL                       Subject is not the end of a line
  PCRE2_NOTEMPTY                     An empty string is not a valid match
  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of the subject is not a valid match
  PCRE2_NO_JIT                       Do not use JIT matching
  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in the subject or replacement
                                      (only relevant if PCRE2_UTF was set at compile time)
  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the subject
  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for first match
  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.

If PCRE2_SUBSTITUTE_MATCHED is set, match_data must be non-NULL; its contents must be the result of a call to pcre2_match() (or pcre2_jit_match()) using the same pattern, subject pointer, effective subject length, start offset, and match options.

The function returns the number of substitutions, which may be zero if there are no matches. The result may be greater than one only when PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code is returned.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_copy_byname.html ================================================ pcre2_substring_copy_byname specification

pcre2_substring_copy_byname man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);

DESCRIPTION

This is a convenience function for extracting a captured substring, identified by name, into a given buffer. The arguments are:

  match_data    The match data block for the match
  name          Name of the required substring
  buffer        Buffer to receive the string
  bufflen       Length of buffer (code units)
The bufflen variable is updated to contain the length of the extracted string, excluding the trailing zero. The yield of the function is zero for success or one of the following error numbers:
  PCRE2_ERROR_NOSUBSTRING   there are no groups of that name
  PCRE2_ERROR_UNAVAILBLE    the ovector was too small for that group
  PCRE2_ERROR_UNSET         the group did not participate in the match
  PCRE2_ERROR_NOMEMORY      the buffer is not big enough
If there is more than one group with the given name, the first one that is set is returned. In this situation PCRE2_ERROR_UNSET means that no group with the given name was set.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_copy_bynumber.html ================================================ pcre2_substring_copy_bynumber specification

pcre2_substring_copy_bynumber man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);

DESCRIPTION

This is a convenience function for extracting a captured substring into a given buffer. The arguments are:

  match_data    The match data block for the match
  number        Number of the required substring
  buffer        Buffer to receive the string
  bufflen       Length of buffer
The bufflen variable is updated with the length of the extracted string, excluding the terminating zero. The yield of the function is zero for success or one of the following error numbers:
  PCRE2_ERROR_NOSUBSTRING   there are no groups of that number
  PCRE2_ERROR_UNAVAILBLE    the ovector was too small for that group
  PCRE2_ERROR_UNSET         the group did not participate in the match
  PCRE2_ERROR_NOMEMORY      the buffer is too small

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_free.html ================================================ pcre2_substring_free specification

pcre2_substring_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_substring_free(PCRE2_UCHAR *buffer);

DESCRIPTION

This is a convenience function for freeing the memory obtained by a previous call to pcre2_substring_get_byname() or pcre2_substring_get_bynumber(). Its only argument is a pointer to the string. If the argument is NULL, the function does nothing.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_get_byname.html ================================================ pcre2_substring_get_byname specification

pcre2_substring_get_byname man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_get_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);

DESCRIPTION

This is a convenience function for extracting a captured substring by name into newly acquired memory. The arguments are:

  match_data    The match data for the match
  name          Name of the required substring
  bufferptr     Where to put the string pointer
  bufflen       Where to put the string length
The memory in which the substring is placed is obtained by calling the same memory allocation function that was used for the match data block. The convenience function pcre2_substring_free() can be used to free it when it is no longer needed. The yield of the function is zero for success or one of the following error numbers:
  PCRE2_ERROR_NOSUBSTRING   there are no groups of that name
  PCRE2_ERROR_UNAVAILBLE    the ovector was too small for that group
  PCRE2_ERROR_UNSET         the group did not participate in the match
  PCRE2_ERROR_NOMEMORY      memory could not be obtained
If there is more than one group with the given name, the first one that is set is returned. In this situation PCRE2_ERROR_UNSET means that no group with the given name was set.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_get_bynumber.html ================================================ pcre2_substring_get_bynumber specification

pcre2_substring_get_bynumber man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_get_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);

DESCRIPTION

This is a convenience function for extracting a captured substring by number into newly acquired memory. The arguments are:

  match_data    The match data for the match
  number        Number of the required substring
  bufferptr     Where to put the string pointer
  bufflen       Where to put the string length
The memory in which the substring is placed is obtained by calling the same memory allocation function that was used for the match data block. The convenience function pcre2_substring_free() can be used to free it when it is no longer needed. The yield of the function is zero for success or one of the following error numbers:
  PCRE2_ERROR_NOSUBSTRING   there are no groups of that number
  PCRE2_ERROR_UNAVAILBLE    the ovector was too small for that group
  PCRE2_ERROR_UNSET         the group did not participate in the match
  PCRE2_ERROR_NOMEMORY      memory could not be obtained

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_length_byname.html ================================================ pcre2_substring_length_byname specification

pcre2_substring_length_byname man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_length_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_SIZE *length);

DESCRIPTION

This function returns the length of a matched substring, identified by name. The arguments are:

  match_data   The match data block for the match
  name         The substring name
  length       Where to return the length, or NULL
The third argument may be NULL if all you want to know is whether or not a substring is set. The yield is zero on success, or a negative error code otherwise.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_length_bynumber.html ================================================ pcre2_substring_length_bynumber specification

pcre2_substring_length_bynumber man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length);

DESCRIPTION

This function returns the length of a matched substring, identified by number. The arguments are:

  match_data   The match data block for the match
  number       The substring number
  length       Where to return the length, or NULL
The third argument may be NULL if all you want to know is whether or not a substring is set. The yield is zero on success, or a negative error code otherwise. After a partial match, only substring 0 is available.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_list_free.html ================================================ pcre2_substring_list_free specification

pcre2_substring_list_free man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

void pcre2_substring_list_free(PCRE2_UCHAR **list);

DESCRIPTION

This is a convenience function for freeing the store obtained by a previous call to pcre2substring_list_get(). Its only argument is a pointer to the list of string pointers. If the argument is NULL, the function returns immediately, without doing anything.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_list_get.html ================================================ pcre2_substring_list_get specification

pcre2_substring_list_get man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_list_get(pcre2_match_data *match_data, " PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);

DESCRIPTION

This is a convenience function for extracting all the captured substrings after a pattern match. It builds a list of pointers to the strings, and (optionally) a second list that contains their lengths (in code units), excluding a terminating zero that is added to each of them. All this is done in a single block of memory that is obtained using the same memory allocation function that was used to get the match data block. The convenience function pcre2_substring_list_free() can be used to free it when it is no longer needed. The arguments are:

  match_data    The match data block
  listptr       Where to put a pointer to the list
  lengthsptr    Where to put a pointer to the lengths, or NULL
A pointer to a list of pointers is put in the variable whose address is in listptr. The list is terminated by a NULL pointer. If lengthsptr is not NULL, a matching list of lengths is created, and its address is placed in lengthsptr. The yield of the function is zero on success or PCRE2_ERROR_NOMEMORY if sufficient memory could not be obtained.

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_nametable_scan.html ================================================ pcre2_substring_nametable_scan specification

pcre2_substring_nametable_scan man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);

DESCRIPTION

This convenience function finds, for a compiled pattern, the first and last entries for a given name in the table that translates capture group names into numbers.

  code    Compiled regular expression
  name    Name whose entries required
  first   Where to return a pointer to the first entry
  last    Where to return a pointer to the last entry
When the name is found in the table, if first is NULL, the function returns a group number, but if there is more than one matching entry, it is not defined which one. Otherwise, when both pointers have been set, the yield of the function is the length of each entry in code units. If the name is not found, PCRE2_ERROR_NOSUBSTRING is returned.

There is a complete description of the PCRE2 native API, including the format of the table entries, in the pcre2api page, and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2_substring_number_from_name.html ================================================ pcre2_substring_number_from_name specification

pcre2_substring_number_from_name man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name);

DESCRIPTION

This convenience function finds the number of a named substring capturing parenthesis in a compiled pattern, provided that it is a unique name. The function arguments are:

  code    Compiled regular expression
  name    Name whose number is required
The yield of the function is the number of the parenthesis if the name is found, or PCRE2_ERROR_NOSUBSTRING if it is not found. When duplicate names are allowed (PCRE2_DUPNAMES is set), if the name is not unique, PCRE2_ERROR_NOUNIQUESUBSTRING is returned. You can obtain the list of numbers with the same name by calling pcre2_substring_nametable_scan().

There is a complete description of the PCRE2 native API in the pcre2api page and a description of the POSIX API in the pcre2posix page.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2api.html ================================================ pcre2api specification

pcre2api man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

#include <pcre2.h>

PCRE2 is a new API for PCRE, starting at release 10.0. This document contains a description of all its native functions. See the pcre2 document for an overview of all the PCRE2 documentation.

PCRE2 NATIVE API BASIC FUNCTIONS

pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext);

void pcre2_code_free(pcre2_code *code);

pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext);

pcre2_match_data *pcre2_match_data_create_from_pattern( const pcre2_code *code, pcre2_general_context *gcontext);

int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext);

int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount);

void pcre2_match_data_free(pcre2_match_data *match_data);

PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS

PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);

PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data);

PCRE2_SIZE pcre2_get_match_data_heapframes_size( pcre2_match_data *match_data);

uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);

PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);

PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);

PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS

pcre2_general_context *pcre2_general_context_create( void *(*private_malloc)(PCRE2_SIZE, void *), void (*private_free)(void *, void *), void *memory_data);

pcre2_general_context *pcre2_general_context_copy( pcre2_general_context *gcontext);

void pcre2_general_context_free(pcre2_general_context *gcontext);

PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS

pcre2_compile_context *pcre2_compile_context_create( pcre2_general_context *gcontext);

pcre2_compile_context *pcre2_compile_context_copy( pcre2_compile_context *ccontext);

void pcre2_compile_context_free(pcre2_compile_context *ccontext);

int pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value);

int pcre2_set_character_tables(pcre2_compile_context *ccontext, const uint8_t *tables);

int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t extra_options);

int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE value);

int pcre2_set_max_pattern_compiled_length( pcre2_compile_context *ccontext, PCRE2_SIZE value);

int pcre2_set_max_varlookbehind(pcre2_compile_contest *ccontext, uint32_t value);

int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value);

int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t value);

int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data);

int pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive);

PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

pcre2_match_context *pcre2_match_context_create( pcre2_general_context *gcontext);

pcre2_match_context *pcre2_match_context_copy( pcre2_match_context *mcontext);

void pcre2_match_context_free(pcre2_match_context *mcontext);

int pcre2_set_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_callout_block *, void *), void *callout_data);

int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);

int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data);

int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);

int pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t value);

int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value);

int pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t value);

PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS

int pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);

int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);

void pcre2_substring_free(PCRE2_UCHAR *buffer);

int pcre2_substring_get_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);

int pcre2_substring_get_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);

int pcre2_substring_length_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_SIZE *length);

int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length);

int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);

int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name);

void pcre2_substring_list_free(PCRE2_UCHAR **list);

int pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);

PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION

int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, PCRE2_SIZE *outlengthptr);

PCRE2 NATIVE API JIT FUNCTIONS

int pcre2_jit_compile(pcre2_code *code, uint32_t options);

int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext);

void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);

pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize, size_t maxsize, pcre2_general_context *gcontext);

void pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback_function, void *callback_data);

void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);

PCRE2 NATIVE API SERIALIZATION FUNCTIONS

int32_t pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext);

int32_t pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);

void pcre2_serialize_free(uint8_t *bytes);

int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);

PCRE2 NATIVE API AUXILIARY FUNCTIONS

pcre2_code *pcre2_code_copy(const pcre2_code *code);

pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);

int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen);

const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);

void pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables);

int pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where);

int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data);

int pcre2_config(uint32_t what, void *where);

PCRE2 NATIVE API OBSOLETE FUNCTIONS

int pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t value);

int pcre2_set_recursion_memory_management( pcre2_match_context *mcontext, void *(*private_malloc)(size_t, void *), void (*private_free)(void *, void *), void *memory_data);

These functions became obsolete at release 10.30 and are retained only for backward compatibility. They should not be used in new code. The first is replaced by pcre2_set_depth_limit(); the second is no longer needed and has no effect (it always returns zero).

PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS

pcre2_convert_context *pcre2_convert_context_create( pcre2_general_context *gcontext);

pcre2_convert_context *pcre2_convert_context_copy( pcre2_convert_context *cvcontext);

void pcre2_convert_context_free(pcre2_convert_context *cvcontext);

int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, uint32_t escape_char);

int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, uint32_t separator_char);

int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, PCRE2_UCHAR **buffer, PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);

void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);

These functions provide a way of converting non-PCRE2 patterns into patterns that can be processed by pcre2_compile(). This facility is experimental and may be changed in future releases. At present, "globs" and POSIX basic and extended patterns can be converted. Details are given in the pcre2convert documentation.

PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES

There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code units, respectively. However, there is just one header file, pcre2.h. This contains the function prototypes and other definitions for all three libraries. One, two, or all three can be installed simultaneously. On Unix-like systems the libraries are called libpcre2-8, libpcre2-16, and libpcre2-32, and they can also co-exist with the original PCRE libraries. Every PCRE2 function comes in three different forms, one for each library, for example:

  pcre2_compile_8()
  pcre2_compile_16()
  pcre2_compile_32()
There are also three different sets of data types:
  PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
  PCRE2_SPTR8,  PCRE2_SPTR16,  PCRE2_SPTR32
The UCHAR types define unsigned code units of the appropriate widths. For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are pointers to constants of the equivalent UCHAR types, that is, they are pointers to vectors of unsigned code units.

Character strings are passed to a PCRE2 library as sequences of unsigned integers in code units of the appropriate width. The length of a string may be given as a number of code units, or the string may be specified as zero-terminated.

Many applications use only one code unit width. For their convenience, macros are defined whose names are the generic forms such as pcre2_compile() and PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific function and macro names. PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it to be 8, 16, or 32 before including pcre2.h in order to make use of the generic names.

Applications that use more than one code unit width can be linked with more than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before including pcre2.h, and then use the real function names. Any code that is to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is unknown should also use the real function names. (Unfortunately, it is not possible in C code to save and restore the value of a macro.)

If PCRE2_CODE_UNIT_WIDTH is not defined before including pcre2.h, a compiler error occurs.

When using multiple libraries in an application, you must take care when processing any particular pattern to use only functions from a single library. For example, if you want to run a match using a pattern that was compiled with pcre2_compile_16(), you must do so with pcre2_match_16(), not pcre2_match_8() or pcre2_match_32().

In the function summaries above, and in the rest of this document and other PCRE2 documents, functions and data types are described using their generic names, without the _8, _16, or _32 suffix.

PCRE2 API OVERVIEW

PCRE2 has its own native API, which is described in this document. There are also some wrapper functions for the 8-bit library that correspond to the POSIX regular expression API, but they do not give access to all the functionality of PCRE2 and they are not thread-safe. They are described in the pcre2posix documentation. Both these APIs define a set of C function calls.

The native API C data types, function prototypes, option values, and error codes are defined in the header file pcre2.h, which also contains definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the library. Applications can use these to include support for different releases of PCRE2.

In a Windows environment, if you want to statically link an application program against a non-dll PCRE2 library, you must define PCRE2_STATIC before including pcre2.h.

The functions pcre2_compile() and pcre2_match() are used for compiling and matching regular expressions in a Perl-compatible manner. A sample program that demonstrates the simplest way of using them is provided in the file called pcre2demo.c in the PCRE2 source distribution. A listing of this program is given in the pcre2demo documentation, and the pcre2sample documentation describes how to compile and run it.

The compiling and matching functions recognize various options that are passed as bits in an options argument. There are also some more complicated parameters such as custom memory management functions and resource limits that are passed in "contexts" (which are just memory blocks, described below). Simple applications do not need to make use of contexts.

Just-in-time (JIT) compiler support is an optional feature of PCRE2 that can be built in appropriate hardware environments. It greatly speeds up the matching performance of many patterns. Programs can request that it be used if available by calling pcre2_jit_compile() after a pattern has been successfully compiled by pcre2_compile(). This does nothing if JIT support is not available.

More complicated programs might need to make use of the specialist functions pcre2_jit_stack_create(), pcre2_jit_stack_free(), and pcre2_jit_stack_assign() in order to control the JIT code's memory usage.

JIT matching is automatically used by pcre2_match() if it is available, unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT matching, which gives improved performance at the expense of less sanity checking. The JIT-specific functions are discussed in the pcre2jit documentation.

A second matching function, pcre2_dfa_match(), which is not Perl-compatible, is also provided. This uses a different algorithm for the matching. The alternative algorithm finds all possible matches (at a given point in the subject), and scans the subject just once (unless there are lookaround assertions). However, this algorithm does not return captured substrings. A description of the two matching algorithms and their advantages and disadvantages is given in the pcre2matching documentation. There is no JIT support for pcre2_dfa_match().

In addition to the main compiling and matching functions, there are convenience functions for extracting captured substrings from a subject string that has been matched by pcre2_match(). They are:

  pcre2_substring_copy_byname()
  pcre2_substring_copy_bynumber()
  pcre2_substring_get_byname()
  pcre2_substring_get_bynumber()
  pcre2_substring_list_get()
  pcre2_substring_length_byname()
  pcre2_substring_length_bynumber()
  pcre2_substring_nametable_scan()
  pcre2_substring_number_from_name()
pcre2_substring_free() and pcre2_substring_list_free() are also provided, to free memory used for extracted strings. If either of these functions is called with a NULL argument, the function returns immediately without doing anything.

The function pcre2_substitute() can be called to match a pattern and return a copy of the subject string with substitutions for parts that were matched.

Functions whose names begin with pcre2_serialize_ are used for saving compiled patterns on disc or elsewhere, and reloading them later.

Finally, there are functions for finding out information about a compiled pattern (pcre2_pattern_info()) and about the configuration with which PCRE2 was built (pcre2_config()) and that it is using.

Functions with names ending with _free() are used for freeing memory blocks of various sorts. In all cases, if one of these functions is called with a NULL argument, it does nothing.

STRING LENGTHS AND OFFSETS

The PCRE2 API uses string lengths and offsets into strings of code units in several places. These values are always of type PCRE2_SIZE, which is an unsigned integer type, currently always defined as size_t. The largest value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings and unset offsets. Therefore, the longest string that can be handled is one less than this maximum. Note that string lengths are always given in code units. Only in the 8-bit library is such a length the same as the number of bytes in the string.

NEWLINES

PCRE2 supports five different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) character, the two-character sequence CRLF, any of the three preceding, or any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029).

Each of the first three conventions is used by at least one operating system as its standard newline sequence. When PCRE2 is built, a default can be specified. If it is not, the default is set to LF, which is the Unix standard. However, the newline convention can be changed by an application when calling pcre2_compile(), or it can be specified by special text at the start of the pattern itself; this overrides any other settings. See the pcre2pattern page for details of the special character sequences.

In the PCRE2 documentation the word "newline" is used to mean "the character or pair of characters that indicate a line break". The choice of newline convention affects the handling of the dot, circumflex, and dollar metacharacters, the handling of #-comments in /x mode, and, when CRLF is a recognized line ending sequence, the match position advancement for a non-anchored pattern. There is more detail about this in the section on pcre2_match() options below.

The choice of newline convention does not affect the interpretation of the \n or \r escape sequences, nor does it affect what \R matches; this has its own separate convention.

MULTITHREADING

In a multithreaded application it is important to keep thread-specific data separate from data that can be shared between threads. The PCRE2 library code itself is thread-safe: it contains no static or global variables. The API is designed to be fairly simple for non-threaded applications while at the same time ensuring that multithreaded applications can use it.

There are several different blocks of data that are used to pass information between the application and the PCRE2 libraries.

The compiled pattern

A pointer to the compiled form of a pattern is returned to the user when pcre2_compile() is successful. The data in the compiled pattern is fixed, and does not change when the pattern is matched. Therefore, it is thread-safe, that is, the same compiled pattern can be used by more than one thread simultaneously. For example, an application can compile all its patterns at the start, before forking off multiple threads that use them. However, if the just-in-time (JIT) optimization feature is being used, it needs separate memory stack areas for each thread. See the pcre2jit documentation for more details.

In a more complicated situation, where patterns are compiled only when they are first needed, but are still shared between threads, pointers to compiled patterns must be protected from simultaneous writing by multiple threads. This is somewhat tricky to do correctly. If you know that writing to a pointer is atomic in your environment, you can use logic like this:

  Get a read-only (shared) lock (mutex) for pointer
  if (pointer == NULL)
    {
    Get a write (unique) lock for pointer
    if (pointer == NULL) pointer = pcre2_compile(...
    }
  Release the lock
  Use pointer in pcre2_match()
Of course, testing for compilation errors should also be included in the code.

The reason for checking the pointer a second time is as follows: Several threads may have acquired the shared lock and tested the pointer for being NULL, but only one of them will be given the write lock, with the rest kept waiting. The winning thread will compile the pattern and store the result. After this thread releases the write lock, another thread will get it, and if it does not retest pointer for being NULL, will recompile the pattern and overwrite the pointer, creating a memory leak and possibly causing other issues.

In an environment where writing to a pointer may not be atomic, the above logic is not sufficient. The thread that is doing the compiling may be descheduled after writing only part of the pointer, which could cause other threads to use an invalid value. Instead of checking the pointer itself, a separate "pointer is valid" flag (that can be updated atomically) must be used:

  Get a read-only (shared) lock (mutex) for pointer
  if (!pointer_is_valid)
    {
    Get a write (unique) lock for pointer
    if (!pointer_is_valid)
      {
      pointer = pcre2_compile(...
      pointer_is_valid = TRUE
      }
    }
  Release the lock
  Use pointer in pcre2_match()
If JIT is being used, but the JIT compilation is not being done immediately (perhaps waiting to see if the pattern is used often enough), similar logic is required. JIT compilation updates a value within the compiled code block, so a thread must gain unique write access to the pointer before calling pcre2_jit_compile(). Alternatively, pcre2_code_copy() or pcre2_code_copy_with_tables() can be used to obtain a private copy of the compiled code before calling the JIT compiler.

Context blocks

The next main section below introduces the idea of "contexts" in which PCRE2 functions are called. A context is nothing more than a collection of parameters that control the way PCRE2 operates. Grouping a number of parameters together in a context is a convenient way of passing them to a PCRE2 function without using lots of arguments. The parameters that are stored in contexts are in some sense "advanced features" of the API. Many straightforward applications will not need to use contexts.

In a multithreaded application, if the parameters in a context are values that are never changed, the same context can be used by all the threads. However, if any thread needs to change any value in a context, it must make its own thread-specific copy.

Match blocks

The matching functions need a block of memory for storing the results of a match. This includes details of what was matched, as well as additional information such as the name of a (*MARK) setting. Each thread must provide its own copy of this memory.

PCRE2 CONTEXTS

Some PCRE2 functions have a lot of parameters, many of which are used only by specialist applications, for example, those that use custom memory management or non-standard character tables. To keep function argument lists at a reasonable size, and at the same time to keep the API extensible, "uncommon" parameters are passed to certain functions in a context instead of directly. A context is just a block of memory that holds the parameter values. Applications that do not need to adjust any of the context parameters can pass NULL when a context pointer is required.

There are three different types of context: a general context that is relevant for several PCRE2 operations, a compile-time context, and a match-time context.

The general context

At present, this context just contains pointers to (and data for) external memory management functions that are called from several places in the PCRE2 library. The context is named `general' rather than specifically `memory' because in future other fields may be added. If you do not want to supply your own custom memory management functions, you do not need to bother with a general context. A general context is created by:

pcre2_general_context *pcre2_general_context_create( void *(*private_malloc)(PCRE2_SIZE, void *), void (*private_free)(void *, void *), void *memory_data);

The two function pointers specify custom memory management functions, whose prototypes are:

  void *private_malloc(PCRE2_SIZE, void *);
  void  private_free(void *, void *);
Whenever code in PCRE2 calls these functions, the final argument is the value of memory_data. Either of the first two arguments of the creation function may be NULL, in which case the system memory management functions malloc() and free() are used. (This is not currently useful, as there are no other fields in a general context, but in future there might be.) The private_malloc() function is used (if supplied) to obtain memory for storing the context, and all three values are saved as part of the context.

Whenever PCRE2 creates a data block of any kind, the block contains a pointer to the free() function that matches the malloc() function that was used. When the time comes to free the block, this function is called.

A general context can be copied by calling:

pcre2_general_context *pcre2_general_context_copy( pcre2_general_context *gcontext);

The memory used for a general context should be freed by calling:

void pcre2_general_context_free(pcre2_general_context *gcontext);

If this function is passed a NULL argument, it returns immediately without doing anything.

The compile context

A compile context is required if you want to provide an external function for stack checking during compilation or to change the default values of any of the following compile-time parameters:

  What \R matches (Unicode newlines or CR, LF, CRLF only)
  PCRE2's character tables
  The newline character sequence
  The compile time nested parentheses limit
  The maximum length of the pattern string
  The extra options bits (none set by default)
  Which performance optimizations the compiler should apply
A compile context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of pcre2_compile().

A compile context is created, copied, and freed by the following functions:

pcre2_compile_context *pcre2_compile_context_create( pcre2_general_context *gcontext);

pcre2_compile_context *pcre2_compile_context_copy( pcre2_compile_context *ccontext);

void pcre2_compile_context_free(pcre2_compile_context *ccontext);

A compile context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected.

int pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value);

The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line ending sequence. The value is used by the JIT compiler and by the two interpreted matching functions, pcre2_match() and pcre2_dfa_match().

int pcre2_set_character_tables(pcre2_compile_context *ccontext, const uint8_t *tables);

The value must be the result of a call to pcre2_maketables(), whose only argument is a general context. This function builds a set of character tables in the current locale.

int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t extra_options);

As PCRE2 has developed, almost all the 32 option bits that are available in the options argument of pcre2_compile() have been used up. To avoid running out, the compile context contains a set of extra option bits which are used for some newer, assumed rarer, options. This function sets those bits. It always sets all the bits (either on or off). It does not modify any existing setting. The available options are defined in the section entitled "Extra compile options" below.

int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE value);

This sets a maximum length, in code units, for any pattern string that is compiled with this context. If the pattern is longer, an error is generated. This facility is provided so that applications that accept patterns from external sources can limit their size. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.

int pcre2_set_max_pattern_compiled_length( pcre2_compile_context *ccontext, PCRE2_SIZE value);

This sets a maximum size, in bytes, for the memory needed to hold the compiled version of a pattern that is compiled with this context. If the pattern needs more memory, an error is generated. This facility is provided so that applications that accept patterns from external sources can limit the amount of memory they use. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.

int pcre2_set_max_varlookbehind(pcre2_compile_contest *ccontext, uint32_t value);

This sets a maximum length for the number of characters matched by a variable-length lookbehind assertion. The default is set when PCRE2 is built, with the ultimate default being 255, the same as Perl. Lookbehind assertions without a bounding length are not supported.

int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value);

This specifies which characters or character sequences are to be recognized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), PCRE2_NEWLINE_ANY (any Unicode newline sequence), or PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero).

A pattern can override the value set in the compile context by starting with a sequence such as (*CRLF). See the pcre2pattern page for details.

When a pattern is compiled with the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option, the newline convention affects the recognition of the end of internal comments starting with #. The value is saved with the compiled pattern for subsequent use by the JIT compiler and by the two interpreted matching functions, pcre2_match() and pcre2_dfa_match().

int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t value);

This parameter adjusts the limit, set when PCRE2 is built (default 250), on the depth of parenthesis nesting in a pattern. This limit stops rogue patterns using up too much system stack when being compiled. The limit applies to parentheses of all kinds, not just capturing parentheses.

int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data);

There is at least one application that runs PCRE2 in threads with very limited system stack, where running out of stack is to be avoided at all costs. The parenthesis limit above cannot take account of how much stack is actually available during compilation. For a finer control, you can supply a function that is called whenever pcre2_compile() starts to compile a parenthesized part of a pattern. This function can check the actual stack size (or anything else that it wants to, of course).

The first argument to the callout function gives the current depth of nesting, and the second is user data that is set up by the last argument of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error.

int pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive);

PCRE2 can apply various performance optimizations during compilation, in order to make matching faster. For example, the compiler might convert some regex constructs into an equivalent construct which pcre2_match() can execute faster. By default, all available optimizations are enabled. However, in rare cases, one might wish to disable specific optimizations. For example, if it is known that some optimizations cannot benefit a certain regex, it might be desirable to disable them, in order to speed up compilation.

The permitted values of directive are as follows:

  PCRE2_OPTIMIZATION_FULL
Enable all optional performance optimizations. This is the default value.
  PCRE2_OPTIMIZATION_NONE
Disable all optional performance optimizations.
  PCRE2_AUTO_POSSESS
  PCRE2_AUTO_POSSESS_OFF
Enable/disable "auto-possessification" of variable quantifiers such as * and +. This optimization, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can disable this optimization if you want the matching functions to do a full, unoptimized search and run all the callouts.
  PCRE2_DOTSTAR_ANCHOR
  PCRE2_DOTSTAR_ANCHOR_OFF
Enable/disable an optimization that is applied when .* is the first significant item in a top-level branch of a pattern, and all the other branches also start with .* or with \A or \G or ^. Such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped.

Dotstar anchor optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP).

  PCRE2_START_OPTIMIZE
  PCRE2_START_OPTIMIZE_OFF
Enable/disable optimizations which cause matching functions to scan the subject string for specific code unit values before attempting a match. For example, if it is known that an unanchored match must start with a specific value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without actually running the main matching function. This means that a special item such as (*COMMIT) at the start of a pattern is not considered until after a suitable starting point for the match has been found. Also, when callouts or (*MARK) items are in use, these "start-up" optimizations can cause them to be skipped if the pattern is never actually used. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run.

Disabling start-up optimizations ensures that in cases where the result is "no match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are considered at every possible starting position in the subject string.

Disabling start-up optimizations may change the outcome of a matching operation. Consider the pattern

  (*COMMIT)ABC
When this is compiled, PCRE2 records the fact that a match must start with the character "A". Suppose the subject string is "DEFABC". The start-up optimization scans along the subject, finds "A" and runs the first match attempt from there. The (*COMMIT) item means that the pattern must match the current starting position, which in this case, it does. However, if the same match is run without start-up optimizations, the initial scan along the subject string does not happen. The first match attempt is run starting from "D" and when this fails, (*COMMIT) prevents any further matches being tried, so the overall result is "no match".

Another start-up optimization makes use of a minimum length for a matching subject, which is recorded when possible. Consider the pattern

  (*MARK:1)B(*MARK:2)(X|Y)
The minimum length for a match is two characters. If the subject is "XXBB", the "starting character" optimization skips "XX", then tries to match "BB", which is long enough. In the process, (*MARK:2) is encountered and remembered. When the match attempt fails, the next "B" is found, but there is only one character left, so there are no more attempts, and "no match" is returned with the "last mark seen" set to "2". Without start-up optimizations, however, matches are tried at every possible starting position, including at the end of the subject, where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is returned is "1". In this case, the optimizations do not affect the overall match result, which is still "no match", but they do affect the auxiliary information that is returned.

The match context

A match context is required if you want to:

  Set up a callout function
  Set an offset limit for matching an unanchored pattern
  Change the limit on the amount of heap used when matching
  Change the backtracking match limit
  Change the backtracking depth limit
  Set custom memory management specifically for the match
If none of these apply, just pass NULL as the context argument of pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().

A match context is created, copied, and freed by the following functions:

pcre2_match_context *pcre2_match_context_create( pcre2_general_context *gcontext);

pcre2_match_context *pcre2_match_context_copy( pcre2_match_context *mcontext);

void pcre2_match_context_free(pcre2_match_context *mcontext);

A match context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected.

int pcre2_set_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_callout_block *, void *), void *callout_data);

This sets up a callout function for PCRE2 to call at specified points during a matching operation. Details are given in the pcre2callout documentation.

int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);

This sets up a callout function for PCRE2 to call after each substitution made by pcre2_substitute(). Details are given in the section entitled "Creating a new string with substitutions" below.

int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data);

This sets up a callout function for PCRE2 to call when performing case transformations inside pcre2_substitute(). Details are given in the section entitled "Creating a new string with substitutions" below.

int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);

The offset_limit parameter limits how far an unanchored search can advance in the subject string. The default value is PCRE2_UNSET. The pcre2_match() and pcre2_dfa_match() functions return PCRE2_ERROR_NOMATCH if a match with a starting point before or at the given offset is not found. The pcre2_substitute() function makes no more substitutions.

For example, if the pattern /abc/ is matched against "123abc" with an offset limit less than 3, the result is PCRE2_ERROR_NOMATCH. A match can never be found if the startoffset argument of pcre2_match(), pcre2_dfa_match(), or pcre2_substitute() is greater than the offset limit set in the match context.

When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT option when calling pcre2_compile() so that when JIT is in use, different code can be compiled. If a match is started with a non-default match limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.

The offset limit facility can be used to track progress when searching large subject strings or to limit the extent of global substitutions. See also the PCRE2_FIRSTLINE option, which requires a match to start before or at the first newline that follows the start of matching in the subject. If this is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used.

int pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t value);

The heap_limit parameter specifies, in units of kibibytes (1024 bytes), the maximum amount of heap memory that pcre2_match() may use to hold backtracking information when running an interpretive match. This limit also applies to pcre2_dfa_match(), which may use the heap when processing patterns with a lot of nested pattern recursion or lookarounds or atomic groups. This limit does not apply to matching with the JIT optimization, which has its own memory control arrangements (see the pcre2jit documentation for more details). If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2 is built; if it is not, the default is set very large and is essentially unlimited.

A value for the heap limit may also be supplied by an item at the start of a pattern of the form

  (*LIMIT_HEAP=ddd)
where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of pcre2_match() or, if no such limit is set, less than the default.

The pcre2_match() function always needs some heap memory, so setting a value of zero guarantees a "heap limit exceeded" error. Details of how pcre2_match() uses the heap are given in the pcre2perform documentation.

For pcre2_dfa_match(), a vector on the system stack is used when processing pattern recursions, lookarounds, or atomic groups, and only if this is not big enough is heap memory used. In this case, setting a value of zero disables the use of the heap.

int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value);

The match_limit parameter provides a means of preventing PCRE2 from using up too many computing resources when processing patterns that are not going to match, but which have a very large number of possibilities in their search trees. The classic example is a pattern that uses nested unlimited repeats.

There is an internal counter in pcre2_match() that is incremented each time round its main matching loop. If this value reaches the match limit, pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT. This has the effect of limiting the amount of backtracking that can take place. For patterns that are not anchored, the count restarts from zero for each position in the subject string. This limit also applies to pcre2_dfa_match(), though the counting is done in a different way.

When pcre2_match() is called with a pattern that was successfully processed by pcre2_jit_compile(), the way in which matching is executed is entirely different. However, there is still the possibility of runaway matching that goes on for a very long time, and so the match_limit value is also used in this case (but in a different way) to limit how long the matching can continue.

The default value for the limit can be set when PCRE2 is built; the default is 10 million, which handles all but the most extreme cases. A value for the match limit may also be supplied by an item at the start of a pattern of the form

  (*LIMIT_MATCH=ddd)
where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if no such limit is set, less than the default.

int pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t value);

This parameter limits the depth of nested backtracking in pcre2_match(). Each time a nested backtracking point is passed, a new memory frame is used to remember the state of matching at that point. Thus, this parameter indirectly limits the amount of memory that is used in a match. However, because the size of each memory frame depends on the number of capturing parentheses, the actual memory limit varies from pattern to pattern. This limit was more useful in versions before 10.30, where function recursion was used for backtracking.

The depth limit is not relevant, and is ignored, when matching is done using JIT compiled code. However, it is supported by pcre2_dfa_match(), which uses it to limit the depth of nested internal recursive function calls that implement atomic groups, lookaround assertions, and pattern recursions. This limits, indirectly, the amount of system stack that is used. It was more useful in versions before 10.32, when stack memory was used for local workspace vectors for recursive function calls. From version 10.32, only local variables are allocated on the stack and as each call uses only a few hundred bytes, even a small stack can support quite a lot of recursion.

If the depth of internal recursive function calls is great enough, local workspace vectors are allocated on the heap from version 10.32 onwards, so the depth limit also indirectly limits the amount of heap memory that is used. A recursive pattern such as /(.(?2))((?1)|)/, when matched to a very long string using pcre2_dfa_match(), can use a great deal of memory. However, it is probably better to limit heap usage directly by calling pcre2_set_heap_limit().

The default value for the depth limit can be set when PCRE2 is built; if it is not, the default is set to the same value as the default for the match limit. If the limit is exceeded, pcre2_match() or pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth limit may also be supplied by an item at the start of a pattern of the form

  (*LIMIT_DEPTH=ddd)
where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if no such limit is set, less than the default.

CHECKING BUILD-TIME OPTIONS

int pcre2_config(uint32_t what, void *where);

The function pcre2_config() makes it possible for a PCRE2 client to find the value of certain configuration parameters and to discover which optional features have been compiled into the PCRE2 library. The pcre2build documentation has more details about these features.

The first argument for pcre2_config() specifies which information is required. The second argument is a pointer to memory into which the information is placed. If NULL is passed, the function returns the amount of memory that is needed for the requested information. For calls that return numerical values, the value is in bytes; when requesting these values, where should point to appropriately aligned memory. For calls that return strings, the required length is given in code units, not counting the terminating zero.

When requesting information, the returned value from pcre2_config() is non-negative on success, or the negative error code PCRE2_ERROR_BADOPTION if the value in the first argument is not recognized. The following information is available:

  PCRE2_CONFIG_BSR
The output is a uint32_t integer whose value indicates what character sequences the \R escape sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The default can be overridden when a pattern is compiled.
  PCRE2_CONFIG_COMPILED_WIDTHS
The output is a uint32_t integer whose lower bits indicate which code unit widths were selected when PCRE2 was built. The 1-bit indicates 8-bit support, and the 2-bit and 4-bit indicate 16-bit and 32-bit support, respectively.
  PCRE2_CONFIG_DEPTHLIMIT
The output is a uint32_t integer that gives the default limit for the depth of nested backtracking in pcre2_match() or the depth of nested recursions, lookarounds, and atomic groups in pcre2_dfa_match(). Further details are given with pcre2_set_depth_limit() above.
  PCRE2_CONFIG_EFFECTIVE_LINKSIZE
The output is a uint32_t integer that contains the number of bytes the library uses for internal linkage in compiled regular expressions. Its value is derived from the value that was provided at build time and that is described below by PCRE2_CONFIG_LINKSIZE.
  PCRE2_CONFIG_HEAPLIMIT
The output is a uint32_t integer that gives, in kibibytes, the default limit for the amount of heap memory used by pcre2_match() or pcre2_dfa_match(). Further details are given with pcre2_set_heap_limit() above.
  PCRE2_CONFIG_JIT
The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee that JIT will be used for any given match, and neither does it guarantee that JIT will actually be able to function, because it may not be able to allocate executable memory in some environments. There is a special call to pcre2_jit_compile() that can be used to check this. See the pcre2jit documentation for more details.
  PCRE2_CONFIG_JITTARGET
The where argument should point to a code-unit-aligned buffer. All previous versions of PCRE2 have required no more than 128 code units of buffer capacity. However, this requirement is not guaranteed to be maintained, so applications should call pcre2_config() with where set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to pcre2_config(). The buffer is filled with a string that contains the name of the architecture for which the JIT compiler is configured at build time, for example, a 64-bit ARM CPU that supports the Armv8.1 extension writes "ARM-64 (LSE) 64bit (little endian + unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is returned; otherwise the number of code units used is returned. This is the length of the string plus one unit for the terminating zero.
  PCRE2_CONFIG_LINKSIZE
The output is a uint32_t integer that contains the number of bytes the library was instructed to use for internal linkage in compiled regular expressions. When PCRE2 is configured, the value can be set to 2, 3, or 4, with the default being 2 for most libraries.

The actual number of bytes used depends on the size of the code units that the library supports and can be higher. See PCRE2_CONFIG_EFFECTIVE_LINKSIZE above for details.

The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all but the most massive patterns, since it allows the size of the compiled pattern to be up to 65535 code units. Larger values allow larger regular expressions to be compiled by those two libraries, but at the expense of slower matching.

  PCRE2_CONFIG_MATCHLIMIT
The output is a uint32_t integer that gives the default match limit for pcre2_match(). Further details are given with pcre2_set_match_limit() above.
  PCRE2_CONFIG_NEWLINE
The output is a uint32_t integer whose value specifies the default character sequence that is recognized as meaning "newline". The values are:
  PCRE2_NEWLINE_CR       Carriage return (CR)
  PCRE2_NEWLINE_LF       Linefeed (LF)
  PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
  PCRE2_NEWLINE_ANY      Any Unicode line ending
  PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
  PCRE2_NEWLINE_NUL      The NUL character (binary zero)
The default should normally correspond to the standard sequence for your operating system.
  PCRE2_CONFIG_NEVER_BACKSLASH_C
The output is a uint32_t integer that is set to one if the use of \C was permanently disabled when PCRE2 was built; otherwise it is set to zero.
  PCRE2_CONFIG_PARENSLIMIT
The output is a uint32_t integer that gives the maximum depth of nesting of parentheses (of any kind) in a pattern. This limit is imposed to cap the amount of system stack used when a pattern is compiled. It is specified when PCRE2 is built; the default is 250. This limit does not take into account the stack that may already be used by the calling application. For finer control over compilation stack usage, see pcre2_set_compile_recursion_guard().
  PCRE2_CONFIG_STACKRECURSE
This parameter is obsolete and should not be used in new code. The output is a uint32_t integer that is always set to zero.
  PCRE2_CONFIG_TABLES_LENGTH
The output is a uint32_t integer that gives the length of PCRE2's character processing tables in bytes. For details of these tables see the section on locale support below.
  PCRE2_CONFIG_UNICODE_VERSION
The where argument should point to a code-unit-aligned buffer. All previous versions of PCRE2 have required no more than 24 code units of buffer capacity. However, applications should call pcre2_config() with where set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to pcre2_config(). If PCRE2 has been compiled without Unicode support, the buffer is filled with the text "Unicode not supported". Otherwise, the Unicode version string (for example, "8.0.0") is written. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero.
  PCRE2_CONFIG_UNICODE
The output is a uint32_t integer that is set to one if Unicode support is available; otherwise it is set to zero. Unicode support implies UTF support.
  PCRE2_CONFIG_VERSION
The where argument should point to a code-unit-aligned buffer. All previous versions of PCRE2 have required no more than 24 code units of buffer capacity. However, applications should call pcre2_config() with where set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to pcre2_config(). The buffer is filled with the PCRE2 version string, zero-terminated. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero.

COMPILING A PATTERN

pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext);

void pcre2_code_free(pcre2_code *code);

pcre2_code *pcre2_code_copy(const pcre2_code *code);

pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);

The pcre2_compile() function compiles a pattern into an internal form. The pattern is defined by a pointer to a string of code units and a length in code units. If the pattern is zero-terminated, the length can be specified as PCRE2_ZERO_TERMINATED. A NULL pattern pointer with a length of zero is treated as an empty string (NULL with a non-zero length causes an error return). The function returns a pointer to a block of memory that contains the compiled pattern and related data, or NULL if an error occurred.

If the compile context argument ccontext is NULL, memory for the compiled pattern is obtained by calling malloc(). Otherwise, it is obtained from the same memory function that was used for the compile context. The caller must free the memory by calling pcre2_code_free() when it is no longer needed. If pcre2_code_free() is called with a NULL argument, it returns immediately, without doing anything.

The function pcre2_code_copy() makes a copy of the compiled code in new memory, using the same memory allocator as was used for the original. However, if the code has been processed by the JIT compiler (see below), the JIT information cannot be copied (because it is position-dependent). The new copy can initially be used only for non-JIT matching, though it can be passed to pcre2_jit_compile() if required. If pcre2_code_copy() is called with a NULL argument, it returns NULL.

The pcre2_code_copy() function provides a way for individual threads in a multithreaded application to acquire a private copy of shared compiled code. However, it does not make a copy of the character tables used by the compiled pattern; the new pattern code points to the same tables as the original code. (See "Locale Support" below for details of these character tables.) In many applications the same tables are used throughout, so this behaviour is appropriate. Nevertheless, there are occasions when a copy of a compiled pattern and the relevant tables are needed. The pcre2_code_copy_with_tables() provides this facility. Copies of both the code and the tables are made, with the new code pointing to the new tables. The memory for the new tables is automatically freed when pcre2_code_free() is called for the new copy of the compiled code. If pcre2_code_copy_with_tables() is called with a NULL argument, it returns NULL.

NOTE: When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can be referenced by the substring extraction functions after a successful match. After running a match, you must not free a compiled pattern or a subject string until after all operations on the match data block have taken place, unless, in the case of the subject string, you have used the PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled "Option bits for pcre2_match()" below.

The options argument for pcre2_compile() contains various bit settings that affect the compilation. It should be zero if none of them are required. The available options are described below. Some of them (in particular, those that are compatible with Perl, but some others as well) can also be set and unset from within the pattern (see the detailed description in the pcre2pattern documentation).

For those options that can be different in different parts of the pattern, the contents of the options argument specifies their settings at the start of compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and PCRE2_NO_UTF_CHECK options can be set at the time of matching as well as at compile time.

Some additional options and less frequently required compile-time parameters (for example, the newline setting) can be provided in a compile context (as described above).

If errorcode or erroroffset is NULL, pcre2_compile() returns NULL immediately. Otherwise, the variables to which these point are set to an error code and an offset (number of code units) within the pattern, respectively, when pcre2_compile() returns NULL because a compilation error has occurred.

There are over 100 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and are described in the pcre2unicode documentation. There is no separate documentation for the positive error codes, because the textual error messages that are obtained by calling the pcre2_get_error_message() function (see "Obtaining a textual error message" below) should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined for both positive and negative error codes in pcre2.h. When compilation is successful errorcode is set to a value that returns the message "no error" if passed to pcre2_get_error_message().

The value returned in erroroffset is an indication of where in the pattern an error occurred. When there is no error, zero is returned. A non-zero value is not necessarily the furthest point in the pattern that was read. For example, after the error "lookbehind assertion is not fixed length", the error offset points to the start of the failing assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the first code unit of the failing character.

Some errors are not detected until the whole pattern has been scanned; in these cases, the offset passed back is the length of the pattern. Note that the offset is in code units, not characters, even in a UTF mode. It may sometimes point into the middle of a UTF-8 or UTF-16 character.

This code fragment shows a typical straightforward call to pcre2_compile():

  pcre2_code *re;
  PCRE2_SIZE erroffset;
  int errorcode;
  re = pcre2_compile(
    "^A.*Z",                /* the pattern */
    PCRE2_ZERO_TERMINATED,  /* the pattern is zero-terminated */
    0,                      /* default options */
    &errorcode,             /* for error code */
    &erroffset,             /* for error offset */
    NULL);                  /* no compile context */

Main compile options

The following names for option bits are defined in the pcre2.h header file:

  PCRE2_ANCHORED
If this bit is set, the pattern is forced to be "anchored", that is, it is constrained to match only at the first matching point in the string that is being searched (the "subject string"). This effect can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl.
  PCRE2_ALLOW_EMPTY_CLASS
By default, for compatibility with Perl, a closing square bracket that immediately follows an opening one is treated as a data character for the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which therefore contains no characters and so can never match.
  PCRE2_ALT_BSUX
This option request alternative handling of three escape sequences, which makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set:

(1) \U matches an upper case "U" character; by default \U causes a compile time error (Perl uses \U to upper case subsequent characters).

(2) \u matches a lower case "u" character unless it is followed by four hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, \u causes a compile time error (Perl uses it to upper case the following character).

(3) \x matches a lower case "x" character unless it is followed by two hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, as in Perl, a hexadecimal number is always expected after \x, but it may have one or two digits.

ECMAscript 6 added additional functionality to \u. This can be accessed using the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile options" below). Note that this alternative escape handling applies only to patterns. Neither of these options affects the processing of replacement strings passed to pcre2_substitute().

  PCRE2_ALT_CIRCUMFLEX
In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter matches at the start of the subject (unless PCRE2_NOTBOL is set), and also after any internal newline. However, it does not match after a newline at the end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX.
  PCRE2_ALT_EXTENDED_CLASS
Alters the parsing of character classes to follow the extended syntax described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact on the behaviour of the Perl-specific "(?[...])" syntax for extended classes, but instead enables the alternative syntax of extended class behaviour inside ordinary "[...]" character classes. See the pcre2pattern documentation for details of the character classes supported.
  PCRE2_ALT_VERBNAMES
By default, for compatibility with Perl, the name in any verb sequence such as (*MARK:NAME) is any sequence of characters that does not include a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing is applied to verb names and only an unescaped closing parenthesis terminates the name. A closing parenthesis can be included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped white space in verb names is skipped and #-comments are recognized, exactly as in the rest of the pattern.
  PCRE2_AUTO_CALLOUT
If this bit is set, pcre2_compile() automatically inserts callout items, all with number 255, before each pattern item, except immediately before or after an explicit callout in the pattern. For discussion of the callout facility, see the pcre2callout documentation.
  PCRE2_CASELESS
If this bit is set, letters in the pattern match both upper and lower case letters in the subject. It is equivalent to Perl's /i option, and it can be changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than U+007F.

Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.

One language family, Turkish and Azeri, has its own case-insensitivity rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 (small dotless i) characters.

For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having another case.

From release 10.45 PCRE2_CASELESS also affects what some of the letter-related Unicode property escapes (\p and \P) match. The properties Lu (upper case letter), Ll (lower case letter), and Lt (title case letter) are all treated as LC (cased letter) when PCRE2_CASELESS is set.

  PCRE2_DOLLAR_ENDONLY
If this bit is set, a dollar metacharacter in the pattern matches only at the end of the subject string. Without this option, a dollar also matches immediately before a newline at the end of the string (but not before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. There is no equivalent to this option in Perl, and no way to set it within a pattern.
  PCRE2_DOTALL
If this bit is set, a dot metacharacter in the pattern matches any character, including one that indicates a newline. However, it only ever matches one character, even if newlines are coded as CRLF. Without this option, a dot does not match when the current position in the subject is at a newline. This option is equivalent to Perl's /s option, and it can be changed within a pattern by a (?s) option setting. A negative class such as [^a] always matches newline characters, and the \N escape sequence always matches a non-newline character, independent of the setting of PCRE2_DOTALL.
  PCRE2_DUPNAMES
If this bit is set, names used to identify capture groups need not be unique. This can be helpful for certain types of pattern when it is known that only one instance of the named group can ever be matched. There are more details of named capture groups below; see also the pcre2pattern documentation.
  PCRE2_ENDANCHORED
If this bit is set, the end of any pattern match must be right at the end of the string being searched (the "subject string"). If the pattern match succeeds by reaching (*ACCEPT), but does not reach the end of the subject, the match fails at the current starting point. For unanchored patterns, a new match is then tried at the next starting point. However, if the match succeeds by reaching the end of the pattern, but not the end of the subject, backtracking occurs and an alternative match may be found. Consider these two patterns:
  .(*ACCEPT)|..
  .|..
If matched against "abc" with PCRE2_ENDANCHORED set, the first matches "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl.

For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only to the first (that is, the longest) matched string. Other parallel matches, which are necessarily substrings of the first one, must obviously end before the end of the subject.

  PCRE2_EXTENDED
If this bit is set, most white space characters in the pattern are totally ignored except when escaped, inside a character class, or inside a \Q...\E sequence. However, white space is not allowed within sequences such as (?> that introduce various parenthesized groups, nor within numerical quantifiers such as {1,3}. Ignorable white space is permitted between an item and a following quantifier and between a quantifier and a following + that indicates possessiveness. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be changed within a pattern by a (?x) option setting.

When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recognizes as white space only those characters with code points less than 256 that are flagged as white space in its low-character table. The table is normally created by pcre2_maketables(), which uses the isspace() function to identify space characters. In most ASCII environments, the relevant characters are those with code points 0x0009 (tab), 0x000A (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage return), and 0x0020 (space).

When PCRE2 is compiled with Unicode support, in addition to these characters, five more Unicode "Pattern White Space" characters are recognized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to-right mark), U+200F (right-to-left mark), U+2028 (line separator), and U+2029 (paragraph separator). This set of characters is the same as recognized by Perl's /x option. Note that the horizontal and vertical space characters that are matched by the \h and \v escapes in patterns are a much bigger set.

As well as ignoring most white space, PCRE2_EXTENDED also causes characters between an unescaped # outside a character class and the next newline, inclusive, to be ignored, which makes it possible to include comments inside complicated patterns. Note that the end of this type of comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count.

Which characters are interpreted as newlines can be specified by a setting in the compile context that is passed to pcre2_compile() or by a special sequence at the start of the pattern, as described in the section entitled "Newline conventions" in the pcre2pattern documentation. A default is defined when PCRE2 is built.

  PCRE2_EXTENDED_MORE
This option has the effect of PCRE2_EXTENDED, but, in addition, unescaped space and horizontal tab characters are ignored inside a character class. Note: only these two characters are ignored, not the full set of pattern white space characters that are ignored outside a character class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, and it can be changed within a pattern by a (?xx) option setting.
  PCRE2_FIRSTLINE
If this option is set, the start of an unanchored pattern match must be before or at the first newline in the subject string following the start of matching, though the matched text may continue over the newline. If startoffset is non-zero, the limiting newline is not necessarily the first newline in the subject. For example, if the subject string is "abc\nxyz" (where \n represents a single-character newline) a pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. This option has no effect for anchored patterns.
  PCRE2_LITERAL
If this option is set, all meta-characters in the pattern are disabled, and it is treated as a literal string. Matching literal strings with a regular expression engine is not the most efficient way of doing it. If you are doing a lot of literal matching and are worried about efficiency, you should consider using other approaches. The only other main options that are allowed with PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error.
  PCRE2_MATCH_INVALID_UTF
This option forces PCRE2_UTF (see below) and also enables support for matching by pcre2_match() in subject strings that contain invalid UTF sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. This facility is not supported for DFA matching. For details, see the pcre2unicode documentation.
  PCRE2_MATCH_UNSET_BACKREF
If this option is set, a backreference to an unset capture group matches an empty string (by default this causes the current matching alternative to fail). A pattern such as (\1)(a) succeeds when this option is set (assuming it can find an "a" in the subject), whereas it fails by default, for Perl compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka JavaScript).
  PCRE2_MULTILINE
By default, for the purposes of matching "start of line" and "end of line", PCRE2 treats the subject string as consisting of a single line of characters, even if it actually contains newlines. The "start of line" metacharacter (^) matches only at the start of the string, and the "end of line" metacharacter ($) matches only at the end of the string, or before a terminating newline (except when PCRE2_DOLLAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a newline. This behaviour (for ^, $, and dot) is the same as Perl.

When PCRE2_MULTILINE it is set, the "start of line" and "end of line" constructs match immediately following or immediately before internal newlines in the subject string, respectively, as well as at the very start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. Note that the "start of line" metacharacter does not match after a newline at the end of the subject, for compatibility with Perl. However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect.

  PCRE2_NEVER_BACKSLASH_C
This option locks out the use of \C in the pattern that is being compiled. This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in applications that process patterns from external sources. Note that there is also a build-time option that permanently locks out the use of \C.
  PCRE2_NEVER_UCP
This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as described for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error.
  PCRE2_NEVER_UTF
This option locks out interpretation of the pattern as UTF-8, UTF-16, or UTF-32, depending on which library is in use. In particular, it prevents the creator of the pattern from switching to UTF interpretation by starting the pattern with (*UTF). This option may be useful in applications that process patterns from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
  PCRE2_NO_AUTO_CAPTURE
If this option is set, it disables the use of numbered capturing parentheses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). This is the same as Perl's /n option. Note that, when this option is set, references to capture groups (backreferences or recursion/subroutine calls) may only refer to named groups, though the reference can be by name or by number.
  PCRE2_NO_AUTO_POSSESS
If this (deprecated) option is set, it disables "auto-possessification", which is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes.

If a compile context is available, it is recommended to use pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS takes precedence over the pcre2_set_optimize() optimization directives PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF.

  PCRE2_NO_DOTSTAR_ANCHOR
If this (deprecated) option is set, it disables an optimization that is applied when .* is the first significant item in a top-level branch of a pattern, and all the other branches also start with .* or with \A or \G or ^. The optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. (If a compile context is available, it is recommended to use pcre2_set_optimize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF instead.)
  PCRE2_NO_START_OPTIMIZE
This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT compiler. Setting this option is equivalent to calling pcre2_set_optimize() with the directive parameter set to PCRE2_START_OPTIMIZE_OFF.

There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without actually running the main matching function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run.

Disabling the start-up optimizations may cause performance to suffer. However, this may be desirable for patterns which contain callouts or items such as (*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF for further details.

  PCRE2_NO_UTF_CHECK
When PCRE2_UTF is set, the validity of the pattern as a UTF string is automatically checked. There are discussions about the validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode document. If an invalid UTF sequence is found, pcre2_compile() returns a negative error code.

If you know that your pattern is a valid UTF string, and you want to skip this check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set, the effect of passing an invalid UTF string as a pattern is undefined. It may cause your program to crash or loop.

Note that this option can also be passed to pcre2_match() and pcre2_dfa_match(), to suppress UTF validity checking of the subject string.

Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. In particular, the so-called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences such as \x{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described in the section entitled "Extra compile options" below. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16.

  PCRE2_UCP
This option has two effects. Firstly, it change the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes. By default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode properties are used to classify characters. There are some PCRE2_EXTRA options (see below) that add finer control to this behaviour. More details are given in the section on generic character types in the pcre2pattern page.

The second effect of PCRE2_UCP is to force the use of Unicode properties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default).

The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option (see above) alters the matching of the 'i' characters to follow their behaviour in Turkish and Azeri languages. For further details on PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the pcre2unicode page.

  PCRE2_UNGREEDY
This option inverts the "greediness" of the quantifiers so that they are not greedy by default, but become greedy if followed by "?". It is not compatible with Perl. It can also be set by a (?U) option setting within the pattern.
  PCRE2_USE_OFFSET_LIMIT
This option must be set for pcre2_compile() if pcre2_set_offset_limit() is going to be used to set a non-default offset limit in a match context for matches that use this pattern. An error is generated if an offset limit is set without this option. For more details, see the description of pcre2_set_offset_limit() in the section that describes match contexts. See also the PCRE2_FIRSTLINE option above.
  PCRE2_UTF
This option causes PCRE2 to regard both the pattern and the subject strings that are subsequently processed as strings of UTF characters instead of single-code-unit strings. It is available when PCRE2 is built to include Unicode support (which is the default). If Unicode support is not available, the use of this option provokes an error. Details of how PCRE2_UTF changes the behaviour of PCRE2 are given in the pcre2unicode page. In particular, note that it changes the way PCRE2_CASELESS works.

Extra compile options

The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_options() function are as follows:

  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
Since release 10.38 PCRE2 has forbidden the use of \K within lookaround assertions, following Perl's lead. This option is provided to re-enable the previous behaviour (act in positive lookarounds, ignore in negative ones) in case anybody is relying on it.
  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot therefore be represented in UTF-16. They can be represented in UTF-8 and UTF-32, but are defined as invalid code points, and cause errors if encountered in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2.

These values also cause errors if encountered in escape sequences such as \x{d912} within a pattern. However, it seems that some applications, when using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does not disable the error that occurs, because it applies only to the testing of input strings for UTF validity.

If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code point values in UTF-8 and UTF-32 patterns no longer provoke errors and are incorporated in the compiled pattern. However, they can only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set.

  PCRE2_EXTRA_ALT_BSUX
The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and \x in the way that ECMAscript (aka JavaScript) does. Additional functionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal character code, where hhh.. is any number of hexadecimal digits.
  PCRE2_EXTRA_ASCII_BSD
This option forces \d to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aD) option setting.
  PCRE2_EXTRA_ASCII_BSS
This option forces \s to match only ASCII space characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) option setting.
  PCRE2_EXTRA_ASCII_BSW
This option forces \w to match only ASCII word characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aW) option setting.
  PCRE2_EXTRA_ASCII_DIGIT
This option forces the POSIX character classes [:digit:] and [:xdigit:] to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option setting.
  PCRE2_EXTRA_ASCII_POSIX
This option forces all the POSIX character classes, including [:digit:] and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aP) option setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes.
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
This is a dangerous option. Use with care. By default, an unrecognized escape such as \j or a malformed one such as \x{2z} causes a compile-time error when detected by pcre2_compile(). Perl is somewhat inconsistent in handling such items: for example, \j is treated as a literal "j", and non-hexadecimal digits in \x{} are just ignored, though warnings are given in both cases if Perl's warning switch is enabled. However, a malformed octal number after \o{ always causes an error in Perl.

If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to pcre2_compile(), all unrecognized or malformed escape sequences are treated as single-character escapes. For example, \j is a literal "j" and \x{2z} is treated as the literal string "x{2z}". Setting this option means that typos in patterns may go undetected and have unexpected results. Also note that a sequence such as [\N{] is interpreted as a malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] gives an error because an unqualified \N is a valid escape sequence but is not supported in a character class. To reiterate: this is a dangerous option. Use with great care.

  PCRE2_EXTRA_CASELESS_RESTRICT
When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow for more than two cases per character. There are two case-equivalent character sets that contain both ASCII and non-ASCII characters. The ASCII letter S is case-equivalent to U+017f (long S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must either be ASCII or non-ASCII. The option can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option settings.
  PCRE2_EXTRA_ESCAPED_CR_IS_LF
There are some legacy applications where the escape sequence \r in a pattern is expected to match a newline. If this option is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character. The option does not affect a literal CR in the pattern, nor does it affect CR specified as an explicit code point such as \x{0D}.
  PCRE2_EXTRA_MATCH_LINE
This option is provided for use by the -x option of pcre2grep. It causes the pattern only to match complete lines. This is achieved by automatically inserting the code for "^(?:" at the start of the compiled pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched line may be in the middle of the subject string. This option can be used with PCRE2_LITERAL.
  PCRE2_EXTRA_MATCH_WORD
This option is provided for use by the -w option of pcre2grep. It causes the pattern only to match strings that have a word boundary at the start and the end. This is achieved by automatically inserting the code for "\b(?:" at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set.
  PCRE2_EXTRA_NO_BS0
If this option is set (note that its final character is the digit 0) it locks out the use of the sequence \0 unless at least one more octal digit follows.
  PCRE2_EXTRA_PYTHON_OCTAL
If this option is set, PCRE2 follows Python's rules for interpreting octal escape sequences. The rules for handling sequences such as \14, which could be an octal number or a back reference are different. Details are given in the pcre2pattern documentation.
  PCRE2_EXTRA_NEVER_CALLOUT
If this option is set, PCRE2 treats callouts in the pattern as a syntax error, returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application knows that a callout will not be provided to pcre2_match(), so that callouts in the pattern are not silently ignored.
  PCRE2_EXTRA_TURKISH_CASING
This option alters case-equivalence of the 'i' letters to follow the alphabet used by Turkish and Azeri languages. The option can be changed within a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or UCP options must be set. In the 8-bit library, UTF must be set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.

JUST-IN-TIME (JIT) COMPILATION

int pcre2_jit_compile(pcre2_code *code, uint32_t options);

int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext);

void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);

pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize, size_t maxsize, pcre2_general_context *gcontext);

void pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback_function, void *callback_data);

void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);

These functions provide support for JIT compilation, which, if the just-in-time compiler is available, further processes a compiled pattern into machine code that executes much faster than the pcre2_match() interpretive matching function. Full details are given in the pcre2jit documentation.

JIT compilation is a heavyweight optimization. It can take some time for patterns to be analyzed, and for one-off matches and simple patterns the benefit of faster execution might be offset by a much slower compilation time. Most (but not all) patterns can be optimized by the JIT compiler.

LOCALE SUPPORT

const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);

void pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables);

PCRE2 handles caseless matching, and determines whether characters are letters, digits, or whatever, by reference to a set of tables, indexed by character code point. However, this applies only to characters whose code points are less than 256. By default, higher-valued code points never match escapes such as \w or \d.

When PCRE2 is built with Unicode support (the default), certain Unicode character properties can be tested with \p and \P, or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled; this causes \w and friends to use Unicode property support instead of the built-in tables. PCRE2_UCP also causes upper/lower casing operations on characters with code points greater than 127 to use Unicode properties. These effects apply even when PCRE2_UTF is not set. There are, however, some PCRE2_EXTRA options (see above) that can be used to modify or suppress them.

The use of locales with Unicode is discouraged. If you are handling characters with code points greater than 127, you should either use Unicode support, or use locales, but not try to mix the two.

PCRE2 contains a built-in set of character tables that are used by default. These are sufficient for many applications. Normally, the internal tables recognize only ASCII characters. However, when PCRE2 is built, it is possible to cause the internal tables to be rebuilt in the default "C" locale of the local system, which may cause them to be different.

The built-in tables can be overridden by tables supplied by the application that calls PCRE2. These may be created in a different locale from the default. As more and more applications change to using Unicode, the need for this locale support is expected to die away.

External tables are built by calling the pcre2_maketables() function, in the relevant locale. The only argument to this function is a general context, which can be used to pass a custom memory allocator. If the argument is NULL, the system malloc() is used. The result can be passed to pcre2_compile() as often as necessary, by creating a compile context and calling pcre2_set_character_tables() to set the tables pointer therein.

For example, to build and use tables that are appropriate for the French locale (where accented characters with values greater than 127 are treated as letters), the following code could be used:

  setlocale(LC_CTYPE, "fr_FR");
  tables = pcre2_maketables(NULL);
  ccontext = pcre2_compile_context_create(NULL);
  pcre2_set_character_tables(ccontext, tables);
  re = pcre2_compile(..., ccontext);
The locale name "fr_FR" is used on Linux and other Unix-like systems; if you are using Windows, the name for the French locale is "french".

The pointer that is passed (via the compile context) to pcre2_compile() is saved with the compiled pattern, and the same tables are used by the matching functions. Thus, for any single pattern, compilation and matching both happen in the same locale, but different patterns can be processed in different locales.

It is the caller's responsibility to ensure that the memory containing the tables remains available while they are still in use. When they are no longer needed, you can discard them using pcre2_maketables_free(), which should pass as its first parameter the same global context that was used to create the tables.

Saving locale tables

The tables described above are just a sequence of binary bytes, which makes them independent of hardware characteristics such as endianness or whether the processor is 32-bit or 64-bit. A copy of the result of pcre2_maketables() can therefore be saved in a file or elsewhere and re-used later, even in a different program or on another computer. The size of the tables (number of bytes) must be obtained by calling pcre2_config() with the PCRE2_CONFIG_TABLES_LENGTH option because pcre2_maketables() does not return this value. Note that the pcre2_dftables program, which is part of the PCRE2 build system, can be used stand-alone to create a file that contains a set of binary tables. See the pcre2build documentation for details.

INFORMATION ABOUT A COMPILED PATTERN

int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);

The pcre2_pattern_info() function returns general information about a compiled pattern. For information about callouts, see the next section. The first argument for pcre2_pattern_info() is a pointer to the compiled pattern. The second argument specifies which piece of information is required, and the third argument is a pointer to a variable to receive the data. If the third argument is NULL, the first argument is ignored, and the function returns the size in bytes of the variable that is required for the information requested. Otherwise, the yield of the function is zero for success, or one of the following negative numbers:

  PCRE2_ERROR_NULL           the argument code was NULL
  PCRE2_ERROR_BADMAGIC       the "magic number" was not found
  PCRE2_ERROR_BADOPTION      the value of what was invalid
  PCRE2_ERROR_UNSET          the requested field is not set
The "magic number" is placed at the start of each compiled pattern as a simple check against passing an arbitrary memory pointer. Here is a typical call of pcre2_pattern_info(), to obtain the length of the compiled pattern:
  int rc;
  size_t length;
  rc = pcre2_pattern_info(
    re,               /* result of pcre2_compile() */
    PCRE2_INFO_SIZE,  /* what is required */
    &length);         /* where to put the data */
The possible values for the second argument are defined in pcre2.h, and are as follows:
  PCRE2_INFO_ALLOPTIONS
  PCRE2_INFO_ARGOPTIONS
  PCRE2_INFO_EXTRAOPTIONS
Return copies of the pattern's options. The third argument should point to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOPTIONS returns the compile options as modified by any top-level (*XXX) option settings such as (*UTF) at the start of the pattern itself. PCRE2_INFO_EXTRAOPTIONS returns the extra options that were set in the compile context by calling the pcre2_set_compile_extra_options() function.

For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EXTENDED option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can change within a pattern do not affect the result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the pattern. (This was different in some earlier releases.)

A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if the first significant item in every top-level branch is one of the following:

  ^     unless PCRE2_MULTILINE is set
  \A    always
  \G    always
  .*    sometimes - see below
When .* is the first significant item, anchoring is possible only when all the following are true:
  .* is not in an atomic group
  .* is not in a capture group that is the subject of a backreference
  PCRE2_DOTALL is in force for .*
  Neither (*PRUNE) nor (*SKIP) appears in the pattern
  PCRE2_NO_DOTSTAR_ANCHOR is not set
  Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF
For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS.
  PCRE2_INFO_BACKREFMAX
Return the number of the highest backreference in the pattern. The third argument should point to a uint32_t variable. Named capture groups acquire numbers as well as names, and these count towards the highest backreference. Backreferences such as \4 or \g{12} match the captured characters of the given group, but in addition, the check that a capture group is set in a conditional group such as (?(3)a|b) is also a backreference. Zero is returned if there are no backreferences.
  PCRE2_INFO_BSR
The output is a uint32_t integer whose value indicates what character sequences the \R escape sequence matches. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF.
  PCRE2_INFO_CAPTURECOUNT
Return the highest capture group number in the pattern. In patterns where (?| is not used, this is also the total number of capture groups. The third argument should point to a uint32_t variable.
  PCRE2_INFO_DEPTHLIMIT
If the pattern set a backtracking depth limit by including an item of the form (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function.
  PCRE2_INFO_FIRSTBITMAP
In the absence of a single first code unit for a non-anchored pattern, pcre2_compile() may construct a 256-bit table that defines a fixed set of values for the first code unit in any match. For example, a pattern that starts with [abc] results in a table with three bits set. When code unit values greater than 255 are supported, the flag bit for 255 means "any code unit of value 255 or above". If such a table was constructed, a pointer to it is returned. Otherwise NULL is returned. The third argument should point to a const uint8_t * variable.
  PCRE2_INFO_FIRSTCODETYPE
Return information about the first code unit of any matched string, for a non-anchored pattern. The third argument should point to a uint32_t variable. If there is a fixed first value, for example, the letter "c" from a pattern such as (cat|cow|coyote), 1 is returned, and the value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is known that a match can occur only at the start of the subject or following a newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0 is returned.
  PCRE2_INFO_FIRSTCODEUNIT
Return the value of the first code unit of any matched string for a pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third argument should point to a uint32_t variable. In the 8-bit library, the value is always less than 256. In the 16-bit library the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode.
  PCRE2_INFO_FRAMESIZE
Return the size (in bytes) of the data frames that are used to remember backtracking positions when the pattern is processed by pcre2_match() without the use of JIT. The third argument should point to a size_t variable. The frame size depends on the number of capturing parentheses in the pattern. Each additional capture group adds two PCRE2_SIZE variables.
  PCRE2_INFO_HASBACKSLASHC
Return 1 if the pattern contains any instances of \C, otherwise 0. The third argument should point to a uint32_t variable.
  PCRE2_INFO_HASCRORLF
Return 1 if the pattern contains any explicit matches for CR or LF characters, otherwise 0. The third argument should point to a uint32_t variable. An explicit match is either a literal CR or LF character, or \r or \n or one of the equivalent hexadecimal or octal escape sequences.
  PCRE2_INFO_HEAPLIMIT
If the pattern set a heap memory limit by including an item of the form (*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function.
  PCRE2_INFO_JCHANGED
Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise 0. The third argument should point to a uint32_t variable. (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respectively.
  PCRE2_INFO_JITSIZE
If the compiled pattern was successfully processed by pcre2_jit_compile(), return the size of the JIT compiled code, otherwise return zero. The third argument should point to a size_t variable.
  PCRE2_INFO_LASTCODETYPE
Returns 1 if there is a rightmost literal code unit that must exist in any matched string, other than at its start. The third argument should point to a uint32_t variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is recorded only if it follows something of variable length. For example, for the pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
  PCRE2_INFO_LASTCODEUNIT
Return the value of the rightmost literal code unit that must exist in any matched string, other than at its start, for a pattern where PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argument should point to a uint32_t variable.
  PCRE2_INFO_MATCHEMPTY
Return 1 if the pattern might match an empty string, otherwise 0. The third argument should point to a uint32_t variable. When a pattern contains recursive subroutine calls it is not always possible to determine whether or not it can match an empty string. PCRE2 takes a cautious approach and returns 1 in such cases.
  PCRE2_INFO_MATCHLIMIT
If the pattern set a match limit by including an item of the form (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function.
  PCRE2_INFO_MAXLOOKBEHIND
A lookbehind assertion moves back a certain number of characters (not code units) when it starts to process each of its branches. This request returns the largest of these backward moves. The third argument should point to a uint32_t integer. The simple assertions \b and \B require a one-character lookbehind and cause PCRE2_INFO_MAXLOOKBEHIND to return 1 in the absence of anything longer. \A also registers a one-character lookbehind, though it does not actually inspect the previous character.

Note that this information is useful for multi-segment matching only if the pattern contains no nested lookbehinds. For example, the pattern (?<=a(?<=ba)c) returns a maximum lookbehind of 2, but when it is processed, the first lookbehind moves back by two characters, matches one character, then the nested lookbehind also moves back by two characters. This puts the matching point three characters earlier than it was at the start. PCRE2_INFO_MAXLOOKBEHIND is really only useful as a debugging tool. See the pcre2partial documentation for a discussion of multi-segment matching.

  PCRE2_INFO_MINLENGTH
If a minimum length for matching subject strings was computed, its value is returned. Otherwise the returned value is 0. This value is not computed when PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in UTF mode may be different from the number of code units. The third argument should point to a uint32_t variable. The value is a lower bound to the length of any matching string. There may not be any strings of that length that do actually match, but every string that does match is at least that long.
  PCRE2_INFO_NAMECOUNT
  PCRE2_INFO_NAMEENTRYSIZE
  PCRE2_INFO_NAMETABLE
PCRE2 supports the use of named as well as numbered capturing parentheses. The names are just an additional way of identifying the parentheses, which still acquire numbers. Several convenience functions such as pcre2_substring_get_byname() are provided for extracting captured substrings by name. It is also possible to extract the data directly, by first converting the name to a number in order to access the correct pointers in the output vector (described with pcre2_match() below). To do the conversion, you need to use the name-to-number map, which is described by these three values.

The map consists of a number of fixed-size entries. PCRE2_INFO_NAMECOUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each entry in code units; both of these return a uint32_t value. The entry size depends on the length of the longest name.

PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table. This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit library, the first two bytes of each entry are the number of the capturing parenthesis, most significant byte first. In the 16-bit library, the pointer points to 16-bit code units, the first of which contains the parenthesis number. In the 32-bit library, the pointer points to 32-bit code units, the first of which contains the parenthesis number. The rest of the entry is the corresponding name, zero terminated.

The names are in alphabetical order. If (?| is used to create multiple capture groups with the same number, as described in the section on duplicate group numbers in the pcre2pattern page, the groups may be given the same name, but there is only one entry in the table. Different names for groups of the same number are not permitted.

Duplicate names for capture groups with different numbers are permitted, but only if PCRE2_DUPNAMES is set. They appear in the table in the order in which they were found in the pattern. In the absence of (?| this is the order of increasing number; when (?| is used this is not necessarily the case because later capture groups may have lower numbers.

As a simple example of the name/number table, consider the following pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white space - including newlines - is ignored):

  (?<date> (?<year>(\d\d)?\d\d) - (?<month>\d\d) - (?<day>\d\d) )
There are four named capture groups, so the table has four entries, and each entry in the table is eight bytes long. The table is as follows, with non-printing bytes shows in hexadecimal, and undefined bytes shown as ??:
  00 01 d  a  t  e  00 ??
  00 05 d  a  y  00 ?? ??
  00 04 m  o  n  t  h  00
  00 02 y  e  a  r  00 ??
When writing code to extract data from named capture groups using the name-to-number map, remember that the length of the entries is likely to be different for each compiled pattern.
  PCRE2_INFO_NEWLINE
The output is one of the following uint32_t values:
  PCRE2_NEWLINE_CR       Carriage return (CR)
  PCRE2_NEWLINE_LF       Linefeed (LF)
  PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
  PCRE2_NEWLINE_ANY      Any Unicode line ending
  PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
  PCRE2_NEWLINE_NUL      The NUL character (binary zero)
This identifies the character sequence that will be recognized as meaning "newline" while matching.
  PCRE2_INFO_SIZE
Return the size of the compiled pattern in bytes (for all three libraries). The third argument should point to a size_t variable. This value includes the size of the general data block that precedes the code units of the compiled pattern itself. The value that is used when pcre2_compile() is getting memory in which to place the compiled pattern may be slightly larger than the value returned by this option, because there are cases where the code that calculates the size has to over-estimate. Processing a pattern with the JIT compiler does not alter the value returned by this option.

INFORMATION ABOUT A PATTERN'S CALLOUTS

int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data);

A script language that supports the use of string arguments in callouts might like to scan all the callouts in a pattern before running the match. This can be done by calling pcre2_callout_enumerate(). The first argument is a pointer to a compiled pattern, the second points to a callback function, and the third is arbitrary user data. The callback function is called for every callout in the pattern in the order in which they appear. Its first argument is a pointer to a callout enumeration block, and its second argument is the user_data value that was passed to pcre2_callout_enumerate(). The contents of the callout enumeration block are described in the pcre2callout documentation, which also gives further details about callouts.

SERIALIZATION AND PRECOMPILING

It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width, and PCRE2_SIZE type. Before compiled patterns can be saved, they must be converted to a "serialized" form, which in the case of PCRE2 is really just a bytecode dump. The functions whose names begin with pcre2_serialize_ are used for converting to and from the serialized form. They are described in the pcre2serialize documentation. Note that PCRE2 serialization does not convert compiled patterns to an abstract format like Java or .NET serialization.

THE MATCH DATA BLOCK

pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext);

pcre2_match_data *pcre2_match_data_create_from_pattern( const pcre2_code *code, pcre2_general_context *gcontext);

void pcre2_match_data_free(pcre2_match_data *match_data);

Information about a successful or unsuccessful match is placed in a match data block, which is an opaque structure that is accessed by function calls. In particular, the match data block contains a vector of offsets into the subject string that define the matched parts of the subject. This is known as the ovector.

Before calling pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match() you must create a match data block by calling one of the creation functions above. For pcre2_match_data_create(), the first argument is the number of pairs of offsets in the ovector.

When using pcre2_match(), one pair of offsets is required to identify the string that matched the whole pattern, with an additional pair for each captured substring. For example, a value of 4 creates enough space to record the matched portion of the subject plus three captured substrings.

When using pcre2_dfa_match() there may be multiple matched substrings of different lengths at the same point in the subject. The ovector should be made large enough to hold as many as are expected.

A minimum of at least 1 pair is imposed by pcre2_match_data_create(), so it is always possible to return the overall matched string in the case of pcre2_match() or the longest match in the case of pcre2_dfa_match(). The maximum number of pairs is 65535; if the first argument of pcre2_match_data_create() is greater than this, 65535 is used.

The second argument of pcre2_match_data_create() is a pointer to a general context, which can specify custom memory management for obtaining the memory for the match data block. If you are not using custom memory management, pass NULL, which causes malloc() to be used.

For pcre2_match_data_create_from_pattern(), the first argument is a pointer to a compiled pattern. The ovector is created to be exactly the right size to hold all the substrings a pattern might capture when matched using pcre2_match(). You should not use this call when matching with pcre2_dfa_match(). The second argument is again a pointer to a general context, but in this case if NULL is passed, the memory is obtained using the same allocator that was used for the compiled pattern (custom or default).

A match data block can be used many times, with the same or different compiled patterns. You can extract information from a match data block after a match operation has finished, using functions that are described in the sections on matched strings and other match data below.

When a call of pcre2_match() fails, valid data is available in the match block only when the error is PCRE2_ERROR_NOMATCH, PCRE2_ERROR_PARTIAL, or one of the error codes for an invalid UTF string. Exactly what is available depends on the error, and is detailed below.

When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can be referenced by the extraction functions after a successful match. After running a match, you must not free a compiled pattern or a subject string until after all operations on the match data block (for that match) have taken place, unless, in the case of the subject string, you have used the PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled "Option bits for pcre2_match()" below.

When a match data block itself is no longer needed, it should be freed by calling pcre2_match_data_free(). If this function is called with a NULL argument, it returns immediately, without doing anything.

MEMORY USE FOR MATCH DATA BLOCKS

PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data);

PCRE2_SIZE pcre2_get_match_data_heapframes_size( pcre2_match_data *match_data);

The size of a match data block depends on the size of the ovector that it contains. The function pcre2_get_match_data_size() returns the size, in bytes, of the block that is its argument.

When pcre2_match() runs interpretively (that is, without using JIT), it makes use of a vector of data frames for remembering backtracking positions. The size of each individual frame depends on the number of capturing parentheses in the pattern and can be obtained by calling pcre2_pattern_info() with the PCRE2_INFO_FRAMESIZE option (see the section entitled "Information about a compiled pattern" above).

Heap memory is used for the frames vector; if the initial memory block turns out to be too small during matching, it is automatically expanded. When pcre2_match() returns, the memory is not freed, but remains attached to the match data block, for use by any subsequent matches that use the same block. It is automatically freed when the match data block itself is freed.

You can find the current size of the frames vector that a match data block owns by calling pcre2_get_match_data_heapframes_size(). For a newly created match data block the size will be zero. Some types of match may require a lot of frames and thus a large vector; applications that run in environments where memory is constrained can check this and free the match data block if the heap frames vector has become too big.

MATCHING A PATTERN: THE TRADITIONAL FUNCTION

int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext);

The function pcre2_match() is called to match a subject string against a compiled pattern, which is passed in the code argument. You can call pcre2_match() with the same code argument as many times as you like, in order to find multiple matches in the subject string or to match different subject strings with the same pattern.

This function is the main matching facility of the library, and it operates in a Perl-like manner. For specialist use there is also an alternative matching function, which is described below in the section about the pcre2_dfa_match() function.

Here is an example of a simple call to pcre2_match():

  pcre2_match_data *md = pcre2_match_data_create(4, NULL);
  int rc = pcre2_match(
    re,             /* result of pcre2_compile() */
    "some string",  /* the subject string */
    11,             /* the length of the subject string */
    0,              /* start at offset 0 in the subject */
    0,              /* default options */
    md,             /* the match data block */
    NULL);          /* a match context; NULL means use defaults */
If the subject string is zero-terminated, the length can be given as PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common matching parameters are to be changed. For details, see the section on the match context above.

The string to be matched by pcre2_match()

The subject string is passed to pcre2_match() as a pointer in subject, a length in length, and a starting offset in startoffset. The length and offset are in code units, not characters. That is, they are in bytes for the 8-bit library, 16-bit code units for the 16-bit library, and 32-bit code units for the 32-bit library, whether or not UTF processing is enabled. As a special case, if subject is NULL and length is zero, the subject is assumed to be an empty string. If length is non-zero, an error occurs if subject is NULL.

If startoffset is greater than the length of the subject, pcre2_match() returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the search for a match starts at the beginning of the subject, and this is by far the most common case. In UTF-8 or UTF-16 mode, the starting offset must point to the start of a character, or to the end of the subject (in UTF-32 mode, one code unit equals one character, so all offsets are valid). Like the pattern string, the subject may contain binary zeros.

A non-zero starting offset is useful when searching for another match in the same subject by calling pcre2_match() again after a previous success. Setting startoffset differs from passing over a shortened string and setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of lookbehind. For example, consider the pattern

  \Biss\B
which finds occurrences of "iss" in the middle of words. (\B matches only if the current position in the subject is not a word boundary.) When applied to the string "Mississippi" the first call to pcre2_match() finds the first occurrence. If pcre2_match() is called again with just the remainder of the subject, namely "issippi", it does not match, because \B is always false at the start of the subject, which is deemed to be a word boundary. However, if pcre2_match() is passed the entire string again, but with startoffset set to 4, it finds the second occurrence of "iss" because it is able to look behind the starting point to discover that it is preceded by a letter.

Finding all the matches in a subject is tricky when the pattern can match an empty string. PCRE2 includes a helper API to assist with this; see the section entitled "Iterating over all matches" below for details.

If a non-zero starting offset is passed when the pattern is anchored, a single attempt to match at the given offset is made. This can only succeed if the pattern does not require the match to be at the start of the subject. In other words, the anchoring must be the result of setting the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL, not by starting the pattern with ^ or \A.

Option bits for pcre2_match()

The unused bits of the options argument for pcre2_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_DISABLE_RECURSELOOP_CHECK, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.

Setting PCRE2_ANCHORED or PCRE2_ENDANCHORED at match time is not supported by the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the interpretive code in pcre2_match() is run. PCRE2_DISABLE_RECURSELOOP_CHECK is ignored by JIT, but apart from PCRE2_NO_JIT (obviously), the remaining options are supported for JIT matching.

  PCRE2_ANCHORED
The PCRE2_ANCHORED option limits pcre2_match() to matching at the first matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out to be anchored by virtue of its contents, it cannot be made unanchored at matching time. Note that setting the option at match time disables JIT matching.
  PCRE2_COPY_MATCHED_SUBJECT
By default, a pointer to the subject is remembered in the match data block so that, after a successful match, it can be referenced by the substring extraction functions. This means that the subject's memory must not be freed until all such operations are complete. For some applications where the lifetime of the subject string is not guaranteed, it may be necessary to make a copy of the subject string, but it is wasteful to do this unless the match is successful. After a successful match, if PCRE2_COPY_MATCHED_SUBJECT is set, the subject is copied and the new pointer is remembered in the match data block instead of the original subject pointer. The memory allocator that was used for the match block itself is used. The copy is automatically freed when pcre2_match_data_free() is called to free the match data block. It is also automatically freed if the match data block is re-used for another match operation.
  PCRE2_DISABLE_RECURSELOOP_CHECK
This option is relevant only to pcre2_match() for interpretive matching. It is ignored when JIT is used, and is forbidden for pcre2_dfa_match().

The use of recursion in patterns can lead to infinite loops. In the interpretive matcher these would be eventually caught by the match or heap limits, but this could take a long time and/or use a lot of memory if the limits are large. There is therefore a check at the start of each recursion. If the same group is still active from a previous call, and the current subject pointer is the same as it was at the start of that group, and the furthest inspected character of the subject has not changed, an error is generated.

There are rare cases of matches that would complete, but nevertheless trigger this error. This option disables the check. It is provided mainly for testing when comparing JIT and interpretive behaviour.

  PCRE2_ENDANCHORED
If the PCRE2_ENDANCHORED option is set, any string that pcre2_match() matches must be right at the end of the subject string. Note that setting the option at match time disables JIT matching.
  PCRE2_NOTBOL
This option specifies that first character of the subject string is not the beginning of a line, so the circumflex metacharacter should not match before it. Setting this without having set PCRE2_MULTILINE at compile time causes circumflex never to match. This option affects only the behaviour of the circumflex metacharacter. It does not affect \A.
  PCRE2_NOTEOL
This option specifies that the end of the subject string is not the end of a line, so the dollar metacharacter should not match it nor (except in multiline mode) a newline immediately before it. Setting this without having set PCRE2_MULTILINE at compile time causes dollar never to match. This option affects only the behaviour of the dollar metacharacter. It does not affect \Z or \z.
  PCRE2_NOTEMPTY
An empty string is not considered to be a valid match if this option is set. If there are alternatives in the pattern, they are tried. If all the alternatives match the empty string, the entire match fails. For example, if the pattern
  a?b?
is applied to a string not beginning with "a" or "b", it matches an empty string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not valid, so pcre2_match() searches further into the string for occurrences of "a" or "b".
  PCRE2_NOTEMPTY_ATSTART
This is like PCRE2_NOTEMPTY, except that it locks out an empty string match only at the first matching position, that is, at the start of the subject plus the starting offset. An empty string match later in the subject is permitted. If the pattern is anchored, such a match can occur only if the pattern contains \K.
  PCRE2_NO_JIT
By default, if a pattern has been successfully processed by pcre2_jit_compile(), JIT is automatically used when pcre2_match() is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use of JIT; it forces matching to be done by the interpreter.
  PCRE2_NO_UTF_CHECK
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked unless PCRE2_NO_UTF_CHECK is passed to pcre2_match() or PCRE2_MATCH_INVALID_UTF was passed to pcre2_compile(). The latter special case is discussed in detail in the pcre2unicode documentation.

In the default case, if a non-zero starting offset is given, the check is applied only to that part of the subject that could be inspected during matching, and there is a check that the starting offset points to the first code unit of a character or to the end of the subject. If there are no lookbehind assertions in the pattern, the check starts at the starting offset. Otherwise, it starts at the length of the longest lookbehind before the starting offset, or at the start of the subject if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds.

The check is carried out before any other processing takes place, and a negative error code is returned if the check fails. There are several UTF error codes for each code unit width, corresponding to different problems with the code unit sequence. There are discussions about the validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode documentation.

If you know that your subject is valid, and you want to skip this check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling pcre2_match(). You might want to do this for the second and subsequent calls to pcre2_match() if you are making repeated calls to find multiple matches in the same subject string.

Warning: Unless PCRE2_MATCH_INVALID_UTF was set at compile time, when PCRE2_NO_UTF_CHECK is set at match time the effect of passing an invalid string as a subject, or an invalid value of startoffset, is undefined. Your program may crash or loop indefinitely or give wrong results.

  PCRE2_PARTIAL_HARD
  PCRE2_PARTIAL_SOFT
These options turn on the partial matching feature. A partial match occurs if the end of the subject string is reached successfully, but there are not enough subject characters to complete the match. In addition, either at least one character must have been inspected or the pattern must contain a lookbehind, or the pattern must be one that could match an empty string.

If this situation arises when PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set, matching continues by testing any remaining alternatives. Only if no complete match can be found is PCRE2_ERROR_PARTIAL returned instead of PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT specifies that the caller is prepared to handle a partial match, but only if no complete match can be found.

If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if a partial match is found, pcre2_match() immediately returns PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other words, when PCRE2_PARTIAL_HARD is set, a partial match is considered to be more important than an alternative complete match.

There is a more detailed discussion of partial and multi-segment matching, with examples, in the pcre2partial documentation.

NEWLINE HANDLING WHEN MATCHING

When PCRE2 is built, a default newline convention is set; this is usually the standard convention for the operating system. The default can be overridden in a compile context by calling pcre2_set_newline(). It can also be overridden by starting a pattern string with, for example, (*CRLF), as described in the section on newline conventions in the pcre2pattern page. During matching, the newline choice affects the behaviour of the dot, circumflex, and dollar metacharacters. It may also alter the way the match starting position is advanced after a match failure for an unanchored pattern.

When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is set as the newline convention, and a match attempt for an unanchored pattern fails when the current starting position is at a CRLF sequence, and the pattern contains no explicit matches for CR or LF characters, the match position is advanced by two characters instead of one, in other words, to after the CRLF.

The above rule is a compromise that makes the most common cases work as expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is not set), it does not match the string "\r\nA" because, after failing at the start, it skips both the CR and the LF before retrying. However, the pattern [\r\n]A does match that string, because it contains an explicit CR or LF reference, and so advances only by one character after the first failure.

An explicit match for CR of LF is either a literal appearance of one of those characters in the pattern, or one of the \r or \n or equivalent octal or hexadecimal escape sequences. Implicit matches such as [^X] do not count, nor does \s, even though it includes CR and LF in the characters that it matches.

Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \r or \n escapes appear in the pattern.

HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);

PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);

In general, a pattern matches a certain portion of the subject, and in addition, further substrings from the subject may be picked out by parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's book, this is called "capturing" in what follows, and the phrase "capture group" (Perl terminology) is used for a fragment of a pattern that picks out a substring. PCRE2 supports several other kinds of parenthesized group that do not cause substrings to be captured. The pcre2_pattern_info() function can be used to find out how many capture groups there are in a compiled pattern.

You can use auxiliary functions for accessing captured substrings by number or by name, as described in sections below.

Alternatively, you can make direct use of the vector of PCRE2_SIZE values, called the ovector, which contains the offsets of captured strings. It is part of the match data block. The function pcre2_get_ovector_pointer() returns the address of the ovector, and pcre2_get_ovector_count() returns the number of pairs of values it contains.

Within the ovector, the first in each pair of values is set to the offset of the first code unit of a substring, and the second is set to the offset of the first code unit after the end of a substring. These values are always code unit offsets, not character offsets. That is, they are byte offsets in the 8-bit library, 16-bit offsets in the 16-bit library, and 32-bit offsets in the 32-bit library.

After a partial match (error return PCRE2_ERROR_PARTIAL), only the first pair of offsets (that is, ovector[0] and ovector[1]) are set. They identify the part of the subject that was partially matched. See the pcre2partial documentation for details of partial matching.

After a fully successful match, the first pair of offsets identifies the portion of the subject string that was matched by the entire pattern. The next pair is used for the first captured substring, and so on. The value returned by pcre2_match() is one more than the highest numbered pair that has been set. For example, if two substrings have been captured, the returned value is 3. If there are no captured substrings, the return value from a successful match is 1, indicating that just the first pair of offsets has been set.

If a pattern uses the \K escape sequence within a positive lookahead assertion, the reported start of a successful match can be greater than the end of the match. For example, if the pattern (?=ab\K) is matched against "ab", the start and end offset values for the match are 2 and 0.

If a capture group is matched repeatedly within a single match operation, it is the last portion of the subject that it matched that is returned.

If the ovector is too small to hold all the captured substring offsets, as much as possible is filled in, and the function returns a value of zero. If captured substrings are not of interest, pcre2_match() may be called with a match data block whose ovector is of minimum length (that is, one pair).

It is possible for capture group number n+1 to match some part of the subject when group n has not been used at all. For example, if the string "abc" is matched against the pattern (a|(z))(bc) the return from the function is 4, and groups 1 and 3 are matched, but 2 is not. When this happens, both values in the offset pairs corresponding to unused groups are set to PCRE2_UNSET.

Offset values that correspond to unused groups at the end of the expression are also set to PCRE2_UNSET. For example, if the string "abc" is matched against the pattern (abc)(x(yz)?)? groups 2 and 3 are not matched. The return from the function is 2, because the highest used capture group number is 1. The offsets for the second and third capture groups (assuming the vector is large enough, of course) are set to PCRE2_UNSET.

Elements in the ovector that do not correspond to capturing parentheses in the pattern are never changed. That is, if a pattern contains n capturing parentheses, no more than ovector[0] to ovector[2n+1] are set by pcre2_match(). The other elements retain whatever values they previously had. After a failed match attempt, the contents of the ovector are unchanged.

OTHER INFORMATION ABOUT A MATCH

PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);

PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);

As well as the offsets in the ovector, other information about a match is retained in the match data block and can be retrieved by the above functions in appropriate circumstances. If they are called at other times, the result is undefined.

After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function pcre2_get_mark() can be called to access this name, which can be specified in the pattern by any of the backtracking control verbs, not just (*MARK). The same function applies to all the verbs. It returns a pointer to the zero-terminated name, which is within the compiled pattern. If no name is available, NULL is returned. The length of the name (excluding the terminating zero) is stored in the code unit that precedes the name. You should use this length instead of relying on the terminating zero if the name might contain a binary zero.

After a successful match, the name that is returned is the last mark name encountered on the matching path through the pattern. Instances of backtracking verbs without names do not count. Thus, for example, if the matching path contains (*MARK:A)(*PRUNE), the name "A" is returned. After a "no match" or a partial match, the last encountered name is returned. For example, consider this pattern:

  ^(*MARK:A)((*MARK:B)a|b)c
When it matches "bc", the returned name is A. The B mark is "seen" in the first branch of the group, but it is not on the matching path. On the other hand, when this pattern fails to match "bx", the returned name is B.

Warning: By default, certain start-of-match optimizations are used to give a fast "no match" result in some situations. For example, if the anchoring is removed from the pattern above, there is an initial check for the presence of "c" in the subject before running the matching engine. This check fails for "bx", causing a match failure without seeing any marks. You can disable the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option for pcre2_compile() or by starting the pattern with (*NO_START_OPT).

After a successful match, a partial match, or one of the invalid UTF errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can be called. After a successful or partial match it returns the code unit offset of the character at which the match started. For a non-partial match, this can be different to the value of ovector[0] if the pattern contains the \K escape sequence. After a partial match, however, this value is always the same as ovector[0] because \K does not affect the result of a partial match.

After a UTF check failure, pcre2_get_startchar() can be used to obtain the code unit offset of the invalid UTF character. Details are given in the pcre2unicode page.

ERROR RETURNS FROM pcre2_match()

If pcre2_match() fails, it returns a negative number. This can be converted to a text string by calling the pcre2_get_error_message() function (see "Obtaining a textual error message" below). Negative error codes are also returned by other functions, and are documented with them. The codes are given names in the header file. If UTF checking is in force and an invalid UTF subject string is detected, one of a number of UTF-specific negative error codes is returned. Details are given in the pcre2unicode page. The following are the other errors that may be returned by pcre2_match():

  PCRE2_ERROR_NOMATCH
The subject string did not match the pattern.
  PCRE2_ERROR_PARTIAL
The subject string did not match, but it did match partially. See the pcre2partial documentation for details of partial matching.
  PCRE2_ERROR_BADMAGIC
PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to catch the case when it is passed a junk pointer. This is the error that is returned when the magic number is not present.
  PCRE2_ERROR_BADMODE
This error is given when a compiled pattern is passed to a function in a library of a different code unit width, for example, a pattern compiled by the 8-bit library is passed to a 16-bit or 32-bit library function.
  PCRE2_ERROR_BADOFFSET
The value of startoffset was greater than the length of the subject.
  PCRE2_ERROR_BADOPTION
An unrecognized bit was set in the options argument.
  PCRE2_ERROR_BADUTFOFFSET
The UTF code unit sequence that was passed as a subject was checked and found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of startoffset did not point to the beginning of a UTF character or the end of the subject.
  PCRE2_ERROR_CALLOUT
This error is never generated by pcre2_match() itself. It is provided for use by callout functions that want to cause pcre2_match() or pcre2_callout_enumerate() to return a distinctive error code. See the pcre2callout documentation for details.
  PCRE2_ERROR_DEPTHLIMIT
The nested backtracking depth limit was reached.
  PCRE2_ERROR_HEAPLIMIT
The heap limit was reached.
  PCRE2_ERROR_INTERNAL
An unexpected internal error has occurred. This error could be caused by a bug in PCRE2 or by overwriting of the compiled pattern.
  PCRE2_ERROR_JIT_STACKLIMIT
This error is returned when a pattern that was successfully studied using JIT is being matched, but the memory available for the just-in-time processing stack is not large enough. See the pcre2jit documentation for more details.
  PCRE2_ERROR_MATCHLIMIT
The backtracking match limit was reached.
  PCRE2_ERROR_NOMEMORY
Heap memory is used to remember backtracking points. This error is given when the memory allocation function (default or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
  PCRE2_ERROR_NULL
Either the code, subject, or match_data argument was passed as NULL.
  PCRE2_ERROR_RECURSELOOP
This error is returned when pcre2_match() detects a recursion loop within the pattern. Specifically, it means that either the whole pattern or a capture group has been called recursively for the second time at the same position in the subject string. Some simple patterns that might do this are detected and faulted at compile time, but more complicated cases, in particular mutual recursions between two different groups, cannot be detected until matching is attempted.

OBTAINING A TEXTUAL ERROR MESSAGE

int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen);

A text message for an error code from any PCRE2 function (compile, match, or auxiliary) can be obtained by calling pcre2_get_error_message(). The code is passed as the first argument, with the remaining two arguments specifying a code unit buffer and its length in code units, into which the text message is placed. The message is returned in code units of the appropriate width for the library that is being used.

The returned message is terminated with a trailing zero, and the function returns the number of code units used, excluding the trailing zero. If the error number is unknown, the negative error code PCRE2_ERROR_BADDATA is returned. If the buffer is too small, the message is truncated (but still with a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. None of the messages is very long; a buffer size of 120 code units is ample.

ITERATING OVER ALL MATCHES

int pcre2_next_match(pcre2_match_data *match_data, PCRE2_SIZE *pstart_offset, uint32_t *poptions);

A common task for applications is to implement "global" matching behaviour, for example, replacing all matches in the subject; splitting the subject on all matches; or simply counting the number of matches. The pcre2_next_match() function helps with this task by providing the appropriate parameters for the next match attempt (available since PCRE2 10.47).

First, a match attempt should be made using one of the matching functions (pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match()). Then, pcre2_next_match() can be called, providing the same match_data parameter.

It returns 0 ("false") if there is no need to make a further match attempt, or 1 ("true") if another match should be attempted. Returning 1 does not imply that there is another match, only that another match should be attempted (which may return PCRE2_ERROR_NOMATCH).

The *pstart_offset and *poptions are set if the function returns 1. The *pstart_offset should be passed to the next match attempt directly, and the *poptions should be passed to the next match attempt by combining with the application's match options using OR.

There is some code that demonstrates how to do this in the pcre2demo sample program. The general pattern is:

  uint32_t app_options = ...;
  uint32_t global_options = 0;
  PCRE2_SIZE start_offset = 0;
  while (1)
    {
    int rc = pcre2_match(re, subject, subject_len, start_offset,
                         app_options | global_options, match_data,
                         match_context);

    if (rc == PCRE2_ERROR_NOMATCH) break; /* no match, and no more attempts */
    if (rc < 0) { ... exit }

    ...handle the match

    if (!pcre2_next_match(match_data, &start_offset, &global_options))
      break; /* no more attempts */
    }

The guarantees provided by pcre2_next_match() are that the start_offset will advance, so the loop will definitely terminate. The conditions which ensure this are that either: (a) pcre2_next_match() returns 0 (false); or (b) the returned *pstart_offset is strictly greater than the previous start_offset; or (c) if the previous match was a successful match of the empty string then the returned *pstart_offset is equal to the previous ovector[1], and *poptions will be set to PCRE2_NOTEMPTY_ATSTART to prevent another empty match from being returned.

A loop implemented as shown above will always terminate, unless there is a bug in PCRE2. As a measure of "defensive programming", applications are encouraged to add an assertion or check to break their loop if it does not make progress (and report the issue as a bug).

If an application does not use the flag PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, then each match is "well-behaved" and satisfies:

  start_offset <= ovector[0] <= ovector[1].
In this case, the matches found by pcre2_match() with pcre2_next_match() will be sorted, non-overlapping (possibly touching), and with no duplicates.

Otherwise, if PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK is used, then the guarantees are considerably weaker. We do not guarantee that the matches will always advance: only that the start_offset will. The matches found by pcre2_match() with pcre2_next_match() will be a finite sequence (as pcre2_next_match() ensures that start_offset advances, so the search will terminate). The matches can however be overlapping, can contain duplicates, and (in truly pathological examples) may not even be sorted by ovector[0]. Additionally, each match itself can end before it starts (ovector[1] < ovector[0]). We recommend that applications do not set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK.

EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length);

int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);

int pcre2_substring_get_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);

void pcre2_substring_free(PCRE2_UCHAR *buffer);

Captured substrings can be accessed directly by using the ovector as described above. For convenience, auxiliary functions are provided for extracting captured substrings as new, separate, zero-terminated strings. A substring that contains a binary zero is correctly extracted and has a further zero added on the end, but the result is not, of course, a C string.

The functions in this section identify substrings by number. The number zero refers to the entire matched substring, with higher numbers referring to substrings captured by parenthesized groups. After a partial match, only substring zero is available. An attempt to extract any other substring gives the error PCRE2_ERROR_PARTIAL. The next section describes similar functions for extracting captured substrings by name.

If a pattern uses the \K escape sequence within a positive lookahead assertion, the reported start of a successful match can be greater than the end of the match. For example, if the pattern (?=ab\K) is matched against "ab", the start and end offset values for the match are 2 and 0. In this situation, calling these functions with a zero substring number extracts a zero-length empty string.

You can find the length in code units of a captured substring without extracting it by calling pcre2_substring_length_bynumber(). The first argument is a pointer to the match data block, the second is the group number, and the third is a pointer to a variable into which the length is placed. If you just want to know whether or not the substring has been captured, you can pass the third argument as NULL.

The pcre2_substring_copy_bynumber() function copies a captured substring into a supplied buffer, whereas pcre2_substring_get_bynumber() copies it into new memory, obtained using the same memory allocation function that was used for the match data block. The first two arguments of these functions are a pointer to the match data block and a capture group number.

The final arguments of pcre2_substring_copy_bynumber() are a pointer to the buffer and a pointer to a variable that contains its length in code units. This is updated to contain the actual number of code units used for the extracted substring, excluding the terminating zero.

For pcre2_substring_get_bynumber() the third and fourth arguments point to variables that are updated with a pointer to the new memory and the number of code units that comprise the substring, again excluding the terminating zero. When the substring is no longer needed, the memory should be freed by calling pcre2_substring_free().

The return value from all these functions is zero for success, or a negative error code. If the pattern match failed, the match failure code is returned. If a substring number greater than zero is used after a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible error codes are:

  PCRE2_ERROR_NOMEMORY
The buffer was too small for pcre2_substring_copy_bynumber(), or the attempt to get memory failed for pcre2_substring_get_bynumber().
  PCRE2_ERROR_NOSUBSTRING
There is no substring with that number in the pattern, that is, the number is greater than the number of capturing parentheses.
  PCRE2_ERROR_UNAVAILABLE
The substring number, though not greater than the number of captures in the pattern, is greater than the number of slots in the ovector, so the substring could not be captured.
  PCRE2_ERROR_UNSET
The substring did not participate in the match. For example, if the pattern is (abc)|(def) and the subject is "def", and the ovector contains at least two capturing slots, substring number 1 is unset.

EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

int pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);

void pcre2_substring_list_free(PCRE2_UCHAR **list);

The pcre2_substring_list_get() function extracts all available substrings and builds a list of pointers to them. It also (optionally) builds a second list that contains their lengths (in code units), excluding a terminating zero that is added to each of them. All this is done in a single block of memory that is obtained using the same memory allocation function that was used to get the match data block.

This function must be called only after a successful match. If called after a partial match, the error code PCRE2_ERROR_PARTIAL is returned.

The address of the memory block is returned via listptr, which is also the start of the list of string pointers. The end of the list is marked by a NULL pointer. The address of the list of lengths is returned via lengthsptr. If your strings do not contain binary zeros and you do not therefore need the lengths, you may supply NULL as the lengthsptr argument to disable the creation of a list of lengths. The yield of the function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the memory block could not be obtained. When the list is no longer needed, it should be freed by calling pcre2_substring_list_free().

If this function encounters a substring that is unset, which can happen when capture group number n+1 matches some part of the subject, but group n has not been used at all, it returns an empty string. This can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain PCRE2_UNSET for unset substrings, or by calling pcre2_substring_length_bynumber().

EXTRACTING CAPTURED SUBSTRINGS BY NAME

int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name);

int pcre2_substring_length_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_SIZE *length);

int pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);

int pcre2_substring_get_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);

void pcre2_substring_free(PCRE2_UCHAR *buffer);

To extract a substring by name, you first have to find associated number. For example, for this pattern:

  (a+)b(?<xxx>\d+)...
the number of the capture group called "xxx" is 2. If the name is known to be unique (PCRE2_DUPNAMES was not set), you can find the number from the name by calling pcre2_substring_number_from_name(). The first argument is the compiled pattern, and the second is the name. The yield of the function is the group number, PCRE2_ERROR_NOSUBSTRING if there is no group with that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one group with that name. Given the number, you can extract the substring directly from the ovector, or use one of the "bynumber" functions described above.

For convenience, there are also "byname" functions that correspond to the "bynumber" functions, the only difference being that the second argument is a name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate names, these functions scan all the groups with the given name, and return the captured substring from the first named group that is set.

If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is returned. If all groups with the name have numbers that are greater than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is returned. If there is at least one group with a slot in the ovector, but no group is found to be set, PCRE2_ERROR_UNSET is returned.

Warning: If the pattern uses the (?| feature to set up multiple capture groups with the same number, as described in the section on duplicate group numbers in the pcre2pattern page, you cannot use names to distinguish the different capture groups, because names are not included in the compiled code. The matching process uses only numbers. For this reason, the use of different names for groups with the same number causes an error at compile time.

CREATING A NEW STRING WITH SUBSTITUTIONS

int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, PCRE2_SIZE *outlengthptr);

This function optionally calls pcre2_match() and then makes a copy of the subject string in outputbuffer, replacing parts that were matched with the replacement string, whose length is supplied in rlength, which can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a special case, if replacement is NULL and rlength is zero, the replacement is assumed to be an empty string. If rlength is non-zero, an error occurs if replacement is NULL.

There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the replacement string(s). The default action is to perform just one replacement if the pattern matches, but there is an option that requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).

If successful, pcre2_substitute() returns the number of substitutions that were carried out. This may be zero if no match was found, and is never greater than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A negative value is returned if an error is detected.

Matches in which a \K item in a lookahead in the pattern causes the match to end before it starts are not supported, and give rise to an error return. For global replacements, matches in which \K in a lookbehind causes the match to start earlier than the point that was reached in the previous iteration are also not supported. (These cases are only possible if the pattern was compiled with the backwards-compatibility option PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK.)

The first seven arguments of pcre2_substitute() are the same as for pcre2_match(), except that the partial matching options are not permitted, and match_data may be passed as NULL, in which case a match data block is obtained and freed within this function, using memory management functions from the match context, if provided, or else those that were used to allocate memory for the compiled code.

If match_data is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the provided block is used for all calls to pcre2_match(), and its contents afterwards are the result of the final call made internally by pcre2_substitute() to the matching function. For global changes, this will always be a no-match error. The contents of the ovector within the match data block may or may not have been changed.

As well as the usual options for pcre2_match(), a number of additional options can be set in the options argument of pcre2_substitute(). One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external match_data block must be provided, and it must have already been used for an external call to pcre2_match() (or pcre2_jit_match()) with the same pattern, subject pointer, effective subject length, start offset, and match option arguments (substitute-specific options can be added to the options argument). If any of these parameters is changed, pcre2_substitute() returns an error. The data in the match_data block (return code, offset vector) is used for the first substitution instead of calling pcre2_match() from within pcre2_substitute(). This allows an application to check for a match before choosing to substitute, without having to repeat the match.

If the contents of the subject buffer are mutated in between pcre2_match() and a call to pcre2_substitute() with PCRE2_SUBSTITUTE_MATCHED, the behaviour is unsafe; in particular, in this case, PCRE2 is unable to ensure that the offsets in the ovector point to the start of characters (with UTF-encoded input).

The contents of the externally supplied match data block are not changed when PCRE2_SUBSTITUTE_MATCHED is set, and so the match block is permitted for use in another call using PCRE2_SUBSTITUTE_MATCHED. If PCRE2_SUBSTITUTE_GLOBAL is also set, pcre2_match() is called after the first substitution to check for furthe matches, but this is done using an internally obtained match data block, thus always leaving the external block unchanged.

The code argument is not used for matching before the first substitution when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, even when PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains information such as the UTF setting and the number of capturing parentheses in the pattern.

When using PCRE2_SUBSTITUTE_MATCHED, you should not modify the subject string in between the prior call to pcre2_match() and pcre2_substitute(), as the substitution assumes that the passed-in ovector is compatible with the subject string. Although PCRE2 does verify that the subject is a pointer to the same buffer, it cannot in general verify whether the contents of the buffer have changed. For example, if the subject buffer is mutated from one valid UTF-8 string to another valid string, of the same length in code units, the ovector offsets are no longer guaranteed to point to the start of a character. Beware that with PCRE2_SUBSTITUTE_MATCHED in UTF mode, the subject string is not re-scanned for UTF validity when pcre2_substitute() first uses it.

The default action of pcre2_substitute() is to return a copy of the subject string with matched substrings replaced. However, if PCRE2_SUBSTITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are returned. In the global case, multiple replacements are concatenated in the output buffer. Substitution callouts (see below) can be used to separate them if necessary.

Partial matching is supported, with limitations: if matching succeeds but with a partial match, then pcre2_substitute returns PCRE2_ERROR_PARTIAL. When partial-matching (either of PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT is passed), then PCRE2_SUBSTITUTE_REPLACEMENT_ONLY must also be set, or else PCRE2_ERROR_BADOPTION is returned. Similarly, certain replacement items ($' and $_) cause PCRE2_ERROR_PARTIALSUBS to be returned when partial-matching, even if a complete match is found.

The outlengthptr argument of pcre2_substitute() must point to a variable that contains the length, in code units, of the output buffer. If the function is successful, the value is updated to contain the length in code units of the new string, excluding the trailing zero that is automatically added.

If the function is not successful, the value set via outlengthptr depends on the type of error. For syntax errors in the replacement string, the value is the offset in the replacement string where the error was detected. For other errors, the value is PCRE2_UNSET by default. This includes the case of the output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set.

PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (without, of course, writing anything) in order to compute the size of buffer that is needed, which will include the extra space for the terminating NUL. This value is passed back via the outlengthptr variable, with the result of the function still being PCRE2_ERROR_NOMEMORY.

Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean that the entire operation is carried out twice. Depending on the application, it may be more efficient to allocate a large buffer and free the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.

The replacement string, which is interpreted as a UTF string in UTF mode, is checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF replacement string causes an immediate return with the relevant UTF error code.

If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted in any way. By default, however, a dollar character is an escape character that can specify the insertion of characters from capture groups and names from (*MARK) or other control verbs in the pattern. Dollar is the only escape character (backslash is treated as literal). The following forms are recognized:

  $$                  insert a dollar character
  $n or ${n}          insert the contents of group n
  $0 or $&            insert the entire matched substring
  $`                  insert the substring that precedes the match
  $'                  insert the substring that follows the match
  $_                  insert the entire input string
  $+                  insert the highest-numbered capture group which matched
  $*MARK or ${*MARK}  insert a control verb name
Either a group number or a group name can be given for n, for example $2 or $NAME. Curly brackets are required only if the following character would be interpreted as part of the number or name. The number may be zero to include the entire matched string. For example, if the pattern a(b)c is matched with "=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=".

The JavaScript form $<name>, where the angle brackets are part of the syntax, is also recognized for group names, but not for group numbers or *MARK.

$*MARK inserts the name from the last encountered backtracking control verb on the matching path that has a name. (*MARK) must always include a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this pcre2test example shows:

  /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
      apple lemon
   2: pear orange
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string, replacing every matching substring. If this option is not set, only the first matching substring is replaced. The search for matches takes place in the original subject string (that is, previous replacements do not affect it). Iteration is implemented by advancing the startoffset value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, searching stops when that limit is reached.

Because global substitutions apply the pattern repeatedly to the subject string, and always iterate over non-overlapping matches, the substitutions done by pcre2_substitute() do not match and substitute text inside the replacement strings themselves (no recursive/iterative substitution). However, applications can easily implement other alternative replacement strategies, such as iteratively replacing, then matching and replacing on the result. The replacement loop inside pcre2_substitute() is simple and can be emulated in client code by allocating a buffer, searching for matches in a loop, and calling pcre2_substitute() with PCRE2_SUBSTITUTE_REPLACEMENT_ONLY an PCRE2_SUBSTITUTE_MATCHED, and without PCRE2_SUBSTITUTE_GLOBAL.

You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of startoffset and an offset limit. Here is a pcre2test example:

  /B/g,replace=!,use_offset_limit
  ABC ABC ABC ABC\=offset=3,offset_limit=12
   2: ABC A!C A!C ABC
When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same offset is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next two characters are CR, LF. In this case, the offset is advanced by two characters.

PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error.

PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including unknown groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does not influence the extended substitution syntax described below.

PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is set, several things change:

Firstly, backslash in a replacement string is interpreted as an escape character. The usual forms such as \x{ddd} can be used to specify particular character codes, and backslash followed by any non-alphanumeric character quotes that character. Extended quoting can be coded using \Q...\E, exactly as in pattern strings. The escapes \b and \v are interpreted as the characters backspace and vertical tab, respectively.

The interpretation of backslash followed by one or more digits is the same as in a pattern, which in Perl has some ambiguities. Details are given in the pcre2pattern page.

The Python form \g<n>, where the angle brackets are part of the syntax and n is either a group name or number, is recognized as an alternative way of inserting the contents of a group, for example \g<3>.

There are also four escape sequences for forcing the case of inserted letters. Case forcing applies to all inserted characters, including those from capture groups and letters within \Q...\E quoted sequences. The insertion mechanism has three states: no case forcing, force upper case, and force lower case. The escape sequences change the current state: \U and \L change to upper or lower case forcing, respectively, and \E (when not terminating a \Q quoted sequence) reverts to no case forcing. The sequences \u and \l force the next character (if it is a letter) to upper or lower case, respectively, and then the state automatically reverts to no case forcing.

However, if \u is immediately followed by \L or \l is immediately followed by \U, the next character's case is forced by the first escape sequence, and subsequent characters by the second. This provides a "title casing" facility that can be applied to group captures. For example, if group 1 has captured "heLLo", the replacement string "\u\L$1" becomes "Hello".

If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater than 127. However, only simple case folding, as determined by the Unicode file CaseFolding.txt is supported. PCRE2 does not support language-specific special casing rules such as using different lower case Greek sigmas in the middle and ends of words (as defined in the Unicode file SpecialCasing.txt).

Note that case forcing sequences such as \U...\E do not nest. For example, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do not apply to replacement strings.

The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash:

  ${n:-string}
  ${n:+string1:string2}
As in the simple case, n may be a group number or a name. The first form specifies a default value. If group n is set, its value is inserted; if not, the string is expanded and the result inserted. The second form specifies strings that are expanded and inserted when group n is set or unset, respectively. The first form is just a convenient shorthand for
  ${n:+${n}:string}
Backslash can be used to escape colons and closing curly brackets in the replacement strings. A change of the case forcing state within a replacement string remains in force afterwards, as shown in this pcre2test example:
  /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
      body
   1: hello
      somebody
   1: HELLO
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown groups in the extended syntax forms to be treated as unset.

If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrelevant and are ignored.

Substitution errors

In the event of an error, pcre2_substitute() returns a negative error code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() are passed straight back.

PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.

PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set.

PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is needed is returned via outlengthptr. Note that this does not happen by default.

PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the match_data argument is NULL or if the subject or replacement arguments are NULL. For backward compatibility reasons an exception is made for the replacement argument if the rlength argument is also 0.

PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier than the current position in the subject, which can happen if \K is used in a lookaround assertion).

As for all PCRE2 errors, a text message that describes the error can be obtained by calling the pcre2_get_error_message() function (see "Obtaining a textual error message" above).

Substitution callouts

int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);

The pcre2_set_substitute_callout() function can be used to specify a callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution has been processed, but it can cause the replacement not to happen.

The callout function is not called for simulated substitutions that happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when substitution processing exceeds the buffer space provided by the caller, processing continues by counting code units. The simulation is unable to populate the callout block, and so the simulation is pessimistic about the required buffer size. Whichever is larger of accepted or rejected substitution is reported as the required size. Therefore, the returned buffer length may be an overestimate (without a substitution callout, it is normally an exact measurement).

The first argument of the callout function is a pointer to a substitute callout block structure, which contains the following fields, not necessarily in this order:

  uint32_t    version;
  uint32_t    subscount;
  PCRE2_SPTR  input;
  PCRE2_SPTR  output;
  PCRE2_SIZE *ovector;
  uint32_t    oveccount;
  PCRE2_SIZE  output_offsets[2];
The version field contains the version number of the block format. The current version is 0. The version number will increase in future if more fields are added, but the intention is never to remove any of the existing fields.

The subscount field is the number of the current match. It is 1 for the first callout, 2 for the second, and so on. The input and output pointers are copies of the values passed to pcre2_substitute().

The ovector field points to the ovector, which contains the result of the most recent match. The oveccount field contains the number of pairs that are set in the ovector, and is always greater than zero.

The output_offsets vector contains the offsets of the replacement in the output string. This has already been processed for dollar and (if requested) backslash substitutions as described above.

The second argument of the callout function is the value passed as callout_data when the function was registered. The value returned by the callout function is interpreted as follows:

If the value is zero, the replacement is accepted, and, if PCRE2_SUBSTITUTE_GLOBAL is set, processing continues with a search for the next match. If the value is not zero, the current replacement is not accepted. If the value is greater than zero, processing continues when PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero or PCRE2_SUBSTITUTE_GLOBAL is not set), the rest of the input is copied to the output and the call to pcre2_substitute() exits, returning the number of matches so far.

Substitution case callouts

int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data);

The pcre2_set_substitute_case_callout() function can be used to specify a callout function for pcre2_substitute() to use when performing case transformations. This does not affect any case insensitivity behaviour when performing a match, but only the user-visible transformations performed when processing a substitution such as:

    pcre2_substitute(..., "\\U$1", ...)

The default case transformations applied by PCRE2 are reasonably complete, and, in UTF or UCP mode, perform the simple locale-invariant case transformations as specified by Unicode. This is suitable for the internal (invisible) case-equivalence procedures used during pattern matching, but an application may wish to use more sophisticated locale-aware processing for the user-visible substitution transformations.

One example implementation of the callout_function using the ICU library would be:

    PCRE2_SIZE
    icu_case_callout(
      PCRE2_SPTR input, PCRE2_SIZE input_len,
      PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
      int to_case, void *data_ptr)
    {
      UErrorCode err = U_ZERO_ERROR;
      int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER
        ? u_strToLower(output, output_cap, input, input_len, NULL, &err)
        : to_case == PCRE2_SUBSTITUTE_CASE_UPPER
        ? u_strToUpper(output, output_cap, input, input_len, NULL, &err)
        : u_strToTitle(output, output_cap, input, input_len, &first_char_only,
                       NULL, &err);
      if (U_FAILURE(err)) return (~(PCRE2_SIZE)0);
      return r;
    }

The first and second arguments of the case callout function are the Unicode string to transform.

The third and fourth arguments are the output buffer and its capacity.

The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the callout to indicate that the case of the entire callout input should be case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that only the first character or glyph should be transformed to Unicode titlecase and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode properties to titlecase each word in a string; but PCRE2 is requesting that only the single leading character is to be titlecased).

The sixth argument is the callout_data supplied to pcre2_set_substitute_case_callout().

The resulting string in the destination buffer may be larger or smaller than the input, if the casing rules merge or split characters. The return value is the length required for the output string. If a buffer of sufficient size was provided to the callout, then the result must be written to the buffer and the number of code units returned. If the result does not fit in the provided buffer, then the required capacity must be returned and PCRE2 will not make use of the output buffer. PCRE2 provides input and output buffers which overlap, so the callout must support this by suitable internal buffering.

Alternatively, if the callout wishes to indicate an error, then it may return (~(PCRE2_SIZE)0). In this case pcre2_substitute() will immediately fail with error PCRE2_ERROR_REPLACECASE.

When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option, there are situations when pcre2_substitute() will return an underestimate of the required buffer size. If you call pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for the replacement string to be constructed, then instead of calling the case callout, pcre2_substitute() will make an estimate of the required buffer size. The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second call is not guaranteed to succeed either, if the case callout requires more buffer space than expected. The caller must make repeated attempts in a loop.

DUPLICATE CAPTURE GROUP NAMES

int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);

When a pattern is compiled with the PCRE2_DUPNAMES option, names for capture groups are not required to be unique. Duplicate names are always allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names.

Normally, patterns that use duplicate names are such that in any one match, only one of each set of identically-named groups participates. An example is shown in the pcre2pattern documentation.

When duplicates are present, pcre2_substring_copy_byname() and pcre2_substring_get_byname() return the first substring corresponding to the given name that is set. Only if none are set is PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name() function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names.

If you want to get full details of all captured substrings for a given name, you must use the pcre2_substring_nametable_scan() function. The first argument is the compiled pattern, and the second is the name. If the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.

When the third and fourth arguments are not NULL, they must be pointers to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the given name, and the function returns the length of each entry in code units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.

The format of the name table is described above in the section entitled Information about a pattern. Given all the relevant entries for the name, you can extract each of their numbers, and hence the captured data.

FINDING ALL POSSIBLE MATCHES AT ONE POSITION

The traditional matching function uses a similar algorithm to Perl, which stops when it finds the first match at a given point in the subject. If you want to find all possible matches, or the longest possible match at a given position, consider using the alternative matching function (see below) instead. If you cannot use the alternative function, you can kludge it up by making use of the callout facility, which is described in the pcre2callout documentation.

What you have to do is to insert a callout right at the end of the pattern. When your callout function is called, extract and save the current matched substring. Then return 1, which forces pcre2_match() to backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.

MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount);

The function pcre2_dfa_match() is called to match a subject string against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does not backtrack (except when processing lookaround assertions). This has different characteristics to the normal algorithm, and is not compatible with Perl. Some of the features of PCRE2 patterns are not supported. Nevertheless, there are times when this kind of matching can be useful. For a discussion of the two matching algorithms, and a list of features that pcre2_dfa_match() does not support, see the pcre2matching documentation.

The arguments for the pcre2_dfa_match() function are the same as for pcre2_match(), plus two extras. The ovector within the match data block is used in a different way, and this is described below. The other common arguments are used in the same way as for pcre2_match(), so their description is not repeated here.

The two additional arguments provide workspace for the function. The workspace vector should contain at least 20 elements. It is used for keeping track of multiple paths through the pattern tree. More workspace is needed for patterns and subjects where there are a lot of potential matches.

Here is an example of a simple call to pcre2_dfa_match():

  int wspace[20];
  pcre2_match_data *md = pcre2_match_data_create(4, NULL);
  int rc = pcre2_dfa_match(
    re,             /* result of pcre2_compile() */
    "some string",  /* the subject string */
    11,             /* the length of the subject string */
    0,              /* start at offset 0 in the subject */
    0,              /* default options */
    md,             /* the match data block */
    NULL,           /* a match context; NULL means use defaults */
    wspace,         /* working space vector */
    20);            /* number of elements (NOT size in bytes) */

Option bits for pcre2_dfa_match()

The unused bits of the options argument for pcre2_dfa_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for pcre2_match(), so their description is not repeated here.

  PCRE2_PARTIAL_HARD
  PCRE2_PARTIAL_SOFT
These have the same general effect as they do for pcre2_match(), but the details are slightly different. When PCRE2_PARTIAL_HARD is set for pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the subject is reached and there is still at least one matching possibility that requires additional characters. This happens even if some complete matches have already been found. When PCRE2_PARTIAL_SOFT is set, the return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the subject is reached, there have been no complete matches, but there is still at least one matching possibility. The portion of the string that was inspected when the longest partial match was found is set as the first matching string in both cases. There is a more detailed discussion of partial and multi-segment matching, with examples, in the pcre2partial documentation.
  PCRE2_DFA_SHORTEST
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as soon as it has found one match. Because of the way the alternative algorithm works, this is necessarily the shortest possible match at the first possible matching point in the subject string.
  PCRE2_DFA_RESTART
When pcre2_dfa_match() returns a partial match, it is possible to call it again, with additional subject characters, and have it continue with the same match. The PCRE2_DFA_RESTART option requests this action; when it is set, the workspace and wscount options must reference the same vector as before because data about the match so far is left in them after a partial match. There is more discussion of this facility in the pcre2partial documentation.

Successful returns from pcre2_dfa_match()

When pcre2_dfa_match() succeeds, it may have matched more than one substring in the subject. Note, however, that all the matches from one run of the function start at the same point in the subject. The shorter matches are all initial substrings of the longer matches. For example, if the pattern

  <.*>
is matched against the string
  This is <something> <something else> <something further> no more
the three matched strings are
  <something> <something else> <something further>
  <something> <something else>
  <something>
On success, the yield of the function is a number greater than zero, which is the number of matched substrings. The offsets of the substrings are returned in the ovector, and can be extracted by number in the same way as for pcre2_match(), but the numbers bear no relation to any capture groups that may exist in the pattern, because DFA matching does not support capturing.

Calls to the convenience functions that extract substrings by name return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used after a DFA match. The convenience functions that extract substrings by number never return PCRE2_ERROR_NOSUBSTRING.

The matched strings are stored in the ovector in reverse order of length; that is, the longest matching string is first. If there were too many matches to fit into the ovector, the yield of the function is zero, and the vector is filled with the longest matches.

NOTE: PCRE2's "auto-possessification" optimization usually applies to character repeats at the end of a pattern (as well as internally). For example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA matching, this means that only one possible match is found. If you really do want multiple matches in such cases, either use an ungreedy repeat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when compiling.

Error returns from pcre2_dfa_match()

The pcre2_dfa_match() function returns a negative number when it fails. Many of the errors are the same as for pcre2_match(), as described above. There are in addition the following errors that are specific to pcre2_dfa_match():

  PCRE2_ERROR_DFA_UITEM
This return is given if pcre2_dfa_match() encounters an item in the pattern that it does not support, for instance, the use of \C in a UTF mode or a backreference.
  PCRE2_ERROR_DFA_UCOND
This return is given if pcre2_dfa_match() encounters a condition item that uses a backreference for the condition, or a test for recursion in a specific capture group. These are not supported.
  PCRE2_ERROR_DFA_UINVALID_UTF
This return is given if pcre2_dfa_match() is called for a pattern that was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA matching.
  PCRE2_ERROR_DFA_WSSIZE
This return is given if pcre2_dfa_match() runs out of space in the workspace vector.
  PCRE2_ERROR_DFA_RECURSE
When a recursion or subroutine call is processed, the matching function calls itself recursively, using private memory for the ovector and workspace. This error is given if the internal ovector is not large enough. This should be extremely rare, as a vector of size 1000 is used.
  PCRE2_ERROR_DFA_BADRESTART
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, some plausibility checks are made on the contents of the workspace, which should contain data about the previous partial match. If any of these checks fail, this error is given.

SEE ALSO

pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 29 October 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2build.html ================================================ pcre2build specification

pcre2build man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

BUILDING PCRE2

PCRE2 is distributed with a configure script that can be used to build the library in Unix-like environments using the Autotools applications. Also in the distribution are files to support building using CMake instead of configure. The text file README contains general information about building with Autotools (some of which is repeated below), and also has some comments about building on various operating systems. The files in the vms directory support building under OpenVMS. There is a lot more information about building PCRE2 without using Autotools (including information about using CMake and building "by hand") in the text file called NON-AUTOTOOLS-BUILD. You should consult this file as well as the README file if you are building in a non-Unix-like environment.

PCRE2 BUILD-TIME OPTIONS

The rest of this document describes the optional features of PCRE2 that can be selected when the library is compiled. It assumes use of the configure script, where the optional features are selected or deselected by providing options to configure before running the make command. However, the same options can be selected in both Unix-like and non-Unix-like environments if you are using CMake instead of configure to build PCRE2.

If you are not using Autotools or CMake, option selection can be done by editing the config.h file, or by passing parameter settings to the compiler, as described in NON-AUTOTOOLS-BUILD.

The complete list of options for configure (which includes the standard ones such as the selection of the installation directory) can be obtained by running

  ./configure --help
The following sections include descriptions of "on/off" options whose names begin with --enable or --disable. Because of the way that configure works, --enable and --disable always come in pairs, so the complementary option always exists as well, but as it specifies the default, it is not described. Options that specify values have names that start with --with. At the end of a configure run, a summary of the configuration is output.

BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES

By default, a library called libpcre2-8 is built, containing functions that take string arguments contained in arrays of bytes, interpreted either as single-byte characters, or UTF-8 strings. You can also build two other libraries, called libpcre2-16 and libpcre2-32, which process strings that are contained in arrays of 16-bit and 32-bit code units, respectively. These can be interpreted either as single-unit characters or UTF-16/UTF-32 strings. To build these additional libraries, add one or both of the following to the configure command:

  --enable-pcre2-16
  --enable-pcre2-32
If you do not want the 8-bit library, add
  --disable-pcre2-8
as well. At least one of the three libraries must be built. Note that the POSIX wrapper is for the 8-bit library only, and that pcre2grep is an 8-bit program. Neither of these are built if you select only the 16-bit or 32-bit libraries.

BUILDING SHARED AND STATIC LIBRARIES

The Autotools PCRE2 building process uses libtool to build both shared and static libraries by default. You can suppress an unwanted library by adding one of

  --disable-shared
  --disable-static
to the configure command. Setting --disable-shared ensures that PCRE2 libraries are built as static libraries. The binaries that are then created as part of the build process (for example, pcre2test and pcre2grep) are linked statically with one or more PCRE2 libraries, but may also be dynamically linked with other libraries such as libc. If you want these binaries to be fully statically linked, you can set LDFLAGS like this:

LDFLAGS=--static ./configure --disable-shared

Note the two hyphens in --static. Of course, this works only if static versions of all the relevant libraries are available for linking.

UNICODE AND UTF SUPPORT

By default, PCRE2 is built with support for Unicode and UTF character strings. To build it without Unicode support, add

  --disable-unicode
to the configure command. This setting applies to all three libraries. It is not possible to build one library with Unicode support and another without in the same configuration.

Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16 or UTF-32. To do that, applications that use the library can set the PCRE2_UTF option when they call pcre2_compile() to compile a pattern. Alternatively, patterns may be started with (*UTF) unless the application has locked this out by setting PCRE2_NEVER_UTF.

UTF support allows the libraries to process character code points up to 0x10ffff in the strings that they handle. Unicode support also gives access to the Unicode properties of characters, using pattern escapes such as \P, \p, and \X. Only the general category properties such as Lu and Nd, script names, and some bi-directional and binary properties are supported. Details are given in the pcre2pattern documentation.

Pattern escapes such as \d and \w do not by default make use of Unicode properties. The application can request that they do by setting the PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP).

DISABLING THE USE OF \C

The \C escape sequence, which matches a single code unit, even in a UTF mode, can cause unpredictable behaviour because it may leave the current matching point in the middle of a multi-code-unit character. The application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C option when calling pcre2_compile(). There is also a build-time option

  --enable-never-backslash-C
(note the upper case C) which locks out the use of \C entirely.

JUST-IN-TIME COMPILER SUPPORT

Just-in-time (JIT) compiler support is included in the build by specifying

  --enable-jit
This support is available only for certain hardware architectures. If this option is set for an unsupported architecture, a building error occurs. If in doubt, use
  --enable-jit=auto
which enables JIT only if the current hardware is supported. You can check if JIT is enabled in the configuration summary that is output at the end of a configure run. If you are enabling JIT under SELinux you may also want to add
  --enable-jit-sealloc
which enables the use of an execmem allocator in JIT that is compatible with SELinux. This has no effect if JIT is not enabled. See the pcre2jit documentation for a discussion of JIT usage. When JIT support is enabled, pcre2grep automatically makes use of it, unless you add
  --disable-pcre2grep-jit
to the configure command.

NEWLINE RECOGNITION

By default, PCRE2 interprets the linefeed (LF) character as indicating the end of a line. This is the normal newline character on Unix-like systems. You can compile PCRE2 to use carriage return (CR) instead, by adding

  --enable-newline-is-cr
to the configure command. There is also an --enable-newline-is-lf option, which explicitly specifies linefeed as the newline character.

Alternatively, you can specify that line endings are to be indicated by the two-character sequence CRLF (CR immediately followed by LF). If you want this, add

  --enable-newline-is-crlf
to the configure command. There is a fourth option, specified by
  --enable-newline-is-anycrlf
which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as indicating a line ending. A fifth option, specified by
  --enable-newline-is-any
causes PCRE2 to recognize any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). The final option is
  --enable-newline-is-nul
which causes NUL (binary zero) to be set as the default line-ending character.

Whatever default line ending convention is selected when PCRE2 is built can be overridden by applications that use the library. At build time it is recommended to use the standard for your operating system.

WHAT \R MATCHES

By default, the sequence \R in a pattern matches any Unicode newline sequence, independently of what has been selected as the line ending sequence. If you specify

  --enable-bsr-anycrlf
the default is changed so that \R matches only CR, LF, or CRLF. Whatever is selected when PCRE2 is built can be overridden by applications that use the library.

HANDLING VERY LARGE PATTERNS

Within a compiled pattern, offset values are used to point from one part to another (for example, from an opening parenthesis to an alternation metacharacter). By default, in the 8-bit and 16-bit libraries, two-byte values are used for these offsets, leading to a maximum size for a compiled pattern of around 64 thousand code units. This is sufficient to handle all but the most gigantic patterns. Nevertheless, some people do want to process truly enormous patterns, so it is possible to compile PCRE2 to use three-byte or four-byte offsets by adding a setting such as

  --with-link-size=3
to the configure command. The value given must be 2, 3, or 4. For the 16-bit library, a value of 3 is rounded up to 4. In these libraries, using longer offsets slows down the operation of PCRE2 because it has to load additional data when handling them. For the 32-bit library the value is always 4 and cannot be overridden; the value of --with-link-size is ignored.

LIMITING PCRE2 RESOURCE USAGE

The pcre2_match() function increments a counter each time it goes round its main loop. Putting a limit on this counter controls the amount of computing resource used by a single call to pcre2_match(). The limit can be changed at run time, as described in the pcre2api documentation. The default is 10 million, but this can be changed by adding a setting such as

  --with-match-limit=500000
to the configure command. This setting also applies to the pcre2_dfa_match() matching function, and to JIT matching (though the counting is done differently).

The pcre2_match() function uses heap memory to record backtracking points. The more nested backtracking points there are (that is, the deeper the search tree), the more memory is needed. There is an upper limit, specified in kibibytes (units of 1024 bytes). This limit can be changed at run time, as described in the pcre2api documentation. The default limit (in effect unlimited) is 20 million. You can change this by a setting such as

  --with-heap-limit=500
which limits the amount of heap to 500 KiB. This limit applies only to interpretive matching in pcre2_match() and pcre2_dfa_match(), which may also use the heap for internal workspace when processing complicated patterns. This limit does not apply when JIT (which has its own memory arrangements) is used.

You can also explicitly limit the depth of nested backtracking in the pcre2_match() interpreter. This limit defaults to the value that is set for --with-match-limit. You can set a lower default limit by adding, for example,

  --with-match-limit-depth=10000
to the configure command. This value can be overridden at run time. This depth limit indirectly limits the amount of heap memory that is used, but because the size of each backtracking "frame" depends on the number of capturing parentheses in a pattern, the amount of heap that is used before the limit is reached varies from pattern to pattern. This limit was more useful in versions before 10.30, where function recursion was used for backtracking.

As well as applying to pcre2_match(), the depth limit also controls the depth of recursive function calls in pcre2_dfa_match(). These are used for lookaround assertions, atomic groups, and recursion within patterns. The limit does not apply to JIT matching.

LIMITING VARIABLE-LENGTH LOOKBEHIND ASSERTIONS

Lookbehind assertions in which one or more branches can match a variable number of characters are supported only if there is a maximum matching length for each top-level branch. There is a limit to this maximum that defaults to 255 characters. You can alter this default by a setting such as

  --with-max-varlookbehind=100
The limit can be changed at runtime by calling pcre2_set_max_varlookbehind(). Lookbehind assertions in which every branch matches a fixed number of characters (not necessarily all the same) are not constrained by this limit.

CREATING CHARACTER TABLES AT BUILD TIME

PCRE2 uses fixed tables for processing characters whose code points are less than 256. By default, PCRE2 is built with a set of tables that are distributed in the file src/pcre2_chartables.c.dist. These tables are for ASCII codes only. If you add

  --enable-rebuild-chartables
to the configure command, the distributed tables are no longer used. Instead, a program called pcre2_dftables is compiled and run. This outputs the source for new set of tables, created in the default locale of your C run-time system. This method of replacing the tables does not work if you are cross compiling, because pcre2_dftables needs to be run on the local host and therefore not compiled with the cross compiler.

If you need to create alternative tables when cross compiling, you will have to do so "by hand". There may also be other reasons for creating tables manually. To cause pcre2_dftables to be built on the local host, run a normal compiling command, and then run the program with the output file as its argument, for example:

  cc src/pcre2_dftables.c -o pcre2_dftables
  ./pcre2_dftables src/pcre2_chartables.c
This builds the tables in the default locale of the local host. If you want to specify a locale, you must use the -L option:
  LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
You can also specify -b (with or without -L). This causes the tables to be written in binary instead of as source code. A set of binary tables can be loaded into memory by an application and passed to pcre2_compile() in the same way as tables created by calling pcre2_maketables(). The tables are just a string of bytes, independent of hardware characteristics such as endianness. This means they can be bundled with an application that runs in different environments, to ensure consistent behaviour.

USING EBCDIC CODE

PCRE2 assumes by default that it will run in an environment where the character code is ASCII or Unicode, which is a superset of ASCII. This is the case for most computer operating systems. PCRE2 can, however, be compiled to run in an 8-bit EBCDIC environment by adding

  --enable-ebcdic --disable-unicode
to the configure command. You should only use it if you know that you are in an EBCDIC environment (for example, an IBM mainframe operating system).

This setting implies --enable-rebuild-chartables, in order to ensure that you have the correct default character tables for your system's codepage. There is an exception when you set --enable-ebcdic-ignoring-compiler (see below), which allows using a default set of EBCDIC 1047 character tables rather than forcing use of --enable-rebuild-chartables.

It is not supported to enable both EBCDIC input and either ASCII or UTF-8/16/32 in the same build of the library. When PCRE2 is built with EBCDIC support, it always operates in EBCDIC, and consequently --enable-unicode and --enable-ebcdic are mutually exclusive.

The EBCDIC character that corresponds to an ASCII LF is assumed to have the value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In such an environment you should use

  --enable-ebcdic-nl25
(which implies --enable-ebcdic). The EBCDIC character for CR has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is not chosen as LF is made to correspond to the Unicode NEL character (which, in Unicode, is 0x85).

The options that select newline behaviour, such as --enable-newline-is-cr, and equivalent run-time options, refer to these character values in an EBCDIC environment.

On systems requiring an EBCDIC build of PCRE2, the compiler should be set to use the correct codepage, so that C character literals such as 'z' use the correct numeric value for whichever EBCDIC codpage is in use. (PCRE2 cannot support multiple EBCDIC codepages dynamically.) However, if this not possible, then you can use

  --enable-ebcdic-ignoring-compiler
in order to disregard the compiler's codepage, and instead force PCRE2 to use numeric constants corresponding to the EBCDIC 1047 codepage instead. This can be used to build (or test) EBCDIC support on an ASCII/UTF-8 system such as Linux.

PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS

By default pcre2grep supports the use of callouts with string arguments within the patterns it is matching. There are two kinds: one that generates output using local code, and another that calls an external program or script. If --disable-pcre2grep-callout-fork is added to the configure command, only the first kind of callout is supported; if --disable-pcre2grep-callout is used, all callouts are completely ignored. For more details of pcre2grep callouts, see the pcre2grep documentation.

PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT

By default, pcre2grep reads all files as plain text. You can build it so that it recognizes files whose names end in .gz or .bz2, and reads them with libz or libbz2, respectively, by adding one or both of

  --enable-pcre2grep-libz
  --enable-pcre2grep-libbz2
to the configure command. These options naturally require that the relevant libraries are installed on your system. Configuration will fail if they are not.

PCRE2GREP BUFFER SIZE

pcre2grep uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it finds a match. The default starting size of the buffer is 20KiB. The buffer itself is three times this size, but because of the way it is used for holding "before" lines, the longest line that is guaranteed to be processable is the notional buffer size. If a longer line is encountered, pcre2grep automatically expands the buffer, up to a specified maximum size, whose default is 1MiB or the starting size, whichever is the larger. You can change the default parameter values by adding, for example,

  --with-pcre2grep-bufsize=51200
  --with-pcre2grep-max-bufsize=2097152
to the configure command. The caller of pcre2grep can override these values by using --buffer-size and --max-buffer-size on the command line.

PCRE2TEST OPTION FOR LIBREADLINE SUPPORT

If you add one of

  --enable-pcre2test-libreadline
  --enable-pcre2test-libedit
to the configure command, pcre2test is linked with the libreadline orlibedit library, respectively, and when its input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking instead with libedit, which has a BSD licence.

Setting --enable-pcre2test-libreadline causes the -lreadline option to be added to the pcre2test build. In many operating environments with a system-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is in use), some extra configuration may be necessary. The INSTALL file for libreadline says this:

  "Readline uses the termcap functions, but does not link with
  the termcap or curses library itself, allowing applications
  which link with readline the to choose an appropriate library."
If your environment has not been set up so that an appropriate library is automatically included, you may need to add something like
  LIBS="-lncurses"
immediately before the configure command.

INCLUDING DEBUGGING CODE

If you add

  --enable-debug
to the configure command, additional debugging code is included in the build. This feature is intended for use by the PCRE2 maintainers.

DEBUGGING WITH VALGRIND SUPPORT

If you add

  --enable-valgrind
to the configure command, PCRE2 will use valgrind annotations to mark certain memory regions as unaddressable. This allows it to detect invalid memory accesses, and is mostly useful for debugging PCRE2 itself.

CODE COVERAGE REPORTING

If your C compiler is gcc, you can build a version of PCRE2 that can generate a code coverage report for its test suite. To enable this, you must install lcov version 1.6 or above. Then specify

  --enable-coverage
to the configure command and build PCRE2 in the usual way.

Note that using ccache (a caching C compiler) is incompatible with code coverage reporting. If you have configured ccache to run automatically on your system, you must set the environment variable

  CCACHE_DISABLE=1
before running make to build PCRE2, so that ccache is not used.

When --enable-coverage is used, the following addition targets are added to the Makefile:

  make coverage
This creates a fresh coverage report for the PCRE2 test suite. It is equivalent to running "make coverage-reset", "make coverage-baseline", "make check", and then "make coverage-report".
  make coverage-reset
This zeroes the coverage counters, but does nothing else.
  make coverage-baseline
This captures baseline coverage information.
  make coverage-report
This creates the coverage report.
  make coverage-clean-report
This removes the generated coverage report without cleaning the coverage data itself.
  make coverage-clean-data
This removes the captured coverage data without removing the coverage files created at compile time (*.gcno).
  make coverage-clean
This cleans all coverage data including the generated coverage report. For more information about code coverage, see the gcov and lcov documentation.

DISABLING THE Z AND T FORMATTING MODIFIERS

The C99 standard defines formatting modifiers z and t for size_t and ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in environments other than old versions of Microsoft Visual Studio when __STDC_VERSION__ is defined and has a value greater than or equal to 199901L (indicating support for C99). However, there is at least one environment that claims to be C99 but does not support these modifiers. If

  --disable-percent-zt
is specified, no use is made of the z or t modifiers. Instead of %td or %zu, a suitable format is used depending in the size of long for the platform.

SUPPORT FOR FUZZERS

There is a special option for use by people who want to run fuzzing tests on PCRE2:

  --enable-fuzz-support
At present this applies only to the 8-bit library. If set, it causes an extra library called libpcre2-fuzzsupport.a to be built, but not installed. This contains a single function called LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the length of the string. When called, this function tries to compile the string as a pattern, and if that succeeds, to match it. This is done both with no options and with some random options bits that are generated from the string.

Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing function and outputs information about what it is doing. The input strings are specified by arguments: if an argument starts with "=" the rest of it is a literal input string. Otherwise, it is assumed to be a file name, and the contents of the file are the test string.

OBSOLETE OPTION

In versions of PCRE2 prior to 10.30, there were two ways of handling backtracking in the pcre2_match() function. The default was to use the system stack, but if

  --disable-stack-for-recursion
was set, memory on the heap was used. From release 10.30 onwards this has changed (the stack is no longer used) and this option now does nothing except give a warning.

SEE ALSO

pcre2api(3), pcre2-config(3).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 17 October 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2callout.html ================================================ pcre2callout specification

pcre2callout man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2.h>

int (*pcre2_callout)(pcre2_callout_block *, void *);

int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data);

DESCRIPTION

PCRE2 provides a feature called "callout", which is a means of temporarily passing control to the caller of PCRE2 in the middle of pattern matching. The caller of PCRE2 provides an external function by putting its entry point in a match context (see pcre2_set_callout() in the pcre2api documentation).

When using the pcre2_substitute() function, an additional callout feature is available. This does a callout after each change to the subject string and is described in the pcre2api documentation; the rest of this document is concerned with callouts during pattern matching.

Within a regular expression, (?C<arg>) indicates a point at which the external function is to be called. Different callout points can be identified by putting a number less than 256 after the letter C. The default value is zero. Alternatively, the argument may be a delimited string. The starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the start, except for {, where the ending delimiter is }. If the ending delimiter is needed within the string, it must be doubled. For example, this pattern has two callout points:

  (?C1)abc(?C"some ""arbitrary"" text")def
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 automatically inserts callouts, all with number 255, before each item in the pattern except for immediately before or after an explicit callout. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
  A(?C3)B
it is processed as if it were
  (?C255)A(?C3)B(?C255)
Here is a more complicated example:
  A(\d{2}|--)
With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were
  (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
Notice that there is a callout before and after each parenthesis and alternation bar. If the pattern contains a conditional group whose condition is an assertion, an automatic callout is inserted immediately before the condition. Such a callout may also be inserted explicitly, for example:
  (?(?C9)(?=a)ab|de)  (?(?C%text%)(?!=d)ab|de)
This applies only to assertion conditions (because they are themselves independent groups).

Callouts can be useful for tracking the progress of pattern matching. The pcre2test program has a pattern qualifier (/auto_callout) that sets automatic callouts. When any callouts are present, the output from pcre2test indicates how the pattern is being matched. This is useful information when you are trying to optimize the performance of a particular pattern.

MISSING CALLOUTS

You should be aware that, because of optimizations in the way PCRE2 compiles and matches patterns, callouts sometimes do not happen exactly as you might expect.

Auto-possessification

At compile time, PCRE2 "auto-possessifies" repeated items when it knows that what follows cannot be part of the repeat. For example, a+[bc] is compiled as if it were a++[bc]. The pcre2test output when this pattern is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string "aaaa" is:

  --->aaaa
   +0 ^        a+
   +2 ^   ^    [bc]
  No match
This indicates that when matching [bc] fails, there is no backtracking into a+ (because it is being treated as a++) and therefore the callouts that would be taken for the backtracks do not occur. You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
  --->aaaa
   +0 ^        a+
   +2 ^   ^    [bc]
   +2 ^  ^     [bc]
   +2 ^ ^      [bc]
   +2 ^^       [bc]
  No match
This time, when matching [bc] fails, the matcher backtracks into a+ and tries again, repeatedly, until a+ itself fails.

Automatic .* anchoring

By default, an optimization is applied when .* is the first significant item in a pattern. If PCRE2_DOTALL is set, so that the dot can match any character, the pattern is automatically anchored. If PCRE2_DOTALL is not set, a match can start only after an internal newline or at the beginning of the subject, and pcre2_compile() remembers this. If a pattern has more than one top-level branch, automatic anchoring occurs if all branches are anchorable.

This optimization is disabled, however, if .* is in an atomic group or if there is a backreference to the capture group in which it appears. It is also disabled if the pattern contains (*PRUNE) or (*SKIP). However, the presence of callouts does not affect it.

For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT and applied to the string "aa", the pcre2test output is:

  --->aa
   +0 ^      .*
   +2 ^ ^    \d
   +2 ^^     \d
   +2 ^      \d
  No match
This shows that all match attempts start at the beginning of the subject. In other words, the pattern is anchored. You can disable this optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the output changes to:
  --->aa
   +0 ^      .*
   +2 ^ ^    \d
   +2 ^^     \d
   +2 ^      \d
   +0  ^     .*
   +2  ^^    \d
   +2  ^     \d
  No match
This shows more match attempts, starting at the second subject character. Another optimization, described in the next section, means that there is no subsequent attempt to match with an empty subject.

Other optimizations

Other optimizations that provide fast "no match" results also affect callouts. For example, if the pattern is

  ab(?C4)cd
PCRE2 knows that any matching string must contain the letter "d". If the subject string is "abyz", the lack of "d" means that matching doesn't ever start, and the callout is never reached. However, with "abyd", though the result is still no match, the callout is obeyed.

For most patterns PCRE2 also knows the minimum length of a matching string, and will immediately give a "no match" return without actually running a match if the subject is not long enough, or, for unanchored patterns, if it has been scanned far enough.

You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE option to pcre2_compile(), or by starting the pattern with (*NO_START_OPT). This slows down the matching process, but does ensure that callouts such as the example above are obeyed.

THE CALLOUT INTERFACE

During matching, when PCRE2 reaches a callout point, if an external function is provided in the match context, it is called. This applies to both normal, DFA, and JIT matching. The first argument to the callout function is a pointer to a pcre2_callout block. The second argument is the void * callout data that was supplied when the callout was set up by calling pcre2_set_callout() (see the pcre2api documentation). The callout block structure contains the following fields, not necessarily in this order:

  uint32_t      version;
  uint32_t      callout_number;
  uint32_t      capture_top;
  uint32_t      capture_last;
  uint32_t      callout_flags;
  PCRE2_SIZE   *offset_vector;
  PCRE2_SPTR    mark;
  PCRE2_SPTR    subject;
  PCRE2_SIZE    subject_length;
  PCRE2_SIZE    start_match;
  PCRE2_SIZE    current_position;
  PCRE2_SIZE    pattern_position;
  PCRE2_SIZE    next_item_length;
  PCRE2_SIZE    callout_string_offset;
  PCRE2_SIZE    callout_string_length;
  PCRE2_SPTR    callout_string;
The version field contains the version number of the block format. The current version is 2; the three callout string fields were added for version 1, and the callout_flags field for version 2. If you are writing an application that might use an earlier release of PCRE2, you should check the version number before accessing any of these fields. The version number will increase in future if more fields are added, but the intention is never to remove any of the existing fields.

Fields for numerical callouts

For a numerical callout, callout_string is NULL, and callout_number contains the number of the callout, in the range 0-255. This is the number that follows (?C for callouts that part of the pattern; it is 255 for automatically generated callouts.

Fields for string callouts

For callouts with string arguments, callout_number is always zero, and callout_string points to the string that is contained within the compiled pattern. Its length is given by callout_string_length. Duplicated ending delimiters that were present in the original pattern string have been turned into single characters, but there is no other processing of the callout string argument. An additional code unit containing binary zero is present after the string, but is not included in the length. The delimiter that was used to start the string is also stored within the pattern, immediately before the string itself. You can access this delimiter as callout_string[-1] if you need it.

The callout_string_offset field is the code unit offset to the start of the callout argument string within the original pattern string. This is provided for the benefit of applications such as script languages that might need to report errors in the callout string within the pattern.

Fields for all callouts

The remaining fields in the callout block are the same for both kinds of callout.

The offset_vector field is a pointer to a vector of capturing offsets (the "ovector"). You may read the elements in this vector, but you must not change any of them.

For calls to pcre2_match(), the offset_vector field is not (since release 10.30) a pointer to the actual ovector that was passed to the matching function in the match data block. Instead it points to an internal ovector of a size large enough to hold all possible captured substrings in the pattern. Note that whenever a recursion or subroutine call within a pattern completes, the capturing state is reset to what it was before.

The capture_last field contains the number of the most recently captured substring, and the capture_top field contains one more than the number of the highest numbered captured substring so far. If no substrings have yet been captured, the value of capture_last is 0 and the value of capture_top is 1. The values of these fields do not always differ by one; for example, when the callout in the pattern ((a)(b))(?C2) is taken, capture_last is 1 but capture_top is 4.

The contents of ovector[2] to ovector[<capture_top>*2-1] can be inspected in order to extract substrings that have been matched so far, in the same way as extracting substrings after a match has completed. The values in ovector[0] and ovector[1] are always PCRE2_UNSET because the match is by definition not complete. Substrings that have not been captured but whose numbers are less than capture_top also have both of their ovector slots set to PCRE2_UNSET.

For DFA matching, the offset_vector field points to the ovector that was passed to the matching function in the match data block for callouts at the top level, but to an internal ovector during the processing of pattern recursions, lookarounds, and atomic groups. However, these ovectors hold no useful information because pcre2_dfa_match() does not support substring capturing. The value of capture_top is always 1 and the value of capture_last is always 0 for DFA matching.

The subject and subject_length fields contain copies of the values that were passed to the matching function.

The start_match field normally contains the offset within the subject at which the current match attempt started. However, if the escape sequence \K has been encountered, this value is changed to reflect the modified starting point. If the pattern is not anchored, the callout function may be called several times from the same point in the pattern for different starting points in the subject.

The current_position field contains the offset within the subject of the current match pointer.

The pattern_position field contains the offset in the pattern string to the next item to be matched.

The next_item_length field contains the length of the next item to be processed in the pattern string. When the callout is at the end of the pattern, the length is zero. When the callout precedes an opening parenthesis, the length includes meta characters that follow the parenthesis. For example, in a callout before an assertion such as (?=ab) the length is 3. For an alternation bar or a closing parenthesis, the length is one, unless a closing parenthesis is followed by a quantifier, in which case its length is included. (This changed in release 10.23. In earlier releases, before an opening parenthesis the length was that of the entire group, and before an alternation bar or a closing parenthesis the length was zero.)

The pattern_position and next_item_length fields are intended to help in distinguishing between different automatic callouts, which all have the same callout number. However, they are set for all callouts, and are used by pcre2test to show the next item to be matched when displaying callout information.

In callouts from pcre2_match() the mark field contains a pointer to the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or (*THEN) item in the match, or NULL if no such items have been passed. Instances of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In callouts from the DFA matching function this field always contains NULL.

The callout_flags field is always zero in callouts from pcre2_dfa_match() or when JIT is being used. When pcre2_match() without JIT is used, the following bits may be set:

  PCRE2_CALLOUT_STARTMATCH
This is set for the first callout after the start of matching for each new starting position in the subject.
  PCRE2_CALLOUT_BACKTRACK
This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run.

Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output from pcre2test does not indicate the presence of these bits unless the callout_extra modifier is set.

The information in the callout_flags field is provided so that applications can track and tell their users how matching with backtracking is done. This can be useful when trying to optimize patterns, or just to understand how PCRE2 works. There is no support in pcre2_dfa_match() because there is no backtracking in DFA matching, and there is no support in JIT because JIT is all about maximimizing matching performance. In both these cases the callout_flags field is always zero.

RETURN VALUES FROM CALLOUTS

The external callout function returns an integer to PCRE2. If the value is zero, matching proceeds as normal. If the value is greater than zero, matching fails at the current point, but the testing of other matching possibilities goes ahead, just as if a lookahead assertion had failed. If the value is less than zero, the match is abandoned, and the matching function returns the negative value.

Negative values should normally be chosen from the set of PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout functions; it will never be used by PCRE2 itself.

CALLOUT ENUMERATION

int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data);

A script language that supports the use of string arguments in callouts might like to scan all the callouts in a pattern before running the match. This can be done by calling pcre2_callout_enumerate(). The first argument is a pointer to a compiled pattern, the second points to a callback function, and the third is arbitrary user data. The callback function is called for every callout in the pattern in the order in which they appear. Its first argument is a pointer to a callout enumeration block, and its second argument is the user_data value that was passed to pcre2_callout_enumerate(). The data block contains the following fields:

  version                Block version number
  pattern_position       Offset to next item in pattern
  next_item_length       Length of next item in pattern
  callout_number         Number for numbered callouts
  callout_string_offset  Offset to string within pattern
  callout_string_length  Length of callout string
  callout_string         Points to callout string or is NULL
The version number is currently 0. It will increase if new fields are ever added to the block. The remaining fields are the same as their namesakes in the pcre2_callout block that is used for callouts during matching, as described above.

Note that the value of pattern_position is unique for each callout. However, if a callout occurs inside a group that is quantified with a non-zero minimum or a fixed maximum, the group is replicated inside the compiled pattern. For example, a pattern such as /(a){2}/ is compiled as if it were /(a)(a)/. This means that the callout will be enumerated more than once, but with the same value for pattern_position in each case.

The callback function should normally return zero. If it returns a non-zero value, scanning the pattern stops, and that value is returned from pcre2_callout_enumerate().

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 26 February 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2compat.html ================================================ pcre2compat specification

pcre2compat man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

DIFFERENCES BETWEEN PCRE2 AND PERL

This document describes some of the known differences in the ways that PCRE2 and Perl handle regular expressions. The differences described here are with respect to Perl version 5.38.0, but as both Perl and PCRE2 are continually changing, the information may at times be out of date.

1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the next character unless it is the start of a newline sequence. This means that, if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF (0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline indicator.

2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does have are given in the pcre2unicode page.

3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but they do not mean what you might think. For example, (?!a){3} does not assert that the next three characters are not "a". It just asserts that the next character is not "a" three times (in principle; PCRE2 optimizes this to run the assertion just once). Perl allows some repeat quantifiers on other assertions, for example, \b* , but these do not seem to have any use. PCRE2 does not allow any kind of quantifier on non-lookaround assertions.

4. If a braced quantifier such as {1,2} appears where there is nothing to repeat (for example, at the start of a branch), PCRE2 raises an error whereas Perl treats the quantifier characters as literal. When a braced quantifier (...){min,max} has min > max, Perl treats it as an item which fails to match any portion of the subject (as no number of repetitions can meet the condition), and additionally issues a warning when in warning mode. PCRE2 has no warning features, so it gives an error in this case.

5. Capture groups that occur inside negative lookaround assertions are counted, but their entries in the offsets vector are set only when a negative assertion is a condition that has a matching branch (that is, the condition is false). Perl may set such capture groups in other circumstances.

6. The following Perl escape sequences are not supported: \F, \l, \L, \u, \U, and \N when followed by a character name. \N on its own, matching a non-newline character, and \N{U+dd..}, matching a Unicode code point, are supported. The escapes that modify the case of following letters are implemented by Perl's general string-handling and are not part of its pattern matching engine. If any of these are encountered by PCRE2, an error is generated by default. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript interprets them.

7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties such as Lu and Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See the pcre2pattern documentation for details. The long synonyms for property names that Perl supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is".

8. PCRE2 supports the \Q...\E escape for quoting substrings. Characters in between are treated as literals. However, this is slightly different from Perl in that $ and @ are also handled as literals inside the quotes. In Perl, they cause variable interpolation (PCRE2 does not have variables). Also, Perl does "double-quotish backslash interpolation" on any backslashes between \Q and \E which, its documentation says, "may lead to confusing results". PCRE2 treats a backslash between \Q and \E just like any other character. Note the following examples:

    Pattern            PCRE2 matches     Perl matches

    \Qabc$xyz\E        abc$xyz           abc followed by the contents of $xyz
    \Qabc\$xyz\E       abc\$xyz          abc\$xyz
    \Qabc\E\$\Qxyz\E   abc$xyz           abc$xyz
    \QA\B\E            A\B               A\B
    \Q\\E              \                 \\E
The \Q...\E sequence is recognized both inside and outside character classes by both PCRE2 and Perl. Another difference from Perl is that any appearance of \Q or \E inside what might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers is inside \Q...\E, but not if the separating comma is. When not recognized as a quantifier a sequence such as {\Q1\E,2} is treated as the literal string "{1,2}".

9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details.

10. Subroutine calls (whether recursive or not) were treated as atomic groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl.

11. In PCRE2, if any of the backtracking control verbs are used in a group that is called as a subroutine (whether or not recursively), their effect is confined to that group; it does not extend to the surrounding pattern. This is not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are processed as anchored at the point where they are tested. PCRE2 also confines all control verbs within atomic assertions, again including (*THEN) in assertions with only one branch.

12. If a pattern contains more than one backtracking control verb, the first one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs.

13. There are some differences that are concerned with the settings of captured strings when part of a pattern is repeated. For example, matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to "b".

14. PCRE2's handling of duplicate capture group numbers and names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to translate between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two capture groups have the same number but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time.

15. Perl used to recognize comments in some places that PCRE2 does not, for example, between the ( and ? at the start of a group. If the /x modifier is set, Perl allowed white space between ( and ? though the latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently.

16. Perl, when in warning mode, gives warnings for character classes such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes.

17. In PCRE2, until release 10.45, the upper/lower case character properties Lu and Ll were not affected when case-independent matching was specified. Perl has changed in this respect, and PCRE2 has now changed to match. When caseless matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased letter).

18. From release 5.32.0, Perl locks out the use of \K in lookaround assertions. From release 10.38 PCRE2 does the same by default. However, there is an option for re-enabling the previous behaviour. When this option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions.

19. PCRE2 provides some extensions to the Perl regular expression facilities. Perl 5.10 included new features that were not in earlier versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This list is with respect to Perl 5.38:

(a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string.

(b) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.)

(c) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is inverted, that is, by default they are not greedy, but if followed by a question mark they are.

(d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string.

(e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART options have no Perl equivalents.

(f) The \R escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option.

(g) The callout facility is PCRE2-specific. Perl supports codeblocks and variable interpolation, but not general hooks on every match.

(h) The partial matching facility is PCRE2-specific.

(i) The alternative matching function (pcre2_dfa_match()) matches in a different way and is not Perl-compatible.

(j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at the start of a pattern. These set overall options that cannot be changed within the pattern.

(k) PCRE2 supports non-atomic positive lookaround assertions. This is an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic.

(l) There are three syntactical items in patterns that can refer to a capturing group by number: back references such as \g{2}, subroutine calls such as (?3), and condition references such as (?(4)...). PCRE2 supports relative group numbers such as +2 and -4 in all three cases. Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions.

(m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 extension that is not available in Perl.

20. Perl has different limits than PCRE2. See the pcre2limits documentation for details. Perl went with 5.10 from recursion to iteration keeping the intermediate matches on the heap, which is ~10% slower but does not fall into any stack-overflow limit. PCRE2 made a similar change at release 10.30, and also has many build-time and run-time customizable limits.

21. Unlike Perl, PCRE2 doesn't have character set modifiers and specially no way to set characters by context just like Perl's "/d". A regular expression using PCRE2_UTF and PCRE2_UCP will use similar rules to Perl's "/u"; something closer to "/a" could be selected by adding other PCRE2_EXTRA_ASCII* options on top.

22. Some recursive patterns that Perl diagnoses as infinite recursions can be handled by PCRE2, either by the interpreter or the JIT. An example is /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject.

23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl tries to recover and prints a warning if the problem was that an invalid hexadecimal digit was found. Since PCRE2 doesn't have warnings it returns an error instead. Additionally, Perl accepts \x{} and generates NUL unlike PCRE2.

24. From release 10.45, PCRE2 gives an error if \x is not followed by a hexadecimal digit or a curly bracket. It used to interpret this as the NUL character. Perl still generates NUL, but warns when in warning mode in most cases.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 02 June 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2convert.html ================================================ pcre2convert specification

pcre2convert man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

EXPERIMENTAL PATTERN CONVERSION FUNCTIONS

This document describes a set of functions that can be used to convert "foreign" patterns into PCRE2 regular expressions. This facility is currently experimental, and may be changed in future releases. Two kinds of pattern, globs and POSIX patterns, are supported.

THE CONVERT CONTEXT

pcre2_convert_context *pcre2_convert_context_create( pcre2_general_context *gcontext);

pcre2_convert_context *pcre2_convert_context_copy( pcre2_convert_context *cvcontext);

void pcre2_convert_context_free(pcre2_convert_context *cvcontext);

int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, uint32_t escape_char);

int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, uint32_t separator_char);

A convert context is used to hold parameters that affect the way that pattern conversion works. Like all PCRE2 contexts, you need to use a context only if you want to override the defaults. There are the usual create, copy, and free functions. If custom memory management functions are set in a general context that is passed to pcre2_convert_context_create(), they are used for all memory management within the conversion functions.

There are only two parameters in the convert context at present. Both apply only to glob conversions. The escape character defaults to grave accent under Windows, otherwise backslash. It can be set to zero, meaning no escape character, or to any punctuation character with a code point less than 256. The separator character defaults to backslash under Windows, otherwise forward slash. It can be set to forward slash, backslash, or dot.

The two setting functions return zero on success, or PCRE2_ERROR_BADDATA if their second argument is invalid.

THE CONVERSION FUNCTION

int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, PCRE2_UCHAR **buffer, PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);

void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);

The first two arguments of pcre2_pattern_convert() define the foreign pattern that is to be converted. The length may be given as PCRE2_ZERO_TERMINATED. The options argument defines how the pattern is to be processed. If the input is UTF, the PCRE2_CONVERT_UTF option should be set. PCRE2_CONVERT_NO_UTF_CHECK may also be set if you are sure the input is valid. One or more of the glob options, or one of the following POSIX options must be set to define the type of conversion that is required:

  PCRE2_CONVERT_GLOB
  PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
  PCRE2_CONVERT_GLOB_NO_STARSTAR
  PCRE2_CONVERT_POSIX_BASIC
  PCRE2_CONVERT_POSIX_EXTENDED
Details of the conversions are given below. The buffer and blength arguments define how the output is handled:

If buffer is NULL, the function just returns the length of the converted pattern via blength. This is one less than the length of buffer needed, because a terminating zero is always added to the output.

If buffer points to a NULL pointer, an output buffer is obtained using the allocator in the context or malloc() if no context is supplied. A pointer to this buffer is placed in the variable to which buffer points. When no longer needed the output buffer must be freed by calling pcre2_converted_pattern_free(). If this function is called with a NULL argument, it returns immediately without doing anything.

If buffer points to a non-NULL pointer, blength must be set to the actual length of the buffer provided (in code units).

In all cases, after successful conversion, the variable pointed to by blength is updated to the length actually used (in code units), excluding the terminating zero that is always added.

If an error occurs, the length (via blength) is set to the offset within the input pattern where the error was detected. Only gross syntax errors are caught; there are plenty of errors that will get passed on for pcre2_compile() to discover.

The return from pcre2_pattern_convert() is zero on success or a non-zero PCRE2 error code. Note that PCRE2 error codes may be positive or negative: pcre2_compile() uses mostly positive codes and pcre2_match() negative ones; pcre2_convert() uses existing codes of both kinds. A textual error message can be obtained by calling pcre2_get_error_message().

CONVERTING GLOBS

Globs are used to match file names, and consequently have the concept of a "path separator", which defaults to backslash under Windows and forward slash otherwise. If PCRE2_CONVERT_GLOB is set, the wildcards * and ? are not permitted to match separator characters, but the double-star (**) feature (which does match separators) is supported.

PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with the double-star feature disabled. These options may be given together.

CONVERTING POSIX PATTERNS

POSIX defines two kinds of regular expression pattern: basic and extended. These can be processed by setting PCRE2_CONVERT_POSIX_BASIC or PCRE2_CONVERT_POSIX_EXTENDED, respectively.

In POSIX patterns, backslash is not special in a character class. Unmatched closing parentheses are treated as literals.

In basic patterns, ? + | {} and () must be escaped to be recognized as metacharacters outside a character class. If the first character in the pattern is * it is treated as a literal. ^ is a metacharacter only at the start of a branch.

In extended patterns, a backslash not in a character class always makes the next character literal, whatever it is. There are no backreferences.

Note: POSIX mandates that the longest possible match at the first matching position must be found. This is not what pcre2_match() does; it yields the first match that is found. An application can use pcre2_dfa_match() to find the longest match, but that does not support backreferences (but then neither do POSIX extended patterns).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 14 November 2023
Copyright © 1997-2018 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2demo.html ================================================ pcre2demo specification

pcre2demo man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SOURCE CODE

/*************************************************
*           PCRE2 DEMONSTRATION PROGRAM          *
*************************************************/

/* This is a demonstration program to illustrate a straightforward way of
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.

There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.

In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:

cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo

If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:

cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo

If you do not have pkg-config, you may have to use something like this:

cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
  -R/usr/local/lib -lpcre2-8 -o pcre2demo

Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
library files for PCRE2 are installed on your system. Only some operating
systems (Solaris is one) use the -R option.

Building under Windows:

If you want to statically link this program against a non-dll .a file, you must
define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
the following line. */

/* #define PCRE2_STATIC */

/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */

#define PCRE2_CODE_UNIT_WIDTH 8

#include <stdio.h>
#include <string.h>
#include <pcre2.h>


/**************************************************************************
* Here is the program. The API includes the concept of "contexts" for     *
* setting up unusual interface requirements for compiling and matching,   *
* such as custom memory managers and non-standard newline definitions.    *
* This program does not do any of this, so it makes no use of contexts,   *
* always passing NULL where a context could be given.                     *
**************************************************************************/

int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern;     /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject;     /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;

int errornumber;
int find_all, caseless_match;
int i;
int rc;

uint32_t namecount;
uint32_t name_entry_size;

PCRE2_SIZE erroroffset;
PCRE2_SIZE *ovector;
PCRE2_SIZE ovector_last[2];
PCRE2_SIZE subject_length;

pcre2_match_data *match_data;


/**************************************************************************
* First, sort out the command line. Options:                              *
* - "-g" to request repeated matching to find all occurrences,            *
*   like Perl's /g option. We set the variable find_all to a non-zero     *
*   value if the -g option is present.                                    *
* - "-i" to request caseless matching, like Perl's /i option.  We set the *
*   variable caseless_match to PCRE2_CASELESS if the -i option is         *
*   present.                                                              *
**************************************************************************/

find_all = 0;
caseless_match = 0;
for (i = 1; i < argc; i++)
  {
  if (strcmp(argv[i], "-g") == 0) find_all = 1;
  else if (strcmp(argv[i], "-i") == 0) caseless_match = PCRE2_CASELESS;
  else if (argv[i][0] == '-')
    {
    printf("Unrecognised option %s\n", argv[i]);
    return 1;
    }
  else break;
  }

/* After the options, we require exactly two arguments, which are the pattern,
and the subject string. */

if (argc - i != 2)
  {
  printf("Exactly two arguments required: a regex and a subject string\n");
  return 1;
  }

/* Pattern and subject are char arguments, so they can be straightforwardly
cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
defined to be size_t. */

pattern = (PCRE2_SPTR)argv[i];
subject = (PCRE2_SPTR)argv[i+1];
subject_length = (PCRE2_SIZE)strlen((char *)subject);


/*************************************************************************
* Now we are going to compile the regular expression pattern, and handle *
* any errors that are detected.                                          *
*************************************************************************/

re = pcre2_compile(
  pattern,               /* the pattern */
  PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
  caseless_match,        /* possibly enable caseless */
  &errornumber,          /* for error number */
  &erroroffset,          /* for error offset */
  NULL);                 /* use default compile context */

/* Compilation failed: print the error message and exit. */

if (re == NULL)
  {
  PCRE2_UCHAR buffer[256];
  pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
  printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
    buffer);
  return 1;
  }


/*************************************************************************
* If the compilation succeeded, we call PCRE2 again, in order to do a    *
* pattern match against the subject string. This does just ONE match. If *
* further matching is needed, it will be done below. Before running the  *
* match we must set up a match_data block for holding the result. Using  *
* pcre2_match_data_create_from_pattern() ensures that the block is       *
* exactly the right size for the number of capturing parentheses in the  *
* pattern. If you need to know the actual size of a match_data block as  *
* a number of bytes, you can find it like this:                          *
*                                                                        *
* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data);    *
*************************************************************************/

match_data = pcre2_match_data_create_from_pattern(re, NULL);

/* Now run the match. */

rc = pcre2_match(
  re,                   /* the compiled pattern */
  subject,              /* the subject string */
  subject_length,       /* the length of the subject */
  0,                    /* start at offset 0 in the subject */
  0,                    /* default options */
  match_data,           /* block for storing the result */
  NULL);                /* use default match context */

/* Matching failed: handle error cases */

if (rc < 0)
  {
  switch(rc)
    {
    case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
    /*
    Handle other special cases if you like
    */
    default: printf("Matching error %d\n", rc); break;
    }
  pcre2_match_data_free(match_data);   /* Release memory used for the match */
  pcre2_code_free(re);                 /*   data and the compiled pattern. */
  return 1;
  }

/* Match succeeded. Get a pointer to the output vector, where string offsets
are stored. */

ovector = pcre2_get_ovector_pointer(match_data);
printf("Match succeeded at offset %d\n", (int)ovector[0]);


/*************************************************************************
* We have found the first match within the subject string. If the output *
* vector wasn't big enough, say so. Then output any substrings that were *
* captured.                                                              *
*************************************************************************/

/* The output vector wasn't big enough. This should not happen, because we used
pcre2_match_data_create_from_pattern() above. */

if (rc == 0)
  printf("ovector was not big enough for all the captured substrings\n");

/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
assertions. This is the recommended behaviour. However, the option
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK allows applications to re-enable the old
behaviour. If that is set, it is possible to run patterns such as /(?=.\K)/ that
use \K in an assertion to set the start of a match later than its end. In this
demonstration program, we show how to detect this case, although it cannot arise
because the option is never set. */

if (ovector[0] > ovector[1])
  {
  printf("\\K was used in an assertion to set the match start after its end.\n"
    "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
      (char *)(subject + ovector[1]));
  printf("Run abandoned\n");
  pcre2_match_data_free(match_data);
  pcre2_code_free(re);
  return 1;
  }

/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */

for (i = 0; i < rc; i++)
  {
  PCRE2_SPTR substring_start = subject + ovector[2*i];
  PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
  printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
  }


/**************************************************************************
* That concludes the basic part of this demonstration program. We have    *
* compiled a pattern, and performed a single match. The code that follows *
* shows first how to access named substrings, and then how to code for    *
* repeated matches on the same subject.                                   *
**************************************************************************/

/* See if there are any named substrings, and if so, show them by name. First
we have to extract the count of named parentheses from the pattern. */

(void)pcre2_pattern_info(
  re,                   /* the compiled pattern */
  PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
  &namecount);          /* where to put the answer */

if (namecount == 0)
  printf("No named substrings\n");
else
  {
  PCRE2_SPTR tabptr;
  printf("Named substrings\n");

  /* Before we can access the substrings, we must extract the table for
  translating names to numbers, and the size of each entry in the table. */

  (void)pcre2_pattern_info(
    re,                       /* the compiled pattern */
    PCRE2_INFO_NAMETABLE,     /* address of the table */
    &name_table);             /* where to put the answer */

  (void)pcre2_pattern_info(
    re,                       /* the compiled pattern */
    PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
    &name_entry_size);        /* where to put the answer */

  /* Now we can scan the table and, for each entry, print the number, the name,
  and the substring itself. In the 8-bit library the number is held in two
  bytes, most significant first. */

  tabptr = name_table;
  for (i = 0; i < namecount; i++)
    {
    int n = (tabptr[0] << 8) | tabptr[1];
    printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
      (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
    tabptr += name_entry_size;
    }
  }


/*************************************************************************
* If the "-g" option was given on the command line, we want to continue  *
* to search for additional matches in the subject string, in a similar   *
* way to the /g option in Perl. This turns out to be trickier than you   *
* might think because of the possibility of matching an empty string.    *
*                                                                        *
* To help with this task, PCRE2 provides the pcre2_next_match() helper.  *
*************************************************************************/

if (!find_all)     /* Check for -g */
  {
  pcre2_match_data_free(match_data);  /* Release the memory that was used */
  pcre2_code_free(re);                /* for the match data and the pattern. */
  return 0;                           /* Exit the program. */
  }

/* Loop for second and subsequent matches */

ovector_last[0] = ovector[0];
ovector_last[1] = ovector[1];

for (;;)
  {
  PCRE2_SIZE start_offset;
  uint32_t options;

  /* After each successful match, we use pcre2_next_match() to obtain the match
  parameters for subsequent match attempts. */

  if (!pcre2_next_match(match_data, &start_offset, &options))
    break;

  /* Run the next matching operation */

  rc = pcre2_match(
    re,                   /* the compiled pattern */
    subject,              /* the subject string */
    subject_length,       /* the length of the subject */
    start_offset,         /* starting offset in the subject */
    options,              /* options */
    match_data,           /* block for storing the result */
    NULL);                /* use default match context */

  /* If this match attempt fails, exit the loop for subsequent matches. */

  if (rc == PCRE2_ERROR_NOMATCH)
    break;

  /* Other matching errors are not recoverable. */

  if (rc < 0)
    {
    printf("Matching error %d\n", rc);
    pcre2_match_data_free(match_data);
    pcre2_code_free(re);
    return 1;
    }

  /* This demonstration program depends on pcre2_next_match() to ensure that the
  loop for second and subsequent matches does not run forever. However, it would
  be robust practice for a production application to verify this. The following
  block of code shows how to do this. This error case is not reachable unless
  there is a bug in PCRE2.

  Because this program does not set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option,
  the logic is simple. We verify that either ovector[1] has advanced, or that we
  have an empty match touching the end of a previous non-empty match. See the
  API documentation for guidance if your application uses
  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK and searches for multiple matches. */

  if (!(ovector[1] > ovector_last[1] ||
        (ovector[1] == ovector[0] && ovector_last[1] > ovector_last[0] &&
         ovector[1] == ovector_last[1])))
    {
    printf("\\K was used in an assertion to yield non-advancing matches.\n");
    printf("Run abandoned\n");
    pcre2_match_data_free(match_data);
    pcre2_code_free(re);
    return 1;
    }

  ovector_last[0] = ovector[0];
  ovector_last[1] = ovector[1];

  /* Match succeeded. */

  printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);

  /* The match succeeded, but the output vector wasn't big enough. This
  should not happen. */

  if (rc == 0)
    printf("ovector was not big enough for all the captured substrings\n");

  /* We guard against patterns such as /(?=.\K)/ that use \K in an assertion to
  set the start of a match later than its end. As explained above, this case
  should not occur because this demonstration program does not set the
  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, however, we do include code showing
  how to detect it. */

  if (ovector[0] > ovector[1])
    {
    printf("\\K was used in an assertion to set the match start after its end.\n"
      "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
        (char *)(subject + ovector[1]));
    printf("Run abandoned\n");
    pcre2_match_data_free(match_data);
    pcre2_code_free(re);
    return 1;
    }

  /* As before, show substrings stored in the output vector by number, and then
  also any named substrings. */

  for (i = 0; i < rc; i++)
    {
    PCRE2_SPTR substring_start = subject + ovector[2*i];
    size_t substring_length = ovector[2*i+1] - ovector[2*i];
    printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
    }

  if (namecount == 0)
    printf("No named substrings\n");
  else
    {
    PCRE2_SPTR tabptr = name_table;
    printf("Named substrings\n");
    for (i = 0; i < namecount; i++)
      {
      int n = (tabptr[0] << 8) | tabptr[1];
      printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
        (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
      tabptr += name_entry_size;
      }
    }
  }      /* End of loop to find second and subsequent matches */

printf("\n");

pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 0;
}

/* End of pcre2demo.c */

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2grep.html ================================================ pcre2grep specification

pcre2grep man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

pcre2grep [options] [long options] [pattern] [path1 path2 ...]

DESCRIPTION

pcre2grep searches files for character patterns, in the same way as other grep commands do, but it uses the PCRE2 regular expression library to support patterns that are compatible with the regular expressions of Perl 5. See pcre2syntax(3) for a quick-reference summary of pattern syntax, or pcre2pattern(3) for a full description of the syntax and semantics of the regular expressions that PCRE2 supports.

Patterns, whether supplied on the command line or in a separate file, are given without delimiters. For example:

  pcre2grep Thursday /etc/motd
If you attempt to use delimiters (for example, by surrounding a pattern with slashes, as is common in Perl scripts), they are interpreted as part of the pattern. Quotes can of course be used to delimit patterns on the command line because they are interpreted by the shell, and indeed quotes are required if a pattern contains white space or shell metacharacters.

The first argument that follows any option settings is treated as the single pattern to be matched when neither -e nor -f is present. Conversely, when one or both of these options are used to specify patterns, all arguments are treated as path names. At least one of -e, -f, or an argument pattern must be provided.

If no files are specified, pcre2grep reads the standard input. The standard input can also be referenced by a name consisting of a single hyphen. For example:

  pcre2grep some-pattern file1 - file3
By default, input files are searched line by line, so pattern assertions about the beginning and end of a subject string (^, $, \A, \Z, and \z) match at the beginning and end of each line. When a line matches a pattern, it is copied to the standard output, and if there is more than one file, the file name is output at the start of each line, followed by a colon. However, there are options that can change how pcre2grep behaves. For example, the -M option makes it possible to search for strings that span line boundaries. What defines a line boundary is controlled by the -N (--newline) option. The -h and -H options control whether or not file names are shown, and the -Z option changes the file name terminator to a zero byte.

The amount of memory used for buffering files that are being scanned is controlled by parameters that can be set by the --buffer-size and --max-buffer-size options. The first of these sets the size of buffer that is obtained at the start of processing. If an input file contains very long lines, a larger buffer may be needed; this is handled by automatically extending the buffer, up to the limit specified by --max-buffer-size. The default values for these parameters can be set when pcre2grep is built; if nothing is specified, the defaults are set to 20KiB and 1MiB respectively. An error occurs if a line is too long and the buffer can no longer be expanded.

The block of memory that is actually used is three times the "buffer size", to allow for buffering "before" and "after" lines. If the buffer size is too small, fewer than requested "before" and "after" lines may be output.

When matching with a multiline pattern, the size of the buffer must be at least half of the maximum match expected or the pattern might fail to match.

Patterns can be no longer than 8KiB or BUFSIZ bytes, whichever is the greater. BUFSIZ is defined in <stdio.h>. When there is more than one pattern (specified by the use of -e and/or -f), each pattern is applied to each line in the order in which they are defined, except that all the -e patterns are tried before the -f patterns.

By default, as soon as one pattern matches a line, no further patterns are considered. However, if --colour (or --color) is used to colour the matching substrings, or if --only-matching, --file-offsets, --line-offsets, or --output is used to output only the part of the line that matched (either shown literally, or as an offset), the behaviour is different. In this situation, all the patterns are applied to the line. If there is more than one match, the one that begins nearest to the start of the subject is processed; if there is more than one match at that position, the one with the longest matching substring is processed; if the matching substrings are equal, the first match found is processed.

Scanning with all the patterns resumes immediately following the match, so that later matches on the same line can be found. Note, however, that an overlapping match that starts in the middle of another match will not be processed.

The above behaviour was changed at release 10.41 to be more compatible with GNU grep. In earlier releases, pcre2grep did not recognize matches from later patterns that were earlier in the subject.

Patterns that can match an empty string are accepted, but empty string matches are never recognized. An example is the pattern "(super)?(man)?", in which all components are optional. This pattern finds all occurrences of both "super" and "man"; the output differs from matching with "super|man" when only the matching substrings are being shown.

If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses the value to set a locale when calling the PCRE2 library. The --locale option can be used to override this.

SUPPORT FOR COMPRESSED FILES

Compile-time options for pcre2grep can set it up to use libz or libbz2 for reading compressed files whose names end in .gz or .bz2, respectively. You can find out whether your pcre2grep binary has support for one or both of these file types by running it with the --help option. If the appropriate support is not present, all files are treated as plain text. The standard input is always so treated. If a file with a .gz or .bz2 extension is not in fact compressed, it is read as a plain text file. When input is from a compressed .gz or .bz2 file, the --line-buffered option is ignored.

BINARY FILES

By default, a file that contains a binary zero byte within the first 1024 bytes is identified as a binary file, and is processed specially. However, if the newline type is specified as NUL, that is, the line terminator is a binary zero, the test for a binary file is not applied. See the --binary-files option for a means of changing the way binary files are handled.

BINARY ZEROS IN PATTERNS

Patterns passed from the command line are strings that are terminated by a binary zero, so cannot contain internal zeros. However, patterns that are read from a file via the -f option may contain binary zeros.

OPTIONS

The order in which some of the options appear can affect the output. For example, both the -H and -l options affect the printing of file names. Whichever comes later in the command line will be the one that takes effect. Similarly, except where noted below, if an option is given twice, the later setting is used. Numerical values for options may be followed by K or M, to signify multiplication by 1024 or 1024*1024 respectively.

-- This terminates the list of options. It is useful if the next item on the command line starts with a hyphen but is not an option. This allows for the processing of patterns and file names that start with hyphens.

-A number, --after-context=number Output up to number lines of context after each matching line. Fewer lines are output if the next match or the end of the file is reached, or if the processing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used instead of a colon for the context lines (the -Z option can be used to change the file name terminator to a zero byte). A line containing "--" is output between each group of lines, unless they are in fact contiguous in the input file. The value of number is expected to be relatively small. When -c is used, -A is ignored.

-a, --text Treat binary files as text. This is equivalent to --binary-files=text.

--allow-lookaround-bsk PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl. This option causes pcre2grep to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which enables this somewhat dangerous usage.

-B number, --before-context=number Output up to number lines of context before each matching line. Fewer lines are output if the previous match or the start of the file is within number lines, or if the processing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used instead of a colon for the context lines (the -Z option can be used to change the file name terminator to a zero byte). A line containing "--" is output between each group of lines, unless they are in fact contiguous in the input file. The value of number is expected to be relatively small. When -c is used, -B is ignored.

--binary-files=word Specify how binary files are to be processed. If the word is "binary" (the default), pattern matching is performed on binary files, but the only output is "Binary file <name> matches" when a match succeeds. If the word is "text", which is equivalent to the -a or --text option, binary files are processed in the same way as any other file. In this case, when a match succeeds, the output may be binary garbage, which can have nasty effects if sent to a terminal. If the word is "without-match", which is equivalent to the -I option, binary files are not processed at all; they are assumed not to be of interest and are skipped without causing any output or affecting the return code.

--buffer-size=number Set the parameter that controls how much memory is obtained at the start of processing for buffering files that are being scanned. See also --max-buffer-size below.

-C number, --context=number Output number lines of context both before and after each matching line. This is equivalent to setting both -A and -B to the same value.

-c, --count Do not output lines from the files that are being scanned; instead output the number of lines that would have been shown, either because they matched, or, if -v is set, because they failed to match. By default, this count is exactly the same as the number of lines that would have been output, but if the -M (multiline) option is used (without -v), there may be more suppressed lines than the count (that is, the number of matches).

If no lines are selected, the number zero is output. If several files are being scanned, a count is output for each of them and the -t option can be used to cause a total to be output at the end. However, if the --files-with-matches option is also used, only those files whose counts are greater than zero are listed. When -c is used, the -A, -B, and -C options are ignored.

--colour, --color If this option is given without any data, it is equivalent to "--colour=auto". If data is required, it must be given in the same shell item, separated by an equals sign.

--colour=value, --color=value This option specifies under what circumstances the parts of a line that matched a pattern should be coloured in the output. It is ignored if --file-offsets, --line-offsets, or --output is set. By default, output is not coloured. The value for the --colour option (which is optional, see above) may be "never", "always", or "auto". In the latter case, colouring happens only if the standard output is connected to a terminal. More resources are used when colouring is enabled, because pcre2grep has to search for all possible matches in a line, not just one, in order to colour them all.

The colour that is used can be specified by setting one of the environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or PCREGREP_COLOR, which are checked in that order. If none of these are set, pcre2grep looks for GREP_COLORS or GREP_COLOR (in that order). The value of the variable should be a string of two numbers, separated by a semicolon, except in the case of GREP_COLORS, which must start with "ms=" or "mt=" followed by two semicolon-separated colours, terminated by the end of the string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is ignored, and GREP_COLOR is checked.

If the string obtained from one of the above variables contains any characters other than semicolon or digits, the setting is ignored and the default colour is used. The string is copied directly into the control string for setting colour on a terminal, so it is your responsibility to ensure that the values make sense. If no relevant environment variable is set, the default is "1;31", which gives red.

-D action, --devices=action If an input path is not a regular file or a directory, "action" specifies how it is to be processed. Valid values are "read" (the default) or "skip" (silently skip the path).

-d action, --directories=action If an input path is a directory, "action" specifies how it is to be processed. Valid values are "read" (the default in non-Windows environments, for compatibility with GNU grep), "recurse" (equivalent to the -r option), or "skip" (silently skip the path, the default in Windows environments). In the "read" case, directories are read as if they were ordinary files. In some operating systems the effect of reading a directory like this is an immediate end-of-file; in others it may provoke an error.

--depth-limit=number See --match-limit below.

-E, --case-restrict When case distinctions are being ignored in Unicode mode, two ASCII letters (K and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F (long S) respectively, as well as their lower case ASCII counterparts. When this option is set, case equivalences are restricted such that no ASCII character matches a non-ASCII character, and vice versa.

-e pattern, --regex=pattern, --regexp=pattern Specify a pattern to be matched. This option can be used multiple times in order to specify several patterns. It can also be used as a way of specifying a single pattern that starts with a hyphen. When -e is used, no argument pattern is taken from the command line; all arguments are treated as file names. There is no limit to the number of patterns. They are applied to each line in the order in which they are defined.

If -f is used with -e, the command line patterns are matched first, followed by the patterns from the file(s), independent of the order in which these options are specified.

--exclude=pattern Files (but not directories) whose names match the pattern are skipped without being processed. This applies to all files, whether listed on the command line, obtained from --file-list, or by scanning a directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the file name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times in order to specify multiple patterns. If a file name matches both an --include and an --exclude pattern, it is excluded. There is no short form for this option.

--exclude-from=filename Treat each non-empty line of the file as the data for an --exclude option. What constitutes a newline when reading the file is the operating system's default. The --newline option has no effect on this option. This option may be given more than once in order to specify a number of files to read.

--exclude-dir=pattern Directories whose names match the pattern are skipped without being processed, whatever the setting of the --recursive option. This applies to all directories, whether listed on the command line, obtained from --file-list, or by scanning a parent directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the directory name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times in order to specify more than one pattern. If a directory matches both --include-dir and --exclude-dir, it is excluded. There is no short form for this option.

-F, --fixed-strings Interpret each data-matching pattern as a list of fixed strings, separated by newlines, instead of as a regular expression. What constitutes a newline for this purpose is controlled by the --newline option. The -w (match as a word) and -x (match whole line) options can be used with -F. They apply to each of the fixed strings. A line is selected if any of the fixed strings are found in it (subject to -w or -x, if present). This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude options.

-f filename, --file=filename Read patterns from the file, one per line. As is the case with patterns on the command line, no delimiters should be used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --newline option has no effect on this option. Trailing white space is removed from each line, and blank lines are ignored unless the --posix-pattern-file option is also provided. An empty file contains no patterns and therefore matches nothing. Patterns read from a file in this way may contain binary zeros, which are treated as ordinary character literals.

If this option is given more than once, all the specified files are read. A data line is output if any of the patterns match it. A file name can be given as "-" to refer to the standard input. When -f is used, patterns specified on the command line using -e may also be present; they are matched before the file's patterns. However, no pattern is taken from the command line; all arguments are treated as the names of paths to be searched.

--file-list=filename Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a newline when reading the file is the operating system's default. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any that are listed on the command line. The file name can be given as "-" to refer to the standard input. If --file and --file-list are both specified as "-", patterns are read first. This is useful only when the standard input is a terminal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read.

--file-offsets Instead of showing lines or parts of lines that match, show each match as an offset from the start of the file and a length, separated by a comma. In this mode, --colour has no effect, and no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown separately. This option is mutually exclusive with --output, --line-offsets, and --only-matching.

--group-separator=text Output this text string instead of two hyphens between groups of lines when -A, -B, or -C is in use. See also --no-group-separator.

-H, --with-filename Force the inclusion of the file name at the start of output lines when searching a single file. The file name is not normally shown in this case. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. When the -M option causes a pattern to match more than one line, only the first is preceded by the file name. This option overrides any previous -h, -l, or -L options.

-h, --no-filename Suppress the output file names when searching multiple files. File names are normally shown when multiple files are searched. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. This option overrides any previous -H, -L, or -l options.

--heap-limit=number See --match-limit below.

--help Output a help message, giving brief details of the command options and file type support, and then exit. Anything else on the command line is ignored.

-I Ignore binary files. This is equivalent to --binary-files=without-match.

-i, --ignore-case Ignore upper/lower case distinctions when pattern matching. This applies when matching path names for inclusion or exclusion as well as when matching lines in files.

--include=pattern If any --include patterns are specified, the only files that are processed are those whose names match one of the patterns and do not match an --exclude pattern. This option does not affect directories, but it applies to all files, whether listed on the command line, obtained from --file-list, or by scanning a directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the file name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. If a file name matches both an --include and an --exclude pattern, it is excluded. There is no short form for this option.

--include-from=filename Treat each non-empty line of the file as the data for an --include option. What constitutes a newline for this purpose is the operating system's default. The --newline option has no effect on this option. This option may be given any number of times; all the files are read.

--include-dir=pattern If any --include-dir patterns are specified, the only directories that are processed are those whose names match one of the patterns and do not match an --exclude-dir pattern. This applies to all directories, whether listed on the command line, obtained from --file-list, or by scanning a parent directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the directory name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. If a directory matches both --include-dir and --exclude-dir, it is excluded. There is no short form for this option.

-L, --files-without-match Instead of outputting lines from the files, just output the names of the files that do not contain any lines that would have been output. Each file name is output once, on a separate line by default, but if the -Z option is set, they are separated by zero bytes instead of newlines. This option overrides any previous -H, -h, or -l options.

-l, --files-with-matches Instead of outputting lines from the files, just output the names of the files containing lines that would have been output. Each file name is output once, on a separate line, but if the -Z option is set, they are separated by zero bytes instead of newlines. Searching normally stops as soon as a matching line is found in a file. However, if the -c (count) option is also used, matching continues in order to obtain the correct count, and those files that have at least one match are listed along with their counts. Using this option with -c is a way of suppressing the listing of files with no matches that occurs with -c on its own. This option overrides any previous -H, -h, or -L options.

--label=name This option supplies a name to be used for the standard input when file names are being output. If not supplied, "(standard input)" is used. There is no short form for this option.

--line-buffered When this option is given, non-compressed input is read and processed line by line, and the output is flushed after each write. By default, input is read in large chunks, unless pcre2grep can determine that it is reading from a terminal, which is currently possible only in Unix-like environments or Windows. Output to terminal is normally automatically flushed by the operating system. This option can be useful when the input or output is attached to a pipe and you do not want pcre2grep to buffer up large amounts of data. However, its use will affect performance, and the -M (multiline) option ceases to work. When input is from a compressed .gz or .bz2 file, --line-buffered is ignored.

--line-offsets Instead of showing lines or parts of lines that match, show each match as a line number, the offset from the start of the line, and a length. The line number is terminated by a colon (as usual; see the -n option), and the offset and length are separated by a comma. In this mode, --colour has no effect, and no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown separately. This option is mutually exclusive with --output, --file-offsets, and --only-matching.

--locale=locale-name This option specifies a locale to be used for pattern matching. It overrides the value in the LC_ALL or LC_CTYPE environment variables. If no locale is specified, the PCRE2 library's default (usually the "C" locale) is used. There is no short form for this option.

-M, --multiline Allow patterns to match more than one line. When this option is set, the PCRE2 library is called in "multiline" mode, and a match is allowed to continue past the end of the initial line and onto one or more subsequent lines.

Patterns used with -M may usefully contain literal newline characters and internal occurrences of ^ and $ characters, because in multiline mode these can match at internal newlines. Because pcre2grep is scanning multiple lines, the \Z and \z assertions match only at the end of the last line in the file. The \A assertion matches at the start of the first line of a match. This can be any line in the file; it is not anchored to the first line.

The output for a successful match may consist of more than one line. The first line is the line in which the match started, and the last line is the line in which the match ended. If the matched string ends with a newline sequence, the output ends at the end of that line. If -v is set, none of the lines in a multi-line match are output. Once a match has been handled, scanning restarts at the beginning of the line after the one in which the match ended.

The newline sequence that separates multiple lines must be matched as part of the pattern. For example, to find the phrase "regular expression" in a file where "regular" might be at the end of a line and "expression" at the start of the next line, you could use this command:

  pcre2grep -M 'regular\s+expression' <file>
The \s escape sequence matches any white space character, including newlines, and is followed by + so as to match trailing white space on the first line as well as possibly handling a two-character newline sequence.

There is a limit to the number of lines that can be matched, imposed by the way that pcre2grep buffers the input file as it scans it. With a sufficiently large processing buffer, this should not be a problem.

The -M option does not work when input is read line by line (see --line-buffered.)

-m number, --max-count=number Stop processing after finding number matching lines, or non-matching lines if -v is also set. Any trailing context lines are output after the final match. In multiline mode, each multiline match counts as just one line for this purpose. If this limit is reached when reading the standard input from a regular file, the file is left positioned just after the last matching line. If -c is also set, the count that is output is never greater than number. This option has no effect if used with -L, -l, or -q, or when just checking for a match in a binary file.

--match-limit=number Processing some regular expression patterns may take a very long time to search for all possible matching strings. Others may require a very large amount of memory. There are three options that set resource limits for matching.

The --match-limit option provides a means of limiting computing resource usage when processing patterns that are not going to match, but which have a very large number of possibilities in their search trees. The classic example is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a counter that is incremented each time around its main processing loop. If the value set by --match-limit is reached, an error occurs.

The --heap-limit option specifies, as a number of kibibytes (units of 1024 bytes), the maximum amount of heap memory that may be used for matching.

The --depth-limit option limits the depth of nested backtracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtracking point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than --match-limit.

There are no short forms for these options. The default limits can be set when the PCRE2 library is compiled; if they are not specified, the defaults are very large and so effectively unlimited.

--max-buffer-size=number This limits the expansion of the processing buffer, whose initial size can be set by --buffer-size. The maximum buffer size is silently forced to be no smaller than the starting buffer size.

-N newline-type, --newline=newline-type Six different conventions for indicating the ends of lines in scanned files are supported. For example:

  pcre2grep -N CRLF 'some pattern' <file>
The newline type may be specified in upper, lower, or mixed case. If the newline type is NUL, lines are separated by binary zero characters. The other types are the single-character sequences CR (carriage return) and LF (linefeed), the two-character sequence CRLF, an "anycrlf" type, which recognizes any of the preceding three types, and an "any" type, for which any Unicode line ending sequence is assumed to end a line. The Unicode sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029).

When the PCRE2 library is built, a default line-ending sequence is specified. This is normally the standard sequence for the operating system. Unless otherwise specified by this option, pcre2grep uses the library's default.

This option makes it possible to use pcre2grep to scan files that have come from other environments without having to modify their line endings. If the data that is being scanned does not agree with the convention set by this option, pcre2grep may behave in strange ways. Note that this option does not apply to files specified by the -f, --exclude-from, or --include-from options, which are expected to use the operating system's standard newline sequence.

-n, --line-number Precede each output line by its line number in the file, followed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the line number. When the -M option causes a pattern to match more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used.

--no-group-separator Do not output a separator between groups of lines when -A, -B, or -C is in use. The default is to output a line containing two hyphens. See also --group-separator.

--no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build time. This option can be used to disable the use of JIT at run time. It is provided for testing and working around problems. It should never be needed in normal use.

-O text, --output=text When there is a match, instead of outputting the line that matched, output just the text specified in this option, followed by an operating-system standard newline. In this mode, --colour has no effect, and no context is shown. That is, the -A, -B, and -C options are ignored. The --newline option has no effect on this option, which is mutually exclusive with --only-matching, --file-offsets, and --line-offsets. However, like --only-matching, if there is more than one match in a line, each of them causes a line of output.

Escape sequences starting with a dollar character may be used to insert the contents of the matched part of the line and/or captured substrings into the text.

$<digits> or ${<digits>} is replaced by the captured substring of the given decimal number; $& (or the legacy $0) substitutes the whole match. If the number is greater than the number of capturing substrings, or if the capture is unset, the replacement is empty.

$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; $v by vertical tab.

$o<digits> or $o{<digits>} is replaced by the character whose code point is the given octal number. In the first form, up to three octal digits are processed. When more digits are needed in Unicode mode to specify a wide character, the second form must be used.

$x<digits> or $x{<digits>} is replaced by the character represented by the given hexadecimal number. In the first form, up to two hexadecimal digits are processed. When more digits are needed in Unicode mode to specify a wide character, the second form must be used.

Any other character is substituted by itself. In particular, $$ is replaced by a single dollar.

-o, --only-matching Show only the part of the line that matched a pattern instead of the whole line. In this mode, no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown separately, on a separate line of output. If -o is combined with -v (invert the sense of the match to find non-matching lines), no output is generated, but the return code is set appropriately. If the matched portion of the line is empty, nothing is output unless the file name or line number are being printed, in which case they are shown on an otherwise empty line. This option is mutually exclusive with --output, --file-offsets and --line-offsets.

-onumber, --only-matching=number Show only the part of the line that matched the capturing parentheses of the given number. Up to 50 capturing parentheses are supported by default. This limit can be changed via the --om-capture option. A pattern may contain any number of capturing parentheses, but only those whose number is within the limit can be accessed by -o. An error occurs if the number specified by -o is greater than the limit.

-o0 is the same as -o without a number. Because these options can be given without an argument (see above), if an argument is present, it must be given in the same shell item, for example, -o3 or --only-matching=2. The comments given for the non-argument case above also apply to this option. If the specified capturing parentheses do not exist in the pattern, or were not set in the match, nothing is output unless the file name or line number are being output.

If this option is given multiple times, multiple substrings are output for each match, in the order the options are given, and all on one line. For example, -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and then 3 again to be output. By default, there is no separator (but see the next but one option).

--om-capture=number Set the number of capturing parentheses that can be accessed by -o. The default is 50.

--om-separator=text Specify a separating string for multiple occurrences of -o. The default is an empty string. Separating strings are never coloured.

-P, --no-ucp Starting from release 10.43, when UTF/Unicode mode is specified with -u or -U, the PCRE2_UCP option is used by default. This means that the POSIX classes in patterns match more than just ASCII characters. For example, [:digit:] matches any Unicode decimal digit. The --no-ucp option suppresses PCRE2_UCP, thus restricting the POSIX classes to ASCII characters, as was the case in earlier releases. Note that there are now more fine-grained option settings within patterns that affect individual classes. For example, when in UCP mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \w to match Unicode letters and digits.

--posix-pattern-file When patterns are provided with the -f option, do not trim trailing spaces or ignore empty lines in a similar way than other grep tools. To keep the behaviour consistent with older versions, if the pattern read was terminated with CRLF (as character literals) then both characters won't be included as part of it, so if you really need to have pattern ending in '\r', use a escape sequence or provide it by a different method.

-q, --quiet Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found.

-r, --recursive If any given path is a directory, recursively scan the files it contains, taking note of any --include and --exclude settings. By default, a directory is read as a normal file; in some operating systems this gives an immediate end-of-file. This option is a shorthand for setting the -d option to "recurse".

--recursion-limit=number This is an obsolete synonym for --depth-limit. See --match-limit above for details.

-s, --no-messages Suppress error messages about non-existent or unreadable files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files.

-t, --total-count This option is useful when scanning more than one file. If used on its own, -t suppresses all output except for a grand total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand total is output except when the previous output is just one line. In other words, it is not output when just one file's count is listed. If file names are being output, the grand total is preceded by "TOTAL:". Otherwise, it appears as just another number. The -t option is ignored when used with -L (list files without matches), because the grand total would always be zero.

-u, --utf Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including those for any --exclude and --include options) and all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an error occurs.

-U, --utf-allow-invalid As --utf, but in addition subject lines may contain invalid UTF-8 code unit sequences. These can never form part of any pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or other binary files. For more details about matching in non-valid UTF-8 strings, see the pcre2unicode(3) documentation.

-V, --version Write the version numbers of pcre2grep and the PCRE2 library to the standard output and then exit. Anything else on the command line is ignored.

-v, --invert-match Invert the sense of the match, so that lines which do not match any of the patterns are the ones that are found. When this option is set, options such as --only-matching and --output, which specify parts of a match that are to be output, are ignored.

-w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must be a word boundary at the start and end of each matched string. This is equivalent to having "\b(?:" at the start of each pattern, and ")\b" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude options.

-x, --line-regex, --line-regexp Force the patterns to start matching only at the beginnings of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pattern and ")$" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude options.

-Z, --null Terminate files names in the regular output with a zero byte (the NUL character) instead of what would normally appear. This is useful when file names contain unusual characters such as colons, hyphens, or even newlines. The option does not apply to file names in error messages.

ENVIRONMENT VARIABLES

The environment variables LC_ALL and LC_CTYPE are examined, in that order, for a locale. The first one that is set is used. This can be overridden by the --locale option. If no locale is set, the PCRE2 library's default (usually the "C" locale) is used.

NEWLINES

The -N (--newline) option allows pcre2grep to scan files with newline conventions that differ from the default. This option affects only the way scanned files are processed. It does not affect the interpretation of files specified by the -f, --file-list, --exclude-from, or --include-from options.

Any parts of the scanned input files that are written to the standard output are copied with whatever newline sequences they have in the input. However, if the final line of a file is output, and it does not end with a newline sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used.

The newline setting does not affect the way in which pcre2grep writes newlines in informational messages to the standard output and error streams. Under Windows, the standard output is set to be binary, so that "\r\n" at the ends of output lines that are copied from the input is not converted to "\r\r\n" by the C I/O library. This means that any messages written to the standard output must end with "\r\n". For all other operating systems, and for all messages to the standard error stream, "\n" is used.

OPTIONS COMPATIBILITY WITH GNU GREP

Many of the short and long forms of pcre2grep's options are the same as in the GNU grep program. Any long option of the form --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE2 terminology). However, the --case-restrict, --depth-limit, -E, --file-list, --file-offsets, --heap-limit, --include-dir, --line-offsets, --locale, --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separator, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number.

Although most of the common options work the same way, a few are different in pcre2grep. For example, the --include option's argument is a glob for GNU grep, but in pcre2grep it is a regular expression to which the -i option applies. If both the -c and -l options are given, GNU grep lists only file names, without counts, but pcre2grep gives the counts as well.

OPTIONS WITH DATA

There are four different ways in which an option with data can be specified. If a short form option is used, the data may follow immediately, or (with one exception) in the next command line item. For example:

  -f/some/file
  -f /some/file
The exception is the -o option, which may appear with or without data. Because of this, if data is present, it must follow immediately in the same item, for example -o3.

If a long form option is used, the data may appear in the same command line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example:

  --file=/some/file
  --file /some/file
Note, however, that if you want to supply a file name beginning with ~ as data in a shell command, and have the shell expand ~ to a home directory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item.

The exceptions to the above are the --colour (or --color) and --only-matching options, for which the data is optional. If one of these options does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data.

USING PCRE2'S CALLOUT FACILITY

pcre2grep has, by default, support for calling external programs or scripts or echoing specific strings during matching by making use of PCRE2's callout facility. However, this support can be completely or partially disabled when pcre2grep is built. You can find out whether your binary has support for callouts by running it with the --help option. If callout support is completely disabled, callouts in patterns are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored.

A callout in a PCRE2 pattern is of the form (?C<arg>) where the argument is either a number or a quoted string (see the pcre2callout documentation for details). Numbered callouts are ignored by pcre2grep; only callouts with string arguments are useful.

Echoing a specific string

Starting the callout string with a pipe character invokes an echoing facility that avoids calling an external program or script. This facility is always available, provided that callouts were not completely disabled when pcre2grep was built. The rest of the callout string is processed as a zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the --output (-O) option (see above). However, $0 or $& cannot be used to insert a matched substring because the match is still in progress. Instead, the single character '0' is inserted. Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example:

  pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
Matching continues normally after the string is output. If you want to see only the callout output but not any output from an actual match, you should end the pattern with (*FAIL).

Calling external programs or scripts

This facility can be independently disabled when pcre2grep is built. It is supported for Windows, where a call to _spawnvp() is used, for VMS, where lib$spawn() is used, and for any Unix-like environment where fork() and execv() are available.

If the callout string does not start with a pipe (vertical bar) character, it is parsed into a list of substrings separated by pipe characters. The first substring must be an executable name, with the following substrings specifying arguments:

  executable_name|arg1|arg2|...
Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the --output (-O) option documented above, except that $0 or $& cannot insert the matched string because the match is still in progress. Instead, the character '0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example:
  echo -e "abcde\n12345" | pcre2grep \
    '(?x)(.)(..(.))
    (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -

  Output:

    Arg1: [a] [bcd] [d] Arg2: |a| ()
    abcde
    Arg1: [1] [234] [4] Arg2: |1| ()
    12345
The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero characters in the callout argument will cause premature termination of their substrings, and therefore should not be present. Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any reason (including the non-existence of the executable), a local matching failure occurs and the matcher backtracks in the normal way.

MATCHING ERRORS

It is possible to supply a regular expression that takes a very long time to fail to match certain lines. Such patterns normally involve nested indefinite repeats, for example: (a+)*\d when matched against a line of a's with no final digit. The PCRE2 matching function has a resource limit that causes it to abort in these circumstances. If this happens, pcre2grep outputs an error message and the line that caused the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up.

The --match-limit option of pcre2grep can be used to set the overall resource limit. There are also other limits that affect the amount of memory used during matching; see the discussion of --heap-limit and --depth-limit above.

DIAGNOSTICS

Exit status is 0 if any matches were found, 1 if no matches were found, and 2 for syntax errors, overlong lines, non-existent or inaccessible files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessible files does not affect the return code.

When run under VMS, the return code is placed in the symbol PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1).

SEE ALSO

pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 24 January 2025
Copyright © 1997-2023 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2jit.html ================================================ pcre2jit specification

pcre2jit man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PCRE2 JUST-IN-TIME COMPILER SUPPORT

Just-in-time compiling is a heavyweight optimization that can greatly speed up pattern matching. However, it comes at the cost of extra processing before the match is performed, so it is of most benefit when the same pattern is going to be matched many times. This does not necessarily mean many calls of a matching function; if the pattern is not anchored, matching attempts may take place many times at various positions in the subject, even for a single call. Therefore, if the subject string is very long, it may still pay to use JIT even for one-off matches. JIT support is available for all of the 8-bit, 16-bit and 32-bit PCRE2 libraries.

JIT support applies only to the traditional Perl-compatible matching function. It does not apply when the DFA matching function is being used. The code for JIT support was written by Zoltan Herczeg.

AVAILABILITY OF JIT SUPPORT

JIT support is an optional feature of PCRE2. The "configure" option --enable-jit (or equivalent CMake option) must be set when PCRE2 is built if you want to use JIT. The support is limited to the following hardware platforms:

  ARM 32-bit (v7, and Thumb2)
  ARM 64-bit
  IBM s390x 64 bit
  Intel x86 32-bit and 64-bit
  LoongArch 64 bit
  MIPS 32-bit and 64-bit
  Power PC 32-bit and 64-bit
  RISC-V 32-bit and 64-bit
If --enable-jit is set on an unsupported platform, compilation fails.

A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular match. One reason for this is that there are a number of options and pattern items that are not supported by JIT (see below). Another reason is that in some environments JIT is unable to get executable memory in which to build its compiled code. The only guarantee from pcre2_config() is that if it returns zero, JIT will definitely not be used.

As of release 10.45 there is a more informative way to test for JIT support. If pcre2_compile_jit() is called with the single option PCRE2_JIT_TEST_ALLOC it returns zero if JIT is available and has a working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. The code argument is ignored, so it can be a NULL value.

A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs that need the best possible performance, there is a "fast path" API that is JIT-specific.

SIMPLE USE OF JIT

To make use of the JIT support in the simplest way, all you have to do is to call pcre2_jit_compile() after successfully compiling a pattern with pcre2_compile(). This function has two arguments: the first is the compiled pattern pointer that was returned by pcre2_compile(), and the second is zero or more of the following option bits: PCRE2_JIT_COMPLETE, PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT.

If JIT support is not available, a call to pcre2_jit_compile() does nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern is passed to the JIT compiler, which turns it into machine code that executes much faster than the normal interpretive code, but yields exactly the same results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code.

There is a limit to the size of pattern that JIT supports, imposed by the size of machine stack that it uses. The exact rules are not documented because they may change at any time, in particular, when new optimizations are introduced. If a pattern is too big, a call to pcre2_jit_compile() returns PCRE2_ERROR_NOMEMORY.

PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should set one or both of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT compiler generates different optimized code for each of the three modes (normal, soft partial, hard partial). When pcre2_match() is called, the appropriate code is run if it is available. Otherwise, the pattern is matched using interpretive code.

You can call pcre2_jit_compile() multiple times for the same compiled pattern. It does nothing if it has previously compiled code for any of the option bits. For example, you can call it once with PCRE2_JIT_COMPLETE and (perhaps later, when you find you need partial matching) again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial matching. If pcre2_jit_compile() is called with no option bits set, it immediately returns zero. This is an alternative way of testing whether JIT support has been compiled.

At present, it is not possible to free JIT compiled code except when the entire compiled pattern is freed by calling pcre2_code_free().

In some circumstances you may need to call additional functions. These are described in the section entitled "Controlling the JIT stack" below.

There are some pcre2_match() options that are not supported by JIT, and there are also some pattern items that JIT cannot handle. Details are given below. In both cases, matching automatically falls back to the interpretive code. If you want to know whether JIT was actually used for a particular match, you should arrange for a JIT callback function to be set up as described in the section entitled "Controlling the JIT stack" below, even if you do not need to supply a non-default JIT stack. Such a callback function is called whenever JIT code is about to be obeyed. If the match-time options are not right for JIT execution, the callback function is not obeyed.

If the JIT compiler finds an unsupported item, no JIT data is generated. You can find out if JIT compilation was successful for a compiled pattern by calling pcre2_pattern_info() with the PCRE2_INFO_JITSIZE option. A non-zero result means that JIT compilation was successful. A result of 0 means that JIT support is not available, or the pattern was not processed by pcre2_jit_compile(), or the JIT compiler was not able to handle the pattern. Successful JIT compilation does not, however, guarantee the use of JIT at match time because there are some match time options that are not supported by JIT.

MATCHING SUBJECTS CONTAINING INVALID UTF

When a pattern is compiled with the PCRE2_UTF option, subject strings are normally expected to be a valid sequence of UTF code units. By default, this is checked at the start of matching and an error is generated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be passed to pcre2_match() to skip the check (for improved performance) if you are sure that a subject string is valid. If this option is used with an invalid string, the result is undefined. The calling program may crash or loop or otherwise misbehave.

However, a way of running matches on strings that may contain invalid UTF sequences is available. Calling pcre2_compile() with the PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in pcre2_match() to support invalid UTF, and, if pcre2_jit_compile() is subsequently called, the compiled JIT code also supports invalid UTF. Details of how this support works, in both the JIT and the interpretive cases, is given in the pcre2unicode documentation.

There is also an obsolete option for pcre2_jit_compile() called PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility. It is superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF and should no longer be used. It may be removed in future.

UNSUPPORTED OPTIONS AND PATTERN ITEMS

The pcre2_match() options that are supported for JIT matching are PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not supported at match time.

If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the use of JIT, forcing matching by the interpreter code.

The only unsupported pattern items are \C (match a single data unit) when running in a UTF mode, and a callout immediately before an assertion condition in a conditional group.

RETURN VALUES FROM JIT MATCHING

When a pattern is matched using JIT, the return values are the same as those given by the interpretive pcre2_match() code, with the addition of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the memory used for the JIT stack was insufficient. See "Controlling the JIT stack" below for a discussion of JIT stack usage.

The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if searching a very large pattern tree goes on for too long, as it is in the same circumstance when JIT is not used, but the details of exactly what is counted are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned when JIT matching is used.

CONTROLLING THE JIT STACK

When the compiled JIT code runs, it needs a block of memory to use as a stack. By default, it uses 32KiB on the machine stack. However, some large or complicated patterns need more than this. The error PCRE2_ERROR_JIT_STACKLIMIT is given when there is not enough stack. Three functions are provided for managing blocks of memory for use as JIT stacks. There is further discussion about the use of JIT stacks in the section entitled "JIT stack FAQ" below.

The pcre2_jit_stack_create() function creates a JIT stack. Its arguments are a starting size, a maximum size, and a general context (for memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type pcre2_jit_stack, or NULL if there is an error. The pcre2_jit_stack_free() function is used to free a stack that is no longer needed. If its argument is NULL, this function returns immediately, without doing anything. (For the technically minded: the address space is allocated by mmap or VirtualAlloc.) A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern.

The pcre2_jit_stack_assign() function specifies which stack JIT code should use. Its arguments are as follows:

  pcre2_match_context  *mcontext
  pcre2_jit_callback    callback
  void                 *data
The first argument is a pointer to a match context. When this is subsequently passed to a matching function, its information determines which JIT stack is used. If this argument is NULL, the function returns immediately, without doing anything. There are three cases for the values of the other two options:
  (1) If callback is NULL and data is NULL, an internal 32KiB block
      on the machine stack is used. This is the default when a match
      context is created.

  (2) If callback is NULL and data is not NULL, data must be
      a pointer to a valid JIT stack, the result of calling
      pcre2_jit_stack_create().

  (3) If callback is not NULL, it must point to a function that is
      called with data as an argument at the start of matching, in
      order to set up a JIT stack. If the return from the callback
      function is NULL, the internal 32KiB stack is used; otherwise the
      return value must be a valid JIT stack, the result of calling
      pcre2_jit_stack_create().
A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when pcre2_match() is called with options that are incompatible for JIT matching. A callback function can therefore be used to determine whether a match operation was executed by JIT or by the interpreter.

You may safely use the same JIT stack for more than one pattern (either by assigning directly or by callback), as long as the patterns are matched sequentially in the same thread. Currently, the only way to set up non-sequential matches in one thread is to use callouts: if a callout function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es).

In a multithread application, if you do not specify a JIT stack, or if you assign or pass back NULL from a callback, that is thread-safe, because each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for each thread so that the application is thread-safe.

Strictly speaking, even more is allowed. You can assign the same non-NULL stack to a match context that is used by any number of patterns, as long as they are not used for matching by multiple threads at the same time. For example, you could use the same stack in all compiled patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended.

This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate:

  During thread initialization
    thread_local_var = pcre2_jit_stack_create(...)

  During thread exit
    pcre2_jit_stack_free(thread_local_var)

  Use a one-line callback function
    return thread_local_var
All the functions described in this section do nothing if JIT is not available.

JIT STACK FAQ

(1) Why do we need JIT stacks?

PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is difficult. For example, the stack chain needs to be updated every time if we extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory.

(2) Why don't we simply allocate blocks of memory with malloc()?

Modern operating systems have a nice feature: they can reserve an address space instead of allocating memory. We can safely allocate memory pages inside this address space, so the stack could grow without moving memory data (this is important because of pointers). Thus we can allocate 1MiB address space, and use only a single memory page (usually 4KiB) if that is enough. However, we can still grow up to 1MiB anytime if needed.

(3) Who "owns" a JIT stack?

The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by pcre2_match(), (that is, it is assigned to a match context that is passed to the pattern currently running), that stack must not be used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function.

(4) When should a JIT stack be freed?

You can free a JIT stack at any time, as long as it will not be used by pcre2_match() again. When you assign the stack to a match context, only a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, anytime. Just do not call pcre2_match() with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently used by pcre2_match() in another thread). You can also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement.

(5) Should I allocate/free a stack every time before/after calling pcre2_match()?

No, because this is too costly in terms of resources. However, you could implement some clever idea which release the stack if it is not used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns.

(6) OK, the stack is for long term memory allocation. But what happens if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the stack is freed?

Especially on embedded systems, it might be a good idea to release memory sometimes without freeing the stack. There is no API for this at the moment. Probably a function call which returns with the currently allocated memory for any stack and another which allows releasing memory (shrinking the stack) would be a good idea if someone needs this.

(7) This is too much of a headache. Isn't there any better solution for JIT stack handling?

No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API.

FREEING JIT SPECULATIVE MEMORY

void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);

The JIT executable allocator does not free all memory when it is possible. It expects new allocations, and keeps some free memory around to improve allocation speed. However, in low memory conditions, it might be better to free all possible memory. You can cause this to happen by calling pcre2_jit_free_unused_memory(). Its argument is a general context, for custom memory management, or NULL for standard memory management.

EXAMPLE CODE

This is a single-threaded example that specifies a JIT stack without using a callback. A real program should include error checking after all the function calls.

  int rc;
  pcre2_code *re;
  pcre2_match_data *match_data;
  pcre2_match_context *mcontext;
  pcre2_jit_stack *jit_stack;

  re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0,
    &errornumber, &erroffset, NULL);
  rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
  mcontext = pcre2_match_context_create(NULL);
  jit_stack = pcre2_jit_stack_create(32*1024, 512*1024, NULL);
  pcre2_jit_stack_assign(mcontext, NULL, jit_stack);
  match_data = pcre2_match_data_create(re, 10);
  rc = pcre2_match(re, subject, length, 0, 0, match_data, mcontext);
  /* Process result */

  pcre2_code_free(re);
  pcre2_match_data_free(match_data);
  pcre2_match_context_free(mcontext);
  pcre2_jit_stack_free(jit_stack);

JIT FAST PATH API

Because the API described above falls back to interpreted matching when JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via pcre2_match() does have a performance impact. Programs that are written for use where JIT is known to be available, and which need the best possible performance, can instead use a "fast path" API to call JIT matching directly instead of calling pcre2_match() (obviously only for patterns that have been successfully processed by pcre2_jit_compile()).

The fast path function is called pcre2_jit_match(), and it takes exactly the same arguments as pcre2_match(). However, the subject string must be specified with a length; PCRE2_ZERO_TERMINATED is not supported. Unsupported option bits (for example, PCRE2_ANCHORED and PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The return values are also the same as for pcre2_match(), plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled.

When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For example, if the subject pointer is NULL but the length is non-zero, an immediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these checks do not happen on the JIT fast path. If invalid UTF data is passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), the result is undefined. The program may crash or loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you should call pcre2_jit_match() in UTF mode only if you are sure the subject is valid.

Bypassing the sanity checks and the pcre2_match() wrapping can give speedups of more than 10%.

SEE ALSO

pcre2api(3), pcre2unicode(3)

AUTHOR

Philip Hazel (FAQ by Zoltan Herczeg)
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 22 August 2024
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2limits.html ================================================ pcre2limits specification

pcre2limits man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SIZE AND OTHER LIMITATIONS

There are some size limitations in PCRE2 but it is hoped that they will never in practice be relevant.

The maximum size of a compiled pattern is approximately 64 thousand code units for the 8-bit and 16-bit libraries if PCRE2 is compiled with the default internal linkage size, which is 2 bytes for these libraries. If you want to process regular expressions that are truly enormous, you can compile PCRE2 with an internal linkage size of 3 or 4 (when building the 16-bit library, 3 is rounded up to 4). See the README file in the source distribution and the pcre2build documentation for details. In these cases the limit is substantially larger. However, the speed of execution is slower. In the 32-bit library, the internal linkage size is always 4.

The maximum length of a source pattern string is essentially unlimited; it is the largest number a PCRE2_SIZE variable can hold. However, the program that calls pcre2_compile() can specify a smaller limit.

The maximum length (in code units) of a subject string is one less than the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as size_t. Its maximum value (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings and unset offsets.

All values in repeating quantifiers must be less than 65536.

There are two different limits that apply to branches of lookbehind assertions. If every branch in such an assertion matches a fixed number of characters, the maximum length of any branch is 65535 characters. If any branch matches a variable number of characters, then the maximum matching length for every branch is limited. The default limit is set at compile time, defaulting to 255, but can be changed by the calling program.

There is no limit to the number of parenthesized groups, but there can be no more than 65535 capture groups, and there is a limit to the depth of nesting of parenthesized subpatterns of all kinds. This is imposed in order to limit the amount of system stack used at compile time. The default limit can be specified when PCRE2 is built; if not, the default is set to 250. An application can change this limit by calling pcre2_set_parens_nest_limit() to set the limit in a compile context.

The maximum length of the name for a named capture group as well as the number of such groups is configurable at build time. The maximum length for the name defaults to 128 code units, and the maximum number of such groups to 10000.

The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb is 255 code units for the 8-bit library and 65535 code units for the 16-bit and 32-bit libraries.

The maximum length of a string argument to a callout is the largest number a 32-bit unsigned integer can hold.

The maximum amount of heap memory used for matching is controlled by the heap limit, which can be set in a pattern or in a match context. The default is a very large number, effectively unlimited.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 03 September 2025
Copyright © 1997-2023 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2matching.html ================================================ pcre2matching specification

pcre2matching man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PCRE2 MATCHING ALGORITHMS

This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() function. This works in the same way as Perl's matching function, and provides a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the pcre2jit documentation is compatible with this function.

An alternative algorithm is provided by the pcre2_dfa_match() function; it operates in a different way, and is not Perl-compatible. This alternative has advantages and disadvantages compared with the standard algorithm, and these are described below.

When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, when there are multiple possibilities. For example, if the anchored pattern

  ^<.*>
is matched against the string
  <something> <something else> <something further>
there are three possible answers. The standard algorithm finds only one of them, whereas the alternative algorithm finds all three.

REGULAR EXPRESSIONS AS TREES

The set of strings that are matched by a regular expression can be represented as a tree structure. An unlimited repetition in the pattern makes the tree of infinite size, but it is still a tree. Matching the pattern to a given subject string (from a given starting point) can be thought of as a search of the tree. There are two ways to search a tree: depth-first and breadth-first, and these correspond to the two matching algorithms provided by PCRE2.

THE STANDARD MATCHING ALGORITHM

In the terminology of Jeffrey Friedl's book "Mastering Regular Expressions", the standard algorithm is an "NFA algorithm". It conducts a depth-first search of the pattern tree. That is, it proceeds along a single path through the tree, checking that the subject matches what is required. When there is a mismatch, the algorithm tries any alternatives at the current point, and if they all fail, it backs up to the previous branch point in the tree, and tries the next alternative branch at that level. This often involves backing up (moving to the left) in the subject string as well. The order in which repetition branches are tried is controlled by the greedy or ungreedy nature of the quantifier.

If a leaf node is reached, a matching string has been found, and at that point the algorithm stops. Thus, if there is more than one possible match, this algorithm returns the first one that it finds. Whether this is the shortest, the longest, or some intermediate length depends on the way the alternations and the greedy or ungreedy repetition quantifiers are specified in the pattern.

Because it ends up with a single path through the tree, it is relatively straightforward for this algorithm to keep track of the substrings that are matched by portions of the pattern in parentheses. This provides support for capturing parentheses and backreferences.

THE ALTERNATIVE MATCHING ALGORITHM

This algorithm conducts a breadth-first search of the tree. Starting from the first matching point in the subject, it scans the subject string from left to right, once, character by character, and as it does this, it remembers all the paths through the tree that represent valid matches. In Friedl's terminology, this is a kind of "DFA algorithm", though it is not implemented as a traditional finite state machine (it keeps multiple states active simultaneously).

Although the general principle of this matching algorithm is that it scans the subject string only once, without backtracking, there is one exception: when a lookaround assertion is encountered, the characters following or preceding the current point have to be independently inspected.

The scan continues until either the end of the subject is reached, or there are no more unterminated paths. At this point, terminated paths represent the different matching possibilities (if there are none, the match has failed). Thus, if there is more than one possible match, this algorithm finds all of them, and in particular, it finds the longest. The matches are returned in the output vector in decreasing order of length. There is an option to stop the algorithm after the first match (which is necessarily the shortest) is found.

Note that the size of vector needed to contain all the results depends on the number of simultaneous matches, not on the number of capturing parentheses in the pattern. Using pcre2_match_data_create_from_pattern() to create the match data block is therefore not advisable when doing DFA matching.

Note also that all the matches that are found start at the same point in the subject. If the pattern

  cat(er(pillar)?)?
is matched against the string "the caterpillar catchment", the result is the three strings "caterpillar", "cater", and "cat" that start at the fifth character of the subject. The algorithm does not automatically move on to find matches that start at later positions.

PCRE2's "auto-possessification" optimization usually applies to character repeats at the end of a pattern (as well as internally). For example, the pattern "a\d+" is compiled as if it were "a\d++" because there is no point even considering the possibility of backtracking into the repeated digits. For DFA matching, this means that only one possible match is found. If you really do want multiple matches in such cases, either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POSSESS option when compiling.

There are a number of features of PCRE2 regular expressions that are not supported or behave differently in the alternative matching function. Those that are not supported cause an error if encountered.

1. Because the algorithm finds all possible matches, the greedy or ungreedy nature of repetition quantifiers is not relevant (though it may affect auto-possessification, as just described). During matching, greedy and ungreedy quantifiers are treated in exactly the same way. However, possessive quantifiers can make a difference when what follows could also match what is quantified, for example in a pattern like this:

  ^a++\w!
This pattern matches "aaab!" but not "aaa!", which would be matched by a non-possessive quantifier. Similarly, if an atomic group is present, it is matched as if it were a standalone pattern at the current point, and the longest match is then "locked in" for the rest of the overall pattern.

2. When dealing with multiple paths through the tree simultaneously, it is not straightforward to keep track of captured substrings for the different matching possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured substrings are available.

3. Because no substrings are captured, a number of related features are not available:

(a) Backreferences;

(b) Conditional expressions that use a backreference as the condition or test for a specific group recursion;

(c) Script runs;

(d) Scan substring assertions.

4. Because many paths through the tree may be active, the \K escape sequence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported.

5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0.

6. The \C escape sequence, which (in the standard algorithm) always matches a single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree.

7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion.

8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not supported by pcre2_dfa_match().

ADVANTAGES OF THE ALTERNATIVE ALGORITHM

The main advantage of the alternative algorithm is that all possible matches (at a single point in the subject) are automatically found, and in particular, the longest match is found. To find more than one match at the same point using the standard algorithm, you have to do kludgy things with callouts.

Partial matching is possible with this algorithm, though it has some limitations. The pcre2partial documentation gives details of partial matching and discusses multi-segment matching.

DISADVANTAGES OF THE ALTERNATIVE ALGORITHM

The alternative algorithm suffers from a number of disadvantages:

1. It is substantially slower than the standard algorithm. This is partly because it has to search for all possible matches, but is also because it is less susceptible to optimization.

2. Capturing parentheses and other features such as backreferences that rely on them are not supported.

3. Matching within invalid UTF strings is not supported.

4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm.

5. JIT optimization is not supported.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 22 February 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2partial.html ================================================ pcre2partial specification

pcre2partial man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PARTIAL MATCHING IN PCRE2

In normal use of PCRE2, if there is a match up to the end of a subject string, but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH is returned, just like any other failing match. There are circumstances where it might be helpful to distinguish this "partial match" case.

One example is an application where the subject string is very long, and not all available at once. The requirement here is to be able to do the matching segment by segment, but special action is needed when a matched substring spans the boundary between two segments.

Another example is checking a user input string as it is typed, to ensure that it conforms to a required format. Invalid characters can be immediately diagnosed and rejected, giving instant feedback.

Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options when calling a matching function. The difference between the two options is whether or not a partial match is preferred to an alternative complete match, though the details differ between the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD takes precedence.

If you want to use partial matching with just-in-time optimized code, as well as setting a partial match option for the matching function, you must also call pcre2_jit_compile() with one or both of these options:

  PCRE2_JIT_PARTIAL_HARD
  PCRE2_JIT_PARTIAL_SOFT
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial matches on the same pattern. Separate code is compiled for each mode. If the appropriate JIT mode has not been compiled, interpretive matching code is used.

Setting a partial matching option disables two of PCRE2's standard optimization hints. PCRE2 remembers the last literal code unit in a pattern, and abandons matching immediately if it is not present in the subject string. This optimization cannot be used for a subject string that might match only partially. PCRE2 also remembers a minimum length of a matching string, and does not bother to run the matching function on shorter strings. This optimization is also disabled for partial matching.

REQUIREMENTS FOR A PARTIAL MATCH

A possible partial match occurs during matching when the end of the subject string is reached successfully, but either more characters are needed to complete the match, or the addition of more characters might change what is matched.

Example 1: if the pattern is /abc/ and the subject is "ab", more characters are definitely needed to complete a match. In this case both hard and soft matching options yield a partial match.

Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match can be found, but the addition of more characters might change what is matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match; PCRE2_PARTIAL_SOFT returns the complete match.

On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next pattern item is \z, \Z, \b, \B, or $ there is always a partial match. Otherwise, for both options, the next pattern item must be one that inspects a character, and at least one of the following must be true:

(1) At least one character has already been inspected. An inspected character need not form part of the final matched string; lookbehind assertions and the \K escape sequence provide ways of inspecting characters before the start of a matched string.

(2) The pattern contains one or more lookbehind assertions. This condition exists in case there is a lookbehind that inspects characters before the start of the match.

(3) There is a special case when the whole pattern can match an empty string. When the starting point is at the end of the subject, the empty string match is a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above conditions is true, it is returned. However, because adding more characters might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match, which in this case means "there is going to be a match at this point, but until some more characters are added, we do not know if it will be an empty string or something longer".

PARTIAL MATCHING USING pcre2_match()

When a partial matching option is set, the result of calling pcre2_match() can be one of the following:

A successful match A complete match has been found, starting and ending within this subject.

PCRE2_ERROR_NOMATCH No match can start anywhere in this subject.

PCRE2_ERROR_PARTIAL Adding more characters may result in a complete match that uses one or more characters from the end of this subject.

When a partial match is returned, the first two elements in the ovector point to the portion of the subject that was matched, but the values in the rest of the ovector are undefined. The appearance of \K in the pattern has no effect for a partial match. Consider this pattern:

  /abc\K123/
If it is matched against "456abc123xyz" the result is a complete match, and the ovector defines the matched string as "123", because \K resets the "start of match" point. However, if a partial match is requested and the subject string is "456abc12", a partial match is found for the string "abc12", because all these characters are needed for a subsequent re-match with additional characters.

If there is more than one partial match, the first one that was found provides the data that is returned. Consider this pattern:

  /123\w+X|dogY/
If this is matched against the subject string "abc123dog", both alternatives fail to match, but the end of the subject is reached during matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying "123dog" as the first partial match. (In this example, there are two partial matches, because "dog" on its own partially matches the second alternative.)

How a partial match is processed by pcre2_match()

What happens when a partial match is identified depends on which of the two partial matching options is set.

If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a partial match is found, without continuing to search for possible complete matches. This option is "hard" because it prefers an earlier partial match over a later complete match. For this reason, the assumption is made that the end of the supplied subject string is not the true end of the available data, which is why \z, \Z, \b, \B, and $ always give a partial match.

If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching continues as normal, and other alternatives in the pattern are tried. If no complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match over a partial match. All the various matching items in a pattern behave as if the subject string is potentially complete; \z, \Z, and $ match at the end of the subject, as normal, and for \b and \B the end of the subject is treated as a non-alphanumeric.

The difference between the two partial matching options can be illustrated by a pattern such as:

  /dog(sbody)?/
This matches either "dog" or "dogsbody", greedily (that is, it prefers the longer string if possible). If it is matched against the string "dog" with PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other hand, if the pattern is made ungreedy the result is different:
  /dog(sbody)??/
In this case the result is always a complete match because that is found first, and matching never continues after finding a complete match. It might be easier to follow this explanation by thinking of the two patterns like this:
  /dog(sbody)?/    is the same as  /dogsbody|dog/
  /dog(sbody)??/   is the same as  /dog|dogsbody/
The second pattern will never match "dogsbody", because it will always find the shorter match first.

Example of partial matching using pcre2test

The pcre2test data modifiers partial_hard (or ph) and partial_soft (or ps) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT, respectively, when calling pcre2_match(). Here is a run of pcre2test using a pattern that matches the whole subject in the form of a date:

    re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
  data> 25dec3\=ph
  Partial match: 23dec3
  data> 3ju\=ph
  Partial match: 3ju
  data> 3juj\=ph
  No match
This example gives the same results for both hard and soft partial matching options. Here is an example where there is a difference:
    re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
  data> 25jun04\=ps
   0: 25jun04
   1: jun
  data> 25jun04\=ph
  Partial match: 25jun04
With PCRE2_PARTIAL_SOFT, the subject is matched completely. For PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so there is only a partial match.

MULTI-SEGMENT MATCHING WITH pcre2_match()

PCRE was not originally designed with multi-segment matching in mind. However, over time, features (including partial matching) that make multi-segment matching possible have been added. A very long string can be searched segment by segment by calling pcre2_match() repeatedly, with the aim of achieving the same results that would happen if the entire string was available for searching all the time. Normally, the strings that are being sought are much shorter than each individual segment, and are in the middle of very long strings, so the pattern is normally not anchored.

Special logic must be implemented to handle a matched substring that spans a segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a partial match at the end of a segment whenever there is the possibility of changing the match by adding more characters. The PCRE2_NOTBOL option should also be set for all but the first segment.

When a partial match occurs, the next segment must be added to the current subject and the match re-run, using the startoffset argument of pcre2_match() to begin at the point where the partial match started. For example:

    re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
  data> ...the date is 23ja\=ph
  Partial match: 23ja
  data> ...the date is 23jan19 and on that day...\=offset=15
   0: 23jan19
   1: jan
Note the use of the offset modifier to start the new match where the partial match was found. In this example, the next segment was added to the one in which the partial match was found. This is the most straightforward approach, typically using a memory buffer that is twice the size of each segment. After a partial match, the first half of the buffer is discarded, the second half is moved to the start of the buffer, and a new segment is added before repeating the match as in the example above. After a no match, the entire buffer can be discarded.

If there are memory constraints, you may want to discard text that precedes a partial match before adding the next segment. Unfortunately, this is not at present straightforward. In cases such as the above, where the pattern does not contain any lookbehinds, it is sufficient to retain only the partially matched substring. However, if the pattern contains a lookbehind assertion, characters that precede the start of the partial match may have been inspected during the matching process. When pcre2test displays a partial match, it indicates these characters with '<' if the allusedtext modifier is set:

    re> "(?<=123)abc"
  data> xx123ab\=ph,allusedtext
  Partial match: 123ab
                 <<<
However, the allusedtext modifier is not available for JIT matching, because JIT matching does not record the first (or last) consulted characters. For this reason, this information is not available via the API. It is therefore not possible in general to obtain the exact number of characters that must be retained in order to get the right match result. If you cannot retain the entire segment, you must find some heuristic way of choosing.

If you know the approximate length of the matching substrings, you can use that to decide how much text to retain. The only lookbehind information that is currently available via the API is the length of the longest individual lookbehind in a pattern, but this can be misleading if there are nested lookbehinds. The value returned by calling pcre2_pattern_info() with the PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code units) that any individual lookbehind moves back when it is processed. A pattern such as "(?<=(?<!b)a)" has a maximum lookbehind value of one, but inspects two characters before its starting point.

In a non-UTF or a 32-bit case, moving back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters while moving back through the code units.

PARTIAL MATCHING USING pcre2_dfa_match()

The DFA function moves along the subject string character by character, without backtracking, searching for all possible matches simultaneously. If the end of the subject is reached before the end of the pattern, there is the possibility of a partial match.

When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there have been no complete matches. Otherwise, the complete matches are returned. If PCRE2_PARTIAL_HARD is set, a partial match takes precedence over any complete matches. The portion of the string that was matched when the longest partial match was found is set as the first matching string.

Because the DFA function always searches for all possible matches, and there is no difference between greedy and ungreedy repetition, its behaviour is different from the pcre2_match(). Consider the string "dog" matched against this ungreedy pattern:

  /dog(sbody)??/
Whereas the standard function stops as soon as it finds the complete match for "dog", the DFA function also finds the partial match for "dogsbody", and so returns that when PCRE2_PARTIAL_HARD is set.

MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()

When a partial match has been found using the DFA matching function, it is possible to continue the match by providing additional subject data and calling the function again with the same compiled regular expression, this time setting the PCRE2_DFA_RESTART option. You must pass the same working space as before, because this is where details of the previous partial match are stored. You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with PCRE2_DFA_RESTART to continue partial matching over multiple segments. Here is an example using pcre2test:

    re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
  data> 23ja\=dfa,ps
  Partial match: 23ja
  data> n05\=dfa,dfa_restart
   0: n05
The first call has "23ja" as the subject, and requests partial matching; the second call has "n05" as the subject for the continued (restarted) match. Notice that when the match is complete, only the last part is shown; PCRE2 does not retain the previously partially-matched string. It is up to the calling program to do that if it needs to. This means that, for an unanchored pattern, if a continued match fails, it is not possible to try again at a new starting point. All this facility is capable of doing is continuing with the previous match attempt. For example, consider this pattern:
  1234|3789
If the first part of the subject is "ABC123", a partial match of the first alternative is found at offset 3. There is no partial match for the second alternative, because such a match does not start at the same point in the subject string. Attempting to continue with the string "7890" does not yield a match because only those alternatives that match at one point in the subject are remembered. Depending on the application, this may or may not be what you want.

If you do want to allow for starting again at the next character, one way of doing it is to retain some or all of the segment and try a new complete match, as described for pcre2_match() above. Another possibility is to work with two buffers. If a partial match at offset n in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you can then try a new match starting at offset n+1 in the first buffer.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 27 November 2024
Copyright © 1997-2019 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2pattern.html ================================================ pcre2pattern specification

pcre2pattern man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PCRE2 REGULAR EXPRESSION DETAILS

The syntax and semantics of the regular expressions that are supported by PCRE2 are described in detail below. There is a quick-reference syntax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. PCRE2 also supports some alternative regular expression syntax that does not conflict with the Perl syntax in order to provide some compatibility with regular expressions in Python, .NET, and Oniguruma. There are in addition some options that enable alternative syntax and semantics that are not the same as in Perl.

Perl's regular expressions are described in its own documentation, and regular expressions in general are covered in a number of books, some of which have copious examples. Jeffrey Friedl's "Mastering Regular Expressions", published by O'Reilly, covers regular expressions in great detail. This description of PCRE2's regular expressions is intended as reference material.

This document discusses the regular expression patterns that are supported by PCRE2 when its main matching function, pcre2_match(), is used. PCRE2 also has an alternative matching function, pcre2_dfa_match(), which matches using a different algorithm that is not Perl-compatible. Some of the features discussed below are not available when DFA matching is used. The advantages and disadvantages of the alternative function, and how it differs from the normal function, are discussed in the pcre2matching page.

EBCDIC CHARACTER CODES

Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes this by default. However, it can be compiled to run in an environment that uses the EBCDIC code, which is the case for some IBM mainframe operating systems. In the sections below, character code values are ASCII or Unicode; in an EBCDIC environment these characters may have different code values, and there are no code points greater than 255. Differences in behaviour when PCRE2 is running in an EBCDIC environment are described in the section "EBCDIC environments" below, which you can ignore unless you really are in an EBCDIC environment.

SPECIAL START-OF-PATTERN ITEMS

A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-compatible, but are provided to make these options accessible to pattern writers who are not able to change the program that processes the pattern. Any number of these items may appear, but they must all be together right at the start of the pattern string, and the letters must be in upper case.

UTF support

In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be specified for the 32-bit library, in which case it constrains the character values to valid Unicode code points. To process UTF strings, PCRE2 must be built to include Unicode support (which is the default). When using UTF strings you must either call the compiling function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How setting a UTF mode affects pattern matching is mentioned in several places below. There is also a summary of features in the pcre2unicode page.

Some applications that allow their users to supply patterns may wish to restrict them to non-UTF data for security reasons. If the PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not allowed, and its appearance in a pattern causes an error.

Unicode property support

Another special sequence that may appear at the start of a pattern is (*UCP). This has the same effect as setting the PCRE2_UCP option: it causes sequences such as \d and \w to use Unicode properties to determine character types, instead of recognizing only characters with codes less than 256 via a lookup table. If also causes upper/lower casing operations to use Unicode properties for characters with code points greater than 127, even when UTF is not set. These behaviours can be changed within the pattern; see the section entitled "Internal Option Setting" below.

Some applications that allow their users to supply patterns may wish to restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to pcre2_compile(), (*UCP) is not allowed, and its appearance in a pattern causes an error.

Locking out empty string matching

Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever matching function is subsequently called to match the pattern. These options lock out the matching of empty strings, either entirely, or only at the start of the subject.

Disabling auto-possessification

If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers possessive when what follows cannot match the repeated item. For example, by default a+b is treated as a++b. For more details, see the pcre2api documentation.

Disabling start-up optimizations

If a pattern starts with (*NO_START_OPT), it has the same effect as setting the PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for quickly reaching "no match" results. For more details, see the pcre2api documentation.

Disabling automatic anchoring

If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables optimizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the pcre2api documentation.

Disabling JIT compilation

If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by the application to apply the JIT optimization by calling pcre2_jit_compile() is ignored.

Setting match resource limits

The pcre2_match() function contains a counter that is incremented every time it goes round its main loop. The caller of pcre2_match() can set a limit on this counter, which therefore limits the amount of computing resource used for a match. The maximum depth of nested backtracking can also be limited; this indirectly restricts the amount of heap memory that is used, but there is also an explicit memory limit that can be set.

These facilities are provided to catch runaway matches that are provoked by patterns with huge matching trees. A common example is a pattern with nested unlimited repeats applied to a long string that does not match. When one of these limits is reached, pcre2_match() gives an error return. The limits can also be set by items at the start of the pattern of the form

  (*LIMIT_HEAP=d)
  (*LIMIT_MATCH=d)
  (*LIMIT_DEPTH=d)
where d is any number of decimal digits. However, the value of the setting must be less than the value set (or defaulted) by the caller of pcre2_match() for it to have any effect. In other words, the pattern writer can lower the limits set by the programmer, but not raise them. If there is more than one setting of one of these limits, the lower value is used. The heap limit is specified in kibibytes (units of 1024 bytes).

Prior to release 10.30, LIMIT_DEPTH was called LIMIT_RECURSION. This name is still recognized for backwards compatibility.

The heap limit applies only when the pcre2_match() or pcre2_dfa_match() interpreters are used for matching. It does not apply to JIT. The match limit is used (but in a different way) when JIT is being used, or when pcre2_dfa_match() is called, to limit computing resource usage by those matching functions. The depth limit is ignored by JIT but is relevant for DFA matching, which uses function recursion for recursions within the pattern and for lookaround assertions and atomic groups. In this case, the depth limit controls the depth of such recursion.

Newline conventions

PCRE2 supports six different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) character, the two-character sequence CRLF, any of the three preceding, any Unicode newline sequence, or the NUL character (binary zero). The pcre2api page has further discussion about newlines, and shows how to set the newline convention when calling pcre2_compile().

It is also possible to specify a newline convention by starting a pattern string with one of the following sequences:

  (*CR)        carriage return
  (*LF)        linefeed
  (*CRLF)      carriage return, followed by linefeed
  (*ANYCRLF)   any of the three above
  (*ANY)       all Unicode newline sequences
  (*NUL)       the NUL character (binary zero)
These override the default and the options given to the compiling function. For example, on a Unix system where LF is the default newline sequence, the pattern
  (*CR)a.b
changes the convention to CR. That pattern matches "a\nb" because LF is no longer a newline. If more than one of these settings is present, the last one is used.

The newline convention affects where the circumflex and dollar assertions are true. It also affects the interpretation of the dot metacharacter when PCRE2_DOTALL is not set, and the behaviour of \N when not followed by an opening brace. However, it does not affect what the \R escape sequence matches. By default, this is any Unicode newline sequence, for Perl compatibility. However, this can be changed; see the next section and the description of \R in the section entitled "Newline sequences" below. A change of \R setting can be combined with a change of newline convention.

Specifying what \R matches

It is possible to restrict \R to match only CR, LF, or CRLF (instead of the complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF at compile time. This effect can also be achieved by starting a pattern with (*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, corresponding to PCRE2_BSR_UNICODE.

CHARACTERS AND METACHARACTERS

A regular expression is a pattern that is matched against a subject string from left to right. Most characters stand for themselves in a pattern, and match the corresponding characters in the subject. As a trivial example, the pattern

  The quick brown fox
matches a portion of a subject string that is identical to itself. When caseless matching is specified (the PCRE2_CASELESS option or (?i) within the pattern), letters are matched independently of case. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to pcre2_compile() or set by (*CASELESS_RESTRICT) or (?r) within the pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed to pcre2_compile() or set by (*TURKISH_CASING) within the pattern), then the 'i' letters are matched according to Turkish and Azeri languages.

The power of regular expressions comes from the ability to include wild cards, character classes, alternatives, and repetitions in the pattern. These are encoded in the pattern by the use of metacharacters, which do not stand for themselves but instead are interpreted in some special way.

There are two different sets of metacharacters: those that are recognized anywhere in the pattern except within square brackets, and those that are recognized within square brackets. Outside square brackets, the metacharacters are as follows:

  \      general escape character with several uses
  ^      assert start of string (or line, in multiline mode)
  $      assert end of string (or line, in multiline mode)
  .      match any character except newline (by default)
  [      start character class definition
  |      start of alternative branch
  (      start group or control verb
  )      end group or control verb
  *      0 or more quantifier
  +      1 or more quantifier; also "possessive quantifier"
  ?      0 or 1 quantifier; also quantifier minimizer
  {      potential start of min/max quantifier
Brace characters { and } are also used to enclose data for constructions such as \g{2} or \k{name}. In almost all uses of braces, space and/or horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or after the comma. The exception to this is \u{...} which is an ECMAScript compatibility feature that is recognized only when the PCRE2_EXTRA_ALT_BSUX option is set. ECMAScript does not ignore such white space; it causes the item to be interpreted as literal.

Part of a pattern that is in square brackets is called a "character class". In a character class the only metacharacters are:

  \      general escape character
  ^      negate the class, but only if the first character
  -      indicates character range
  [      POSIX character class (if followed by POSIX syntax)
  ]      terminates the character class
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or between a # outside a character class and the next newline, inclusive, is ignored. An escaping backslash can be used to include a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are ignored inside a character class. Note: only these two characters are ignored, not the full set of pattern white space characters that are ignored outside a character class. Option settings can be changed within a pattern; see the section entitled "Internal Option Setting" below.

The following sections describe the use of each of the metacharacters.

BACKSLASH

The backslash character has several uses. Firstly, if it is followed by a character that is not a digit or a letter, it takes away any special meaning that character may have. This use of backslash as an escape character applies both inside and outside character classes.

For example, if you want to match a * character, you must write \* in the pattern. This escaping action applies whether or not the following character would otherwise be interpreted as a metacharacter, so it is always safe to precede a non-alphanumeric with backslash to specify that it stands for itself. In particular, if you want to match a backslash, you write \\.

Only ASCII digits and letters have any special meaning after a backslash. All other characters (in particular, those whose code points are greater than 127) are treated as literals.

If you want to treat all characters in a sequence as literals, you can do so by putting them between \Q and \E. Note that this includes white space even when the PCRE2_EXTENDED option is set so that most other white space is ignored. The behaviour is different from Perl in that $ and @ are handled as literals in \Q...\E sequences in PCRE2, whereas in Perl, $ and @ cause variable interpolation. Also, Perl does "double-quotish backslash interpolation" on any backslashes between \Q and \E which, its documentation says, "may lead to confusing results". PCRE2 treats a backslash between \Q and \E just like any other character. Note the following examples:

  Pattern            PCRE2 matches   Perl matches

  \Qabc$xyz\E        abc$xyz        abc followed by the contents of $xyz
  \Qabc\$xyz\E       abc\$xyz       abc\$xyz
  \Qabc\E\$\Qxyz\E   abc$xyz        abc$xyz
  \QA\B\E            A\B            A\B
  \Q\\E              \              \\E
The \Q...\E sequence is recognized both inside and outside character classes. An isolated \E that is not preceded by \Q is ignored. If \Q is not followed by \E later in the pattern, the literal interpretation continues to the end of the pattern (that is, \E is assumed at the end). If the isolated \Q is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket.

Another difference from Perl is that any appearance of \Q or \E inside what might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers is inside \Q...\E, but not if the separating comma is. When not recognized as a quantifier a sequence such as {\Q1\E,2} is treated as the literal string "{1,2}".

Non-printing characters

A second use of backslash provides a way of encoding non-printing characters in patterns in a visible manner. There is no restriction on the appearance of non-printing characters in a pattern, but when a pattern is being prepared by text editing, it is often easier to use one of the following escape sequences instead of the binary character it represents. In an ASCII or Unicode environment, these escapes are as follows:

  \a          alarm, that is, the BEL character (hex 07)
  \cx         "control-x", where x is a non-control ASCII character
  \e          escape (hex 1B)
  \f          form feed (hex 0C)
  \n          linefeed (hex 0A)
  \r          carriage return (hex 0D) (but see below)
  \t          tab (hex 09)
  \0dd        character with octal code 0dd
  \ddd        character with octal code ddd, or back reference
  \o{ddd..}   character with octal code ddd..
  \xhh        character with hex code hh
  \x{hhh..}   character with hex code hhh..
  \N{U+hhh..} character with Unicode hex code point hhh..
A description of how back references work is given later, following the discussion of parenthesized groups.

By default, after \x that is not followed by {, one or two hexadecimal digits are read (letters can be in upper or lower case). If the character that follows \x is neither { nor a hexadecimal digit, an error occurs. This is different from Perl's default behaviour, which generates a NUL character, but is in line with the behaviour of Perl's 'strict' mode in re.

Any number of hexadecimal digits may appear between \x{ and }. If a character other than a hexadecimal digit appears between \x{ and }, or if there is no terminating }, an error occurs.

Characters whose code points are less than 256 can be defined by either of the two syntaxes for \x or by an octal sequence. There is no difference in the way they are handled. For example, \xdc is exactly the same as \x{dc} or \334. However, using the braced versions does make such sequences easier to read.

Support is available for some ECMAScript (aka JavaScript) escape sequences via two compile-time options. If PCRE2_ALT_BSUX is set, the sequence \x followed by { is not recognized. Only if \x is followed by two hexadecimal digits is it recognized as a character escape. Otherwise it is interpreted as a literal "x" character. In this mode, support for code points greater than 256 is provided by \u, which must be followed by four hexadecimal digits; otherwise it is interpreted as a literal "u" character.

PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition, \u{hhh..} is recognized as the character specified by hexadecimal code point. There may be any number of hexadecimal digits, but unlike other places that also use curly brackets, spaces are not allowed and would result in the string being interpreted as a literal. This syntax is from ECMAScript 6.

The \N{U+hhh..} escape sequence is recognized only when PCRE2 is operating in UTF mode. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. Note that when \N is not followed by an opening brace (curly bracket) it has an entirely different meaning, matching any character that is not a newline.

There are some legacy applications where the escape sequence \r is expected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character.

An error occurs if \c is not followed by a character whose ASCII code point is in the range 32 to 126. The precise effect of \cx is as follows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes hex 7B (; is 3B). If the code unit following \c has a code point less than 32 or greater than 126, a compile-time error occurs.

For differences in the way some escapes behave in EBCDIC environments, see section "EBCDIC environments" below.

Octal escapes and back references

The escape \o must be followed by a sequence of octal digits, enclosed in braces. An error occurs if this is not the case. This escape provides a way of specifying character code points as octal numbers greater than 0777, and it also allows octal numbers and backreferences to be unambiguously distinguished.

If braces are not used, after \0 up to two further octal digits are read. However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit must follow \0 (use \00 to generate a NUL character). Make sure you supply two digits after the initial zero if the pattern character that follows is itself an octal digit.

Inside a character class, when a backslash is followed by any octal digit, up to three octal digits are read to generate a code point. Any subsequent digits stand for themselves. The sequences \8 and \9 are treated as the literal characters "8" and "9".

Outside a character class, Perl's handling of a backslash followed by a digit other than 0 is complicated by ambiguity, and Perl has changed over time, causing PCRE2 also to change. From PCRE2 release 10.45 there is an option called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous rules. The next two subsections describe the two sets of rules.

For greater clarity and unambiguity, it is best to avoid following \ by a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical character code points, and \g{...} to specify backreferences.

Perl rules for non-class backslash 1-9

All the digits that follow the backslash are read as a decimal number. If the number is less than 10, begins with the digit 8 or 9, or if there are at least that many previous capture groups in the expression, the entire sequence is taken as a back reference. Otherwise, up to three octal digits are read to form a character code. For example:

  \040   is another way of writing an ASCII space
  \40    is the same, provided there are fewer than 40 previous capture groups
  \7     is always a backreference
  \11    might be a backreference, or another way of writing a tab
  \011   is always a tab
  \0113  is a tab followed by the character "3"
  \113   might be a backreference, otherwise the character with octal code 113
  \377   might be a backreference, otherwise the value 255 (decimal)
  \81    is always a backreference
Note that octal values of 100 or greater that are specified using this syntax must not be introduced by a leading zero, because no more than three octal digits are ever read.

Python rules for non_class backslash 1-9

If there are at least three octal digits after the backslash, exactly three are read as an octal code point number, but the value must be no greater than \377, even in modes where higher code point values are supported. Any subsequent digits stand for themselves. If there are fewer than three octal digits, the sequence is taken as a decimal back reference. Thus, for example, \12 is always a back reference, independent of how many captures there are in the pattern. An error is generated for a reference to a non-existent capturing group.

Constraints on character values

Characters that are specified using octal or hexadecimal numbers are limited to certain values, as follows:

  8-bit non-UTF mode    no greater than 0xff
  16-bit non-UTF mode   no greater than 0xffff
  32-bit non-UTF mode   no greater than 0xffffffff
  All UTF modes         no greater than 0x10ffff and a valid code point
Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the so-called "surrogate" code points). The check for these can be disabled by the caller of pcre2_compile() by setting the option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16.

Escape sequences in character classes

All the sequences that define a single character value can be used both inside and outside character classes. In addition, inside a character class, \b is interpreted as the backspace character (hex 08).

When not followed by an opening brace, \N is not allowed in a character class. \B, \R, and \X are not special inside a character class. Like other unrecognized alphabetic escape sequences, they cause an error. Outside a character class, these sequences have different meanings.

Unsupported escape sequences

In Perl, the sequences \F, \l, \L, \u, and \U are recognized by its string handler and used to modify the case of following characters. By default, PCRE2 does not support these escape sequences in patterns. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U matches a "U" character, and \u can be used to define a character by code point, as described above.

Absolute and relative backreferences

The sequence \g followed by a signed or unsigned number, optionally enclosed in braces, is an absolute or relative backreference. A named backreference can be coded as \g{name}. Backreferences are discussed later, following the discussion of parenthesized groups.

Absolute and relative subroutine calls

For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative syntax for referencing a capture group as a subroutine. Details are discussed later. Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call.

Generic character types

Another use of backslash is for specifying generic character types:

  \d     any decimal digit
  \D     any character that is not a decimal digit
  \h     any horizontal white space character
  \H     any character that is not a horizontal white space character
  \N     any character that is not a newline
  \s     any white space character
  \S     any character that is not a white space character
  \v     any vertical white space character
  \V     any character that is not a vertical white space character
  \w     any "word" character
  \W     any "non-word" character
The \N escape sequence has the same meaning as the "." metacharacter when PCRE2_DOTALL is not set, but setting PCRE2_DOTALL does not change the meaning of \N. Note that when \N is followed by an opening brace it has a different meaning. See the section entitled "Non-printing characters" above for details. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this.

Each pair of lower and upper case escape sequences partitions the complete set of characters into two disjoint sets. Any given character matches one, and only one, of each pair. The sequences can appear both inside and outside character classes. They each match one character of the appropriate type. If the current matching point is at the end of the subject string, all of them fail, because there is no character to match.

The default \s characters are HT (9), LF (10), VT (11), FF (12), CR (13), and space (32), which are defined as white space in the "C" locale. This list may vary if locale-specific matching is taking place. For example, in some locales the "non-breaking space" character (\xA0) is recognized as white space, and in others the VT character is not.

A "word" character is an underscore or any character that is a letter or digit. By default, the definition of letters and digits is controlled by PCRE2's low-valued character tables, and may vary if locale-specific matching is taking place (see "Locale support" in the pcre2api page). For example, in a French locale such as "fr_FR" in Unix-like systems, or "french" in Windows, some character codes greater than 127 are used for accented letters, and these are then matched by \w. The use of locales with Unicode is discouraged.

By default, characters whose code points are greater than 127 never match \d, \s, or \w, and always match \D, \S, and \W, although this may be different for characters in the range 128-255 when locale-specific matching is happening. These escape sequences retain their original meanings from before Unicode support was available, mainly for efficiency reasons. If the PCRE2_UCP option is set, the behaviour is changed so that Unicode properties are used to determine character types, as follows:

  \d  any character that matches \p{Nd} (decimal digit)
  \s  any character that matches \p{Z} or \h or \v
  \w  any character that matches \p{L}, \p{N}, \p{Mn}, or \p{Pc}
The addition of \p{Mn} (non-spacing mark) and the replacement of an explicit test for underscore with a test for \p{Pc} (connector punctuation) happened in PCRE2 release 10.43. This brings PCRE2 into line with Perl.

The upper case escapes match the inverse sets of characters. Note that \d matches only decimal digits, whereas \w matches any Unicode digit, as well as other character categories. Note also that PCRE2_UCP affects \b, and \B because they are defined in terms of \w and \W. Matching these sequences is noticeably slower when PCRE2_UCP is set.

The effect of PCRE2_UCP on any one of these escape sequences can be negated by the options PCRE2_EXTRA_ASCII_BSD, PCRE2_EXTRA_ASCII_BSS, and PCRE2_EXTRA_ASCII_BSW, respectively. These options can be set and reset within a pattern by means of an internal option setting (see below).

The sequences \h, \H, \v, and \V, in contrast to the other sequences, which match only ASCII characters by default, always match a specific list of code points, whether or not PCRE2_UCP is set. The horizontal space characters are:

  U+0009     Horizontal tab (HT)
  U+0020     Space
  U+00A0     Non-break space
  U+1680     Ogham space mark
  U+180E     Mongolian vowel separator
  U+2000     En quad
  U+2001     Em quad
  U+2002     En space
  U+2003     Em space
  U+2004     Three-per-em space
  U+2005     Four-per-em space
  U+2006     Six-per-em space
  U+2007     Figure space
  U+2008     Punctuation space
  U+2009     Thin space
  U+200A     Hair space
  U+202F     Narrow no-break space
  U+205F     Medium mathematical space
  U+3000     Ideographic space
The vertical space characters are:
  U+000A     Linefeed (LF)
  U+000B     Vertical tab (VT)
  U+000C     Form feed (FF)
  U+000D     Carriage return (CR)
  U+0085     Next line (NEL)
  U+2028     Line separator
  U+2029     Paragraph separator
In 8-bit, non-UTF-8 mode, only the characters with code points less than 256 are relevant.

Newline sequences

Outside a character class, by default, the escape sequence \R matches any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent to the following:

  (?>\r\n|\n|\x0b|\f|\r|\x85)
This is an example of an "atomic group", details of which are given below. This particular group matches either the two-character sequence CR followed by LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next line, U+0085). Because this is an atomic group, the two-character sequence is treated as a single unit that cannot be split.

In other modes, two additional characters whose code points are greater than 255 are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029). Unicode support is not needed for these characters to be recognized.

It is possible to restrict \R to match only CR, LF, or CRLF (instead of the complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF at compile time. (BSR is an abbreviation for "backslash R".) This can be made the default when PCRE2 is built; if this is the case, the other behaviour can be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify these settings by starting a pattern string with one of the following sequences:

  (*BSR_ANYCRLF)   CR, LF, or CRLF only
  (*BSR_UNICODE)   any Unicode newline sequence
These override the default and the options given to the compiling function. Note that these special settings, which are not Perl-compatible, are recognized only at the very start of a pattern, and that they must be in upper case. If more than one of them is present, the last one is used. They can be combined with a change of newline convention; for example, a pattern can start with:
  (*ANY)(*BSR_ANYCRLF)
They can also be combined with the (*UTF) or (*UCP) special sequences. Inside a character class, \R is treated as an unrecognized escape sequence, and causes an error.

Unicode character properties

When PCRE2 is built with Unicode support (the default), three additional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Unknown script and with an unassigned type.

Matching characters by Unicode property is not fast, because PCRE2 has to do a multistage table lookup in order to find a character's property. That is why the traditional escape sequences such as \d and \w do not use Unicode properties in PCRE2 by default, though you can make them do so by setting the PCRE2_UCP option or by starting the pattern with (*UCP).

The extra escape sequences that provide property support are:

  \p{xx}   a character with the xx property
  \P{xx}   a character without the xx property
  \X       a Unicode extended grapheme cluster
For compatibility with Perl, negation can be specified by including a circumflex between the opening brace and the property. For example, \p{^Lu} is the same as \P{Lu}.

In accordance with Unicode's "loose matching" rules, ASCII white space characters, hyphens, and underscores are ignored in the properties represented by xx above. As well as the space character, ASCII white space can be tab, linefeed, vertical tab, formfeed, or carriage return.

Some properties are specified as a name only; others as a name and a value, separated by a colon or an equals sign. The names and values consist of ASCII letters and digits (with one Perl-specific exception, see below). They are not case sensitive. Note, however, that the escapes themselves, \p and \P, are case sensitive. There are abbreviations for many names. The following examples are all equivalent:

  \p{bidiclass=al}
  \p{BC=al}
  \p{ Bidi_Class : AL }
  \p{ Bi-di class = Al }
  \P{ ^ Bi-di class = Al }
There is support for Unicode script names, Unicode general category properties, "Any", which matches any character (including newline), Bidi_Class, a number of binary (yes/no) properties, and some special PCRE2 properties (described below). Certain other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} does not match any characters, so always causes a match failure.

Script properties for \p and \P

There are three different syntax forms for matching a script. Each Unicode character has a basic script and, optionally, a list of other scripts ("Script Extensions") with which it is commonly used. Using the Adlam script as an example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and "script extensions" for the property types are recognized and, as for all property specifications, an equals sign is an alternative to the colon. If a script name is given without a property type, for example, \p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this interpretation at release 5.26 and PCRE2 changed at release 10.40.

Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not part of an identified script are lumped together as "Common". The current list of recognized script names and their 4-character abbreviations can be obtained by running this command:

  pcre2test -LS

The general category property for \p and \P

Each character has exactly one Unicode general category property, specified by a two-letter abbreviation. If only one letter is specified with \p or \P, it includes all the general category properties that start with that letter. In this case, in the absence of negation, the curly brackets in the escape sequence are optional; these two examples have the same effect:

  \p{L}
  \pL
The following general category property codes are supported:
  C     Other
  Cc    Control
  Cf    Format
  Cn    Unassigned
  Co    Private use
  Cs    Surrogate

  L     Letter
  Lc    Cased letter
  Ll    Lower case letter
  Lm    Modifier letter
  Lo    Other letter
  Lt    Title case letter
  Lu    Upper case letter

  M     Mark
  Mc    Spacing mark
  Me    Enclosing mark
  Mn    Non-spacing mark

  N     Number
  Nd    Decimal number
  Nl    Letter number
  No    Other number

  P     Punctuation
  Pc    Connector punctuation
  Pd    Dash punctuation
  Pe    Close punctuation
  Pf    Final punctuation
  Pi    Initial punctuation
  Po    Other punctuation
  Ps    Open punctuation

  S     Symbol
  Sc    Currency symbol
  Sk    Modifier symbol
  Sm    Mathematical symbol
  So    Other symbol

  Z     Separator
  Zl    Line separator
  Zp    Paragraph separator
  Zs    Space separator
Perl originally used the name L& for the Lc property. This is still supported by Perl, but discouraged. PCRE2 also still supports it. This property matches any character that has the Lu, Ll, or Lt property, in other words, any letter that is not classified as a modifier or "other". From release 10.45 of PCRE2 the properties Lu, Ll, and Lt are all treated as Lc when case-independent matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The other properties are not affected by caseless matching.

The Cs (Surrogate) property applies only to characters whose code points are in the range U+D800 to U+DFFF. These characters are no different to any other character when PCRE2 is not in UTF mode (using the 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF validity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api page).

The long synonyms for property names that Perl supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is".

No character that is in the Unicode table has the Cn (unassigned) property. Instead, this property is assumed for any code point that is not in the Unicode table.

Binary (yes/no) properties for \p and \P

Unicode defines a number of binary properties, that is, properties whose only values are true or false. You can obtain a list of those that are recognized by \p and \P, along with their abbreviations, by running this command:

  pcre2test -LP

The Bidi_Class property for \p and \P

  \p{Bidi_Class:<class>}   matches a character with the given class
  \p{BC:<class>}           matches a character with the given class
The recognized classes are:
  AL          Arabic letter
  AN          Arabic number
  B           paragraph separator
  BN          boundary neutral
  CS          common separator
  EN          European number
  ES          European separator
  ET          European terminator
  FSI         first strong isolate
  L           left-to-right
  LRE         left-to-right embedding
  LRI         left-to-right isolate
  LRO         left-to-right override
  NSM         non-spacing mark
  ON          other neutral
  PDF         pop directional format
  PDI         pop directional isolate
  R           right-to-left
  RLE         right-to-left embedding
  RLI         right-to-left isolate
  RLO         right-to-left override
  S           segment separator
  WS          white space
As in all property specifications, an equals sign may be used instead of a colon and the class names are case-insensitive. Only the short names listed above are recognized; PCRE2 does not at present support any long alternatives.

Extended grapheme clusters

The \X escape matches any number of Unicode characters that form an "extended grapheme cluster", and treats the sequence as an atomic group (see below). Unicode supports various kinds of composite character by giving each character a grapheme breaking property, and having rules that use these properties to define the boundaries of extended grapheme clusters. The rules are defined in Unicode Standard Annex 29, "Unicode Text Segmentation". Unicode 11.0.0 abandoned the use of some previous properties that had been used for emojis. Instead it introduced various emoji-specific properties. PCRE2 uses only the Extended Pictographic property.

\X always matches at least one character. Then it decides whether to add additional characters according to the following rules for ending a cluster:

1. End at the end of the subject string.

2. Do not end between CR and LF; otherwise end after any control character.

3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters are of five types: L, V, T, LV, and LVT. An L character may be followed by an L, V, LV, or LVT character; an LV or V character may be followed by a V or T character; an LVT or T character may be followed only by a T character.

4. Do not end before extending characters or spacing marks or the zero-width joiner (ZWJ) character. Characters with the "mark" property always have the "extend" grapheme breaking property.

5. Do not end after prepend characters.

6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width joiner) sequences. An emoji ZWJ sequence consists of a character with the Extended_Pictographic property, optionally followed by one or more characters with the Extend property, followed by the ZWJ character, followed by another Extended_Pictographic character.

7. Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) characters if there are an odd number of RI characters before the break point.

8. Otherwise, end the cluster.

PCRE2's additional properties

As well as the standard Unicode properties described above, PCRE2 supports four more that make it possible to convert traditional escape sequences such as \w and \s to use Unicode properties. PCRE2 uses these non-standard, non-Perl properties internally when PCRE2_UCP is set. However, they may also be used explicitly. These properties are:

  Xan   Any alphanumeric character
  Xps   Any POSIX space character
  Xsp   Any Perl space character
  Xwd   Any Perl "word" character
Xan matches characters that have either the L (letter) or the N (number) property. Xps matches the characters tab, linefeed, vertical tab, form feed, or carriage return, and any other character that has the Z (separator) property (this includes the space character). Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl compatibility, but Perl changed. Xwd matches the same characters as Xan, plus those that match Mn (non-spacing mark) or Pc (connector punctuation, which includes underscore).

There is another non-standard property, Xuc, which matches any character that can be represented by a Universal Character Name in C++ and other programming languages. These are the characters $, @, ` (grave accent), and all characters with Unicode code points greater than or equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that most base (ASCII) characters are excluded. (Universal Character Names are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. Note that the Xuc property does not match these sequences but the characters that they represent.)

Resetting the match start

In normal use, the escape sequence \K causes any previously matched characters not to be included in the final matched sequence that is returned. For example, the pattern:

  foo\Kbar
matches "foobar", but reports that it has matched "bar". \K does not interact with anchoring in any way. The pattern:
  ^foo\Kbar
matches only when the subject begins with "foobar" (in single line mode), though it again reports the matched string as "bar". This feature is similar to a lookbehind assertion (described below), but the part of the pattern that precedes \K is not constrained to match a limited number of characters, as is required for a lookbehind assertion. The use of \K does not interfere with the setting of captured substrings. For example, when the pattern
  (foo)\Kbar
matches "foobar", the first substring is still set to "foo".

From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From release 10.38 PCRE2 also forbids this by default. However, the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling pcre2_compile() to re-enable the previous behaviour. When this option is set, \K is acted upon when it occurs inside positive assertions, but is ignored in negative assertions. Note that when a pattern such as (?=ab\K) matches, the reported start of the match can be greater than the end of the match. Using \K in a lookbehind assertion at the start of a pattern can also lead to odd effects. For example, consider this pattern:

  (?<=\Kfoo)bar
If the subject is "foobar", a call to pcre2_match() with a starting offset of 3 succeeds and reports the matching string as "foobar", that is, the start of the reported match is earlier than where the match started.

Simple assertions

The final use of backslash is for certain simple assertions. An assertion specifies a condition that has to be met at a particular point in a match, without consuming any characters from the subject string. The use of groups for more complicated assertions is described below. The backslashed assertions are:

  \b     matches at a word boundary
  \B     matches when not at a word boundary
  \A     matches at the start of the subject
  \Z     matches at the end of the subject
          also matches before a newline at the end of the subject
  \z     matches only at the end of the subject
  \G     matches at the first matching position in the subject
Inside a character class, \b has a different meaning; it matches the backspace character. If any other of these assertions appears in a character class, an "invalid escape sequence" error is generated.

A word boundary is a position in the subject string where the current character and the previous character do not both match \w or \W (i.e. one matches \w and the other matches \W), or the start or end of the string if the first or last character matches \w, respectively. When PCRE2 is built with Unicode support, the meanings of \w and \W can be changed by setting the PCRE2_UCP option. When this is done, it also affects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" or "end of word" metasequence. However, whatever follows \b normally determines which it is. For example, the fragment \ba matches "a" at the start of a word.

The \A, \Z, and \z assertions differ from the traditional circumflex and dollar (described in the next section) in that they only ever match at the very start and end of the subject string, whatever options are set. Thus, they are independent of multiline mode. These three assertions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, which affect only the behaviour of the circumflex and dollar metacharacters. However, if the startoffset argument of pcre2_match() is non-zero, indicating that matching is to start at a point other than the beginning of the subject, \A can never match. The difference between \Z and \z is that \Z matches before a newline at the end of the string as well as at the very end, whereas \z matches only at the end.

The \G assertion is true only when the current matching position is at the start point of the matching process, as specified by the startoffset argument of pcre2_match(). It differs from \A when the value of startoffset is non-zero. By calling pcre2_match() multiple times with appropriate arguments, you can mimic Perl's /g option, and it is in this kind of implementation where \G can be useful.

Note, however, that PCRE2's implementation of \G, being true at the starting character of the matching process, is subtly different from Perl's, which defines it as true at the end of the previous match. In Perl, these can be different when the previously matched string was empty. Because PCRE2 does just one match at a time, it cannot reproduce this behaviour.

If all the alternatives of a pattern begin with \G, the expression is anchored to the starting match position, and the "anchored" flag is set in the compiled regular expression.

CIRCUMFLEX AND DOLLAR

The circumflex and dollar metacharacters are zero-width assertions. That is, they test for a particular condition being true without consuming any characters from the subject string. These two metacharacters are concerned with matching the starts and ends of lines. If the newline convention is set so that only the two-character sequence CRLF is recognized as a newline, isolated CR and LF characters are treated as ordinary data characters, and are not recognized as newlines.

Outside a character class, in the default matching mode, the circumflex character is an assertion that is true only if the current matching point is at the start of the subject string. If the startoffset argument of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circumflex can never match if the PCRE2_MULTILINE option is unset. Inside a character class, circumflex has an entirely different meaning (see below).

Circumflex need not be the first character of the pattern if a number of alternatives are involved, but it should be the first thing in each alternative in which it appears if the pattern is ever to match that branch. If all possible alternatives start with a circumflex, that is, if the pattern is constrained to match only at the start of the subject, it is said to be an "anchored" pattern. (There are also other constructs that can cause a pattern to be anchored.)

The dollar character is an assertion that is true only if the current matching point is at the end of the subject string, or immediately before a newline at the end of the string (by default), unless PCRE2_NOTEOL is set. Note, however, that it does not actually match the newline. Dollar need not be the last character of the pattern if a number of alternatives are involved, but it should be the last item in any branch in which it appears. Dollar has no special meaning in a character class.

The meaning of dollar can be changed so that it matches only at the very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \Z assertion.

The meanings of the circumflex and dollar metacharacters are changed if the PCRE2_MULTILINE option is set. When this is the case, a dollar character matches before any newlines in the string, as well as at the very end, and a circumflex matches immediately after internal newlines as well as at the start of the subject string. It does not match after a newline that ends the string, for compatibility with Perl. However, this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option.

For example, the pattern /^abc$/ matches the subject string "def\nabc" (where \n represents a newline) in multiline mode, but not otherwise. Consequently, patterns that are anchored in single line mode because all branches start with ^ are not anchored in multiline mode, and a match for circumflex is possible when the startoffset argument of pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set.

When the newline convention (see "Newline conventions" below) recognizes the two-character sequence CRLF as a newline, this is preferred, even if the single characters CR and LF are also recognized as newlines. For example, if the newline convention is "any", a multiline mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather than after CR, even though CR on its own is a valid newline. (It also matches at the very start of the string, of course.)

Note that the sequences \A, \Z, and \z can be used to match the start and end of the subject in both modes, and if all branches of a pattern start with \A it is always anchored, whether or not PCRE2_MULTILINE is set.

FULL STOP (PERIOD, DOT) AND \N

Outside a character class, a dot in the pattern matches any one character in the subject string except (by default) a character that signifies the end of a line. One or more characters may be specified as line terminators (see "Newline conventions" above).

Dot never matches a single line-ending character. When the two-character sequence CRLF is the only line ending, dot does not match CR if it is immediately followed by LF, but otherwise it matches all characters (including isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurrences of CR of LF match dot. When all Unicode line endings are being recognized, dot does not match CR or LF or any of the other line ending characters.

The behaviour of dot with regard to newlines can be changed. If the PCRE2_DOTALL option is set, a dot matches any one character, without exception. If the two-character sequence CRLF is present in the subject string, it takes two dots to match it.

The handling of dot is entirely independent of the handling of circumflex and dollar, the only relationship being that they both involve newlines. Dot has no special meaning in a character class.

The escape sequence \N when not followed by an opening brace behaves like a dot, except that it is not affected by the PCRE2_DOTALL option. In other words, it matches any character except one that signifies the end of a line.

When \N is followed by an opening brace it has a different meaning. See the section entitled "Non-printing characters" above for details. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this.

MATCHING A SINGLE CODE UNIT

Outside a character class, the escape sequence \C matches any one code unit, whether or not a UTF mode is set. In the 8-bit library, one code unit is one byte; in the 16-bit library it is a 16-bit unit; in the 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, but it is unclear how it can usefully be used.

Because \C breaks up characters into individual code units, matching one unit with \C in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined results, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's validity at the start of processing unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used).

An application can lock out the use of \C by setting the PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to build PCRE2 with the use of \C permanently disabled.

PCRE2 does not allow \C to appear in lookbehind assertions (described below) in UTF-8 or UTF-16 modes, because this would make it impossible to calculate the length of the lookbehind. Neither the alternative matching function pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. The former gives a match-time error; the latter fails to optimize and so the match is always run using the interpreter.

In the 32-bit library, however, \C is always supported (when not explicitly locked out) because it always matches a single code unit, whether or not UTF-32 is specified.

In general, the \C escape sequence is best avoided. However, one way of using it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a lookahead to check the length of the next character, as in this pattern, which could be used with a UTF-8 string (ignore white space and line breaks):

  (?| (?=[\x00-\x7f])(\C) |
      (?=[\x80-\x{7ff}])(\C)(\C) |
      (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) |
      (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C))
In this example, a group that starts with (?| resets the capturing parentheses numbers in each alternative (see "Duplicate Group Numbers" below). The assertions at the start of each branch check the next UTF-8 character for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The character's individual bytes are then captured by the appropriate number of \C groups.

SQUARE BRACKETS AND CHARACTER CLASSES

An opening square bracket introduces a character class, terminated by a closing square bracket. A closing square bracket on its own is not special by default. If a closing square bracket is required as a member of the class, it should be the first data character in the class (after an initial circumflex, if present) or escaped with a backslash. This means that, by default, an empty class cannot be defined. However, if the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at the start does end the (empty) class.

A character class matches a single character in the subject. A matched character must be in the set of characters defined by the class, unless the first character in the class definition is a circumflex, in which case the subject character must not be in the set defined by the class. If a circumflex is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash.

For example, the character class [aeiou] matches any lower case English vowel, whereas [^aeiou] matches all other characters. Note that a circumflex is just a convenient notation for specifying the characters that are in the class by enumerating those that are not. A class that starts with a circumflex is not an assertion; it still consumes a character from the subject string, and therefore it fails to match if the current pointer is at the end of the string.

Characters in a class may be specified by their code points using \o, \x, or \N{U+hh..} in the usual way. When caseless matching is set, any letters in a class represent both their upper case and lower case versions, so for example, a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set. If you do not want these ASCII/non-ASCII case equivalences, you can suppress them by setting PCRE2_EXTRA_CASELESS_RESTRICT, either as an option in a compile context, or by including (*CASELESS_RESTRICT) or (?r) within a pattern.

Characters that might indicate line breaks are never treated in any special way when matching character classes, whatever line-ending sequence is in use, and whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A class such as [^a] always matches one of these characters.

The generic character type escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V, \w, and \W may appear in a character class, and add the characters that they match to the class. For example, [\dABCDEF] matches any hexadecimal digit. In UTF modes, the PCRE2_UCP option affects the meanings of \d, \s, \w and their upper case partners, just as it does when they appear outside a character class, as described in the section entitled "Generic character types" above. The escape sequence \b has a different meaning inside a character class; it matches the backspace character. The sequences \B, \R, and \X are not special inside a character class. Like any other unrecognized escape sequences, they cause an error. The same is true for \N when not followed by an opening brace.

The minus (hyphen) character can be used to specify a range of characters in a character class. For example, [d-m] matches any letter between d and m, inclusive. If a minus character is required in a class, it must be escaped with a backslash or appear in a position where it cannot be interpreted as indicating a range, typically as the first or last character in the class, or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z.

There is some special treatment for alphabetic ranges in EBCDIC environments; see the section "EBCDIC environments" below.

Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d or \H. However, unless the hyphen is the last character in the class, Perl outputs a warning in its warning mode, as this is most likely a user error. As PCRE2 has no facility for warning, an error is given in these cases.

It is not possible to have the literal character "]" as the end character of a range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as the end of a range, so [W-\]46] is interpreted as a class containing a range and two other characters. The octal or hexadecimal representation of "]" can also be used to end a range.

Ranges normally include all code points between the start and end characters, inclusive. They can also be used for code points specified numerically, for example [\000-\037]. Ranges can include any characters that are valid for the current mode. In any UTF mode, the so-called "surrogate" characters (those whose code points lie between 0xd800 and 0xdfff inclusive) may not be specified explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). However, ranges such as [\x{d7ff}-\x{e000}], which include the surrogates, are always permitted.

If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character tables for a French locale are in use, [\xc8-\xcb] matches accented E characters in both cases.

A circumflex can conveniently be used with the upper case character types to specify a more restricted set of characters than the matching lower case type. For example, the class [^\W_] matches any letter or digit, but not underscore, whereas [\w] includes underscore. A positive character class should be read as "something OR something OR ..." and a negative class as "NOT something AND NOT something AND NOT ...".

The metacharacters that are recognized in character classes are backslash, hyphen (when it can be interpreted as specifying a range), circumflex (only at the start), and the terminating closing square bracket. An opening square bracket is also special when it can be interpreted as introducing a POSIX class (see "Posix character classes" below), or a special compatibility feature (see "Compatibility feature for word boundaries" below. Escaping any non-alphanumeric character in a class turns it into a literal, whether or not it would otherwise be a metacharacter.

PERL EXTENDED CHARACTER CLASSES

From release 10.45 PCRE2 supports Perl's (?[...]) extended character class syntax. This can be used to perform set operations such as intersection on character classes.

The syntax permitted within (?[...]) is quite different to ordinary character classes. Inside the extended class, there is an expression syntax consisting of "atoms", operators, and ordinary parentheses "()" used for grouping. Such classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) turned on within them. This means that literal space and tab characters are ignored everywhere in the class.

The allowed atoms are individual characters specified by escape sequences such as \n or \x{123}, character types such as \d, POSIX classes such as [:alpha:], and nested ordinary (non-extended) character classes. For example, in (?[\d & [...]]) the nested class [...] follows the usual rules for ordinary character classes, in which parentheses are not metacharacters, and character literals and ranges are permitted.

Character literals and ranges may not appear outside a nested ordinary character class because they are not atoms in the extended syntax. The extended syntax does not introduce any additional escape sequences, so (?[\y]) is an unknown escape, as it would be in [\y].

In the extended syntax, ^ does not negate a class (except within an ordinary class nested inside an extended class); it is instead a binary operator.

The binary operators are "&" (intersection), "|" or "+" (union), "-" (subtraction) and "^" (symmetric difference). These are left-associative and "&" has higher (tighter) precedence, while the others have equal lower precedence. The one prefix unary operator is "!" (complement), with highest precedence.

UTS#18 EXTENDED CHARACTER CLASSES

The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) syntax, allowing instead extended class behaviour inside ordinary [...] character classes. This altered syntax for [...] classes is loosely described by the Unicode standard UTS#18. The PCRE2_ALT_EXTENDED_CLASS option does not prevent use of (?[...]) classes; it just changes the meaning of all [...] classes that are not nested inside a Perl (?[...]) class.

Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a character class with two literal characters "a" and "[", but in UTS#18 extended classes the "[" character becomes an additional metacharacter within classes, denoting the start of a nested class, so a literal "[" must be escaped as "\[".

Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", "--" and "~~" which denote character class union, intersection, subtraction, and symmetric difference respectively. In standard Perl syntax, these would simply be needlessly-repeated literals (except for "--" which could be the start or end of a range). In UTS#18 extended classes these operators can be used in constructs such as [\p{L}--[QW]] for "Unicode letters, other than Q and W". A literal "-" at the start or end of a range must be escaped, so while "[--1]" in Perl syntax is the range from hyphen to "1", it must be escaped as "[\--1]" in UTS#18 extended classes.

Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to ignore space and tab characters is not automatically enabled for UTS#18 extended classes, but it is honoured if set.

Extended UTS#18 classes can be nested, and nested classes are themselves extended classes (unlike Perl, where nested classes must be simple classes). For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any letter that is in the Thai or Greek scripts. Note that this means that no special grouping characters (such as the parentheses used in Perl's (?[...]) class syntax) are needed.

Individual class items (literal characters, literal ranges, properties such as \d or \p{...}, and nested classes) can be combined by juxtaposition or by an operator. Juxtaposition is the implicit union operator, and binds more tightly than any explicit operator. Thus a sequence of literals and/or ranges behaves as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character except "E" or "8".

Precedence between the explicit operators is not defined, so mixing operators is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is valid.

This is an emerging syntax which is being adopted gradually across the regex ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024; Python's "re" module reserves the syntax for future use with a FutureWarning for unescaped use of "[" as a literal within character classes. Due to UTS#18 providing insufficient guidance, engines interpret the syntax differently. Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 extended classes, but with slight incompatibilities ([A||B&&C] is parsed as [A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex").

PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so that all the UTS#18 extended classes accepted as valid by PCRE2 have the property that they are interpreted either with the same behaviour, or as invalid, by all other major engines. Please file an issue if you are aware of cross-engine differences in behaviour between PCRE2 and another major engine.

POSIX CHARACTER CLASSES

Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports this notation, in both ordinary and extended classes. For example,

  [01[:alpha:]%]
matches "0", "1", any alphabetic character, or "%". The supported class names are:
  alnum    letters and digits
  alpha    letters
  ascii    character codes 0 - 127
  blank    space or tab only
  cntrl    control characters
  digit    decimal digits (same as \d)
  graph    printing characters, excluding space
  lower    lower case letters
  print    printing characters, including space
  punct    printing characters, excluding letters and digits and space
  space    white space (the same as \s from PCRE2 8.34)
  upper    upper case letters
  word     "word" characters (same as \w)
  xdigit   hexadecimal digits
The default "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and space (32). If locale-specific matching is taking place, the list of space characters may be different; there may be fewer or more of them. "Space" and \s match the same set of characters, as do "word" and \w.

The name "word" is a Perl extension, and "blank" is a GNU extension from Perl 5.8. Another Perl extension is negation, which is indicated by a ^ character after the colon. For example,

  [12[:^digit:]]
matches "1", "2", or any non-digit. PCRE2 (and Perl) also recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not supported, and an error is given if they are encountered.

By default, characters with values greater than 127 do not match any of the POSIX character classes, although this may be different for characters in the range 128-255 when locale-specific matching is happening. However, in UCP mode, unless certain options are set (see below), some of the classes are changed so that Unicode character properties are used. This is achieved by replacing POSIX classes with other sequences, as follows:

  [:alnum:]  becomes  \p{Xan}
  [:alpha:]  becomes  \p{L}
  [:blank:]  becomes  \h
  [:cntrl:]  becomes  \p{Cc}
  [:digit:]  becomes  \p{Nd}
  [:lower:]  becomes  \p{Ll}
  [:space:]  becomes  \p{Xps}
  [:upper:]  becomes  \p{Lu}
  [:word:]   becomes  \p{Xwd}
Negated versions, such as [:^alpha:] use \P instead of \p. Four other POSIX classes are handled specially in UCP mode:

[:graph:] This matches characters that have glyphs that mark the page when printed. In Unicode property terms, it matches all characters with the L, M, N, P, S, or Cf properties, except for:

  U+061C           Arabic Letter Mark
  U+180E           Mongolian Vowel Separator
  U+2066 - U+2069  Various "isolate"s

[:print:] This matches the same characters as [:graph:] plus space characters that are not controls, that is, characters with the Zs property.

[:punct:] This matches all characters that have the Unicode P (punctuation) property, plus those characters with code points less than 256 that have the S (Symbol) property.

[:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This is a change that was made in PCRE2 release 10.43 for Perl compatibility.

The other POSIX classes are unchanged by PCRE2_UCP, and match only characters with code points less than 256.

There are two options that can be used to restrict the POSIX classes to ASCII characters when PCRE2_UCP is set. The option PCRE2_EXTRA_ASCII_DIGIT affects just [:digit:] and [:xdigit:]. Within a pattern, this can be set and unset by (?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency.

COMPATIBILITY FEATURE FOR WORD BOUNDARIES

In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of word". PCRE2 treats these items as follows:

  [[:<:]]  is converted to  \b(?=\w)
  [[:>:]]  is converted to  \b(?<=\w)
Only these exact character sequences are recognized. A sequence such as [a[:<:]b] provokes error for an unrecognized POSIX class name. This support is not compatible with Perl. It is provided to help migrations from other environments, and is best not used in any new patterns. Note that \b matches at the start and the end of a word (see "Simple assertions" above), and in a Perl-style pattern the preceding or following character normally shows which is wanted, without the need for the assertions that are used above in order to give exactly the POSIX behaviour. Note also that the PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so it also affects these POSIX sequences.

VERTICAL BAR

Vertical bar characters are used to separate alternative patterns. For example, the pattern

  gilbert|sullivan
matches either "gilbert" or "sullivan". Any number of alternatives may appear, and an empty alternative is permitted (matching the empty string). The matching process tries each alternative in turn, from left to right, and the first one that succeeds is used. If the alternatives are within a group (defined below), "succeeds" means matching the rest of the main pattern as well as the alternative in the group.

INTERNAL OPTION SETTING

The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, and are described in detail in the pcre2api documentation. The option letters are:

  i  for PCRE2_CASELESS
  m  for PCRE2_MULTILINE
  n  for PCRE2_NO_AUTO_CAPTURE
  s  for PCRE2_DOTALL
  x  for PCRE2_EXTENDED
  xx for PCRE2_EXTENDED_MORE
For example, (?im) sets caseless, multiline matching. It is also possible to unset these options by preceding the relevant letters with a hyphen, for example (?-im). The two "extended" options are not independent; unsetting either one cancels the effects of both of them.

A combined setting and unsetting such as (?im-sx), which sets PCRE2_CASELESS and PCRE2_MULTILINE while unsetting PCRE2_DOTALL and PCRE2_EXTENDED, is also permitted. Only one hyphen may appear in the options string. If a letter appears both before and after the hyphen, the option is unset. An empty options setting "(?)" is allowed. Needless to say, it has no effect.

If the first character following (? is a circumflex, it causes all of the above options to be unset. Letters may follow the circumflex to cause some options to be re-instated, but a hyphen may not appear.

Some PCRE2-specific options can be changed by the same mechanism using these pairs or individual letters:

  aD for PCRE2_EXTRA_ASCII_BSD
  aS for PCRE2_EXTRA_ASCII_BSS
  aW for PCRE2_EXTRA_ASCII_BSW
  aP for PCRE2_EXTRA_ASCII_POSIX and PCRE2_EXTRA_ASCII_DIGIT
  aT for PCRE2_EXTRA_ASCII_DIGIT
  r  for PCRE2_EXTRA_CASELESS_RESTRICT
  J  for PCRE2_DUPNAMES
  U  for PCRE2_UNGREEDY
However, except for 'r', these are not unset by (?^), which is equivalent to (?-imnrsx). If 'a' is not followed by any of the upper case letters shown above, it sets (or unsets) all the ASCII options.

PCRE2_EXTRA_ASCII_DIGIT has no additional effect when PCRE2_EXTRA_ASCII_POSIX is set, but including it in (?aP) means that (?-aP) suppresses all ASCII restrictions for POSIX classes.

When one of these option changes occurs at top level (that is, not inside group parentheses), the change applies until a subsequent change, or the end of the pattern. An option change within a group (see below for a description of groups) affects only that part of the group that follows it. At the end of the group these options are reset to the state they were before the group. For example,

  (a(?i)b)c
matches abc and aBc and no other strings (assuming PCRE2_CASELESS is not set externally). Any changes made in one alternative do carry on into subsequent branches within the same group. For example,
  (a(?i)b|c)
matches "ab", "aB", "c", and "C", even though when matching "C" the first branch is abandoned before the option setting. This is because the effects of option settings happen at compile time. There would be some very weird behaviour otherwise.

As a convenient shorthand, if any option settings are required at the start of a non-capturing group (see the next section), the option letters may appear between the "?" and the ":". Thus the two patterns

  (?i:saturday|sunday)
  (?:(?i)saturday|sunday)
match exactly the same set of strings.

Note: There are other PCRE2-specific options, applying to the whole pattern, which can be set by the application when the compiling function is called. In addition, the pattern can contain special leading sequences such as (*CRLF) to override what the application has set or what has been defaulted. Details are given in the section entitled "Newline sequences" above. There are also the (*UTF) and (*UCP) leading sequences that can be used to set UTF and Unicode property modes; they are equivalent to setting the PCRE2_UTF and PCRE2_UCP options, respectively. However, the application can set the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences.

GROUPS

Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things:

1. It localizes a set of alternatives. For example, the pattern

  cat(aract|erpillar|)
matches "cataract", "caterpillar", or "cat". Without the parentheses, it would match "cataract", "erpillar" or an empty string.

2. It creates a "capture group". This means that, when the whole pattern matches, the portion of the subject string that matched the group is passed back to the caller, separately from the portion that matched the whole pattern. (This applies only to the traditional matching function; the DFA matching function does not support capturing.)

Opening parentheses are counted from left to right (starting from 1) to obtain numbers for capture groups. For example, if the string "the red king" is matched against the pattern

  the ((red|white) (king|queen))
the captured substrings are "red king", "red", and "king", and are numbered 1, 2, and 3, respectively.

The fact that plain parentheses fulfil two functions is not always helpful. There are often times when grouping is required without capturing. If an opening parenthesis is followed by a question mark and a colon, the group does not do any capturing, and is not counted when computing the number of any subsequent capture groups. For example, if the string "the white queen" is matched against the pattern

  the ((?:red|white) (king|queen))
the captured substrings are "white queen" and "queen", and are numbered 1 and 2. The maximum number of capture groups is 65535.

As a convenient shorthand, if any option settings are required at the start of a non-capturing group, the option letters may appear between the "?" and the ":". Thus the two patterns

  (?i:saturday|sunday)
  (?:(?i)saturday|sunday)
match exactly the same set of strings. Because alternative branches are tried from left to right, and options are not reset until the end of the group is reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday".

DUPLICATE GROUP NUMBERS

Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is itself a non-capturing group. For example, consider this pattern:

  (?|(Sat)ur|(Sun))day
Because the two alternatives are inside a (?| group, both sets of capturing parentheses are numbered one. Thus, when the pattern matches, you can look at captured substring number one, whichever alternative matched. This construct is useful when you want to capture part, but not all, of one of a number of alternatives. Inside a (?| group, parentheses are numbered as usual, but the number is reset at the start of each branch. The numbers of any capturing parentheses that follow the whole group start after the highest number used in any branch. The following example is taken from the Perl documentation. The numbers underneath show in which buffer the captured content will be stored.
  # before  ---------------branch-reset----------- after
  / ( a )  (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
  # 1            2         2  3        2     3     4
A backreference to a capture group uses the most recent value that is set for the group. The following pattern matches "abcabc" or "defdef":
  /(?|(abc)|(def))\1/
In contrast, a subroutine call to a capture group always refers to the first one in the pattern with the given number. The following pattern matches "abcabc" or "defabc":
  /(?|(abc)|(def))(?1)/
A relative reference such as (?-1) is no different: it is just a convenient way of computing an absolute group number.

If a condition test for a group's having matched refers to a non-unique number, the test is true if any group with that number has matched.

An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section.

NAMED CAPTURE GROUPS

Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is modified, the numbers may change. To help with this difficulty, PCRE2 supports the naming of capture groups. This feature was not added to Perl until release 5.10. Python had the feature earlier, and PCRE1 introduced it at release 4.0, using the Python syntax. PCRE2 supports both the Perl and the Python syntax.

In PCRE2, a capture group can be named in one of three ways: (?<name>...) or (?'name'...) as in Perl, or (?P<name>...) as in Python. Names may be up to 128 code units long. When PCRE2_UTF is not set, they may contain only ASCII alphanumeric characters and underscores, but must start with a non-digit. When PCRE2_UTF is set, the syntax of group names is extended to allow any Unicode letter or Unicode decimal digit. In other words, group names must match one of these patterns:

  ^[_A-Za-z][_A-Za-z0-9]*\z   when PCRE2_UTF is not set
  ^[_\p{L}][_\p{L}\p{Nd}]*\z  when PCRE2_UTF is set
References to capture groups from other parts of the pattern, such as backreferences, recursion, and conditions, can all be made by name as well as by number.

Named capture groups are allocated numbers as well as names, exactly as if the names were not present. In both PCRE2 and Perl, capture groups are primarily identified by numbers; any names are just aliases for these numbers. The PCRE2 API provides function calls for extracting the complete name-to-number translation table from a compiled pattern, as well as convenience functions for extracting captured substrings by name.

Warning: When more than one capture group has the same number, as described in the previous section, a name given to one of them applies to all of them. Perl allows identically numbered groups to have different names. Consider this pattern, where there are two capture groups, both numbered 1:

  (?|(?<AA>aa)|(?<BB>bb))
Perl allows this, with both names AA and BB as aliases of group 1. Thus, after a successful match, both names yield the same value (either "aa" or "bb").

In an attempt to reduce confusion, PCRE2 does not allow the same group number to be associated with more than one name. The example above provokes a compile-time error. However, there is still scope for confusion. Consider this pattern:

  (?|(?<AA>aa)|(bb))
Although the second group number 1 is not explicitly named, the name AA is still an alias for any group 1. Whether the pattern matches "aa" or "bb", a reference by name to group AA yields the matched string.

By default, a name must be unique within a pattern, except that duplicate names are permitted for groups with the same number, for example:

  (?|(?<AA>aa)|(?<AA>bb))
The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES option at compile time, or by the use of (?J) within the pattern, as described in the section entitled "Internal Option Setting" above.

Duplicate names can be useful for patterns where only one instance of the named capture group can match. Suppose you want to match the name of a weekday, either as a 3-letter abbreviation or as the full name, and in both cases you want to extract the abbreviation. This pattern (ignoring the line breaks) does the job:

  (?J)
  (?<DN>Mon|Fri|Sun)(?:day)?|
  (?<DN>Tue)(?:sday)?|
  (?<DN>Wed)(?:nesday)?|
  (?<DN>Thu)(?:rsday)?|
  (?<DN>Sat)(?:urday)?
There are five capture groups, but only one is ever set after a match. The convenience functions for extracting the data by name returns the substring for the first (and in this example, the only) group of that name that matched. This saves searching to find which numbered group it was. (An alternative way of solving this problem is to use a "branch reset" group, as described in the previous section.)

If you make a backreference to a non-unique named group from elsewhere in the pattern, the groups to which the name refers are checked in the order in which they appear in the overall pattern. The first one that is set is used for the reference. For example, this pattern matches both "foofoo" and "barbar" but not "foobar" or "barfoo":

  (?J)(?:(?<n>foo)|(?<n>bar))\k<n>

If you make a subroutine call to a non-unique named group, the one that corresponds to the first occurrence of the name is used. In the absence of duplicate numbers this is the one with the lowest number.

If you use a named reference in a condition test (see the section about conditions below), either to check whether a capture group has matched, or to check for recursion, all groups with the same name are tested. If the condition is true for any one of them, the overall condition is true. This is the same behaviour as testing by number. For further details of the interfaces for handling named capture groups, see the pcre2api documentation.

REPETITION

Repetition is specified by quantifiers, which may follow any one of these items:

  a literal data character
  the dot metacharacter
  the \C escape sequence
  the \R escape sequence
  the \X escape sequence
  any escape sequence that matches a single character
  a character class
  a backreference
  a parenthesized group (including lookaround assertions)
  a subroutine call (recursive or otherwise)
If a quantifier does not follow a repeatable item, an error occurs. The general repetition quantifier specifies a minimum and maximum number of permitted matches by giving two numbers in curly brackets (braces), separated by a comma. The numbers must be less than 65536, and the first must be less than or equal to the second. For example,
  z{2,4}
matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special character. If the second number is omitted, but the comma is present, there is no upper limit; if the second number and the comma are both omitted, the quantifier specifies an exact number of required matches. Thus
  [aeiou]{3,}
matches at least 3 successive vowels, but may match many more, whereas
  \d{8}
matches exactly 8 digits. If the first number is omitted, the lower limit is taken as zero; in this case the upper limit must be present.
  X{,4} is interpreted as X{0,4}
This is a change in behaviour that happened in Perl 5.34.0 and PCRE2 10.43. In earlier versions such a sequence was not interpreted as a quantifier. Other regular expression engines may behave either way.

If the characters that follow an opening brace do not match the syntax of a quantifier, the brace is taken as a literal character. In particular, this means that {,} is a literal string of three characters.

Note that not every opening brace is potentially the start of a quantifier because braces are used in other items such as \N{U+345} or \k{name}.

In UTF modes, quantifiers apply to characters rather than to individual code units. Thus, for example, \x{100}{2} matches two characters, each of which is represented by a two-byte sequence in a UTF-8 string. Similarly, \X{3} matches three Unicode extended grapheme clusters, each of which may be several code units long (and they may be of different lengths).

The quantifier {0} is permitted, causing the expression to behave as if the previous item and the quantifier were not present. This may be useful for capture groups that are referenced as subroutines from elsewhere in the pattern (but see also the section entitled "Defining capture groups for use by reference only" below). Except for parenthesized groups, items that have a {0} quantifier are omitted from the compiled pattern.

For convenience, the three most common quantifiers have single-character abbreviations:

  *    is equivalent to {0,}
  +    is equivalent to {1,}
  ?    is equivalent to {0,1}
It is possible to construct infinite loops by following a group that can match no characters with a quantifier that has no upper limit, for example:
  (a?)*
Earlier versions of Perl and PCRE1 used to give an error at compile time for such patterns. However, because there are cases where this can be useful, such patterns are now accepted, but whenever an iteration of such a group matches no characters, matching moves on to the next item in the pattern instead of repeatedly matching an empty string. This does not prevent backtracking into any of the iterations if a subsequent item fails to match.

By default, quantifiers are "greedy", that is, they match as much as possible (up to the maximum number of permitted repetitions), without causing the rest of the pattern to fail. The classic example of where this gives problems is in trying to match comments in C programs. These appear between /* and */ and within the comment, individual * and / characters may appear. An attempt to match C comments by applying the pattern

  /\*.*\*/
to the string
  /* first comment */  not comment  /* second comment */
fails, because it matches the entire string owing to the greediness of the .* item. However, if a quantifier is followed by a question mark, it ceases to be greedy, and instead matches the minimum number of times possible, so the pattern
  /\*.*?\*/
does the right thing with C comments. The meaning of the various quantifiers is not otherwise changed, just the preferred number of matches. Do not confuse this use of question mark with its use as a quantifier in its own right. Because it has two uses, it can sometimes appear doubled, as in
  \d??\d
which matches one digit by preference, but can match two if that is the only way the rest of the pattern matches.

If the PCRE2_UNGREEDY option is set (an option that is not available in Perl), the quantifiers are not greedy by default, but individual ones can be made greedy by following them with a question mark. In other words, it inverts the default behaviour.

When a parenthesized group is quantified with a minimum repeat count that is greater than 1 or with a limited maximum, more memory is required for the compiled pattern, in proportion to the size of the minimum or maximum.

If a pattern starts with .* or .{0,} and the PCRE2_DOTALL option (equivalent to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is implicitly anchored, because whatever follows will be tried against every character position in the subject string, so there is no point in retrying the overall match at any position after the first. PCRE2 normally treats such a pattern as though it were preceded by \A.

In cases where it is known that the subject string contains no newlines, it is worth setting PCRE2_DOTALL in order to obtain this optimization, or alternatively, using ^ to indicate anchoring explicitly.

However, there are some cases where the optimization cannot be used. When .* is inside capturing parentheses that are the subject of a backreference elsewhere in the pattern, a match at the start may fail where a later one succeeds. Consider, for example:

  (.*)abc\1
If the subject is "xyz123abc123" the match point is the fourth character. For this reason, such a pattern is not implicitly anchored.

Another case where implicit anchoring is not applied is when the leading .* is inside an atomic group. Once again, a match at the start may fail where a later one succeeds. Consider this pattern:

  (?>.*?a)b
It matches "ab" in the subject "aab". The use of the backtracking control verbs (*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive.

When a capture group is repeated, the value captured is the substring that matched the final iteration. For example, after

  (tweedle[dume]{3}\s*)+
has matched "tweedledum tweedledee" the value of the captured substring is "tweedledee". However, if there are nested capture groups, the corresponding captured values may have been set in previous iterations. For example, after
  (a|(b))+
matches "aba" the value of the second captured substring is "b".

ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS

With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be re-evaluated to see if a different number of repeats allows the rest of the pattern to match. Sometimes it is useful to prevent this, either to change the nature of the match, or to cause it fail earlier than it otherwise might, when the author of the pattern knows there is no point in carrying on.

Consider, for example, the pattern \d+foo when applied to the subject line

  123456bar
After matching all 6 digits and then failing to match "foo", the normal action of the matcher is to try again with only 5 digits matching the \d+ item, and then with 4, and so on, before ultimately failing. "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides the means for specifying that once a group has matched, it is not to be re-evaluated in this way.

If we use atomic grouping for the previous example, the matcher gives up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example:

  (?>\d+)foo
Perl 5.28 introduced an experimental alphabetic form starting with (* which may be easier to remember:
  (*atomic:\d+)foo
This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is prevented from backtracking into it. Backtracking past it to previous items, however, works as normal.

An alternative description is that a group of this type matches exactly the string of characters that an identical standalone pattern would match, if anchored at the current point in the subject string.

Atomic groups are not capture groups. Simple cases such as the above example can be thought of as a maximizing repeat that must swallow everything it can. So, while both \d+ and \d+? are prepared to adjust the number of digits they match in order to make the rest of the pattern match, (?>\d+) can only match an entire sequence of digits.

Atomic groups in general can of course contain arbitrarily complicated expressions, and can be nested. However, when the contents of an atomic group is just a single repeated item, as in the example above, a simpler notation, called a "possessive quantifier" can be used. This consists of an additional + character following a quantifier. Using this notation, the previous example can be rewritten as

  \d++foo
Note that a possessive quantifier can be used with an entire group, for example:
  (abc|xyz){2,3}+
Possessive quantifiers are always greedy; the setting of the PCRE2_UNGREEDY option is ignored. They are a convenient notation for the simpler forms of atomic group. However, there is no difference in the meaning of a possessive quantifier and the equivalent atomic group, though there may be a performance difference; possessive quantifiers should be slightly faster.

The possessive quantifier syntax is an extension to the Perl 5.8 syntax. Jeffrey Friedl originated the idea (and the name) in the first edition of his book. Mike McCloskey liked it, so implemented it when he built Sun's Java package, and PCRE1 copied it from there. It found its way into Perl at release 5.10.

PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive, or by starting the pattern with (*NO_AUTO_POSSESS).

When a pattern contains an unlimited repeat inside a group that can itself be repeated an unlimited number of times, the use of an atomic group is the only way to avoid some failing matches taking a very long time indeed. The pattern

  (\D+|<\d+>)*[!?]
matches an unlimited number of substrings that either consist of non-digits, or digits enclosed in <>, followed by either ! or ?. When it matches, it runs quickly. However, if it is applied to
  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
it takes a long time before reporting failure. This is because the string can be divided between the internal \D+ repeat and the external * repeat in a large number of ways, and all have to be tried. (The example uses [!?] rather than a single character at the end, because both PCRE2 and Perl have an optimization that allows for fast failure when a single character is used. They remember the last single character that is required for a match, and fail early if it is not present in the string.) If the pattern is changed so that it uses an atomic group, like this:
  ((?>\D+)|<\d+>)*[!?]
sequences of non-digits cannot be broken, and failure happens quickly.

BACKREFERENCES

Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that is, to its left) in the pattern, provided there have been that many previous capture groups.

However, if the decimal number following the backslash is less than 8, it is always taken as a backreference, and causes an error only if there are not that many capture groups in the entire pattern. In other words, the group that is referenced need not be to the left of the reference for numbers less than 8. A "forward backreference" of this type can make sense when a repetition is involved and the group to the right has participated in an earlier iteration.

It is not possible to have a numerical "forward backreference" to a group whose number is 8 or more using this syntax because a sequence such as \50 is interpreted as a character defined in octal. See the subsection entitled "Non-printing characters" above for further details of the handling of digits following a backslash. Other forms of backreferencing do not suffer from this restriction. In particular, there is no problem when named capture groups are used (see below).

Another way of avoiding the ambiguity inherent in the use of digits following a backslash is to use the \g escape sequence. This escape must be followed by a signed or unsigned number, optionally enclosed in braces. These examples are all identical:

  (ring), \1
  (ring), \g1
  (ring), \g{1}
An unsigned number specifies an absolute reference without the ambiguity that is present in the older syntax. It is also useful when literal digits follow the reference. A signed number is a relative reference. Consider this example:
  (abc(def)ghi)\g{-1}
The sequence \g{-1} is a reference to the capture group whose number is one less than the number of the next group to be started, so in this example (where the next group would be numbered 3) is it equivalent to \2, and \g{-2} would be equivalent to \1. Note that if this construct is inside a capture group, that group is included in the count, so in this example \g{-2} also refers to group 1:
  (A)(\g{-2}B)
The use of relative references can be helpful in long patterns, and also in patterns that are created by joining together fragments that contain references within themselves.

The sequence \g{+1} is a reference to the next capture group that is started after this item, and \g{+2} refers to the one after that, and so on. This kind of forward reference can be useful in patterns that repeat. Perl does not support the use of + in this way.

A backreference matches whatever actually most recently matched the capture group in the current subject string, rather than anything at all that matches the group (see "Groups as subroutines" below for a way of doing that). So the pattern

  (sens|respons)e and \1ibility
matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If caseful matching is in force at the time of the backreference, the case of letters is relevant. For example,
  ((?i)rah)\s+\1
matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original capture group is matched caselessly.

There are several different ways of writing backreferences to named capture groups. The .NET syntax is \k{name}, the Python syntax is (?=name), and the original Perl syntax is \k<name> or \k'name'. All of these are now supported by both Perl and PCRE2. Perl 5.10's unified backreference syntax, in which \g can be used for both numeric and named references, is also supported by PCRE2. We could rewrite the above example in any of the following ways:

  (?<p1>(?i)rah)\s+\k<p1>
  (?'p1'(?i)rah)\s+\k{p1}
  (?P<p1>(?i)rah)\s+(?P=p1)
  (?<p1>(?i)rah)\s+\g{p1}
A capture group that is referenced by name may appear in the pattern before or after the reference.

There may be more than one backreference to the same group. If a group has not actually been used in a particular match, backreferences to it always fail by default. For example, the pattern

  (a|(bc))\2
always fails if it starts to match "a" rather than "bc". However, if the PCRE2_MATCH_UNSET_BACKREF option is set at compile time, a backreference to an unset value matches an empty string.

Because there may be many capture groups in a pattern, all digits following a backslash are taken as part of a potential backreference number. If the pattern continues with a digit character, some delimiter must be used to terminate the backreference. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, this can be white space. Otherwise, the \g{} syntax or an empty comment (see "Comments" below) can be used.

Recursive backreferences

A backreference that occurs inside the group to which it refers fails when the group is first used, so, for example, (a\1) never matches. However, such references can be useful inside repeated groups. For example, the pattern

  (a|b\1)+
matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of the group, the backreference matches the character string corresponding to the previous iteration. In order for this to work, the pattern must be such that the first iteration does not need to match the backreference. This can be done using alternation, as in the example above, or by a quantifier with a minimum of zero.

For versions of PCRE2 less than 10.25, backreferences of this type used to cause the group that they reference to be treated as an atomic group. This restriction no longer applies, and backtracking into such groups can occur as normal.

ASSERTIONS

An assertion is a test that does not consume any characters. The test must succeed for the match to continue. The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described above.

More complicated assertions are coded as parenthesized groups. If matching such a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed.

A special kind of assertion, called a "scan substring" assertion, matches a subpattern against a previously captured substring. This is described in the section entitled "Scan substring assertions" below. It is a PCRE2 extension, not compatible with Perl.

The other goup-based assertions are of two kinds: those that look ahead of the current position in the subject string, and those that look behind it, and in each case an assertion may be positive (must match for the assertion to be true) or negative (must not match for the assertion to be true).

The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic assertions can be useful. PCRE2 has some support for these, described in the section entitled "Non-atomic assertions" below, but they are not Perl-compatible.

A lookaround assertion may appear as the condition in a conditional group (see below). In this case, the result of matching the assertion determines which branch of the condition is followed.

Assertion groups are not capture groups. If an assertion contains capture groups within it, these are counted for the purposes of numbering the capture groups in the whole pattern. Within each branch of an assertion, locally captured substrings may be referenced in the usual way. For example, a sequence such as (.)\g{-1} can be used to check that two adjacent characters are the same.

When a branch within an assertion fails to match, any substrings that were captured are discarded (as happens with any pattern branch that fails to match). A negative assertion is true only when all its branches fail to match; this means that no captured substrings are ever retained after a successful negative assertion. When an assertion contains a matching branch, what happens depends on the type of assertion.

For a positive assertion, internally captured substrings in the successful branch are retained, and matching continues with the next pattern item after the assertion. For a negative assertion, a matching branch means that the assertion is not true. If such an assertion is being used as a condition in a conditional group (see below), captured substrings are retained, because matching continues with the "no" branch of the condition. For other failing negative assertions, control passes to the previous backtracking point, thus discarding any captured strings within the assertion.

Most assertion groups may be repeated; though it makes no sense to assert the same thing several times, the side effect of capturing in positive assertions may occasionally be useful. However, an assertion that forms the condition for a conditional group may not be quantified. PCRE2 used to restrict the repetition of assertions, but from release 10.35 the only restriction is that an unlimited maximum repetition is changed to be one more than the minimum. For example, {3,} is treated as {3,4}.

Alphabetic assertion names

Traditionally, symbolic sequences such as (?= and (?<= have been used to specify lookaround assertions. Perl 5.28 introduced some experimental alphabetic alternatives which might be easier to remember. They all start with (* instead of (? and must be written using lower case letters. PCRE2 supports the following synonyms:

  (*positive_lookahead:  or (*pla: is the same as (?=
  (*negative_lookahead:  or (*nla: is the same as (?!
  (*positive_lookbehind: or (*plb: is the same as (?<=
  (*negative_lookbehind: or (*nlb: is the same as (?<!
For example, (*pla:foo) is the same assertion as (?=foo). In the following sections, the various assertions are described using the original symbolic forms.

Lookahead assertions

Lookahead assertions start with (?= for positive assertions and (?! for negative assertions. For example,

  \w+(?=;)
matches a word followed by a semicolon, but does not include the semicolon in the match, and
  foo(?!bar)
matches any occurrence of "foo" that is not followed by "bar". Note that the apparently similar pattern
  (?!foo)bar
does not find an occurrence of "bar" that is preceded by something other than "foo"; it finds any occurrence of "bar" whatsoever, because the assertion (?!foo) is always true when the next three characters are "bar". A lookbehind assertion is needed to achieve the other effect.

If you want to force a matching failure at some point in a pattern, the most convenient way to do it is with (?!) because an empty string always matches, so an assertion that requires there not to be an empty string must always fail. The backtracking control verb (*FAIL) or (*F) is a synonym for (?!).

Lookbehind assertions

Lookbehind assertions start with (?<= for positive assertions and (?<! for negative assertions. For example,

  (?<!foo)bar
does find an occurrence of "bar" that is not preceded by "foo". The contents of a lookbehind assertion are restricted such that there must be a known maximum to the lengths of all the strings it matches. There are two cases:

If every top-level alternative matches a fixed length, for example

  (?<=colour|color)
there is a limit of 65535 characters to the lengths, which do not have to be the same, as this example demonstrates. This is the only kind of lookbehind supported by PCRE2 versions earlier than 10.43 and by the alternative matching function pcre2_dfa_match().

In PCRE2 10.43 and later, pcre2_match() supports lookbehind assertions in which one or more top-level alternatives can match more than one string length, for example

  (?<=colou?r)
The maximum matching length for any branch of the lookbehind is limited to a value set by the calling program (default 255 characters). Unlimited repetition (for example \d*) is not supported. In some cases, the escape sequence \K (see above) can be used instead of a lookbehind assertion at the start of a pattern to get round the length limit restriction.

In UTF-8 and UTF-16 modes, PCRE2 does not allow the \C escape (which matches a single code unit even in a UTF mode) to appear in lookbehind assertions, because it makes it impossible to calculate the length of the lookbehind. The \X and \R escapes, which can match different numbers of code units, are never permitted in lookbehinds.

"Subroutine" calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long as the called capture group matches a limited-length string. However, recursion, that is, a "subroutine" call into a group that is already active, is not supported.

PCRE2 supports backreferences in lookbehinds, but only if certain conditions are met. The PCRE2_MATCH_UNSET_BACKREF option must not be set, there must be no use of (?| in the pattern (it creates duplicate group numbers), and if the backreference is by name, the name must be unique. Of course, the referenced group must itself match a limited length substring. The following pattern matches words containing at least two characters that begin and end with the same character:

   \b(\w)\w++(?<=\1)

Possessive quantifiers can be used in conjunction with lookbehind assertions to specify efficient matching at the end of subject strings. Consider a simple pattern such as

  abcd$
when applied to a long string that does not match. Because matching proceeds from left to right, PCRE2 will look for each "a" in the subject and then see if what follows matches the rest of the pattern. If the pattern is specified as
  ^.*abcd$
the initial .* matches the entire string at first, but when this fails (because there is no following "a"), it backtracks to match all but the last character, then all but the last two characters, and so on. Once again the search for "a" covers the entire string, from right to left, so we are no better off. However, if the pattern is written as
  ^.*+(?<=abcd)
there can be no backtracking for the .*+ item because of the possessive quantifier; it can match only the entire string. The subsequent lookbehind assertion does a single test on the last four characters. If it fails, the match fails immediately. For long strings, this approach makes a significant difference to the processing time.

Using multiple assertions

Several assertions (of any sort) may occur in succession. For example,

  (?<=\d{3})(?<!999)foo
matches "foo" preceded by three digits that are not "999". Notice that each of the assertions is applied independently at the same point in the subject string. First there is a check that the previous three characters are all digits, and then there is a check that the same three characters are not "999". This pattern does not match "foo" preceded by six characters, the first of which are digits and the last three of which are not "999". For example, it doesn't match "123abcfoo". A pattern to do that is
  (?<=\d{3}...)(?<!999)foo
This time the first assertion looks at the preceding six characters, checking that the first three are digits, and then the second assertion checks that the preceding three characters are not "999".

Assertions can be nested in any combination. For example,

  (?<=(?<!foo)bar)baz
matches an occurrence of "baz" that is preceded by "bar" which in turn is not preceded by "foo", while
  (?<=\d{3}(?!999)...)foo
is another pattern that matches "foo" preceded by three digits and any three characters that are not "999".

NON-ATOMIC ASSERTIONS

Traditional lookaround assertions are atomic. That is, if an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic positive assertions can be useful. PCRE2 provides these using the following syntax:

  (*non_atomic_positive_lookahead:  or (*napla: or (?*
  (*non_atomic_positive_lookbehind: or (*naplb: or (?<*
Consider the problem of finding the right-most word in a string that also appears earlier in the string, that is, it must appear at least twice in total. This pattern returns the required result as captured substring 1:
  ^(?x)(*napla: .* \b(\w++)) (?> .*? \b\1\b ){2}
For a subject such as "word1 word2 word3 word2 word3 word4" the result is "word3". How does it work? At the start, ^(?x) anchors the pattern and sets the "x" option, which causes white space (introduced for readability) to be ignored. Inside the assertion, the greedy .* at first consumes the entire string, but then has to backtrack until the rest of the assertion can match a word, which is captured by group 1. In other words, when the assertion first succeeds, it captures the right-most word in the string.

The current matching point is then reset to the start of the subject, and the rest of the pattern match checks for two occurrences of the captured word, using an ungreedy .*? to scan from the left. If this succeeds, we are done, but if the last word in the string does not occur twice, this part of the pattern fails. If a traditional atomic lookahead (?= or (*pla: had been used, the assertion could not be re-entered, and the whole match would fail. The pattern would succeed only if the very last word in the subject was found twice.

Using a non-atomic lookahead, however, means that when the last word does not occur twice in the string, the lookahead can backtrack and find the second-last word, and so on, until either the match succeeds, or all words have been tested.

Two conditions must be met for a non-atomic assertion to be useful: the contents of one or more capturing groups must change after a backtrack into the assertion, and there must be a backreference to a changed group later in the pattern. If this is not the case, the rest of the pattern match fails exactly as before because nothing has changed, so using a non-atomic assertion just wastes resources.

There is one exception to backtracking into a non-atomic assertion. If an (*ACCEPT) control verb is triggered, the assertion succeeds atomically. That is, a subsequent match failure cannot backtrack into the assertion.

Non-atomic assertions are not supported by the alternative matching function pcre2_dfa_match(). They are supported by JIT, but only if they do not contain any control verbs such as (*ACCEPT). (This may change in future). Note that assertions that appear as conditions for conditional groups (see below) must be atomic.

SCAN SUBSTRING ASSERTIONS

A special kind of assertion, not compatible with Perl, makes it possible to check the contents of a captured substring by matching it with a subpattern. Because this involves capturing, this feature is not supported by pcre2_dfa_match().

A scan substring assertion starts with the sequence (*scan_substring: or (*scs: which is followed by a list of substring numbers (absolute or relative) and/or substring names enclosed in single quotes or angle brackets, all within parentheses. The rest of the item is the subpattern that is applied to the substring, as shown in these examples:

  (*scan_substring:(1)...)
  (*scs:(-2)...)
  (*scs:('AB')...)
  (*scs:(1,'AB',-2)...)
The list of groups is checked in the order they are given, and it is the contents of the first one that is found to be set that are scanned. When PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the same name are checked in numerical order. A scan substring assertion fails if none of the groups it references have been set.

The pattern match on the substring is always anchored, that is, it must match from the start of the substring. There is no "bumpalong" if it does not match at the start. The end of the subject is temporarily reset to be the end of the substring, so \Z, \z, and $ will match there. However, the start of the subject is not reset. This means that ^ matches only if the substring is actually at the start of the main subject, but it also means that lookbehind assertions into what precedes the substring are possible.

Here is a very simple example: find a word that contains the rare (in English) sequence of letters "rh" not at the start:

  \b(\w++)(*scs:(1).+rh)
The first group captures a word which is then scanned by the second group. This example does not actually need this heavyweight feature; the same match can be achieved with:
  \b\w+?rh\w*\b
When things are more complicated, however, scanning a captured substring can be a useful way to describe the required match. For exmple, there is a rather complicated pattern in the PCRE2 test data that checks an entire subject string for a palindrome, that is, the sequence of letters is the same in both directions. Suppose you want to search for individual words of two or more characters such as "level" that are palindromes:
  (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...)
Within a substring scanning subpattern, references to other groups work as normal. Capturing groups may appear, and will retain their values during ongoing matching if the assertion succeeds.

SCRIPT RUNS

In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are commonly used together, and because some diacritical and other marks are used with multiple scripts, it is not that simple. There is a full description of the rules that PCRE2 uses in the section entitled "Script Runs" in the pcre2unicode documentation.

If part of a pattern is enclosed between (*script_run: or (*sr: and a closing parenthesis, it fails if the sequence of characters that it matches are not a script run. After a failure, normal backtracking occurs. Script runs can be used to detect spoofing attacks using characters that look the same, but are from different scripts. The string "paypal.com" is an infamous example, where the letters could be a mixture of Latin and Cyrillic. This pattern ensures that the matched characters in a sequence of non-spaces that follow white space are a script run:

  \s+(*sr:\S+)
To be sure that they are all from the Latin script (for example), a lookahead can be used:
  \s+(?=\p{Latin})(*sr:\S+)
This works as long as the first character is expected to be a character in that script, and not (for example) punctuation, which is allowed with any script. If this is not the case, a more creative lookahead is needed. For example, if digits, underscore, and dots are permitted at the start:
  \s+(?=[0-9_.]*\p{Latin})(*sr:\S+)

In many cases, backtracking into a script run pattern fragment is not desirable. The script run can employ an atomic group to prevent this. Because this is a common requirement, a shorthand notation is provided by (*atomic_script_run: or (*asr:

  (*asr:...) is the same as (*sr:(?>...))
Note that the atomic group is inside the script run. Putting it outside would not prevent backtracking into the script run pattern.

Support for script runs is not available if PCRE2 is compiled without Unicode support. A compile-time error is given if any of the above constructs is encountered. Script runs are not supported by the alternate matching function, pcre2_dfa_match() because they use the same mechanism as capturing parentheses.

Warning: The (*ACCEPT) control verb (see below) should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking.

CONDITIONAL GROUPS

It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on the result of an assertion, or whether a specific capture group has already been matched. The two possible forms of conditional group are:

  (?(condition)yes-pattern)
  (?(condition)yes-pattern|no-pattern)
If the condition is satisfied, the yes-pattern is used; otherwise the no-pattern (if present) is used. An absent no-pattern is equivalent to an empty string (it always matches). If there are more than two alternatives in the group, a compile-time error occurs. Each of the two alternatives may itself contain nested groups of any form, including conditional groups; the restriction to two alternatives applies only at the level of the condition itself. This pattern fragment is an example where the alternatives are complex:
  (?(1) (A|B|C) | (D | (?(2)E|F) | E) )

There are five kinds of condition: references to capture groups, references to recursion, two pseudo-conditions called DEFINE and VERSION, and assertions.

Checking for a used capture group by number

If the text between the parentheses consists of a sequence of digits, the condition is true if a capture group of that number has previously matched. If there is more than one capture group with the same number (see the earlier section about duplicate group numbers), the condition is true if any of them have matched. An alternative notation, which is a PCRE2 extension, not supported by Perl, is to precede the digits with a plus or minus sign. In this case, the group number is relative rather than absolute. The most recently opened capture group (which could be enclosing this condition) can be referenced by (?(-1), the next most recent by (?(-2), and so on. Inside loops it can also make sense to refer to subsequent groups. The next capture group to be opened can be referenced as (?(+1), and so on. The value zero in any of these forms is not used; it provokes a compile-time error.

Consider the following pattern, which contains non-significant white space to make it more readable (assume the PCRE2_EXTENDED option) and to divide it into three parts for ease of discussion:

  ( \( )?    [^()]+    (?(1) \) )
The first part matches an optional opening parenthesis, and if that character is present, sets it as the first captured substring. The second part matches one or more characters that are not parentheses. The third part is a conditional group that tests whether or not the first capture group matched. If it did, that is, if subject started with an opening parenthesis, the condition is true, and so the yes-pattern is executed and a closing parenthesis is required. Otherwise, since no-pattern is not present, the conditional group matches nothing. In other words, this pattern matches a sequence of non-parentheses, optionally enclosed in parentheses.

If you were embedding this pattern in a larger one, you could use a relative reference:

  ...other stuff... ( \( )?    [^()]+    (?(-1) \) ) ...
This makes the fragment independent of the parentheses in the larger pattern.

Checking for a used capture group by name

Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used capture group by name. For compatibility with earlier versions of PCRE1, which had this facility before Perl, the syntax (?(name)...) is also recognized. Note, however, that undelimited names consisting of the letter R followed by digits are ambiguous (see the following section). Rewriting the above example to use a named group gives this:

  (?<OPEN> \( )?    [^()]+    (?(<OPEN>) \) )
If the name used in a condition of this kind is a duplicate, the test is applied to all groups of the same name, and is true if any one of them has matched.

Checking for pattern recursion

"Recursion" in this sense refers to any subroutine-like call from one part of the pattern to another, whether or not it is actually recursive. See the sections entitled "Recursive patterns" and "Groups as subroutines" below for details of recursion and subroutine calls.

If a condition is the string (R), and there is no capture group with the name R, the condition is true if matching is currently in a recursion or subroutine call to the whole pattern or any capture group. If digits follow the letter R, and there is no group with that name, the condition is true if the most recent call is into a group with the given number, which must exist somewhere in the overall pattern. This is a contrived example that is equivalent to a+b:

  ((?(R1)a+|(?1)b))
However, in both cases, if there is a capture group with a matching name, the condition tests for its being set, as described in the section above, instead of testing for recursion. For example, creating a group with the name R1 by adding (?<R1>) to the above pattern completely changes its meaning.

If a name preceded by ampersand follows the letter R, for example:

  (?(R&name)...)
the condition is true if the most recent recursion is into a group of that name (which must exist within the pattern).

This condition does not check the entire recursion stack. It tests only the current level. If the name used in a condition of this kind is a duplicate, the test is applied to all groups of the same name, and is true if any one of them is the most recent recursion.

At "top level", all these recursion test conditions are false.

Defining capture groups for use by reference only

If the condition is the string (DEFINE), the condition is always false, even if there is a group with the name DEFINE. In this case, there may be only one alternative in the rest of the conditional group. It is always skipped if control reaches this point in the pattern; the idea of DEFINE is that it can be used to define subroutines that can be referenced from elsewhere. (The use of subroutines is described below.) For example, a pattern to match an IPv4 address such as "192.168.23.245" could be written like this (ignore white space and line breaks):

  (?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
  \b (?&byte) (\.(?&byte)){3} \b
The first part of the pattern is a DEFINE group inside which another group named "byte" is defined. This matches an individual component of an IPv4 address (a number less than 256). When matching takes place, this part of the pattern is skipped because DEFINE acts like a false condition. The rest of the pattern uses references to the named group to match the four dot-separated components of an IPv4 address, insisting on a word boundary at each end.

Checking the PCRE2 version

Programs that link with a PCRE2 library can check the version by calling pcre2_config() with appropriate arguments. Users of applications that do not have access to the underlying code cannot do this. A special "condition" called VERSION exists to allow such users to discover which version of PCRE2 they are dealing with by using this condition to match a string such as "yesno". VERSION must be followed either by "=" or ">=" and a version number. For example:

  (?(VERSION>=10.4)yes|no)
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or "no" otherwise. The fractional part of the version number could be ommited.

Assertion conditions

If the condition is not in any of the above formats, it must be a parenthesized assertion. This may be a positive or negative lookahead or lookbehind assertion. However, it must be a traditional atomic assertion, not one of the non-atomic assertions.

Consider this pattern, again containing non-significant white space, and with the two alternatives on the second line:

  (?(?=[^a-z]*[a-z])
  \d{2}-[a-z]{3}-\d{2}  |  \d{2}-\d{2}-\d{2} )
The condition is a positive lookahead assertion that matches an optional sequence of non-letters followed by a letter. In other words, it tests for the presence of at least one letter in the subject. If a letter is found, the subject is matched against the first alternative; otherwise it is matched against the second. This pattern matches strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.

When an assertion that is a condition contains capture groups, any capturing that occurs in a matching branch is retained afterwards, for both positive and negative assertions, because matching always continues after the assertion, whether it succeeds or fails. (Compare non-conditional assertions, for which captures are retained only for positive assertions that succeed.)

COMMENTS

There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as (?: or a group name or number or a Unicode property name. The characters that make up a comment play no part in the pattern matching.

The sequence (?# marks the start of a comment that continues up to the next closing parenthesis. Nested parentheses are not permitted. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, an unescaped # character also introduces a comment, which in this case continues to immediately after the next newline character or character sequence in the pattern. Which characters are interpreted as newlines is controlled by an option passed to the compiling function or by a special sequence at the start of the pattern, as described in the section entitled "Newline conventions" above. Note that the end of this type of comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. For example, consider this pattern when PCRE2_EXTENDED is set, and the default newline convention (a single linefeed character) is in force:

  abc #comment \n still comment
On encountering the # character, pcre2_compile() skips along, looking for a newline in the pattern. The sequence \n is still literal at this stage, so it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so.

RECURSIVE PATTERNS

Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can be done is to use a pattern that matches up to some fixed depth of nesting. It is not possible to handle an arbitrary nesting depth.

For some time, Perl has provided a facility that allows regular expressions to recurse (amongst other things). It does this by interpolating Perl code in the expression at run time, and the code can refer to the expression itself. A Perl pattern using code interpolation to solve the parentheses problem can be created like this:

  $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
The (?p{...}) item interpolates Perl code at run time, and in this case refers recursively to the pattern in which it appears.

Obviously, PCRE2 cannot support the interpolation of Perl code. Instead, it supports special syntax for recursion of the entire pattern, and also for individual capture group recursion. After its introduction in PCRE1 and Python, this kind of recursion was subsequently introduced into Perl at release 5.10.

A special item that consists of (? followed by a number greater than zero and a closing parenthesis is a recursive subroutine call of the capture group of the given number, provided that it occurs inside that group. (If not, it is a non-recursive subroutine call, which is described in the next section.) The special item (?R) or (?0) is a recursive call of the entire regular expression.

This PCRE2 pattern solves the nested parentheses problem (assume the PCRE2_EXTENDED option is set so that white space is ignored):

  \( ( [^()]++ | (?R) )* \)
First it matches an opening parenthesis. Then it matches any number of substrings which can either be a sequence of non-parentheses, or a recursive match of the pattern itself (that is, a correctly parenthesized substring). Finally there is a closing parenthesis. Note the use of a possessive quantifier to avoid backtracking into sequences of non-parentheses.

If this were part of a larger pattern, you would not want to recurse the entire pattern, so instead you could use this:

  ( \( ( [^()]++ | (?1) )* \) )
We have put the pattern into parentheses, and caused the recursion to refer to them instead of the whole pattern.

In a larger pattern, keeping track of parenthesis numbers can be tricky. This is made easier by the use of relative references. Instead of (?1) in the pattern above you can write (?-2) to refer to the second most recently opened parentheses preceding the recursion. In other words, a negative number counts capturing parentheses leftwards from the point at which it is encountered.

Be aware however, that if duplicate capture group numbers are in use, relative references refer to the earliest group with the appropriate number. Consider, for example:

  (?|(a)|(b)) (c) (?-2)
The first two capture groups (a) and (b) are both numbered 1, and group (c) is number 2. When the reference (?-2) is encountered, the second most recently opened parentheses has the number 1, but it is the first such group (the (a) group) to which the recursion refers. This would be the same if an absolute reference (?1) was used. In other words, relative references are just a shorthand for computing a group number.

It is also possible to refer to subsequent capture groups, by writing references such as (?+2). However, these cannot be recursive because the reference is not inside the parentheses that are referenced. They are always non-recursive subroutine calls, as described in the next section.

An alternative approach is to use named parentheses. The Perl syntax for this is (?&name); PCRE1's earlier syntax (?P>name) is also supported. We could rewrite the above example as follows:

  (?<pn> \( ( [^()]++ | (?&pn) )* \) )
If there is more than one group with the same name, the earliest one is used.

The example pattern that we have been looking at contains nested unlimited repeats, and so the use of a possessive quantifier for matching strings of non-parentheses is important when applying the pattern to strings that do not match. For example, when this pattern is applied to

  (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
it yields "no match" quickly. However, if a possessive quantifier is not used, the match runs for a very long time indeed because there are so many different ways the + and * repeats can carve up the subject, and all have to be tested before failure can be reported.

At the end of a match, the values of capturing parentheses are those from the outermost level. If you want to obtain intermediate values, a callout function can be used (see below and the pcre2callout documentation). If the pattern above is matched against

  (ab(cd)ef)
the value for the inner capturing parentheses (numbered 2) is "ef", which is the last value taken on at the top level. If a capture group is not matched at the top level, its final captured value is unset, even if it was (temporarily) set at a deeper level during the matching process.

Do not confuse the (?R) item with the condition (R), which tests for recursion. Consider this pattern, which matches text in angle brackets, allowing for arbitrary nesting. Only digits are allowed in nested brackets (that is, when recursing), whereas any characters are permitted at the outer level.

  < (?: (?(R) \d++  | [^<>]*+) | (?R)) * >
In this pattern, (?(R) is the start of a conditional group, with two different alternatives for the recursive and non-recursive cases. The (?R) item is the actual recursive call.

Differences in recursion processing between PCRE2 and Perl

Some former differences between PCRE2 and Perl no longer exist.

Before release 10.30, recursion processing in PCRE2 differed from Perl in that a recursive subroutine call was always treated as an atomic group. That is, once it had matched some of the subject string, it was never re-entered, even if it contained untried alternatives and there was a subsequent matching failure. (Historical note: PCRE implemented recursion before Perl did.)

Starting with release 10.30, recursive subroutine calls are no longer treated as atomic. That is, they can be re-entered to try unused alternatives if there is a matching failure later in the pattern. This is now compatible with the way Perl works. If you want a subroutine call to be atomic, you must explicitly enclose it in an atomic group.

Supporting backtracking into recursions simplifies certain types of recursive pattern. For example, this pattern matches palindromic strings:

  ^((.)(?1)\2|.?)$
The second branch in the group matches a single central character in the palindrome when there are an odd number of characters, or nothing when there are an even number of characters, but in order to work it has to be able to try the second case when the rest of the pattern match fails. If you want to match typical palindromic phrases, the pattern has to ignore all non-word characters, which can be done like this:
  ^\W*+((.)\W*+(?1)\W*+\2|\W*+.?)\W*+$
If run with the PCRE2_CASELESS option, this pattern matches phrases such as "A man, a plan, a canal: Panama!". Note the use of the possessive quantifier *+ to avoid backtracking into sequences of non-word characters. Without this, PCRE2 takes a great deal longer (ten times or more) to match typical phrases, and Perl takes so long that you think it has gone into a loop.

Another way in which PCRE2 and Perl used to differ in their recursion processing is in the handling of captured values. Formerly in Perl, when a group was called recursively or as a subroutine (see the next section), it had no access to any values that were captured outside the recursion, whereas in PCRE2 these values can be referenced. Consider this pattern:

  ^(.)(\1|a(?2))
This pattern matches "bab". The first capturing parentheses match "b", then in the second group, when the backreference \1 fails to match "b", the second alternative matches "a" and then recurses. In the recursion, \1 does now match "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works.

Groups as subroutines

If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine in a programming language. More accurately, PCRE2 treats the referenced group as an independent subpattern which it tries to match at the current matching position. The called group may be defined before or after the reference. A numbered reference can be absolute or relative, as in these examples:

  (...(absolute)...)...(?2)...
  (...(relative)...)...(?-1)...
  (...(?+1)...(relative)...
An earlier example pointed out that the pattern
  (sens|respons)e and \1ibility
matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If instead the pattern
  (sens|respons)e and (?1)ibility
is used, it does match "sense and responsibility" as well as the other two strings. Another example is given in the discussion of DEFINE above.

Like recursions, subroutine calls used to be treated as atomic, but this changed at PCRE2 release 10.30, so backtracking into subroutine calls can now occur. However, any capturing parentheses that are set during the subroutine call revert to their previous values afterwards.

Processing options such as case-independence are fixed when a group is defined, so if it is used as a subroutine, such options cannot be changed for different calls. For example, consider this pattern:

  (abc)(?i:(?-1))
It matches "abcabc". It does not match "abcABC" because the change of processing option does not affect the called group.

The behaviour of backtracking control verbs in groups when called as subroutines is described in the section entitled "Backtracking verbs in subroutines" below.

Recursion and subroutines with returned capture groups

Since PCRE2 10.47, recursion and subroutine calls may also specify a list of capture groups to return. This is a PCRE2 syntax extension not supported by Perl. The pattern matching recurses into the referenced expression as described above, however, when the recursion returns to the calling expression the subgroups captured during the recursion can be retained when the calling expression's context is restored.

When used as a subroutine, this allows the subroutine's capture groups to be used as return values.

Only the specific capture groups listed by the caller will be retained, using the following syntax:

  (?R(grouplist))       recurse whole pattern, returning capture groups
  (?n(grouplist))       )
  (?+n(grouplist))      )
  (?-n(grouplist))      ) call subroutine, returning capture groups
  (?&name(grouplist))   )
  (?P>name(grouplist))  )

The list of capture groups "grouplist" is a comma-separated list of (absolute or relative) group numbers, and group names enclosed in single quotes or angle brackets.

Here is an example which first uses the DEFINE condition to create a re-usable routine for matching a weekday, then calls that subroutine and retains the groups it captures for use later:

  (?x: # ignore whitespace for clarity
    # Define the routine "weekendday" which matches Saturday or
    # Sunday, and returns the Sat/Sun prefix as \k<short>.
    (?(DEFINE) (?<weekendday>
        (?|(?<short>Sat)urday|(?<short>Sun)day) ) )
    # Call the routine. Matches "Saturday,Sat" or "Sunday,Sun".
    (?&weekendday(<short>)),\k<short> )

This feature is not available using the Oniguruma syntax \g<...> or \g'...' below.

Oniguruma subroutine syntax

For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative syntax for calling a group as a subroutine, possibly recursively. Here are two of the examples used above, rewritten using this syntax:

  (?<pn> \( ( (?>[^()]+) | \g<pn> )* \) )
  (sens|respons)e and \g'1'ibility
PCRE2 supports an extension to Oniguruma: if a number is preceded by a plus or a minus sign it is taken as a relative reference. For example:
  (abc)(?i:\g<-1>)
Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call.

CALLOUTS

Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it possible, amongst other things, to extract different substrings that match the same pair of parentheses when there is a repetition.

PCRE2 provides a similar feature, but of course it cannot obey arbitrary Perl code. The feature is called "callout". The caller of PCRE2 provides an external function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is passed, or if the callout entry point is set to NULL, callout points will be passed over silently during matching. To disallow callouts in the pattern syntax, you may use the PCRE2_EXTRA_NEVER_CALLOUT option.

Within a regular expression, (?C<arg>) indicates a point at which the external function is to be called. There are two kinds of callout: those with a numerical argument and those with a string argument. (?C) on its own with no argument is treated as (?C0). A numerical argument allows the application to distinguish between different callouts. String arguments were added for release 10.20 to make it possible for script languages that use PCRE2 to embed short scripts within patterns in a similar way to Perl.

During matching, when PCRE2 reaches a callout point, the external function is called. It is provided with the number or string argument of the callout, the position in the pattern, and one item of data that is also set in the match block. The callout function may cause matching to proceed, to backtrack, or to fail.

By default, PCRE2 implements a number of optimizations at matching time, and one side-effect is that sometimes callouts are skipped. If you need all possible callouts to happen, you need to set options that disable the relevant optimizations. More details, including a complete description of the programming interface to the callout function, are given in the pcre2callout documentation.

Callouts with numerical arguments

If you just want to have a means of identifying different callout points, put a number less than 256 after the letter C. For example, this pattern has two callout points:

  (?C1)abc(?C2)def
If the PCRE2_AUTO_CALLOUT flag is passed to pcre2_compile(), numerical callouts are automatically installed before each item in the pattern. They are all numbered 255. If there is a conditional group in the pattern whose condition is an assertion, an additional callout is inserted just before the condition. An explicit callout may also be set at this position, as in this example:
  (?(?C9)(?=a)abc|def)
Note that this applies only to assertion conditions, not to other types of condition.

Callouts with string arguments

A delimited string may be used instead of a number as a callout argument. The starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the start, except for {, where the ending delimiter is }. If the ending delimiter is needed within the string, it must be doubled. For example:

  (?C'ab ''c'' d')xyz(?C{any text})pqr
The doubling is removed before the string is passed to the callout function.

BACKTRACKING CONTROL

There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form, and may behave differently depending on whether or not a name argument is present. The names are not required to be unique within the pattern.

By default, for compatibility with Perl, a name is any sequence of characters that does not include a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result is no longer Perl-compatible.

When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names and only an unescaped closing parenthesis terminates the name. However, the only backslash items that are permitted are \Q, \E, and sequences such as \x{100} that define character code points. Character type escapes such as \d are faulted.

A closing parenthesis can be included in a name either as \) or between \Q and \E. In addition to backslash processing, if the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is also set, unescaped white space in verb names is skipped, and #-comments are recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED and PCRE2_EXTENDED_MORE do not affect verb names unless PCRE2_ALT_VERBNAMES is also set.

The maximum length of a name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit libraries. If the name is empty, that is, if the closing parenthesis immediately follows the colon, the effect is as if the colon were not there. Any number of these verbs may occur in a pattern. Except for (*ACCEPT), they may not be quantified.

Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching function or JIT, because they use backtracking algorithms. With the exception of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function.

The behaviour of these verbs in repeated groups, assertions, and in capture groups called as subroutines (whether or not recursively) is documented below.

Optimizations that affect backtracking verbs

PCRE2 contains some optimizations that are used to speed up matching by running some checks at the start of each match attempt. For example, it may know the minimum length of matching subject, or that a particular character must be present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option when calling pcre2_compile(), by calling pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled "Compiling a pattern" in the pcre2api documentation.

Experiments with Perl suggest that it too has similar optimizations, and like PCRE2, turning them off can change the result of a match.

Verbs that act immediately

The following verbs act as soon as they are encountered.

   (*ACCEPT) or (*ACCEPT:NAME)
This verb causes the match to end successfully, skipping the remainder of the pattern. However, when it is inside a capture group that is called as a subroutine, only that group is ended successfully. Matching then continues at the outer level. If (*ACCEPT) in triggered in a positive assertion, the assertion succeeds; in a negative assertion, the assertion fails.

If (*ACCEPT) is inside capturing parentheses, the data so far is captured. For example:

  A((?:A|B(*ACCEPT)|C)D)
This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by the outer parentheses.

(*ACCEPT) is the only backtracking verb that is allowed to be quantified because an ungreedy quantification with a minimum of zero acts only when a backtrack happens. Consider, for example,

  (A(*ACCEPT)??B)C
where A, B, and C may be complex expressions. After matching "A", the matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT) (see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means "succeed on backtrack".

Warning: (*ACCEPT) should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking.

  (*FAIL) or (*FAIL:NAME)
This verb causes a matching failure, forcing backtracking to occur. It may be abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl documentation notes that it is probably useful only when combined with (?{}) or (??{}). Those are, of course, Perl features that are not present in PCRE2. The nearest equivalent is the callout feature, as for example in this pattern:
  a+(?C)(*FAIL)
A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times).

(*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before the verb acts.

Recording which path was taken

There is one verb whose main purpose is to track how a match was arrived at, though it also has a secondary use in conjunction with advancing the match starting point (see (*SKIP) below).

  (*MARK:NAME) or (*:NAME)
A name is always required with this verb. For all the other backtracking control verbs, a NAME argument is optional.

When a match succeeds, the name of the last-encountered mark name on the matching path is passed back to the caller as described in the section entitled "Other information about the match" in the pcre2api documentation. This applies to all instances of (*MARK) and other verbs, including those inside assertions and atomic groups. However, there are differences in those cases when (*MARK) is used in conjunction with (*SKIP) as described below.

The mark name that was last encountered on the matching path is passed back. A verb without a NAME argument is ignored for this purpose. Here is an example of pcre2test output, where the "mark" modifier requests the retrieval and outputting of (*MARK) data:

    re> /X(*MARK:A)Y|X(*MARK:B)Z/mark
  data> XY
   0: XY
  MK: A
  XZ
   0: XZ
  MK: B
The (*MARK) name is tagged with "MK:" in this output, and in this example it indicates which of the two alternatives matched. This is a more efficient way of obtaining this information than putting each alternative in its own capturing parentheses.

If a verb with a name is encountered in a positive assertion that is true, the name is recorded and passed back if it is the last-encountered. This does not happen for negative assertions or failing positive assertions.

After a partial match or a failed match, the last encountered name in the entire match process is returned. For example:

    re> /X(*MARK:A)Y|X(*MARK:B)Z/mark
  data> XP
  No match, mark = B
Note that in this unanchored example the mark is retained from the match attempt that started at the letter "X" in the subject. Subsequent match attempts starting at "P" and then with an empty string do not get as far as the (*MARK) item, but nevertheless do not reset it.

If you are interested in (*MARK) values after failed matches, you should probably either set the PCRE2_NO_START_OPTIMIZE option or call pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see above) to ensure that the match is always attempted.

Verbs that act after backtracking

The following verbs do nothing when they are encountered. Matching continues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, backtracking cannot pass to the left of the verb. However, when one of these verbs appears inside an atomic group or in an atomic lookaround assertion that is true, its effect is confined to that group, because once the group has been matched, there is never any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point.

These verbs differ in exactly what kind of failure occurs when backtracking reaches them. The behaviour described below is what happens when the verb is not in a subroutine or an assertion. Subsequent sections cover these special cases.

  (*COMMIT) or (*COMMIT:NAME)
This verb causes the whole match to fail outright if there is a later matching failure that causes backtracking to reach it. Even if the pattern is unanchored, no further attempts to find a match by advancing the starting point take place. If (*COMMIT) is the only backtracking verb that is encountered, once it has been passed pcre2_match() is committed to finding a match at the current starting point, or not at all. For example:
  a+(*COMMIT)b
This matches "xxaab" but not "aacaab". It can be thought of as a kind of dynamic anchor, or "I've started, so I must finish."

The behaviour of (*COMMIT:NAME) is not the same as (*MARK:NAME)(*COMMIT). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names that are set with (*MARK), ignoring those set by any of the other backtracking verbs.

If there is more than one backtracking verb in a pattern, a different one that follows (*COMMIT) may be triggered first, so merely passing (*COMMIT) during a match does not always guarantee that a match must be at this starting point.

Note that (*COMMIT) at the start of a pattern is not the same as an anchor, unless PCRE2's start-of-match optimizations are turned off, as shown in this output from pcre2test:

    re> /(*COMMIT)abc/
  data> xyzabc
   0: abc
  data>
  re> /(*COMMIT)abc/no_start_optimize
  data> xyzabc
  No match
For the first pattern, PCRE2 knows that any match must start with "a", so the optimization skips along the subject to "a" before applying the pattern to the first set of data. The match attempt then succeeds. The second pattern disables the optimization that skips along to the first character. The pattern is now applied starting at "x", and so the (*COMMIT) causes the match to fail without trying any other starting points.
  (*PRUNE) or (*PRUNE:NAME)
This verb causes the match to fail at the current starting position in the subject if there is a later matching failure that causes backtracking to reach it. If the pattern is unanchored, the normal "bumpalong" advance to the next starting character then happens. Backtracking can occur as usual to the left of (*PRUNE), before it is reached, or when matching to the right of (*PRUNE), but if there is no match to the right, backtracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE) is just an alternative to an atomic group or possessive quantifier, but there are some uses of (*PRUNE) that cannot be expressed in any other way. In an anchored pattern (*PRUNE) has the same effect as (*COMMIT).

The behaviour of (*PRUNE:NAME) is not the same as (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names set with (*MARK), ignoring those set by other backtracking verbs.

  (*SKIP)
This verb, when given without a name, is like (*PRUNE), except that if the pattern is unanchored, the "bumpalong" advance is not to the next character, but to the position in the subject where (*SKIP) was encountered. (*SKIP) signifies that whatever text was matched leading up to it cannot be part of a successful match if there is a later mismatch. Consider:
  a+(*SKIP)b
If the subject is "aaaac...", after the first match attempt fails (starting at the first character in the string), the starting point skips on to start the next attempt at "c". Note that a possessive quantifier does not have the same effect as this example; although it would suppress backtracking during the first match attempt, the second attempt would start at the second character instead of skipping on to "c".

If (*SKIP) is used to specify a new starting position that is the same as the starting position of the current match, or (by being inside a lookbehind) earlier, the position specified by (*SKIP) is ignored, and instead the normal "bumpalong" occurs.

  (*SKIP:NAME)
When (*SKIP) has an associated name, its behaviour is modified. When such a (*SKIP) is triggered, the previous path through the pattern is searched for the most recent (*MARK) that has the same name. If one is found, the "bumpalong" advance is to the subject position that corresponds to that (*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with a matching name is found, the (*SKIP) is ignored.

The search for a (*MARK) name uses the normal backtracking mechanism, which means that it does not see (*MARK) settings that are inside atomic groups or assertions, because they are never re-entered by backtracking. Compare the following pcre2test examples:

    re> /a(?>(*MARK:X))(*SKIP:X)(*F)|(.)/
  data: abc
   0: a
   1: a
  data:
    re> /a(?:(*MARK:X))(*SKIP:X)(*F)|(.)/
  data: abc
   0: b
   1: b
In the first example, the (*MARK) setting is in an atomic group, so it is not seen when (*SKIP:X) triggers, causing the (*SKIP) to be ignored. This allows the second branch of the pattern to be tried at the first character position. In the second example, the (*MARK) setting is not in an atomic group. This allows (*SKIP:X) to find the (*MARK) when it backtracks, and this causes a new matching attempt to start at the second character. This time, the (*MARK) is never seen because "a" does not match "b", so the matcher immediately jumps to the second branch of the pattern.

Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It ignores names that are set by other backtracking verbs.

  (*THEN) or (*THEN:NAME)
This verb causes a skip to the next innermost alternative when backtracking reaches it. That is, it cancels any further backtracking within the current alternative. Its name comes from the observation that it can be used for a pattern-based if-then-else block:
  ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
If the COND1 pattern matches, FOO is tried (and possibly further items after the end of the group if FOO succeeds); on failure, the matcher skips to the second alternative and tries COND2, without backtracking into COND1. If that succeeds and BAR fails, COND3 is tried. If subsequently BAZ fails, there are no more alternatives, so there is a backtrack to whatever came before the entire group. If (*THEN) is not inside an alternation, it acts like (*PRUNE).

The behaviour of (*THEN:NAME) is not the same as (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names set with (*MARK), ignoring those set by other backtracking verbs.

A group that does not contain a | character is just a part of the enclosing alternative; it is not a nested alternation with only one alternative. The effect of (*THEN) extends beyond such a group to the enclosing alternative. Consider this pattern, where A, B, etc. are complex pattern fragments that do not contain any | characters at this level:

  A (B(*THEN)C) | D
If A and B are matched, but there is a failure in C, matching does not backtrack into A; instead it moves to the next alternative, that is, D. However, if the group containing (*THEN) is given an alternative, it behaves differently:
  A (B(*THEN)C | (*FAIL)) | D
The effect of (*THEN) is now confined to the inner group. After a failure in C, matching moves to (*FAIL), which causes the whole group to fail because there are no more alternatives to try. In this case, matching does backtrack into A.

Note that a conditional group is not considered as having two alternatives, because only one is ever used. In other words, the | character in a conditional group has a different meaning. Ignoring white space, consider:

  ^.*? (?(?=a) a | b(*THEN)c )
If the subject is "ba", this pattern does not match. Because .*? is ungreedy, it initially matches zero characters. The condition (?=a) then fails, the character "b" is matched, but "c" is not. At this point, matching does not backtrack to .*? as might perhaps be expected from the presence of the | character. The conditional group is part of the single alternative that comprises the whole pattern, and so the match fails. (If there was a backtrack into .*?, allowing it to match "b", the match would succeed.)

The verbs just described provide four different "strengths" of control when subsequent matching fails. (*THEN) is the weakest, carrying on the match at the next alternative. (*PRUNE) comes next, failing the match at the current starting position, but allowing an advance to the next character (for an unanchored pattern). (*SKIP) is similar, except that the advance may be more than one character. (*COMMIT) is the strongest, causing the entire match to fail.

More than one backtracking verb

If more than one backtracking verb is present in a pattern, the one that is backtracked onto first acts. For example, consider this pattern, where A, B, etc. are complex pattern fragments:

  (A(*COMMIT)B(*THEN)C|ABD)
If A matches but B fails, the backtrack to (*COMMIT) causes the entire match to fail. However, if A and B match, but C fails, the backtrack to (*THEN) causes the next alternative (ABD) to be tried. This behaviour is consistent, but is not always the same as Perl's. It means that if two or more backtracking verbs appear in succession, all but the last of them has no effect. Consider this example:
  ...(*COMMIT)(*PRUNE)...
If there is a matching failure to the right, backtracking onto (*PRUNE) causes it to be triggered, and its action is taken. There can never be a backtrack onto (*COMMIT).

Backtracking verbs in repeated groups

PCRE2 sometimes differs from Perl in its handling of backtracking verbs in repeated groups. For example, consider:

  /(a(*COMMIT)b)+ac/
If the subject is "abac", Perl matches unless its optimizations are disabled, but PCRE2 always fails because the (*COMMIT) in the second repeat of the group acts.

Backtracking verbs in assertions

(*FAIL) in any assertion has its normal effect: it forces an immediate backtrack. The behaviour of the other backtracking verbs depends on whether or not the assertion is standalone or acting as the condition in a conditional group.

(*ACCEPT) in a standalone positive assertion causes the assertion to succeed without any further processing; captured strings and a mark name (if set) are retained. In a standalone negative assertion, (*ACCEPT) causes the assertion to fail without any further processing; captured substrings and any mark name are discarded.

If the assertion is a condition, (*ACCEPT) causes the condition to be true for a positive assertion and false for a negative one; captured substrings are retained in both cases.

The remaining verbs act only when a later failure causes a backtrack to reach them. This means that, for the Perl-compatible assertions, their effect is confined to the assertion, because Perl lookaround assertions are atomic. A backtrack that occurs after such an assertion is complete does not jump back into the assertion. Note in particular that a (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.

PCRE2 now supports non-atomic positive assertions and also "scan substring" assertions, as described in the sections entitled "Non-atomic assertions" and "Scan substring assertions" above. These assertions must be standalone (not used as conditions). They are not Perl-compatible. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by backtracks from later in the pattern.

The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, and a negative assertion to be true. This behaviour differs from Perl when the assertion has only one branch.

The other backtracking verbs are not treated specially if they appear in a standalone positive assertion. In a conditional positive assertion, backtracking (from within the assertion) into (*COMMIT), (*SKIP), or (*PRUNE) causes the condition to be false. However, for both standalone and conditional negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes the assertion to be true, without considering any further alternative branches.

Backtracking verbs in subroutines

These behaviours occur whether or not the group is called recursively.

(*ACCEPT) in a group called as a subroutine causes the subroutine match to succeed without any further processing. Matching then continues after the subroutine call. Perl documents this behaviour. Perl's treatment of the other verbs in subroutines is different in some cases.

(*FAIL) in a group called as a subroutine has its normal effect: it forces an immediate backtrack.

(*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail when triggered by being backtracked to in a group called as a subroutine. There is then a backtrack at the outer level.

(*THEN), when triggered, skips to the next alternative in the innermost enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level.

EBCDIC ENVIRONMENTS

Differences in the way PCRE behaves when it is running in an EBCDIC environment are covered in this section.

Escape sequences

When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c escape is processed as specified for Perl in the perlebcdic document. The only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. Any other character provokes a compile-time error. The sequence \c@ encodes character code 0; after \c the letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F).

Thus, apart from \c?, these escapes generate the same character code values as they do in an ASCII or Unicode environment, though the meanings of the values mostly differ. For example, \cG always generates code value 7, which is BEL in ASCII but DEL in EBCDIC.

The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but because 127 is not a control character in EBCDIC, Perl makes it generate the APC character. Unfortunately, there are several variants of EBCDIC. In most of them the APC character has the value 255 (hex FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC values, PCRE2 makes \c? generate 95; otherwise it generates 255.

Character classes

In character classes there is a special case in EBCDIC environments for ranges whose end points are both specified as literal letters in the same case. For compatibility with Perl, EBCDIC code points within the range that are not letters are omitted. For example, [h-k] matches only four characters, even though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code points. However, if the range is specified numerically, for example, [\x88-\x92] or [h-\x92], all code points are included.

SEE ALSO

pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 03 September 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2perform.html ================================================ pcre2perform specification

pcre2perform man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PCRE2 PERFORMANCE

Two aspects of performance are discussed below: memory usage and processing time. The way you express your pattern as a regular expression can affect both of them.

COMPILED PATTERN MEMORY USAGE

Patterns are compiled by PCRE2 into a reasonably efficient interpretive code, so that most simple patterns do not use much memory for storing the compiled version. However, there is one case where the memory usage of a compiled pattern can be unexpectedly large. If a parenthesized group has a quantifier with a minimum greater than 1 and/or a limited maximum, the whole group is repeated in the compiled code. For example, the pattern

  (abc|def){2,4}
is compiled as if it were
  (abc|def)(abc|def)((abc|def)(abc|def)?)?
(Technical aside: It is done this way so that backtrack points within each of the repetitions can be independently maintained.)

For regular expressions whose quantifiers use only small numbers, this is not usually a problem. However, if the numbers are large, and particularly if such repetitions are nested, the memory usage can become an embarrassment. For example, the very simple pattern

  ((ab){1,1000}c){1,3}
uses over 50KiB when compiled using the 8-bit library. When PCRE2 is compiled with its default internal pointer size of two bytes, the size limit on a compiled pattern is 65535 code units in the 8-bit and 16-bit libraries, and this is reached with the above pattern if the outer repetition is increased from 3 to 4. PCRE2 can be compiled to use larger internal pointers and thus handle larger compiled patterns, but it is better to try to rewrite your pattern to use less memory if you can.

One way of reducing the memory usage for such patterns is to make use of PCRE2's "subroutine" facility. Re-writing the above pattern as

  ((ab)(?2){0,999}c)(?1){0,2}
reduces the memory requirements to around 16KiB, and indeed it remains under 20KiB even with the outer repetition increased to 100. However, this kind of pattern is not always exactly equivalent, because any captures within subroutine calls are lost when the subroutine completes. If this is not a problem, this kind of rewriting will allow you to process patterns that PCRE2 cannot otherwise handle. The matching performance of the two different versions of the pattern are roughly the same. (This applies from release 10.30 - things were different in earlier releases.)

STACK AND HEAP USAGE AT RUN TIME

From release 10.30, the interpretive (non-JIT) version of pcre2_match() uses very little system stack at run time. In earlier releases recursive function calls could use a great deal of stack, and this could cause problems, but this usage has been eliminated. Backtracking positions are now explicitly remembered in memory frames controlled by the code.

The size of each frame depends on the size of pointer variables and the number of capturing parenthesized groups in the pattern being matched. On a 64-bit system the frame size for a pattern with no captures is 128 bytes. For each capturing group the size increases by 16 bytes.

Until release 10.41, an initial 20KiB frames vector was allocated on the system stack, but this still caused some issues for multi-thread applications where each thread has a very small stack. From release 10.41 backtracking memory frames are always held in heap memory. An initial heap allocation is obtained the first time any match data block is passed to pcre2_match(). This is remembered with the match data block and re-used if that block is used for another match. It is freed when the match data block itself is freed.

The size of the initial block is the larger of 20KiB or ten times the pattern's frame size, unless the heap limit is less than this, in which case the heap limit is used. If the initial block proves to be too small during matching, it is replaced by a larger block, subject to the heap limit. The heap limit is checked only when a new block is to be allocated. Reducing the heap limit between calls to pcre2_match() with the same match data block does not affect the saved block.

In contrast to pcre2_match(), pcre2_dfa_match() does use recursive function calls, but only for processing atomic groups, lookaround assertions, and recursion within the pattern. The original version of the code used to allocate quite large internal workspace vectors on the stack, which caused some problems for some patterns in environments with small stacks. From release 10.32 the code for pcre2_dfa_match() has been re-factored to use heap memory when necessary for internal workspace when recursing, though recursive function calls are still used.

The "match depth" parameter can be used to limit the depth of function recursion, and the "match heap" parameter to limit heap memory in pcre2_dfa_match().

PROCESSING TIME

Certain items in regular expression patterns are processed more efficiently than others. It is more efficient to use a character class like [aeiou] than a set of single-character alternatives such as (a|e|i|o|u). In general, the simplest construction that provides the required behaviour is usually the most efficient. Jeffrey Friedl's book contains a lot of useful general discussion about optimizing regular expressions for efficient performance. This document contains a few observations about PCRE2.

Using Unicode character properties (the \p, \P, and \X escapes) is slow, because PCRE2 has to use a multi-stage table lookup whenever it needs a character's property. If you can find an alternative pattern that does not use character properties, it will probably be faster.

By default, the escape sequences \b, \d, \s, and \w, and the POSIX character classes such as [:alpha:] do not use Unicode properties, partly for backwards compatibility, and partly for performance reasons. However, you can set the PCRE2_UCP option or start the pattern with (*UCP) if you want Unicode character properties to be used. This can double the matching time for items such as \d, when matched with pcre2_match(); the performance loss is less with a DFA matching function, and in both cases there is not much difference for \b.

When a pattern begins with .* not in atomic parentheses, nor in parentheses that are the subject of a backreference, and the PCRE2_DOTALL option is set, the pattern is implicitly anchored by PCRE2, since it can match only at the start of a subject string. If the pattern has multiple top-level branches, they must all be anchorable. The optimization can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is automatically disabled if the pattern contains (*PRUNE) or (*SKIP).

If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, because the dot metacharacter does not then match a newline, and if the subject string contains newlines, the pattern may match from the character immediately following one of them instead of from the very start. For example, the pattern

  .*second
matches the subject "first\nand second" (where \n stands for a newline character), with the match starting at the seventh character. In order to do this, PCRE2 has to retry the match starting after every newline in the subject.

If you are using such a pattern with subject strings that do not contain newlines, the best performance is obtained by setting PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE2 from having to scan along the subject looking for a newline to restart at.

Beware of patterns that contain nested indefinite repeats. These can take a long time to run when applied to a string that does not match. Consider the pattern fragment

  ^(a+)*
This can match "aaaa" in 16 different ways, and this number increases very rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4 times, and for each of those cases other than 0 or 4, the + repeats can match different numbers of times.) When the remainder of the pattern is such that the entire match is going to fail, PCRE2 has in principle to try every possible variation, and this can take an extremely long time, even for relatively short strings.

An optimization catches some of the more simple cases such as

  (a+)*b
where a literal character follows. Before embarking on the standard matching procedure, PCRE2 checks that there is a "b" later in the subject string, and if there is not, it fails the match immediately. However, when there is no following literal this optimization cannot be used. You can see the difference by comparing the behaviour of
  (a+)*\d
with the pattern above. The former gives a failure almost instantly when applied to a whole line of "a" characters, whereas the latter takes an appreciable time with strings longer than about 20 characters.

In many cases, the solution to this kind of performance issue is to use an atomic group or a possessive quantifier. This can often reduce memory requirements as well. As another example, consider this pattern:

  ([^<]|<(?!inet))+
It matches from wherever it starts until it encounters "<inet" or the end of the data, and is the kind of pattern that might be used when processing an XML file. Each iteration of the outer parentheses matches either one character that is not "<" or a "<" that is not followed by "inet". However, each time a parenthesis is processed, a backtracking position is passed, so this formulation uses a memory frame for each matched character. For a long string, a lot of memory is required. Consider now this rewritten pattern, which matches exactly the same strings:
  ([^<]++|<(?!inet))+
This runs much faster, because sequences of characters that do not contain "<" are "swallowed" in one item inside the parentheses, and a possessive quantifier is used to stop any backtracking into the runs of non-"<" characters. This version also uses a lot less memory because entry to a new set of parentheses happens only when a "<" character that is not followed by "inet" is encountered (and we assume this is relatively rare).

This example shows that one way of optimizing performance when matching long subject strings is to write repeated parenthesized subpatterns to match more than one character whenever possible.

SETTING RESOURCE LIMITS

You can set limits on the amount of processing that takes place when matching, and on the amount of heap memory that is used. The default values of the limits are very large, and unlikely ever to operate. They can be changed when PCRE2 is built, and they can also be set when pcre2_match() or pcre2_dfa_match() is called. For details of these interfaces, see the pcre2build documentation and the section entitled "The match context" in the pcre2api documentation.

The pcre2test test program has a modifier called "find_limits" which, if applied to a subject line, causes it to find the smallest limits that allow a pattern to match. This is done by repeatedly matching with different limits.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 06 December 2022
Copyright © 1997-2022 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2posix.html ================================================ pcre2posix specification

pcre2posix man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

#include <pcre2posix.h>

int pcre2_regcomp(regex_t *preg, const char *pattern, int cflags);

int pcre2_regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);

size_t pcre2_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);

void pcre2_regfree(regex_t *preg);

DESCRIPTION

This set of functions provides a POSIX-style API for the PCRE2 regular expression 8-bit library. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit libraries. See the pcre2api documentation for a description of PCRE2's native API, which contains much additional functionality.

IMPORTANT NOTE: The functions described here are NOT thread-safe, and should not be used in multi-threaded applications. They are also limited to processing subjects that are not bigger than 2GB. Use the native API instead.

These functions are wrapper functions that ultimately call the PCRE2 native API. Their prototypes are defined in the pcre2posix.h header file, and they all have unique names starting with pcre2_. However, the pcre2posix.h header also contains macro definitions that convert the standard POSIX names such regcomp() into pcre2_regcomp() etc. This means that a program can use the usual POSIX names without running the risk of accidentally linking with POSIX functions from a different library.

On Unix-like systems the PCRE2 POSIX library is called libpcre2-posix, so can be accessed by adding -lpcre2-posix to the command for linking an application. Because the POSIX functions call the native ones, it is also necessary to add -lpcre2-8.

On Windows systems, if you are linking to a DLL version of the library, it is recommended that PCRE2POSIX_SHARED is defined before including the pcre2posix.h header, as it will allow for a more efficient way to invoke the functions by adding the __declspec(dllimport) decorator.

Although they were not defined as prototypes in pcre2posix.h, releases 10.33 to 10.36 of the library contained functions with the POSIX names regcomp() etc. These simply passed their arguments to the PCRE2 functions. These functions were provided for backwards compatibility with earlier versions of PCRE2, which had only POSIX names. However, this has proved troublesome in situations where a program links with several libraries, some of which use PCRE2's POSIX interface while others use the real POSIX functions. For this reason, the POSIX names have been removed since release 10.37.

Calling the header file pcre2posix.h avoids any conflict with other POSIX libraries. It can, of course, be renamed or aliased as regex.h, which is the "correct" name, if there is no clash. It provides two structure types, regex_t for compiled internal forms, and regmatch_t for returning captured substrings. It also defines some constants whose names start with "REG_"; these are used for setting options and identifying error codes.

USING THE POSIX FUNCTIONS

Note that these functions are just POSIX-style wrappers for PCRE2's native API. They do not give POSIX regular expression behaviour, and they are not thread-safe or even POSIX compatible.

Those POSIX option bits that can reasonably be mapped to PCRE2 native options have been implemented. In addition, the option REG_EXTENDED is defined with the value zero. This has no effect, but since programs that are written to the POSIX interface often use it, this makes it easier to slot in PCRE2 as a replacement library. Other POSIX options are not even defined.

There are also some options that are not defined by POSIX. These have been added at the request of users who want to make use of certain PCRE2-specific features via the POSIX calling interface or to add BSD or GNU functionality.

When PCRE2 is called via these functions, it is only the API that is POSIX-like in style. The syntax and semantics of the regular expressions themselves are still those of Perl, subject to the setting of various PCRE2 options, as described below. "POSIX-like in style" means that the API approximates to the POSIX definition; it is not fully POSIX-compatible, and in multi-unit encoding domains it is probably even less compatible.

The descriptions below use the actual names of the functions, but, as described above, the standard POSIX names (without the pcre2_ prefix) may also be used.

COMPILING A PATTERN

The function pcre2_regcomp() is called to compile a pattern into an internal form. By default, the pattern is a C string terminated by a binary zero (but see REG_PEND below). The preg argument is a pointer to a regex_t structure that is used as a base for storing information about the compiled regular expression. It is also used for input when REG_PEND is set. The regex_t structure used by pcre2_regcomp() is defined in pcre2posix.h and is not the same as the structure used by other libraries that provide POSIX-style matching.

The argument cflags is either zero, or contains one or more of the bits defined by the following macros:

  REG_DOTALL
The PCRE2_DOTALL option is set when the regular expression is passed for compilation to the native function. Note that REG_DOTALL is not part of the POSIX standard.
  REG_ICASE
The PCRE2_CASELESS option is set when the regular expression is passed for compilation to the native function.
  REG_NEWLINE
The PCRE2_MULTILINE option is set when the regular expression is passed for compilation to the native function. Note that this does not mimic the defined POSIX behaviour for REG_NEWLINE (see the following section).
  REG_NOSPEC
The PCRE2_LITERAL option is set when the regular expression is passed for compilation to the native function. This disables all meta characters in the pattern, causing it to be treated as a literal string. The only other options that are allowed with REG_NOSPEC are REG_ICASE, REG_NOSUB, REG_PEND, and REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard.
  REG_NOSUB
When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments are ignored, and no captured strings are returned. Versions of the PCRE2 library prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens because it disables the use of backreferences.
  REG_PEND
If this option is set, the reg_endp field in the preg structure (which has the type const char *) must be set to point to the character beyond the end of the pattern before calling pcre2_regcomp(). The pattern itself may now contain binary zeros, which are treated as data characters. Without REG_PEND, a binary zero terminates the pattern and the re_endp field is ignored. This is a GNU extension to the POSIX standard and should be used with caution in software intended to be portable to other systems.
  REG_UCP
The PCRE2_UCP option is set when the regular expression is passed for compilation to the native function. This causes PCRE2 to use Unicode properties when matching \d, \w, etc., instead of just recognizing ASCII values. Note that REG_UCP is not part of the POSIX standard.
  REG_UNGREEDY
The PCRE2_UNGREEDY option is set when the regular expression is passed for compilation to the native function. Note that REG_UNGREEDY is not part of the POSIX standard.
  REG_UTF
The PCRE2_UTF option is set when the regular expression is passed for compilation to the native function. This causes the pattern itself and all data strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF is not part of the POSIX standard.

In the absence of these flags, no options are passed to the native function. This means that the regex is compiled with PCRE2 default semantics. In particular, the way it handles newline characters in the subject string is the Perl way, not the POSIX way. Note that setting PCRE2_MULTILINE has only some of the effects specified for REG_NEWLINE. It does not affect the way newlines are matched by the dot metacharacter (they are not) or by a negative class such as [^a] (they are).

The yield of pcre2_regcomp() is zero on success, and non-zero otherwise. The preg structure is filled in on success, and one other member of the structure (as well as re_endp) is public: re_nsub contains the number of capturing subpatterns in the regular expression. Various error codes are defined in the header file.

NOTE: If the yield of pcre2_regcomp() is non-zero, you must not attempt to use the contents of the preg structure. If, for example, you pass it to pcre2_regexec(), the result is undefined and your program is likely to crash.

MATCHING NEWLINE CHARACTERS

This area is not simple, because POSIX and Perl take different views of things. It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was never intended to be a POSIX engine. The following table lists the different possibilities for matching newline characters in Perl and PCRE2:

                          Default   Change with

  . matches newline          no     PCRE2_DOTALL
  newline matches [^a]       yes    not changeable
  $ matches \n at end        yes    PCRE2_DOLLAR_ENDONLY
  $ matches \n in middle     no     PCRE2_MULTILINE
  ^ matches \n in middle     no     PCRE2_MULTILINE
This is the equivalent table for a POSIX-compatible pattern matcher:
                          Default   Change with

  . matches newline          yes    REG_NEWLINE
  newline matches [^a]       yes    REG_NEWLINE
  $ matches \n at end        no     REG_NEWLINE
  $ matches \n in middle     no     REG_NEWLINE
  ^ matches \n in middle     no     REG_NEWLINE
This behaviour is not what happens when PCRE2 is called via its POSIX API. By default, PCRE2's behaviour is the same as Perl's, except that there is no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there is no way to stop newline from matching [^a].

Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and PCRE2_DOLLAR_ENDONLY when calling pcre2_compile() directly, but there is no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using the POSIX API, passing REG_NEWLINE to PCRE2's pcre2_regcomp() function causes PCRE2_MULTILINE to be passed to pcre2_compile(), and REG_DOTALL passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.

MATCHING A PATTERN

The function pcre2_regexec() is called to match a compiled pattern preg against a given string, which is by default terminated by a zero byte (but see REG_STARTEND below), subject to the options in eflags. These can be:

  REG_NOTBOL
The PCRE2_NOTBOL option is set when calling the underlying PCRE2 matching function.
  REG_NOTEMPTY
The PCRE2_NOTEMPTY option is set when calling the underlying PCRE2 matching function. Note that REG_NOTEMPTY is not part of the POSIX standard. However, setting this option can give more POSIX-like behaviour in some situations.
  REG_NOTEOL
The PCRE2_NOTEOL option is set when calling the underlying PCRE2 matching function.
  REG_STARTEND
When this option is set, the subject string starts at string + pmatch[0].rm_so and ends at string + pmatch[0].rm_eo, which should point to the first character beyond the string. There may be binary zeros within the subject string, and indeed, using REG_STARTEND is the only way to pass a subject string that contains a binary zero.

Whatever the value of pmatch[0].rm_so, the offsets of the matched string and any captured substrings are still given relative to the start of string itself. (Before PCRE2 release 10.30 these were given relative to string + pmatch[0].rm_so, but this differs from other implementations.)

This is a BSD extension, compatible with but not specified by IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software intended to be portable to other systems. Note that a non-zero rm_so does not imply REG_NOTBOL; REG_STARTEND affects only the location and length of the string, not how it is matched. Setting REG_STARTEND and passing pmatch as NULL are mutually exclusive; the error REG_INVARG is returned.

If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The nmatch and pmatch arguments of pcre2_regexec() are ignored (except possibly as input for REG_STARTEND).

The value of nmatch may be zero, and the value pmatch may be NULL (unless REG_STARTEND is set); in both these cases no data about any matched strings is returned.

Otherwise, the portion of the string that was matched, and also any captured substrings, are returned via the pmatch argument, which points to an array of nmatch structures of type regmatch_t, containing the members rm_so and rm_eo. These contain the byte offset to the first character of each substring and the offset to the first character after the end of each substring, respectively. The 0th element of the vector relates to the entire portion of string that was matched; subsequent elements relate to the capturing subpatterns of the regular expression. Unused entries in the array have both structure members set to -1.

regmatch_t as well as the regoff_t typedef it uses are defined in pcre2posix.h and are not warranted to have the same size or layout as other similarly named types from other libraries that provide POSIX-style matching.

A successful match yields a zero return; various error codes are defined in the header file, of which REG_NOMATCH is the "expected" failure code.

ERROR MESSAGES

The pcre2_regerror() function maps a non-zero errorcode from either pcre2_regcomp() or pcre2_regexec() to a printable message. If preg is not NULL, the error should have arisen from the use of that structure. A message terminated by a binary zero is placed in errbuf. If the buffer is too short, only the first errbuf_size - 1 characters of the error message are used. The yield of the function is the size of buffer needed to hold the whole message, including the terminating zero. This value is greater than errbuf_size if the message was truncated.

MEMORY USAGE

Compiling a regular expression causes memory to be allocated and associated with the preg structure. The function pcre2_regfree() frees all such memory, after which preg may no longer be used as a compiled expression.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 27 November 2024
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2sample.html ================================================ pcre2sample specification

pcre2sample man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PCRE2 SAMPLE PROGRAM

A simple, complete demonstration program to get you started with using PCRE2 is supplied in the file pcre2demo.c in the src directory in the PCRE2 distribution. A listing of this program is given in the pcre2demo documentation. If you do not have a copy of the PCRE2 distribution, you can save this listing to recreate the contents of pcre2demo.c.

The demonstration program compiles the regular expression that is its first argument, and matches it against the subject string in its second argument. No PCRE2 options are set, and default character tables are used. If matching succeeds, the program outputs the portion of the subject that matched, together with the contents of any captured substrings.

If the -g option is given on the command line, the program then goes on to check for further matches of the same regular expression in the same subject string. The logic is a little bit tricky because of the possibility of matching an empty string. Comments in the code explain what is going on.

The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit library. It handles strings and characters that are stored in 8-bit code units. By default, one character corresponds to one code unit, but if the pattern starts with "(*UTF)", both it and the subject are treated as UTF-8 strings, where characters may occupy multiple code units.

If PCRE2 is installed in the standard include and library directories for your operating system, you should be able to compile the demonstration program using a command like this:

  cc -o pcre2demo pcre2demo.c -lpcre2-8
If PCRE2 is installed elsewhere, you may need to add additional options to the command line. For example, on a Unix-like system that has PCRE2 installed in /usr/local, you can compile the demonstration program using a command like this:
  cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
Once you have built the demonstration program, you can run simple tests like this:
  ./pcre2demo 'cat|dog' 'the cat sat on the mat'
  ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
  ./pcre2demo -i 'cat' 'the dog sat on the CAT'
Note that there is a much more comprehensive test program, called pcre2test, which supports many more facilities for testing regular expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be installed). The pcre2demo program is provided as a relatively simple coding example.

If you try to run pcre2demo when PCRE2 is not installed in the standard library directory, you may get an error like this on some operating systems (e.g. Solaris):

  ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
This is caused by the way shared library support works on those systems. You need to add
  -R/usr/local/lib
(for example) to the compile command to get round this problem.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 28 February 2025
Copyright © 1997-2016 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2serialize.html ================================================ pcre2serialize specification

pcre2serialize man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS

int32_t pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext);

int32_t pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);

void pcre2_serialize_free(uint8_t *bytes);

int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);

If you are running an application that uses a large number of regular expression patterns, it may be useful to store them in a precompiled form instead of having to compile them every time the application is run. However, if you are using the just-in-time optimization feature, it is not possible to save and reload the JIT data, because it is position-dependent. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width and PCRE2_SIZE type. For example, patterns compiled on a 32-bit system using PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor can they be reloaded using the 8-bit library.

Note that "serialization" in PCRE2 does not convert compiled patterns to an abstract format like Java or .NET serialization. The serialized output is really just a bytecode dump, which is why it can only be reloaded in the same environment as the one that created it. Hence the restrictions mentioned above. Applications that are not statically linked with a fixed version of PCRE2 must be prepared to recompile patterns from their sources, in order to be immune to PCRE2 upgrades.

SECURITY CONCERNS

The facility for saving and restoring compiled patterns is intended for use within individual applications. As such, the data supplied to pcre2_serialize_decode() is expected to be trusted data, not data from arbitrary external sources. There is only some simple consistency checking, not complete validation of what is being re-loaded. Corrupted data may cause undefined results. For example, if the length field of a pattern in the serialized data is corrupted, the deserializing code may read beyond the end of the byte stream that is passed to it.

SAVING COMPILED PATTERNS

Before compiled patterns can be saved they must be serialized, which in PCRE2 means converting the pattern to a stream of bytes. A single byte stream may contain any number of compiled patterns, but they must all use the same character tables. A single copy of the tables is included in the byte stream (its size is 1088 bytes). For more details of character tables, see the section on locale support in the pcre2api documentation.

The function pcre2_serialize_encode() creates a serialized byte stream from a list of compiled patterns. Its first two arguments specify the list, being a pointer to a vector of pointers to compiled patterns, and the length of the vector. The third and fourth arguments point to variables which are set to point to the created byte stream and its length, respectively. The final argument is a pointer to a general context, which can be used to specify custom memory management functions. If this argument is NULL, malloc() is used to obtain memory for the byte stream. The yield of the function is the number of serialized patterns, or one of the following negative error codes:

  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
PCRE2_ERROR_BADMAGIC means either that a pattern's code has been corrupted, or that a slot in the vector does not point to a compiled pattern.

Once a set of patterns has been serialized you can save the data in any appropriate manner. Here is sample code that compiles two patterns and writes them to a file. It assumes that the variable fd refers to a file that is open for output. The error checking that should be present in a real application has been omitted for simplicity.

  int errorcode;
  uint8_t *bytes;
  PCRE2_SIZE erroroffset;
  PCRE2_SIZE bytescount;
  pcre2_code *list_of_codes[2];
  list_of_codes[0] = pcre2_compile("first pattern",
    PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL);
  list_of_codes[1] = pcre2_compile("second pattern",
    PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL);
  errorcode = pcre2_serialize_encode(list_of_codes, 2, &bytes,
    &bytescount, NULL);
  errorcode = fwrite(bytes, 1, bytescount, fd);
Note that the serialized data is binary data that may contain any of the 256 possible byte values. On systems that make a distinction between binary and non-binary data, be sure that the file is opened for binary output.

Serializing a set of patterns leaves the original data untouched, so they can still be used for matching. Their memory must eventually be freed in the usual way by calling pcre2_code_free(). When you have finished with the byte stream, it too must be freed by calling pcre2_serialize_free(). If this function is called with a NULL argument, it returns immediately without doing anything.

RE-USING PRECOMPILED PATTERNS

In order to re-use a set of saved patterns you must first make the serialized byte stream available in main memory (for example, by reading from a file). The management of this memory block is up to the application. You can use the pcre2_serialize_get_number_of_codes() function to find out how many compiled patterns are in the serialized data without actually decoding the patterns:

  uint8_t *bytes = <serialized data>;
  int32_t number_of_codes = pcre2_serialize_get_number_of_codes(bytes);
The pcre2_serialize_decode() function reads a byte stream and recreates the compiled patterns in new memory blocks, setting pointers to them in a vector. The first two arguments are a pointer to a suitable vector and its length, and the third argument points to a byte stream. The final argument is a pointer to a general context, which can be used to specify custom memory management functions for the decoded patterns. If this argument is NULL, malloc() and free() are used. After deserialization, the byte stream is no longer needed and can be discarded.
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = <serialized data>;
  int32_t number_of_codes =
    pcre2_serialize_decode(list_of_codes, 2, bytes, NULL);
If the vector is not large enough for all the patterns in the byte stream, it is filled with those that fit, and the remainder are ignored. The yield of the function is the number of decoded patterns, or one of the following negative error codes:
  PCRE2_ERROR_BADDATA    second argument is zero or less
  PCRE2_ERROR_BADMAGIC   mismatch of id bytes in the data
  PCRE2_ERROR_BADMODE    mismatch of code unit size or PCRE2 version
  PCRE2_ERROR_BADSERIALIZEDDATA  other sanity check failure
  PCRE2_ERROR_MEMORY     memory allocation failed
  PCRE2_ERROR_NULL       first or third argument is NULL
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness.

Decoded patterns can be used for matching in the usual way, and must be freed by calling pcre2_code_free(). However, be aware that there is a potential race issue if you are using multiple patterns that were decoded from a single byte stream in a multithreaded application. A single copy of the character tables is used by all the decoded patterns and a reference count is used to arrange for its memory to be automatically freed when the last pattern is freed, but there is no locking on this reference count. Therefore, if you want to call pcre2_code_free() for these patterns in different threads, you must arrange your own locking, and ensure that pcre2_code_free() cannot be called by two threads at the same time.

If a pattern was processed by pcre2_jit_compile() before being serialized, the JIT data is discarded and so is no longer available after a save/restore cycle. You can, however, process a restored pattern with pcre2_jit_compile() if you wish.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 19 January 2024
Copyright © 1997-2018 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2syntax.html ================================================ pcre2syntax specification

pcre2syntax man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY

The full syntax and semantics of the regular expression patterns that are supported by PCRE2 are described in the pcre2pattern documentation. This document contains a quick-reference summary of the pattern syntax followed by the syntax of replacement strings in substitution function. The full description of the latter is in the pcre2api documentation.

QUOTING

  \x         where x is non-alphanumeric is a literal x
  \Q...\E    treat enclosed characters as literal
Note that white space inside \Q...\E is always treated as literal, even if PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also that PCRE2's handling of \Q...\E has some differences from Perl's. See the pcre2pattern documentation for details.

BRACED ITEMS

With one exception, wherever brace characters { and } are required to enclose data for constructions such as \g{2} or \k{name}, space and/or horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or after the comma. The exception is \u{...} which is not Perl-compatible and is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This is an ECMAScript compatibility feature, and follows ECMAScript's behaviour.

ESCAPED CHARACTERS

This table applies to ASCII and Unicode environments. An unrecognized escape sequence causes an error.

  \a         alarm, that is, the BEL character (hex 07)
  \cx        "control-x", where x is a non-control ASCII character
  \e         escape (hex 1B)
  \f         form feed (hex 0C)
  \n         newline (hex 0A)
  \r         carriage return (hex 0D)
  \t         tab (hex 09)
  \0dd       character with octal code 0dd
  \ddd       character with octal code ddd, or backreference
  \o{ddd..}  character with octal code ddd..
  \N{U+hh..} character with Unicode code point hh.. (Unicode mode only)
  \xhh       character with hex code hh
  \x{hh..}   character with hex code hh..
\N{U+hh..} is synonymous with \x{hh..} but is not supported in environments that use EBCDIC code (mainly IBM mainframes). Note that \N not followed by an opening curly bracket has a different meaning (see below).

If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized:

  \U         the character "U"
  \uhhhh     character with hex code hhhh
  \u{hh..}   character with hex code hh.. but only for EXTRA_ALT_BSUX
When \x is not followed by {, one or two hexadecimal digits are read, but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise it matches a literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it matches a literal "u".

Note that \0dd is always an octal code. The treatment of backslash followed by a non-zero digit is complicated; for details see the section "Non-printing characters" in the pcre2pattern documentation, where details of escape processing in EBCDIC environments are also given.

CHARACTER TYPES

  .          any character except newline;
               in dotall mode, any character whatsoever
  \C         one code unit, even in UTF mode (best avoided)
  \d         a decimal digit
  \D         a character that is not a decimal digit
  \h         a horizontal white space character
  \H         a character that is not a horizontal white space character
  \N         a character that is not a newline
  \p{xx}     a character with the xx property
  \P{xx}     a character without the xx property
  \R         a newline sequence
  \s         a white space character
  \S         a character that is not a white space character
  \v         a vertical white space character
  \V         a character that is not a vertical white space character
  \w         a "word" character
  \W         a "non-word" character
  \X         a Unicode extended grapheme cluster
\C is dangerous because it may leave the current matching point in the middle of a UTF-8 or UTF-16 character. The application can lock out the use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 with the use of \C permanently disabled.

By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific matching is happening, \s and \w may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behaviour of these escape sequences is changed to use Unicode properties and they match many more characters, but there are some option settings that can restrict individual sequences to matching only ASCII characters.

Property descriptions in \p and \P are matched caselessly; hyphens, underscores, and ASCII white space characters are ignored, in accordance with Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} is the same as \p{ bidi class = AL }.

GENERAL CATEGORY PROPERTIES FOR \p and \P

  C          Other
  Cc         Control
  Cf         Format
  Cn         Unassigned
  Co         Private use
  Cs         Surrogate

  L          Letter
  Lc         Cased letter, the union of Ll, Lu, and Lt
  L&         Synonym of Lc
  Ll         Lower case letter
  Lm         Modifier letter
  Lo         Other letter
  Lt         Title case letter
  Lu         Upper case letter

  M          Mark
  Mc         Spacing mark
  Me         Enclosing mark
  Mn         Non-spacing mark

  N          Number
  Nd         Decimal number
  Nl         Letter number
  No         Other number

  P          Punctuation
  Pc         Connector punctuation
  Pd         Dash punctuation
  Pe         Close punctuation
  Pf         Final punctuation
  Pi         Initial punctuation
  Po         Other punctuation
  Ps         Open punctuation

  S          Symbol
  Sc         Currency symbol
  Sk         Modifier symbol
  Sm         Mathematical symbol
  So         Other symbol

  Z          Separator
  Zl         Line separator
  Zp         Paragraph separator
  Zs         Space separator
From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all equivalent to Lc.

PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P

  Xan        Alphanumeric: union of properties L and N
  Xps        POSIX space: property Z or tab, NL, VT, FF, CR
  Xsp        Perl space: property Z or tab, NL, VT, FF, CR
  Xuc        Universally-named character: one that can be
               represented by a Universal Character Name
  Xwd        Perl word: property Xan or underscore
Perl and POSIX space are now the same. Perl added VT to its space character set at release 5.18.

BINARY PROPERTIES FOR \p AND \P

Unicode defines a number of binary properties, that is, properties whose only values are true or false. You can obtain a list of those that are recognized by \p and \P, along with their abbreviations, by running this command:

  pcre2test -LP

SCRIPT MATCHING WITH \p AND \P

Many script names and their 4-letter abbreviations are recognized in \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of course). You can obtain a list of these scripts by running this command:

  pcre2test -LS

THE BIDI_CLASS PROPERTY FOR \p AND \P

  \p{Bidi_Class:<class>}   matches a character with the given class
  \p{BC:<class>}           matches a character with the given class
The recognized classes are:
  AL          Arabic letter
  AN          Arabic number
  B           paragraph separator
  BN          boundary neutral
  CS          common separator
  EN          European number
  ES          European separator
  ET          European terminator
  FSI         first strong isolate
  L           left-to-right
  LRE         left-to-right embedding
  LRI         left-to-right isolate
  LRO         left-to-right override
  NSM         non-spacing mark
  ON          other neutral
  PDF         pop directional format
  PDI         pop directional isolate
  R           right-to-left
  RLE         right-to-left embedding
  RLI         right-to-left isolate
  RLO         right-to-left override
  S           segment separator
  WS          white space

CHARACTER CLASSES

  [...]       positive character class
  [^...]      negative character class
  [x-y]       range (can be used for hex characters)
  [[:xxx:]]   positive POSIX named set
  [[:^xxx:]]  negative POSIX named set

  alnum       alphanumeric
  alpha       alphabetic
  ascii       0-127
  blank       space or tab
  cntrl       control character
  digit       decimal digit
  graph       printing, excluding space
  lower       lower case letter
  print       printing, including space
  punct       printing, excluding alphanumeric
  space       white space
  upper       upper case letter
  word        same as \w
  xdigit      hexadecimal digit
In PCRE2, POSIX character set names recognize only ASCII characters by default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class.

When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be used, allowing nested character classes, combined using set operators.

  [x&&[^y]]   UTS#18 extended character class

  x||y        set union (OR)
  x&&y        set intersection (AND)
  x--y        set difference (AND NOT)
  x~~y        set symmetric difference (XOR)

PERL EXTENDED CHARACTER CLASSES

  (?[...])                Perl extended character class
  (?[\p{Thai} & \p{Nd}])  operators; white space ignored
  (?[(x - y) & z])        parentheses for grouping

  (?[ [^3] & \p{Nd} ])    [...] is a nested ordinary class
  (?[ [:alpha:] - [z] ])  POSIX set is allowed outside [...]
  (?[ \d - [3] ])         backslash-escaped set is allowed outside [...]
  (?[ !\n & [:ascii:] ])  backslash-escaped character is allowed outside [...]
                      all other characters or ranges must be enclosed in [...]

  x|y, x+y                set union (OR)
  x&y                     set intersection (AND)
  x-y                     set difference (AND NOT)
  x^y                     set symmetric difference (XOR)
  !x                      set complement (NOT)
Inside a Perl extended character class, [...] switches mode to be interpreted as an ordinary character class. Outside of a nested [...], the only items permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside a nested ordinary class, ^ has its usual meaning (inverts the class when used as the first character); outside of a nested class, ^ is the XOR operator.

QUANTIFIERS

  ?           0 or 1, greedy
  ?+          0 or 1, possessive
  ??          0 or 1, lazy
  *           0 or more, greedy
  *+          0 or more, possessive
  *?          0 or more, lazy
  +           1 or more, greedy
  ++          1 or more, possessive
  +?          1 or more, lazy
  {n}         exactly n
  {n,m}       at least n, no more than m, greedy
  {n,m}+      at least n, no more than m, possessive
  {n,m}?      at least n, no more than m, lazy
  {n,}        n or more, greedy
  {n,}+       n or more, possessive
  {n,}?       n or more, lazy
  {,m}        zero up to m, greedy
  {,m}+       zero up to m, possessive
  {,m}?       zero up to m, lazy

ANCHORS AND SIMPLE ASSERTIONS

  \b          word boundary
  \B          not a word boundary
  ^           start of subject
                also after an internal newline in multiline mode
                (after any newline if PCRE2_ALT_CIRCUMFLEX is set)
  \A          start of subject
  $           end of subject
                also before newline at end of subject
                also before internal newline in multiline mode
  \Z          end of subject
                also before newline at end of subject
  \z          end of subject
  \G          first matching position in subject

REPORTED MATCH POINT SETTING

  \K          set reported start of match
From release 10.38 \K is not permitted by default in lookaround assertions, for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ignored in negative ones.

ALTERNATION

  expr|expr|expr...

CAPTURING

  (...)           capture group
  (?<name>...)    named capture group (Perl)
  (?'name'...)    named capture group (Perl)
  (?P<name>...)   named capture group (Python)
  (?:...)         non-capture group
  (?|...)         non-capture group; reset group numbers for
                   capture groups in each alternative
In non-UTF modes, names may contain underscores and ASCII letters and digits; in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit.

ATOMIC GROUPS

  (?>...)         atomic non-capture group
  (*atomic:...)   atomic non-capture group

COMMENT

  (?#....)        comment (not nestable)

OPTION SETTING

Changes of these options within a group are automatically cancelled at the end of the group.

  (?a)            all ASCII options
  (?aD)           restrict \d to ASCII in UCP mode
  (?aS)           restrict \s to ASCII in UCP mode
  (?aW)           restrict \w to ASCII in UCP mode
  (?aP)           restrict all POSIX classes to ASCII in UCP mode
  (?aT)           restrict POSIX digit classes to ASCII in UCP mode
  (?i)            caseless
  (?J)            allow duplicate named groups
  (?m)            multiline
  (?n)            no auto capture
  (?r)            restrict caseless to either ASCII or non-ASCII
  (?s)            single line (dotall)
  (?U)            default ungreedy (lazy)
  (?x)            ignore white space except in classes or \Q...\E
  (?xx)           as (?x) but also ignore space and tab in classes
  (?-...)         unset the given option(s)
  (?^)            unset imnrsx options
(?aP) implies (?aT) as well, though this has no additional effect. However, it means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes.

Unsetting x or xx unsets both. Several options may be set at once, and a mixture of setting and unsetting such as (?i-x) is allowed, but there may be only one hyphen. Setting (but no unsetting) is allowed after (?^ for example (?^in). An option setting may appear at the start of a non-capture group, for example (?i:...).

The following are recognized only at the very start of a pattern or after one of the newline or \R sequences or options with similar syntax. More than one of them may appear. For the first three, d is a decimal number.

  (*LIMIT_DEPTH=d)     set the backtracking limit to d
  (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
  (*LIMIT_MATCH=d)     set the match limit to d
  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
  (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
  (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
  (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
  (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
  (*NO_JIT)            disable JIT optimization
  (*NO_START_OPT)      no start-match optimization (PCRE2_NO_START_OPTIMIZE)
  (*TURKISH_CASING)    set PCRE2_EXTRA_TURKISH_CASING when matching
  (*UTF)               set appropriate UTF mode for the library in use
  (*UCP)               set PCRE2_UCP (use Unicode properties for \d etc)
Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or pcre2_dfa_match(), not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.

NEWLINE CONVENTION

These are recognized only at the very start of the pattern or after option settings with a similar syntax.

  (*CR)           carriage return only
  (*LF)           linefeed only
  (*CRLF)         carriage return followed by linefeed
  (*ANYCRLF)      all three of the above
  (*ANY)          any Unicode newline sequence
  (*NUL)          the NUL character (binary zero)

WHAT \R MATCHES

These are recognized only at the very start of the pattern or after option setting with a similar syntax.

  (*BSR_ANYCRLF)  CR, LF, or CRLF
  (*BSR_UNICODE)  any Unicode newline sequence

LOOKAHEAD AND LOOKBEHIND ASSERTIONS

  (?=...)                     )
  (*pla:...)                  ) positive lookahead
  (*positive_lookahead:...)   )

  (?!...)                     )
  (*nla:...)                  ) negative lookahead
  (*negative_lookahead:...)   )

  (?<=...)                    )
  (*plb:...)                  ) positive lookbehind
  (*positive_lookbehind:...)  )

  (?<!...)                    )
  (*nlb:...)                  ) negative lookbehind
  (*negative_lookbehind:...)  )
Each top-level branch of a lookbehind must have a limit for the number of characters it matches. If any branch can match a variable number of characters, the maximum for each branch is limited to a value set by the caller of pcre2_compile() or defaulted. The default is set when PCRE2 is built (ultimate default 255). If every branch matches a fixed number of characters, the limit for each branch is 65535 characters.

NON-ATOMIC LOOKAROUND ASSERTIONS

These assertions are specific to PCRE2 and are not Perl-compatible.

  (?*...)                                )
  (*napla:...)                           ) synonyms
  (*non_atomic_positive_lookahead:...)   )

  (?<*...)                               )
  (*naplb:...)                           ) synonyms
  (*non_atomic_positive_lookbehind:...)  )

SUBSTRING SCAN ASSERTION

This feature is not Perl-compatible.

  (*scan_substring:(grouplist)...)  scan captured substring
  (*scs:(grouplist)...)             scan captured substring
The comma-separated list "grouplist" may identify groups in any of the following ways:
  n       absolute reference
  +n      relative reference
  -n      relative reference
  <name>  name
  'name'  name

SCRIPT RUNS

  (*script_run:...)           ) script run, can be backtracked into
  (*sr:...)                   )

  (*atomic_script_run:...)    ) atomic script run
  (*asr:...)                  )

BACKREFERENCES

  \n              reference by number (can be ambiguous)
  \gn             reference by number
  \g{n}           reference by number
  \g+n            relative reference by number (PCRE2 extension)
  \g-n            relative reference by number
  \g{+n}          relative reference by number (PCRE2 extension)
  \g{-n}          relative reference by number
  \k<name>        reference by name (Perl)
  \k'name'        reference by name (Perl)
  \g{name}        reference by name (Perl)
  \k{name}        reference by name (.NET)
  (?P=name)       reference by name (Python)

SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)

  (?R)            recurse whole pattern
  (?n)            call subroutine by absolute number
  (?+n)           call subroutine by relative number
  (?-n)           call subroutine by relative number
  (?&name)        call subroutine by name (Perl)
  (?P>name)       call subroutine by name (Python)
  \g<name>        call subroutine by name (Oniguruma)
  \g'name'        call subroutine by name (Oniguruma)
  \g<n>           call subroutine by absolute number (Oniguruma)
  \g'n'           call subroutine by absolute number (Oniguruma)
  \g<+n>          call subroutine by relative number (PCRE2 extension)
  \g'+n'          call subroutine by relative number (PCRE2 extension)
  \g<-n>          call subroutine by relative number (PCRE2 extension)
  \g'-n'          call subroutine by relative number (PCRE2 extension)
The variants using parentheses (?...) may also specify a list of capture groups to return, which shall be retained in the calling subexpression if set during the recursion (this feature is not supported by Perl).
  (?R(grouplist))       recurse whole pattern, returning capture groups
                          (PCRE2 extension)
  (?n(grouplist))       )
  (?+n(grouplist))      ) call subroutine, returning capture groups
  (?-n(grouplist))      )   (PCRE2 extension)
  (?&name(grouplist))   )
  (?P>name(grouplist))  )
The comma-separated list "grouplist" uses the same syntax as (*scan_substring:(grouplist)...), and may identify groups in any of the following ways:
  n       absolute reference
  +n      relative reference
  -n      relative reference
  <name>  name
  'name'  name

CONDITIONAL PATTERNS

  (?(condition)yes-pattern)
  (?(condition)yes-pattern|no-pattern)

  (?(n)                absolute reference condition
  (?(+n)               relative reference condition (PCRE2 extension)
  (?(-n)               relative reference condition (PCRE2 extension)
  (?(<name>)           named reference condition (Perl)
  (?('name')           named reference condition (Perl)
  (?(name)             named reference condition (PCRE2, deprecated)
  (?(R)                overall recursion condition
  (?(Rn)               specific numbered group recursion condition
  (?(R&name)           specific named group recursion condition
  (?(DEFINE)           define groups for reference
  (?(VERSION[>]=n[.m]) test PCRE2 version
  (?(assert)           assertion condition
Note the ambiguity of (?(R) and (?(Rn) which might be named reference conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists.

The parts within brackets for the VERSION conditional syntax could be ommited. The fractional part of the version number defaults to 0 in that case.

BACKTRACKING CONTROL

All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour if :NAME is present. The others just set a name for passing back to the caller, but this is not a name that (*SKIP) can see. The following act immediately they are reached:

  (*ACCEPT)       force successful match
  (*FAIL)         force backtrack; synonym (*F)
  (*MARK:NAME)    set name to be passed back; synonym (*:NAME)
The following act only when a subsequent match failure causes a backtrack to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored.
  (*COMMIT)       overall failure, no advance of starting point
  (*PRUNE)        advance to next starting character
  (*SKIP)         advance to current matching position
  (*SKIP:NAME)    advance to position corresponding to an earlier
                  (*MARK:NAME); if not found, the (*SKIP) is ignored
  (*THEN)         local failure, backtrack to next alternation
The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call.

CALLOUTS

  (?C)            callout (assumed number 0)
  (?Cn)           callout with numerical data n
  (?C"text")      callout with string data
The allowed string delimiters are ` ' " ^ % # $ (which are the same for the start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it.

REPLACEMENT STRINGS

If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for pcre2_substitute() is not interpreted. Otherwise, by default, the only special character is the dollar character in one of the following forms:

  $$                  insert a dollar character
  $n or ${n}          insert the contents of group n
  $<name>             insert the contents of named group
  $0 or $&            insert the entire matched substring
  $`                  insert the substring that precedes the match
  $'                  insert the substring that follows the match
  $_                  insert the entire input string
  $+                  insert the highest-numbered capture group which matched
  $*MARK or ${*MARK}  insert a control verb name
For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, there is additional interpretation:

1. Backslash is an escape character, and the forms described in "ESCAPED CHARACTERS" above are recognized. Also:

  \Q...\E can be used to suppress interpretation
  \l      force the next character to lower case
  \u      force the next character to upper case
  \L      force subsequent characters to lower case
  \U      force subsequent characters to upper case
  \u\L    force next character to upper case, then all lower
  \l\U    force next character to lower case, then all upper
  \E      end \L or \U case forcing
  \b      backspace character (note: as in character class in pattern)
  \v      vertical tab character (note: not the same as in a pattern)
2. The Python form \g<n>, where the angle brackets are part of the syntax and n is either a group name or a number, is recognized as an alternative way of inserting the contents of a group, for example \g<3>.

3. Capture substitution supports the following additional forms:

  ${n:-string}             default for unset group
  ${n:+string1:string2}    values for set/unset group
The substitution strings themselves are expanded. Backslash can be used to escape colons and closing curly brackets.

SEE ALSO

pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 14 October 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2test.html ================================================ pcre2test specification

pcre2test man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

SYNOPSIS

pcre2test [options] [input file [output file]]

pcre2test is a test program for the PCRE2 regular expression libraries, but it can also be used for experimenting with regular expressions. This document describes the features of the test program; for details of the regular expressions themselves, see the pcre2pattern documentation. For details of the PCRE2 library function calls and their options, see the pcre2api documentation.

The input for pcre2test is a sequence of regular expression patterns and subject strings to be matched. There are also command lines for setting defaults and controlling some special actions. The output shows the result of each match attempt. Modifiers on external or internal command lines, the patterns, and the subject lines specify PCRE2 function options, control how the subject is processed, and what output is produced.

There are many obscure modifiers, some of which are specifically designed for use in conjunction with the test script and data files that are distributed as part of PCRE2. All the modifiers are documented here, some without much justification, but many of them are unlikely to be of use except when testing the libraries.

PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES

Different versions of the PCRE2 library can be built to support character strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or all three of these libraries may be simultaneously installed. The pcre2test program can be used to test all the libraries. However, its own input and output are always in 8-bit format. When testing the 16-bit or 32-bit libraries, patterns and subject strings are converted to 16-bit or 32-bit format before being passed to the library functions. Results are converted back to 8-bit code units for output.

In the rest of this document, the names of library functions and structures are given in generic form, for example, pcre2_compile(). The actual names used in the libraries have a suffix _8, _16, or _32, as appropriate.

INPUT ENCODING

Input to pcre2test is processed line by line, either by calling the C library's fgets() function, or via the libreadline or libedit library. In some Windows environments character 26 (hex 1A) causes an immediate end of file, and no further data is read, so this character should be avoided unless you really want that action.

The input is processed using C's string functions, so must not contain binary zeros, even though in Unix-like environments, fgets() treats any bytes other than newline as data characters. An error is generated if a binary zero is encountered. By default subject lines are processed for backslash escapes, which makes it possible to include any data value in strings that are passed to the library for matching. For patterns, there is a facility for specifying some or all of the 8-bit input characters as hexadecimal pairs, which makes it possible to include binary zeros.

Input for the 16-bit and 32-bit libraries

When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed to the library. For subject lines and some patterns, backslash escapes can be used. In addition, when the utf modifier (see "Setting compilation options" below) is set, the pattern and any following subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate.

For non-UTF testing of wide characters, the utf8_input modifier can be used. This is mutually exclusive with utf, and is allowed only in 16-bit or 32-bit mode. It causes the pattern and following subject lines to be treated as UTF-8 according to the original definition (RFC 2279), which allows for character values up to 0x7fffffff. Each character is placed in one 16-bit or 32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error to occur).

UTF-8 (in its original definition) is not capable of encoding values greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) 0x80000000 is added to the character's value. For subject strings, using an escape sequence is preferable.

COMMAND LINE OPTIONS

-8 If the 8-bit library has been built, this option causes it to be used (this is the default). If the 8-bit library has not been built, this option causes an error.

-16 If the 16-bit library has been built, this option causes it to be used. If the 8-bit library has not been built, this is the default. If the 16-bit library has not been built, this option causes an error.

-32 If the 32-bit library has been built, this option causes it to be used. If no other library has been built, this is the default. If the 32-bit library has not been built, this option causes an error.

-ac Behave as if each pattern has the auto_callout modifier, that is, insert automatic callouts into every pattern that is compiled.

-AC As for -ac, but in addition behave as if each subject line has the callout_extra modifier, that is, show additional information from callouts.

-b Behave as if each pattern has the fullbincode modifier; the full internal binary form of the pattern is output after compilation.

-C Output the version number of the PCRE2 library, and all available information about the optional features that are included, and then exit with zero exit code. All other options are ignored. If both -C and -LM are present, whichever is first is recognized.

-C option Output information about a specific build-time option, then exit. This functionality is intended for use in scripts such as RunTest. The following options output the value and set the exit code as indicated:

  linksize   the configured internal link size (2, 3, or 4)
               exit code is set to the link size
  newline    the default newline setting:
               CR, LF, CRLF, ANYCRLF, ANY, or NUL
               exit code is always 0
  bsr        the default setting for what \R matches:
               ANYCRLF or ANY
               exit code is always 0
The following options output 1 for true or 0 for false, and set the exit code to the same value:
  backslash-C  \C is supported (not locked out)
  ebcdic       compiled for an EBCDIC environment
  ebcdic-io    if PCRE2 is compiled for EBCDIC, whether pcre2test's input and
                 output is EBCDIC or ASCII
  ebcdic-nl25  if PCRE2 is compiled for EBCDIC, whether NL (= LF) is 0x25
                 (otherwise it is 0x15, the default)
  jit          just-in-time support is available
  pcre2-16     the 16-bit library was built
  pcre2-32     the 32-bit library was built
  pcre2-8      the 8-bit library was built
  unicode      Unicode support is available
Note that the availability of JIT support in the library does not guarantee that it can actually be used because in some environments it is unable to allocate executable memory. The option "jitusable" gives more detailed information. It returns one of the following values:
  0  JIT is available and usable
  1  JIT is available but cannot allocate executable memory
  2  JIT is not available
  3  Unexpected return from test call to pcre2_jit_compile()
If an unknown option is given, an error message is output; the exit code is 0.

--colo[u]r[=<always,auto,never>] By default, the output is coloured if the output file is a terminal (auto). Force or suppress output of ANSI colour escapes with always and never respectively.

-d Behave as if each pattern has the debug modifier; the internal form and information about the compiled pattern is output after compilation; -d is equivalent to -b -i.

-dfa Behave as if each subject line has the dfa modifier; matching is done using the pcre2_dfa_match() function instead of the default pcre2_match().

-E Run in "preprocess only" mode (similar to "gcc -E"). The "#if ... #endif" commands are processed, and all other lines are printed verbatim.

-error number[,number,...] Call pcre2_get_error_message() for each of the error numbers in the comma-separated list, display the resulting messages on the standard output, then exit with zero exit code. The numbers may be positive or negative. This is a convenience facility for PCRE2 maintainers.

-help Output a brief summary these options and then exit.

-i Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation.

-jit Behave as if each pattern line has the jit modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available.

-jitfast Behave as if each pattern line has the jitfast modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available, and each subject line is passed directly to the JIT matcher via its "fast path".

-jitverify Behave as if each pattern line has the jitverify modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available, and the use of JIT for matching is verified.

-LM List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized.

-LP List properties: write a list of recognized Unicode properties to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized.

-LS List scripts: write a list of recognized Unicode script names to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized.

-malloc Exercise malloc() failures, by first counting the number of calls made to malloc during pattern compilation and matching, then re-running the compilation and matching that many times, exercising a failure of each malloc() call.

-pattern modifier-list Behave as if each pattern line contains the given modifiers.

-q Do not output the version number of pcre2test at the start of execution.

-S size On Unix-like systems, set the size of the run-time stack to size mebibytes (units of 1024*1024 bytes).

-subject modifier-list Behave as if each subject line contains the given modifiers.

-t Run each compile and match many times with a timer, and output the resulting times per compile or match. When JIT is used, separate times are given for the initial compile and the JIT compile. You can control the number of iterations that are used for timing by following -t with a number (as a separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times.

-tm This is like -t except that it times only the matching phase, not the compile phase.

-T -TM These behave like -t and -tm, but in addition, at the end of a run, the total times for all compiles and matches are output.

-unittest Run a fixed set of additional tests of the PCRE2 API which are not driven by the test input files, and then exit.

-version Output the PCRE2 version number and then exit.

DESCRIPTION

If pcre2test is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from the standard input. If pcre2test is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout.

When pcre2test is built, a configuration option can specify that it should be linked with the libreadline or libedit library. When this is done, if the input is from a terminal, it is read using the readline() function. This provides line-editing and history facilities. The output from the -help option states whether or not readline() will be used.

The program handles any number of tests, each of which consists of a set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pattern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed by the perltest.sh script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. For a specification of perltest.sh, see the comments near its beginning. See also the #perltest command below.

When the input is a terminal, pcre2test prompts for each line of input, using "re>" to prompt for regular expression patterns, and "data>" to prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt.

Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r or \r\n, etc., depending on the newline setting) in a single line of input to encode the newline sequences. There is no limit on the length of subject lines; the input buffer is automatically extended if it is too small. There are replication features that makes it possible to generate long repetitive pattern or subject lines without having to supply them explicitly.

An empty line or the end of the file signals the end of the subject lines for a test, at which point a new pattern or command line is expected if there is still input to be read.

COMMAND LINES

In between sets of test data, a line that begins with # is interpreted as a command line. If the first character is followed by white space or an exclamation mark, the line is treated as a comment, and ignored. Otherwise, the following commands are recognized:

  #forbid_utf
Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of patterns. This command also forces an error if a subsequent pattern contains any occurrences of \P, \p, or \X, which are still supported when PCRE2_UTF is not set, but which require Unicode property support to be included in the library.

This is a trigger guard that is used in test files to ensure that UTF or Unicode property tests are not accidentally added to files that are used when Unicode support is not included in the library. Setting PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained by the use of #pattern; the difference is that #forbid_utf cannot be unset, and the automatic options are not displayed in pattern information, to avoid cluttering up test output.

  #load <filename>
This command is used to load a set of precompiled patterns from a file, as described in the section entitled "Saving and restoring compiled patterns" below.
  #loadtables <filename>
This command is used to load a set of binary character tables that can be accessed by the tables=3 qualifier. Such tables can be created by the pcre2_dftables program with the -b option.
  #newline_default [<newline-list>]
When PCRE2 is built, a default newline convention can be specified. This determines which characters and/or character pairs are recognized as indicating a newline in a pattern or subject string. The default can be overridden when a pattern is compiled. The standard test files contain tests of various newline conventions, but the majority of the tests expect a single linefeed to be recognized as a newline by default. Without special action the tests would fail when PCRE2 is compiled with either CR or CRLF as the default newline.

The #newline_default command specifies a list of newline types that are acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case), for example:

  #newline_default LF Any anyCRLF
If the default newline is in the list, this command has no effect. Otherwise, except when testing the POSIX API, a newline modifier that specifies the first newline convention in the list (LF in the above example) is added to any pattern that does not already have a newline modifier. If the newline list is empty, the feature is turned off. This command is present in a number of the standard test input files.

When the POSIX API is being tested there is no way to override the default newline convention, though it is possible to set the newline convention from within the pattern. A warning is given if the posix or posix_nosub modifier is used when #newline_default would set a default for the non-POSIX API.

  #pattern <modifier-list>
This command sets a default modifier list that applies to all subsequent patterns. Modifiers on a pattern can change these settings.
  #perltest
This line is used in test files that can also be processed by perltest.sh to confirm that Perl gives the same results as PCRE2. Subsequent tests are checked for the use of pcre2test features that are incompatible with the perltest.sh script.

Patterns must use '/' as their delimiter, and only certain modifiers are supported. Comment lines, #pattern commands, and #subject commands that set or unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and #newline_default commands, which are needed in the relevant pcre2test files, are silently ignored. All other command lines are ignored, but give a warning message. The #perltest command helps detect tests that are accidentally put in the wrong file or use the wrong delimiter. For more details of the perltest.sh script see the comments it contains.

  #pop [<modifiers>]
  #popcopy [<modifiers>]
These commands are used to manipulate the stack of compiled patterns, as described in the section entitled "Saving and restoring compiled patterns" below.
  #save <filename>
This command is used to save a set of compiled patterns to a file, as described in the section entitled "Saving and restoring compiled patterns" below.
  #subject <modifier-list>
This command sets a default modifier list that applies to all subsequent subject lines. Modifiers on a subject line can change these settings.
  #if CONDITION
  ...
  #endif
If CONDITION is true, then the command is printed, and its contents are processed as normal, including printing the commandlines to the output. If CONDITION is false, then all lines between the "#if" and "#endif" are skipped and not printed. The CONDITION can be any of the conditions which are tested by the "-C" commandline option and which set pcre2test's exit code to a boolean value. The CONDITION may also be preceded by "!".

MODIFIER SYNTAX

Modifier lists are used with both pattern and subject lines. Items in a list are separated by commas followed by optional white space. Trailing white space in a modifier list is ignored. Some modifiers may be given for both patterns and subject lines, whereas others are valid only for one or the other. Each modifier has a long name, for example "anchored", and some of them must be followed by an equals sign and a value, for example, "offset=12". Values cannot contain comma characters, but may contain spaces. Modifiers that do not take values may be preceded by a minus sign to turn off a previous setting.

A few of the more common modifiers can also be specified as single letters, for example "i" for "caseless". In documentation, following the Perl convention, these are written with a slash ("the /i modifier") for clarity. Abbreviated modifiers must all be concatenated in the first item of a modifier list. If the first item is not recognized as a long modifier name, it is interpreted as a sequence of these abbreviations. For example:

  /abc/ig,newline=cr,jit=3
This is a pattern line whose modifier list starts with two one-letter modifiers (/i and /g). The lower-case abbreviated modifiers are the same as used in Perl.

PATTERN SYNTAX

A pattern line must start with one of the following characters (common symbols, excluding pattern meta-characters):

  / ! " ' ` - = _ : ; , % & @ ~
This is interpreted as the pattern's delimiter. A regular expression may be continued over several input lines, in which case the newline characters are included within it. It is possible to include the delimiter as a literal within the pattern by escaping it with a backslash, for example
  /abc\/def/
If you do this, the escape and the delimiter form part of the pattern, but since the delimiters are all non-alphanumeric, the inclusion of the backslash does not affect the pattern's interpretation. Note, however, that this trick does not work within \Q...\E literal bracketing because the backslash will itself be interpreted as a literal. If the terminating delimiter is immediately followed by a backslash, for example,
  /abc/\
a backslash is added to the end of the pattern. This is done to provide a way of testing the error condition that arises if a pattern finishes with a backslash, because
  /abc\/
is interpreted as the first line of a pattern that starts with "abc/", causing pcre2test to read the next line as a continuation of the regular expression.

A pattern can be followed by a modifier list (details below).

SUBJECT LINE SYNTAX

Before each subject line is passed to pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match(), leading and trailing white space is removed, and the line is scanned for backslash escapes, unless the subject_literal modifier was set for the pattern. The following provide a means of encoding non-printing characters in a visible way:

  \a          alarm (BEL, \x07)
  \b          backspace (\x08)
  \e          escape (\x27)
  \f          form feed (\x0c)
  \n          newline (\x0a)
  \N{U+hh...} unicode character (any number of hex digits)
  \r          carriage return (\x0d)
  \t          tab (\x09)
  \v          vertical tab (\x0b)
  \ddd        octal number (up to 3 octal digits); represent a single
                code point unless larger than 255 with the 8-bit library
  \o{dd...}   octal number (any number of octal digits} representing a
                character in UTF mode or a code point
  \xhh        hexadecimal byte (up to 2 hex digits)
  \x{hh...}   hexadecimal number (up to 8 hex digits) representing a
                character in UTF mode or a code point
Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf modifier on the pattern. It is always recognized. There may be any number of hexadecimal digits inside the braces; invalid values provoke error messages but when using \N{U+hh...} with some invalid unicode characters they will be accepted with a warning instead.

Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) describe one byte rather than one character; this makes it possible to construct invalid UTF-8 sequences for testing purposes. On the other hand, \x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating more than one byte if the value is greater than 127. To avoid the ambiguity it is preferred to use \N{U+hh...} when describing characters. When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values that could fit on it, and causes an error for greater values.

When testing the 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it possible to construct invalid UTF-16 sequences for testing purposes.

When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...} values are accepted. This makes it possible to construct invalid UTF-32 sequences for testing purposes.

There is a special backslash sequence that specifies replication of one or more characters:

  \[<characters>]{<count>}
This makes it possible to test long strings without having to provide them as part of the file. For example:
  \[abc]{4}
is converted to "abcabcabcabc". This feature does not support nesting. To include a closing square bracket in the characters, code it as \x5D.

A backslash followed by an equals sign marks the end of the subject string and the start of a modifier list. For example:

  abc\=notbol,notempty
If the subject string is empty and \= is followed by white space, the line is treated as a comment line, and is not used for matching. For example:
  \= This is a comment.
  abc\= This is an invalid modifier list.
A backslash followed by any other non-alphanumeric character just escapes that character. A backslash followed by anything else causes an error. However, if the very last character in the line is a backslash (and there is no modifier list), it is ignored. This gives a way of passing an empty line as data, since a real empty line terminates the data input.

If the subject_literal modifier is set for a pattern, all subject lines that follow are treated as literals, with no special treatment of backslashes. No replication is possible, and any subject modifiers must be set as defaults by a #subject command.

PATTERN MODIFIERS

There are several types of modifier that can appear in pattern lines. Except where noted below, they may also be used in #pattern commands. A pattern's modifier list can add to or override default modifiers that were set by a previous #pattern command.

Setting compilation options

The following modifiers set options for pcre2_compile(). Most of them set bits in the options argument of that function, but those whose names start with PCRE2_EXTRA are additional options that are set in the compile context. Some of these options have single-letter abbreviations. There is special handling for /x: if a second x is present, PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third appearance adds PCRE2_EXTENDED as well, though this makes no difference to the way pcre2_compile() behaves. See pcre2api for a description of the effects of these options.

      allow_empty_class         set PCRE2_ALLOW_EMPTY_CLASS
      allow_lookaround_bsk      set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
      allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
      alt_bsux                  set PCRE2_ALT_BSUX
      alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
      alt_extended_class        set PCRE2_ALT_EXTENDED_CLASS
      alt_verbnames             set PCRE2_ALT_VERBNAMES
      anchored                  set PCRE2_ANCHORED
  /a  ascii_all                 set all ASCII options
      ascii_bsd                 set PCRE2_EXTRA_ASCII_BSD
      ascii_bss                 set PCRE2_EXTRA_ASCII_BSS
      ascii_bsw                 set PCRE2_EXTRA_ASCII_BSW
      ascii_digit               set PCRE2_EXTRA_ASCII_DIGIT
      ascii_posix               set PCRE2_EXTRA_ASCII_POSIX
      auto_callout              set PCRE2_AUTO_CALLOUT
      bad_escape_is_literal     set PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
  /i  caseless                  set PCRE2_CASELESS
  /r  caseless_restrict         set PCRE2_EXTRA_CASELESS_RESTRICT
      dollar_endonly            set PCRE2_DOLLAR_ENDONLY
  /s  dotall                    set PCRE2_DOTALL
      dupnames                  set PCRE2_DUPNAMES
      endanchored               set PCRE2_ENDANCHORED
      escaped_cr_is_lf          set PCRE2_EXTRA_ESCAPED_CR_IS_LF
  /x  extended                  set PCRE2_EXTENDED
  /xx extended_more             set PCRE2_EXTENDED_MORE
      extra_alt_bsux            set PCRE2_EXTRA_ALT_BSUX
      firstline                 set PCRE2_FIRSTLINE
      literal                   set PCRE2_LITERAL
      match_line                set PCRE2_EXTRA_MATCH_LINE
      match_invalid_utf         set PCRE2_MATCH_INVALID_UTF
      match_unset_backref       set PCRE2_MATCH_UNSET_BACKREF
      match_word                set PCRE2_EXTRA_MATCH_WORD
  /m  multiline                 set PCRE2_MULTILINE
      never_backslash_c         set PCRE2_NEVER_BACKSLASH_C
      never_callout             set PCRE2_EXTRA_NEVER_CALLOUT
      never_ucp                 set PCRE2_NEVER_UCP
      never_utf                 set PCRE2_NEVER_UTF
  /n  no_auto_capture           set PCRE2_NO_AUTO_CAPTURE
      no_auto_possess           set PCRE2_NO_AUTO_POSSESS
      no_bs0                    set PCRE2_EXTRA_NO_BS0
      no_dotstar_anchor         set PCRE2_NO_DOTSTAR_ANCHOR
      no_start_optimize         set PCRE2_NO_START_OPTIMIZE
      no_utf_check              set PCRE2_NO_UTF_CHECK
      python_octal              set PCRE2_EXTRA_PYTHON_OCTAL
      turkish_casing            set PCRE2_EXTRA_TURKISH_CASING
      ucp                       set PCRE2_UCP
      ungreedy                  set PCRE2_UNGREEDY
      use_offset_limit          set PCRE2_USE_OFFSET_LIMIT
      utf                       set PCRE2_UTF
As well as turning on the PCRE2_UTF option, the utf modifier causes all non-printing characters in output strings to be printed using the \x{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions.

The following modifiers enable or disable performance optimizations by calling pcre2_set_optimize() before invoking the regex compiler.
      optimization_full      enable all optional optimizations
      optimization_none      disable all optional optimizations
      auto_possess           auto-possessify variable quantifiers
      auto_possess_off       don't auto-possessify variable quantifiers
      dotstar_anchor         anchor patterns starting with .*
      dotstar_anchor_off     don't anchor patterns starting with .*
      start_optimize         enable pre-scan of subject string
      start_optimize_off     disable pre-scan of subject string
See the pcre2_set_optimize documentation for details on these optimizations.

Setting compilation controls

The following modifiers affect the compilation process or request information about the pattern. There are single-letter abbreviations for some that are heavily used in the test files.

  /B  bincode                   show binary code without lengths
      bsr=[anycrlf|unicode]     specify \R handling
      callout_info              show callout information
      convert=<options>         request foreign pattern conversion
      convert_glob_escape=c     set glob escape character
      convert_glob_separator=c  set glob separator character
      convert_length            set convert buffer length
      debug                     same as info,fullbincode
      expand                    expand repetition syntax in pattern
      framesize                 show matching frame size
      fullbincode               show binary code with lengths
  /I  info                      show info about compiled pattern
      hex                       unquoted characters are hexadecimal
      jit[=<number>]            use JIT
      jitfast                   use JIT fast path
      jitverify                 verify JIT use
      locale=<name>             use this locale
      max_pattern_compiled      ) set maximum compiled pattern
                 _length=<n>    )   length (bytes)
      max_pattern_length=<n>    set maximum pattern length (code units)
      max_varlookbehind=<n>     set maximum variable lookbehind length
      memory                    show memory used
      newline=<type>            set newline type
      null_context              compile with a NULL context
      null_pattern              pass pattern as NULL
      parens_nest_limit=<n>     set maximum parentheses depth
      posix                     use the POSIX API
      posix_nosub               use the POSIX API with REG_NOSUB
      push                      push compiled pattern onto the stack
      pushcopy                  push a copy onto the stack
      pushtablescopy            push a copy with tables onto the stack
      stackguard=<number>       test the stackguard feature
      subject_literal           treat all subject lines as literal
      tables=[0|1|2|3]          select internal tables
      use_length                do not zero-terminate the pattern
      utf8_input                treat input as UTF-8
The effects of these modifiers are described in the following sections.

Newline and \R handling

The bsr modifier specifies what \R in a pattern should match. If it is set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to "unicode", \R matches any Unicode newline sequence. The default can be specified when PCRE2 is built; if it is not, the default is set to Unicode.

The newline modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case).

Information about a pattern

The debug modifier is a shorthand for info,fullbincode, requesting all available information.

The bincode modifier causes a representation of the compiled code to be output after compilation. This information does not contain length and offset values, which ensures that the same output is generated for different internal link sizes and different code unit widths. By using bincode, the same regression tests can be used in different environments.

The fullbincode modifier, by contrast, does include length and offset values. This is used in a few special tests that run only for specific code unit widths and link sizes, and is also useful for one-off tests.

The info modifier requests information about the compiled pattern (whether it is anchored, has a fixed first character, and so on). The information is obtained from the pcre2_pattern_info() function. Here are some typical examples:

    re> /(?i)(^a|^b)/m,info
  Capture group count = 1
  Compile options: multiline
  Overall options: caseless multiline
  First code unit at start or follows newline
  Subject length lower bound = 1

    re> /(?i)abc/info
  Capture group count = 0
  Compile options: <none>
  Overall options: caseless
  First code unit = 'a' (caseless)
  Last code unit = 'c' (caseless)
  Subject length lower bound = 3
"Compile options" are those specified by modifiers; "overall options" have added options that are taken or deduced from the pattern. If both sets of options are the same, just a single "options" line is output; if there are no options, the line is omitted. "First code unit" is where any match must start; if there is more than one they are listed as "starting code units". "Last code unit" is the last literal code unit that must be present in any match. This is not necessarily the last character. These lines are omitted if no starting or ending code units are recorded. The subject length line is omitted when no_start_optimize is set because the minimum length is not calculated when it can never be used.

The framesize modifier shows the size, in bytes, of each storage frame used by pcre2_match() for handling backtracking. The size depends on the number of capturing parentheses in the pattern. A vector of these frames is used at matching time; its overall size is shown when the heaframes_size subject modifier is set.

The callout_info modifier requests information about all the callouts in the pattern. A list of them is output at the end of any other information that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern.

Passing a NULL context

Normally, pcre2test passes a context block to pcre2_compile(). If the null_context modifier is set, however, NULL is passed. This is for testing that pcre2_compile() behaves correctly in this case (it uses default values).

Passing a NULL pattern

The null_pattern modifier is for testing the behaviour of pcre2_compile() when the pattern argument is NULL. The length value passed is the default PCRE2_ZERO_TERMINATED unless use_length is set. Any length other than zero causes an error.

Specifying pattern characters in hexadecimal

The hex modifier specifies that the characters of the pattern, except for substrings enclosed in single or double quotes, are to be interpreted as pairs of hexadecimal digits. This feature is provided as a way of creating patterns that contain binary zeros and other non-printing characters. White space is permitted between pairs of digits. For example, this pattern contains three characters:

  /ab 32 59/hex
Parts of such a pattern are taken literally if quoted. This pattern contains nine characters, only two of which are specified in hexadecimal:
  /ab "literal" 32/hex
Either single or double quotes may be used. There is no way of including the delimiter within a substring. The hex and expand modifiers are mutually exclusive.

Specifying the pattern's length

By default, patterns are passed to the compiling functions as zero-terminated strings but can be passed by length instead of being zero-terminated. The use_length modifier causes this to happen. Using a length happens automatically (whether or not use_length is set) when hex is set, because patterns specified in hexadecimal may contain binary zeros.

If hex or use_length is used with the POSIX wrapper API (see "Using the POSIX wrapper API" below), the REG_PEND extension is used to pass the pattern's length.

Specifying a maximum for variable lookbehinds

Variable lookbehind assertions are supported only if, for each one, there is a maximum length (in characters) that it can match. There is a limit on this, whose default can be set at build time, with an ultimate default of 255. The max_varlookbehind modifier uses the pcre2_set_max_varlookbehind() function to change the limit. Lookbehinds whose branches each match a fixed length are limited to 65535 characters per branch.

Specifying wide characters in 16-bit and 32-bit modes

In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 and translated to UTF-16 or UTF-32 when the utf modifier is set. For testing the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input modifier can be used. It is mutually exclusive with utf. Input lines are interpreted as UTF-8 as a means of specifying wide characters. More details are given in "Input encoding" above.

Generating long repetitive patterns

Some tests use long patterns that are very repetitive. Instead of creating a very long input line for such a pattern, you can use a special repetition feature, similar to the one described for subject lines above. If the expand modifier is present on a pattern, parts of the pattern that have the form

  \[<characters>]{<count>}
are expanded before the pattern is passed to pcre2_compile(). For example, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction cannot be nested. An initial "\[" sequence is recognized only if "]{" followed by decimal digits and "}" is found later in the pattern. If not, the characters remain in the pattern unaltered. The expand and hex modifiers are mutually exclusive.

If part of an expanded pattern looks like an expansion, but is really part of the actual pattern, unwanted expansion can be avoided by giving two values in the quantifier. For example, \[AB]{6000,6000} is not recognized as an expansion item.

If the info modifier is set on an expanded pattern, the result of the expansion is included in the information that is output.

JIT compilation

Just-in-time (JIT) compiling is a heavyweight optimization that can greatly speed up pattern matching. See the pcre2jit documentation for details. JIT compiling happens, optionally, after a pattern has been successfully compiled into an internal form. The JIT compiler converts this to optimized machine code. It needs to know whether the match-time options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, because different code is generated for the different cases. See the partial modifier in "Subject Modifiers" below for details of how these options are specified for each match attempt.

JIT compilation is requested by the jit pattern modifier, which may optionally be followed by an equals sign and a number in the range 0 to 7. The three bits that make up the number specify which of the three JIT operating modes are to be compiled:

  1  compile JIT code for non-partial matching
  2  compile JIT code for soft partial matching
  4  compile JIT code for hard partial matching
The possible values for the jit modifier are therefore:
  0  disable JIT
  1  normal matching only
  2  soft partial matching only
  3  normal and soft partial matching
  4  hard partial matching only
  6  soft and hard partial matching only
  7  all three modes
If no number is given, 7 is assumed. The phrase "partial matching" means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the PCRE2_PARTIAL_HARD option set. Note that such a call may return a complete match; the options enable the possibility of a partial match, but do not require it. Note also that if you request JIT compilation only for partial matching (for example, jit=2) but do not set the partial modifier on a subject line, that match will not use JIT code because none was compiled for non-partial matching.

If JIT compilation is successful, the compiled JIT code will automatically be used when an appropriate type of match is run, except when incompatible run-time options are specified. For more details, see the pcre2jit documentation. See also the jitstack modifier below for a way of setting the size of the JIT stack.

If the jitfast modifier is specified, matching is done using the JIT "fast path" interface, pcre2_jit_match(), which skips some of the sanity checks that are done by pcre2_match(), and of course does not work when JIT is not supported. If jitfast is specified without jit, jit=7 is assumed.

If the jitverify modifier is specified, information about the compiled pattern shows whether JIT compilation was or was not successful. If jitverify is specified without jit, jit=7 is assumed. If JIT compilation is successful when jitverify is set, the text "(JIT)" is added to the first output line after a match or non match when JIT-compiled code was actually used in the match.

Setting a locale

The locale modifier must specify the name of a locale, for example:

  /pattern/locale=fr_FR
The given locale is set, pcre2_maketables() is called to build a set of character tables for the locale, and this is then passed to pcre2_compile() when compiling the regular expression. The same tables are used when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern command if a default is needed. Setting a locale and alternate character tables are mutually exclusive.

Showing pattern memory

The memory modifier causes the size in bytes of the memory used to hold the compiled pattern to be output. This does not include the size of the pcre2_code block; it is just the actual compiled data. If the pattern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example:

    re> /a(b)c/jit,memory
  Memory allocation (code space): 21
  Memory allocation (JIT code): 1910

Limiting nested parentheses

The parens_nest_limit modifier sets a limit on the depth of nested parentheses in a pattern. Breaching the limit causes a compilation error. The default for the library is set when PCRE2 is built, but pcre2test sets its own default of 220, which is required for running the standard test suite.

Limiting the pattern length

The max_pattern_length modifier sets a limit, in code units, to the length of pattern that pcre2_compile() will accept. Breaching the limit causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited).

Limiting the size of a compiled pattern

The max_pattern_compiled_length modifier sets a limit, in bytes, to the amount of memory used by a compiled pattern. Breaching the limit causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited).

Using the POSIX wrapper API

The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via the POSIX wrapper API rather than its native API. When posix_nosub is used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX wrapper supports only the 8-bit library. Note that it does not imply POSIX matching semantics; for more detail see the pcre2posix documentation. The following pattern modifiers set options for the regcomp() function:

  caseless           REG_ICASE
  multiline          REG_NEWLINE
  dotall             REG_DOTALL     )
  ungreedy           REG_UNGREEDY   ) These options are not part of
  ucp                REG_UCP        )   the POSIX standard
  utf                REG_UTF8       )
The regerror_buffsize modifier specifies a size for the error buffer that is passed to regerror() in the event of a compilation error. For example:
  /abc/posix,regerror_buffsize=20
This provides a means of testing the behaviour of regerror() when the buffer is too small for the error message. If this modifier has not been set, a large buffer is used.

The aftertext and allaftertext subject modifiers work as described below. All other modifiers are either ignored, with a warning message, or cause an error.

The pattern is passed to regcomp() as a zero-terminated string by default, but if the use_length or hex modifiers are set, the REG_PEND extension is used to pass it by length.

Testing the stack guard feature

The stackguard modifier is used to test the use of pcre2_set_compile_recursion_guard(), a function that is provided to enable stack availability to be checked during compilation (see the pcre2api documentation for details). If the number specified by the modifier is greater than zero, pcre2_set_compile_recursion_guard() is called to set up callback from pcre2_compile() to a local function. The argument it receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted.

Using alternative character tables

The value specified for the tables modifier must be one of the digits 0, 1, 2, or 3. It causes a specific set of built-in character tables to be passed to pcre2_compile(). This is used in the PCRE2 tests to check behaviour with different character tables. The digit specifies the tables as follows:

  0   do not pass any special character tables
  1   the default ASCII tables, as distributed in
        pcre2_chartables.c.dist
  2   a set of tables defining ISO 8859 characters
  3   a set of tables loaded by the #loadtables command
In tables 2, some characters whose codes are greater than 128 are identified as letters, digits, spaces, etc. Tables 3 can be used only after a #loadtables command has loaded them from a binary file. Setting alternate character tables and a locale are mutually exclusive.

Setting certain match controls

The following modifiers are really subject modifiers, and are described under "Subject Modifiers" below. However, they may be included in a pattern's modifier list, in which case they are applied to every subject line that is processed with that pattern. These modifiers do not affect the compilation process.

      aftertext                   show text after match
      allaftertext                show text after captures
      allcaptures                 show all captures
      allvector                   show the entire ovector
      allusedtext                 show all consulted text
      altglobal                   alternative global matching
  /g  global                      global matching
      heapframes_size             show match data heapframes size
      jitstack=<n>                set size of JIT stack
      mark                        show mark values
      null_substitute_match_data  substitute with NULL match data
      replace=<str>               specify a replacement string
      startchar                   show starting character when relevant
      substitute_callout          use substitution callouts
      substitute_case_callout     use substitution case callouts
      substitute_extended         use PCRE2_SUBSTITUTE_EXTENDED
      substitute_literal          use PCRE2_SUBSTITUTE_LITERAL
      substitute_matched          use PCRE2_SUBSTITUTE_MATCHED
      substitute_overflow_length  use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
      substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
      substitute_skip=<n>         skip substitution <n>
      substitute_stop=<n>         skip substitution <n> and following
      substitute_unknown_unset    use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
      substitute_unset_empty      use PCRE2_SUBSTITUTE_UNSET_EMPTY
These modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command.

Specifying literal subject lines

If the subject_literal modifier is present on a pattern, all the subject lines that it matches are taken as literal strings, with no interpretation of backslashes. It is not possible to set subject modifiers on such lines, but any that are set as defaults by a #subject command are recognized.

Saving a compiled pattern

When a pattern with the push modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and pcre2test expects the next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as described in the section entitled "Saving and restoring compiled patterns" below. If pushcopy is used instead of push, a copy of the compiled pattern is stacked, leaving the original as current, ready to match the following input lines. This provides a way of testing the pcre2_code_copy() function. The push and pushcopy modifiers are incompatible with compilation modifiers such as global that act at match time. Any that are specified are ignored (for the stacked copy), with a warning message, except for replace, which causes an error. Note that jitverify, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern.

Testing foreign pattern conversion

The experimental foreign pattern conversion functions in PCRE2 can be tested by setting the convert modifier. Its argument is a colon-separated list of options, which set the equivalent option for the pcre2_pattern_convert() function:

  glob                    PCRE2_CONVERT_GLOB
  glob_no_starstar        PCRE2_CONVERT_GLOB_NO_STARSTAR
  glob_no_wild_separator  PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
  posix_basic             PCRE2_CONVERT_POSIX_BASIC
  posix_extended          PCRE2_CONVERT_POSIX_EXTENDED
  unset                   Unset all options
The "unset" value is useful for turning off a default that has been set by a #pattern command. When one of these options is set, the input pattern is passed to pcre2_pattern_convert(). If the conversion is successful, the result is reflected in the output and then passed to pcre2_compile(). The normal utf and no_utf_check options, if set, cause the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to pcre2_pattern_convert().

By default, the conversion function is allowed to allocate a buffer for its output. However, if the convert_length modifier is set to a value greater than zero, pcre2test passes a buffer of the given length. This makes it possible to test the length check.

The convert_glob_escape and convert_glob_separator modifiers can be used to specify the escape and separator characters for glob processing, overriding the defaults, which are operating-system dependent.

SUBJECT MODIFIERS

The modifiers that can appear in subject lines and the #subject command are of two types.

Setting match options

The following modifiers set options for pcre2_match() or pcre2_dfa_match(). See pcre2api for a description of their effects.

      anchored                   set PCRE2_ANCHORED
      copy_matched_subject       set PCRE2_COPY_MATCHED_SUBJECT
      endanchored                set PCRE2_ENDANCHORED
      dfa_restart                set PCRE2_DFA_RESTART
      dfa_shortest               set PCRE2_DFA_SHORTEST
      disable_recurseloop_check  set PCRE2_DISABLE_RECURSELOOP_CHECK
      no_jit                     set PCRE2_NO_JIT
      no_utf_check               set PCRE2_NO_UTF_CHECK
      notbol                     set PCRE2_NOTBOL
      notempty                   set PCRE2_NOTEMPTY
      notempty_atstart           set PCRE2_NOTEMPTY_ATSTART
      noteol                     set PCRE2_NOTEOL
      partial_hard (or ph)       set PCRE2_PARTIAL_HARD
      partial_soft (or ps)       set PCRE2_PARTIAL_SOFT
The partial matching modifiers are provided with abbreviations because they appear frequently in tests.

If the posix or posix_nosub modifier was present on the pattern, causing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). The other modifiers are ignored, with a warning message.

There is one additional modifier that can be used with the POSIX wrapper. It is ignored (with a warning) if used for non-POSIX matching.

      posix_startend=<n>[:<m>]
This causes the subject string to be passed to regexec() using the REG_STARTEND option, which uses offsets to specify which part of the string is searched. If only one number is given, the end offset is passed as the end of the subject string. For more detail of REG_STARTEND, see the pcre2posix documentation. If the subject string contains binary zeros (coded as escapes such as \x{00} because pcre2test does not support actual binary zeros in its input), you must use posix_startend to specify its length.

Setting match controls

The following modifiers affect the matching process or request additional information. Some of them may also be specified on a pattern line (see above), in which case they apply to every subject line that is matched against that pattern, but can be overridden by modifiers on the subject.

      aftertext                  show text after match
      allaftertext               show text after captures
      allcaptures                show all captures
      allusedtext                show all consulted text (non-JIT only)
      allvector                  show the entire ovector
      altglobal                  alternative global matching
      callout_capture            show captures at callout time
      callout_data=<n>           set a value to pass via callouts
      callout_error=<n>[:<m>]    control callout error
      callout_extra              show extra callout information
      callout_fail=<n>[:<m>]     control callout failure
      callout_no_where           do not show position of a callout
      callout_none               do not supply a callout function
      copy=<number or name>      copy captured substring
      depth_limit=<n>            set a depth limit
      dfa                        use pcre2_dfa_match()
      find_limits                find heap, match and depth limits
      find_limits_noheap         find match and depth limits
      get=<number or name>       extract captured substring
      getall                     extract all captured substrings
  /g  global                     global matching
      heapframes_size            show match data heapframes size
      heap_limit=<n>             set a limit on heap memory (Kbytes)
      jitstack=<n>               set size of JIT stack
      mark                       show mark values
      match_limit=<n>            set a match limit
      memory                     show heap memory usage
      null_context               match with a NULL context
      null_replacement           substitute with NULL replacement
      null_subject               match with NULL subject
      null_substitute_match_data substitute with NULL match data
      offset=<n>                 set starting offset
      offset_limit=<n>           set offset limit
      ovector=<n>                set size of output vector
      recursion_limit=<n>        obsolete synonym for depth_limit
      replace=<str>              specify a replacement string
      startchar                  show startchar when relevant
      startoffset=<n>            same as offset=<n>
      substitute_callout         use substitution callouts
      substitute_case_callout    use substitution case callouts
      substitute_extended        use PCRE2_SUBSTITUTE_EXTENDED
      substitute_literal         use PCRE2_SUBSTITUTE_LITERAL
      substitute_matched         use PCRE2_SUBSTITUTE_MATCHED
      substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
      substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
      substitute_skip=<n>        skip substitution number n
      substitute_stop=<n>        skip substitution number n and greater
      substitute_subject=<str>   specify a different subject for substitution
      substitute_unknown_unset   use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
      substitute_unset_empty     use PCRE2_SUBSTITUTE_UNSET_EMPTY
      zero_terminate             pass the subject as zero-terminated
The effects of these modifiers are described in the following sections. When matching via the POSIX wrapper API, the aftertext, allaftertext, and ovector subject modifiers work as described below. All other modifiers are either ignored, with a warning message, or cause an error.

Showing more text

The aftertext modifier requests that as well as outputting the part of the subject string that matched the entire pattern, pcre2test should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. The allaftertext modifier requests the same action for captured substrings as well as the main matched substring. In each case the remainder is output on the following line with a plus character following the capture number.

The allusedtext modifier requests that all the text that was consulted during a successful pattern match by the interpreter should be shown, for both full and partial matches. This feature is not supported for JIT matching, and if requested with JIT it is ignored (with a warning message). Setting this modifier affects the output if there is a lookbehind at the start of a match, or, for a complete match, a lookahead at the end, or if \K is used in the pattern. Characters that precede or follow the start and end of the actual match are indicated in the output by '<' or '>' characters underneath them. Here is an example:

    re> /(?<=pqr)abc(?=xyz)/
  data> 123pqrabcxyz456\=allusedtext
   0: pqrabcxyz
      <<<   >>>
  data> 123pqrabcxy\=ph,allusedtext
  Partial match: pqrabcxy
                 <<<
The first, complete match shows that the matched string is "abc", with the preceding and following strings "pqr" and "xyz" having been consulted during the match (when processing the assertions). The partial match can indicate only the preceding string.

The startchar modifier requests that the starting character for the match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \K has been processed as part of the match. In this situation, the output for the matched string is displayed from the starting character instead of from the match point, with circumflex characters under the earlier characters. For example:

    re> /abc\Kxyz/
  data> abcxyz\=startchar
   0: abcxyz
      ^^^
Unlike allusedtext, the startchar modifier can be used with JIT. However, these two modifiers are mutually exclusive.

Showing the value of all capture groups

The allcaptures modifier requests that the values of all potential captured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to the return code from pcre2_match()). Groups that did not take part in the match are output as "<unset>". This modifier is not relevant for DFA matching (which does no capturing) and does not apply when replace is specified; it is ignored, with a warning message, if present.

Showing the entire ovector, for all outcomes

The allvector modifier requests that the entire ovector be shown, whatever the outcome of the match. Compare allcaptures, which shows only up to the maximum number of capture groups for the pattern, and then only for a successful complete non-DFA match. This modifier, which acts after any match result, and also for DFA matching, provides a means of checking that there are no unexpected modifications to ovector fields. Before each match attempt, the ovector is filled with a special value, and if this is found in both elements of a capturing pair, "<unchanged>" is output. After a successful match, this applies to all groups after the maximum capture group for the pattern. In other cases it applies to the entire ovector. After a partial match, the first two elements are the only ones that should be set. After a DFA match, the amount of ovector that is used depends on the number of matches that were found.

Testing pattern callouts

A callout function is supplied when pcre2test calls the library matching functions, unless callout_none is specified. Its behaviour can be controlled by various modifiers listed above whose names begin with callout_. Details are given in the section entitled "Callouts" below. Testing callouts from pcre2_substitute() is described separately in "Testing the substitution function" below.

Finding all matches in a string

Searching for all possible matches within a subject can be requested by the global or altglobal modifier. After finding a match, the matching function is called again to search the remainder of the subject. The difference between global and altglobal is that the former uses the start_offset argument to pcre2_match() or pcre2_dfa_match() to start searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbehind assertion (including \b or \B).

If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART flag set, in order to search for another, non-empty, match at the same point in the subject. This imitates the way Perl handles such cases when using the /g modifier or the split() function.

Testing substring extraction functions

The copy and get modifiers can be used to test the pcre2_substring_copy_xxx() and pcre2_substring_get_xxx() functions. They can be given more than once, and each can specify a capture group name or number, for example:

   abcd\=copy=1,copy=3,get=G1
If the #subject command is used to set default copy and/or get lists, these can be unset by specifying a negative number to cancel all numbered groups and an empty name to cancel all named groups.

The getall modifier tests pcre2_substring_list_get(), which extracts all captured substrings.

If the subject line is successfully matched, the substrings extracted by the convenience functions are output with C, G, or L after the string number instead of a colon. This is in addition to the normal full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name.

Testing the substitution function

If the replace modifier is set, the pcre2_substitute() function is called instead of one of the matching functions (or after one call of pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that replacement strings cannot contain commas, because a comma signifies the end of a modifier. This is not thought to be an issue in a test program.

Specifying a completely empty replacement string disables this modifier. However, it is possible to specify an empty replacement by providing a buffer length, as described below, for an otherwise empty replacement.

Unlike subject strings, pcre2test does not process replacement strings for escape sequences. In UTF mode, a replacement string is checked to see if it is a valid UTF-8 string. If so, it is correctly converted to a UTF string of the appropriate code unit width. If it is not a valid UTF-8 string, the individual code units are copied directly. This provides a means of passing an invalid UTF-8 string for testing purposes.

The following modifiers set options (in additional to the normal match options) for pcre2_substitute():

  global                      PCRE2_SUBSTITUTE_GLOBAL
  substitute_extended         PCRE2_SUBSTITUTE_EXTENDED
  substitute_literal          PCRE2_SUBSTITUTE_LITERAL
  substitute_matched          PCRE2_SUBSTITUTE_MATCHED
  substitute_overflow_length  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
  substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
  substitute_unknown_unset    PCRE2_SUBSTITUTE_UNKNOWN_UNSET
  substitute_unset_empty      PCRE2_SUBSTITUTE_UNSET_EMPTY
See the pcre2api documentation for details of these options.

After a successful substitution, the modified string is output, preceded by the number of replacements. This may be zero if there were no matches. Here is a simple example of a substitution test:

  /abc/replace=xxx
      =abc=abc=
   1: =xxx=abc=
      =abc=abc=\=global
   2: =xxx=xxx=
Subject and replacement strings should be kept relatively short (fewer than 256 characters) for substitution tests, as fixed-size buffers are used. To make it easy to test for buffer overflow, if the replacement string starts with a number in square brackets, that number is passed to pcre2_substitute() as the size of the output buffer, with the replacement string starting at the next character. Here is an example that tests the edge case:
  /abc/
      123abc123\=replace=[10]XYZ
   1: 123XYZ123
      123abc123\=replace=[9]XYZ
  Failed: error -48: no more memory
The default action of pcre2_substitute() is to return PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substitute_overflow_length modifier), pcre2_substitute() continues to go through the motions of matching and substituting (but not doing any callouts), in order to compute the size of buffer that is required. When this happens, pcre2test shows the required buffer length (which includes space for the trailing zero) as part of the error message. For example:
  /abc/substitute_overflow_length
      123abc123\=replace=[9]XYZ
  Failed: error -48: no more memory: 10 code units are needed
A replacement string is ignored with POSIX and DFA matching. Specifying partial matching provokes an error return ("bad option value") from pcre2_substitute().

The substitute_subject modifier may be used to test the use of the PCRE2 API, in which a client calls pcre2_match() followed by pcre2_substitute() with PCRE2_SUBSTITUTE_MATCHED, but the client performs an unexpected and unsupported modification of the subject buffer in-place, in between the match and substitution.

Testing substitute callouts

If the substitute_callout modifier is set, a substitution callout function is set up. The null_context modifier must not be set, because the address of the callout function is passed in a match context. When the callout function is called (after each substitution), details of the input and output strings are output. For example:

  /abc/g,replace=<$0>,substitute_callout
      abcdefabcpqr
   1(1) Old 0 3 "abc" New 0 5 "<abc>"
   2(1) Old 6 9 "abc" New 8 13 "<abc>"
   2: <abc>def<abc>pqr
The first number on each callout line is the count of matches. The parenthesized number is the number of pairs that are set in the ovector (that is, one more than the number of capturing groups that were set). Then are listed the offsets of the old substring, its contents, and the same for the replacement.

By default, the substitution callout function returns zero, which accepts the replacement and causes matching to continue if /g was used. Two further modifiers can be used to test other return values. If substitute_skip is set to a value greater than zero the callout function returns +1 for the match of that number, and similarly substitute_stop returns -1. These cause the replacement to be rejected, and -1 causes no further matching to take place. If either of them are set, substitute_callout is assumed. For example:

  /abc/g,replace=<$0>,substitute_skip=1
      abcdefabcpqr
   1(1) Old 0 3 "abc" New 0 5 "<abc> SKIPPED"
   2(1) Old 6 9 "abc" New 6 11 "<abc>"
   2: abcdef<abc>pqr
      abcdefabcpqr\=substitute_stop=1
   1(1) Old 0 3 "abc" New 0 5 "<abc> STOPPED"
   1: abcdefabcpqr
If both are set for the same number, stop takes precedence. Only a single skip or stop is supported, which is sufficient for testing that the feature works.

Testing substitute case callouts

If the substitute_case_callout modifier is set, a substitution case callout function is set up. The callout function is called for each substituted chunk which is to be case-transformed.

The callout function passed is a fixed function with implementation for certain behaviours: inputs which shrink when case-transformed; inputs which grow; inputs with distinct upper/lower/titlecase forms. The characters which are not special-cased for testing purposes are left unmodified, as if they are caseless characters.

Setting the JIT stack size

The jitstack modifier provides a way of setting the maximum stack size that is used by the just-in-time optimization code. It is ignored if JIT optimization is not being used. The value is a number of kibibytes (units of 1024 bytes). Setting zero reverts to the default of 32KiB. Providing a stack that is larger than the default is necessary only for very complicated patterns. If jitstack is set non-zero on a subject line it overrides any value that was set on the pattern.

Setting heap, match, and depth limits

The heap_limit, match_limit, and depth_limit modifiers set the appropriate limits in the match context. These values are ignored when the find_limits or find_limits_noheap modifier is specified.

Finding minimum limits

If the find_limits modifier is present on a subject line, pcre2test calls the relevant matching function several times, setting different values in the match context via pcre2_set_heap_limit(), pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the smallest value for each parameter that allows the match to complete without a "limit exceeded" error. The match itself may succeed or fail. An alternative modifier, find_limits_noheap, omits the heap limit. This is used in the standard tests, because the minimum heap limit varies between systems. If JIT is being used, only the match limit is relevant, and the other two are automatically omitted.

When using this modifier, the pattern should not contain any limit settings such as (*LIMIT_MATCH=...) within it. If such a setting is present and is lower than the minimum matching value, the minimum value cannot be found because pcre2_set_match_limit() etc. are only able to reduce the value of an in-pattern limit; they cannot increase it.

For non-DFA matching, the minimum depth_limit number is a measure of how much nested backtracking happens (that is, how deeply the pattern's tree is searched). In the case of DFA matching, depth_limit controls the depth of recursive calls of the internal function that is used for handling pattern recursion, lookaround assertions, and atomic groups.

For non-DFA matching, the match_limit number is a measure of the amount of backtracking that takes place, and learning the minimum value can be instructive. For most simple matches, the number is quite small, but for patterns with very large numbers of matching possibilities, it can become large very quickly with increasing length of subject string. In the case of DFA matching, match_limit controls the total number of calls, both recursive and non-recursive, to the internal matching function, thus controlling the overall amount of computing resource that is used.

For both kinds of matching, the heap_limit number, which is in kibibytes (units of 1024 bytes), limits the amount of heap memory used for matching.

Showing MARK names

The mark modifier causes the names from backtracking control verbs that are returned from calls to pcre2_match() to be displayed. If a mark is returned for a match, non-match, or partial match, pcre2test shows it. For a match, it is on a line by itself, tagged with "MK:". Otherwise, it is added to the non-match message.

Showing memory usage

The memory modifier causes pcre2test to log the sizes of all heap memory allocation and freeing calls that occur during a call to pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is used only when a match requires more internal workspace that the default allocation on the stack, so in many cases there will be no output. No heap memory is allocated during matching with JIT. For this modifier to work, the null_context modifier must not be set on both the pattern and the subject, though it can be set on one or the other.

Showing the heap frame overall vector size

The heapframes_size modifier is relevant for matches using pcre2_match() without JIT. After a match has run (whether successful or not) the size, in bytes, of the allocated heap frames vector that is left attached to the match data block is shown. If the matching action involved several calls to pcre2_match() (for example, global matching or for timing) only the final value is shown.

This modifier is ignored, with a warning, for POSIX or DFA matching. JIT matching does not use the heap frames vector, so the size is always zero, unless there was a previous non-JIT match. Note that specifing a size of zero for the output vector (see below) causes pcre2test to free its match data block (and associated heap frames vector) and allocate a new one.

Setting a starting offset

The offset modifier sets an offset in the subject string at which matching starts. Its value is a number of code units, not characters.

Setting an offset limit

The offset_limit modifier sets a limit for unanchored matches. If a match cannot be found starting at or before this offset in the subject, a "no match" return is given. The data value is a number of code units, not characters. When this modifier is used, the use_offset_limit modifier must have been set for the pattern; if not, an error is generated.

Setting the size of the output vector

The ovector modifier applies only to the subject line in which it appears, though of course it can also be used to set a default in a #subject command. It specifies the number of pairs of offsets that are available for storing matching information. The default is 15.

A value of zero is useful when testing the POSIX API because it causes regexec() to be called with a NULL capture vector. When not testing the POSIX API, a value of zero is used to cause pcre2_match_data_create_from_pattern() to be called, in order to create a new match block of exactly the right size for the pattern. (It is not possible to create a match block with a zero-length ovector; there is always at least one pair of offsets.) The old match data block is freed.

Passing the subject as zero-terminated

By default, the subject string is passed to a native API matching function with its correct length. In order to test the facility for passing a zero-terminated string, the zero_terminate modifier is provided. It causes the length to be passed as PCRE2_ZERO_TERMINATED. When matching via the POSIX interface, this modifier is ignored, with a warning.

When testing pcre2_substitute(), this modifier also has the effect of passing the replacement string as zero-terminated.

Passing a NULL context, subject, or replacement

Normally, pcre2test passes a context block to pcre2_match(), pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the null_context modifier is set, however, NULL is passed. This is for testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used with the find_limits, find_limits_noheap, or substitute_callout modifiers.

Similarly, for testing purposes, if the null_subject or null_replacement modifier is set, the subject or replacement string pointers are passed as NULL, respectively, to the relevant functions.

THE ALTERNATIVE MATCHING FUNCTION

By default, pcre2test uses the standard PCRE2 matching function, pcre2_match() to match each subject line. PCRE2 also supports an alternative matching function, pcre2_dfa_match(), which operates in a different way, and has some restrictions. The differences between the two functions are described in the pcre2matching documentation.

If the dfa modifier is set, the alternative matching function is used. This function finds all possible matches at a given point in the subject. If, however, the dfa_shortest modifier is set, processing stops after the first match is found. This is always the shortest possible match.

DEFAULT OUTPUT FROM pcre2test

This section describes the output when the normal matching function, pcre2_match(), is being used.

When a match succeeds, pcre2test outputs the list of captured substrings, starting with number 0 for the string that matched the whole pattern. Otherwise, it outputs "No match" when the return is PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is the entire substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \K, \b, or \B was involved.)

For any other return, pcre2test outputs the PCRE2 negative error number and a short descriptive phrase. If the error is a failed UTF string check, the code unit offset of the start of the failing character is also output. Here is an example of an interactive pcre2test run.

  $ pcre2test
  PCRE2 version 10.22 2016-07-29

    re> /^abc(\d+)/
  data> abc123
   0: abc123
   1: 123
  data> xyz
  No match
Unset capturing substrings that are not followed by one that is set are not shown by pcre2test unless the allcaptures modifier is specified. In the following example, there are two capturing substrings, but when the first data line is matched, the second, unset substring is not shown. An "internal" unset substring is shown as "<unset>", as for the second data line.
    re> /(a)|(b)/
  data> a
   0: a
   1: a
  data> b
   0: b
   1: <unset>
   2: b
If the strings contain any non-printing characters, they are output as \xhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \x{hh...} escapes. See below for the definition of non-printing characters. If the aftertext modifier is set, the output for substring 0 is followed by the rest of the subject string, identified by "0+" like this:
    re> /cat/aftertext
  data> cataract
   0: cat
   0+ aract
If global matching is requested, the results of successive matching attempts are output in sequence, like this:
    re> /\Bi(\w\w)/g
  data> Mississippi
   0: iss
   1: ss
   0: iss
   1: ss
   0: ipp
   1: pp
"No match" is output only if the first match attempt fails. Here is an example of a failure message (the offset 4 that is specified by the offset modifier is past the end of the subject string):
    re> /xyz/
  data> xyz\=offset=4
  Error -24 (bad offset value)

Note that whereas patterns can be continued over several lines (a plain ">" prompt is used for continuations), subject lines may not. However newlines can be included in a subject by means of the \n escape (or \r, \r\n, etc., depending on the newline sequence setting).

OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION

When the alternative matching function, pcre2_dfa_match(), is used, the output consists of a list of all the matches that start at the first point in the subject where there is at least one match. For example:

    re> /(tang|tangerine|tan)/
  data> yellow tangerine\=dfa
   0: tangerine
   1: tang
   2: tan
Using the normal matching function on this data finds only "tang". The longest matching string is always given first (and numbered zero). After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the partially matching substring. Note that this is the entire substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \b, or \B was involved. (\K is not supported for DFA matching.)

If global matching is requested, the search for further matches resumes at the end of the longest match. For example:

    re> /(tang|tangerine|tan)/g
  data> yellow tangerine and tangy sultana\=dfa
   0: tangerine
   1: tang
   2: tan
   0: tang
   1: tan
   0: tan
The alternative matching function does not support substring capture, so the modifiers that are concerned with captured substrings are not relevant.

RESTARTING AFTER A PARTIAL MATCH

When the alternative matching function has given the PCRE2_ERROR_PARTIAL return, indicating that the subject partially matched the pattern, you can restart the match with additional subject data by means of the dfa_restart modifier. For example:

    re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
  data> 23ja\=ps,dfa
  Partial match: 23ja
  data> n05\=dfa,dfa_restart
   0: n05
For further information about partial matching, see the pcre2partial documentation.

CALLOUTS

If the pattern contains any callout requests, pcre2test's callout function is called during matching unless callout_none is specified. This works with both matching functions, and with JIT, though there are some differences in behaviour. The output for callouts with numerical arguments and those with string arguments is slightly different.

Callouts with numerical arguments

By default, the callout function displays the callout number, the start and current positions in the subject text at the callout time, and the next pattern item to be tested. For example:

  --->pqrabcdef
    0    ^  ^     \d
This output indicates that callout number 0 occurred for a match attempt starting at the fourth character of the subject string, when the pointer was at the seventh character, and when the next pattern item was \d. Just one circumflex is output if the start and current positions are the same, or if the current position precedes the start position, which can happen if the callout is in a lookbehind assertion.

Callouts numbered 255 are assumed to be automatic callouts, inserted as a result of the auto_callout pattern modifier. In this case, instead of showing the callout number, the offset in the pattern, preceded by a plus, is output. For example:

    re> /\d?[A-E]\*/auto_callout
  data> E*
  --->E*
   +0 ^      \d?
   +3 ^      [A-E]
   +8 ^^     \*
  +10 ^ ^
   0: E*
If a pattern contains (*MARK) items, an additional line is output whenever a change of latest mark is passed to the callout function. For example:
    re> /a(*MARK:X)bc/auto_callout
  data> abc
  --->abc
   +0 ^       a
   +1 ^^      (*MARK:X)
  +10 ^^      b
  Latest Mark: X
  +11 ^ ^     c
  +12 ^  ^
   0: abc
The mark changes between matching "a" and "b", but stays the same for the rest of the match, so nothing more is output. If, as a result of backtracking, the mark reverts to being unset, the text "<unset>" is output.

Callouts with string arguments

The output for a callout with a string argument is similar, except that instead of outputting a callout number before the position indicators, the callout string and its offset in the pattern string are output before the reflection of the subject string, and the subject string is reflected for each callout. For example:

    re> /^ab(?C'first')cd(?C"second")ef/
  data> abcdefg
  Callout (7): 'first'
  --->abcdefg
      ^ ^         c
  Callout (20): "second"
  --->abcdefg
      ^   ^       e
   0: abcdef

Callout modifiers

The callout function in pcre2test returns zero (carry on matching) by default, but you can use a callout_fail modifier in a subject line to change this and other parameters of the callout (see below).

If the callout_capture modifier is set, the current captured groups are output when a callout occurs. This is useful only for non-DFA matching, as pcre2_dfa_match() does not support capturing, so no captures are ever shown.

The normal callout output, showing the callout number or pattern offset (as described above) is suppressed if the callout_no_where modifier is set.

When using the interpretive matching function pcre2_match() without JIT, setting the callout_extra modifier causes additional output from pcre2test's callout function to be generated. For the first callout in a match attempt at a new starting position in the subject, "New match attempt" is output. If there has been a backtrack since the last callout (or start of matching if this is the first callout), "Backtrack" is output, followed by "No other matching paths" if the backtrack ended the previous match attempt. For example:

   re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
  data> aac\=callout_extra
  New match attempt
  --->aac
   +0 ^       (
   +1 ^       a+
   +3 ^ ^     )
   +4 ^ ^     b
  Backtrack
  --->aac
   +3 ^^      )
   +4 ^^      b
  Backtrack
  No other matching paths
  New match attempt
  --->aac
   +0  ^      (
   +1  ^      a+
   +3  ^^     )
   +4  ^^     b
  Backtrack
  No other matching paths
  New match attempt
  --->aac
   +0   ^     (
   +1   ^     a+
  Backtrack
  No other matching paths
  New match attempt
  --->aac
   +0    ^    (
   +1    ^    a+
  No match
Notice that various optimizations must be turned off if you want all possible matching paths to be scanned. If no_start_optimize is not used, there is an immediate "no match", without any callouts, because the starting optimization fails to find "b" in the subject, which it knows must be present for any match. If no_auto_possess is not used, the "a+" item is turned into "a++", which reduces the number of backtracks.

The callout_extra modifier has no effect if used with the DFA matching function, or with JIT.

Return values from callouts

The default return from the callout function is zero, which allows matching to continue. The callout_fail modifier can be given one or two numbers. If there is only one number, 1 is returned instead of 0 (causing matching to backtrack) when a callout of that number is reached. If two numbers (<n>:<m>) are given, 1 is returned when callout <n> is reached and there have been at least <m> callouts. The callout_error modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, causing the entire matching process to be aborted. If both these modifiers are set for the same callout number, callout_error takes precedence. Note that callouts with string arguments are always given the number zero.

The callout_data modifier can be given an unsigned or a negative number. This is set as the "user data" that is passed to the matching function, and passed back when the callout function is invoked. Any value other than zero is used as a return from pcre2test's callout function.

Inserting callouts can be helpful when using pcre2test to check complicated regular expressions. For further information about callouts, see the pcre2callout documentation.

NON-PRINTING CHARACTERS

When pcre2test is outputting text in the compiled version of a pattern, bytes other than 32-126 are always treated as non-printing characters and are therefore shown as hex escapes.

When pcre2test is outputting text that is a matched part of a subject string, it behaves in the same way, unless a different locale has been set for the pattern (using the locale modifier). In this case, the isprint() function is used to distinguish printing and non-printing characters.

SAVING AND RESTORING COMPILED PATTERNS

It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. JIT data cannot be saved. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width and PCRE2_SIZE type. Before compiled patterns can be saved they must be serialized, that is, converted to a stream of bytes. A single byte stream may contain any number of compiled patterns, but they must all use the same character tables. A single copy of the tables is included in the byte stream (its size is 1088 bytes).

The functions whose names begin with pcre2_serialize_ are used for serializing and de-serializing. They are described in the pcre2serialize documentation. In this section we describe the features of pcre2test that can be used to test these functions.

Note that "serialization" in PCRE2 does not convert compiled patterns to an abstract format like Java or .NET. It just makes a reloadable byte code stream. Hence the restrictions on reloading mentioned above.

In pcre2test, when a pattern with push modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and pcre2test expects the next line to contain a new pattern (or command) instead of a subject line. By contrast, the pushcopy modifier causes a copy of the compiled pattern to be stacked, leaving the original available for immediate matching. By using push and/or pushcopy, a number of patterns can be compiled and retained. These modifiers are incompatible with posix, and control modifiers that act at match time are ignored (with a message) for the stacked patterns. The jitverify modifier applies only at compile time.

The command

  #save <filename>
causes all the stacked patterns to be serialized and the result written to the named file. Afterwards, all the stacked patterns are freed. The command
  #load <filename>
reads the data in the file, and then arranges for it to be de-serialized, with the resulting compiled patterns added to the pattern stack. The pattern on the top of the stack can be retrieved by the #pop command, which must be followed by lines of subjects that are to be matched with the pattern, terminated as usual by an empty line or end of file. This command may be followed by a modifier list containing only control modifiers that act after a pattern has been compiled. In particular, hex, posix, posix_nosub, push, and pushcopy are not allowed, nor are any option-setting modifiers. The JIT modifiers are, however permitted. Here is an example that saves and reloads two patterns.
  /abc/push
  /xyz/push
  #save tempfile
  #load tempfile
  #pop info
  xyz

  #pop jit,bincode
  abc
If jitverify is used with #pop, it does not automatically imply jit, which is different behaviour from when it is used on a pattern.

The #popcopy command is analogous to the pushcopy modifier in that it makes current a copy of the topmost stack pattern, leaving the original still on the stack.

SEE ALSO

pcre2(3), pcre2api(3), pcre2callout(3), pcre2jit, pcre2matching(3), pcre2partial(d), pcre2pattern(3), pcre2serialize(3).

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 12 October 2025
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/html/pcre2unicode.html ================================================ pcre2unicode specification

pcre2unicode man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.

UNICODE AND UTF SUPPORT

PCRE2 is normally built with Unicode support, though if you do not need it, you can build it without, in which case the library will be smaller. With Unicode support, PCRE2 has knowledge of Unicode character properties and can process strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit width), but this is not the default. Unless specifically requested, PCRE2 treats each code unit in a string as one character.

There are two ways of telling PCRE2 to switch to UTF mode, where characters may consist of more than one code unit and the range of values is constrained. The program can call pcre2_compile() with the PCRE2_UTF option, or the pattern may start with the sequence (*UTF). However, the latter facility can be locked out by the PCRE2_NEVER_UTF option. That is, the programmer can prevent the supplier of the pattern from switching to UTF mode.

Note that the PCRE2_MATCH_INVALID_UTF option (see below) forces PCRE2_UTF to be set.

In UTF mode, both the pattern and any subject strings that are matched against it are treated as UTF strings instead of strings of individual one-code-unit characters. There are also some other changes to the way characters are handled, as documented below.

UNICODE PROPERTY SUPPORT

When PCRE2 is built with Unicode support, the escape sequences \p{..}, \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting. The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal number, the derived properties Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties.

The full lists are given in the pcre2pattern and pcre2syntax documentation. In general, only the short names for properties are supported. For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not supported. Furthermore, in Perl, many properties may optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE2 does not support this.

WIDE CHARACTERS AND UTF MODES

Code points less than 256 can be specified in patterns by either braced or unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). Larger values have to use braced sequences. Unbraced octal code points up to \777 are also recognized; larger ones can be coded using \o{...}.

The escape sequence \N{U+<hex digits>} is recognized as another way of specifying a Unicode character by code point in a UTF mode. It is not allowed in non-UTF mode.

In UTF mode, repeat quantifiers apply to complete UTF characters, not to individual code units.

In UTF mode, the dot metacharacter matches one UTF character instead of a single code unit.

In UTF mode, capture group names are not restricted to ASCII, and may contain any Unicode letters and decimal digits, as well as underscore.

The escape sequence \C can be used to match a single code unit in UTF mode, but its use can lead to some strange effects because it breaks up multi-unit characters (see the description of \C in the pcre2pattern documentation). For this reason, there is a build-time option that disables support for \C completely. There is also a less draconian compile-time option for locking out the use of \C when a pattern is compiled.

The use of \C is not supported by the alternative matching function pcre2_dfa_match() when in UTF-8 or UTF-16 mode, that is, when a character may consist of more than one code unit. The use of \C in these modes provokes a match-time error. Also, the JIT optimization does not support \C in these modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that contains \C, it will not succeed, and so when pcre2_match() is called, the matching will be carried out by the interpretive function.

The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test characters of any code value, but, by default, the characters that PCRE2 recognizes as digits, spaces, or word characters remain the same set as in non-UTF mode, all with code points less than 256. This remains true even when PCRE2 is built to include Unicode support, because to do otherwise would slow down matching in many common cases. Note that this also applies to \b and \B, because they are defined in terms of \w and \W. If you want to test for a wider sense of, say, "digit", you can use explicit Unicode property tests such as \p{Nd}. Alternatively, if you set the PCRE2_UCP option, the way that the character escapes work is changed so that Unicode properties are used to determine which characters match, though there are some options that suppress this for individual escapes. For details see the section on generic character types in the pcre2pattern documentation.

Like the escapes, characters that match the POSIX named character classes are all low-valued characters unless the PCRE2_UCP option is set, but there is an option to override this.

In contrast to the character escapes and character classes, the special horizontal and vertical white space escapes (\h, \H, \v, and \V) do match all the appropriate Unicode characters, whether or not PCRE2_UCP is set.

UNICODE CASE-EQUIVALENCE

If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use of Unicode properties except for characters whose code points are less than 128 and that have at most two case-equivalent values. For these, a direct table lookup is used for speed. A few Unicode characters such as Greek sigma have more than two code points that are case-equivalent, and these are treated specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case processing for non-UTF character encodings such as UCS-2.

There are two ASCII characters (S and K) that, in addition to their ASCII lower case equivalents, have a non-ASCII one as well (long S and Kelvin sign). Recognition of these non-ASCII characters as case-equivalent to their ASCII counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing.

    Without PCRE2_EXTRA_CASELESS_RESTRICT:
      'k' = 'K' = U+212A (Kelvin sign)
      's' = 'S' = U+017F (long S)
    With PCRE2_EXTRA_CASELESS_RESTRICT:
      'k' = 'K'
      U+212A (Kelvin sign)  only case-equivalent to itself
      's' = 'S'
      U+017F (long S)       only case-equivalent to itself

One language family, Turkish and Azeri, has its own case-insensitivity rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 (small dotless i) characters.

    Without PCRE2_EXTRA_TURKISH_CASING:
      'i' = 'I'
      U+0130 (capital I with dot above)  only case-equivalent to itself
      U+0131 (small dotless i)           only case-equivalent to itself
    With PCRE2_EXTRA_TURKISH_CASING:
      'i' = U+0130 (capital I with dot above)
      U+0131 (small dotless i) = 'I'

It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING together.

From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower case), and Lt (title case) are all treated as Lc (cased letter) when caseless matching is set by the PCRE2_CASELESS option or (?i) within the pattern.

SCRIPT RUNS

The pattern constructs (*script_run:...) and (*atomic_script_run:...), with synonyms (*sr:...) and (*asr:...), verify that the string matched within the parentheses is a script run. In concept, a script run is a sequence of characters that are all from the same Unicode script. However, because some scripts are commonly used together, and because some diacritical and other marks are used with multiple scripts, it is not that simple.

Every Unicode character has a Script property, mostly with a value corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There are also three special values:

"Unknown" is used for code points that have not been assigned, and also for the surrogate code points. In the PCRE2 32-bit library, characters whose code points are greater than the Unicode maximum (U+10FFFF), which are accessible only in non-UTF mode, are assigned the Unknown script.

"Common" is used for characters that are used with many scripts. These include punctuation, emoji, mathematical, musical, and currency symbols, and the ASCII digits 0 to 9.

"Inherited" is used for characters such as diacritical marks that modify a previous character. These are considered to take on the script of the character that they modify.

Some Inherited characters are used with many scripts, but many of them are only normally used with a small number of scripts. For example, U+102E0 (Coptic Epact thousands mark) is used only with Arabic and Coptic. In order to make it possible to check this, a Unicode property called Script Extension exists. Its value is a list of scripts that apply to the character. For the majority of characters, the list contains just one script, the same one as the Script property. However, for characters such as U+102E0 more than one Script is listed. There are also some Common characters that have a single, non-Common script in their Script Extension list.

The next section describes the basic rules for deciding whether a given string of characters is a script run. Note, however, that there are some special cases involving the Chinese Han script, and an additional constraint for decimal digits. These are covered in subsequent sections.

Basic script run rules

A string that is less than two characters long is a script run. This is the only case in which an Unknown character can be part of a script run. Longer strings are checked using only the Script Extensions property, not the basic Script property.

If a character's Script Extension property is the single value "Inherited", it is always accepted as part of a script run. This is also true for the property "Common", subject to the checking of decimal digits described below. All the remaining characters in a script run must have at least one script in common in their Script Extension lists. In set-theoretic terminology, the intersection of all the sets of scripts must not be empty.

A simple example is an Internet name such as "google.com". The letters are all in the Latin script, and the dot is Common, so this string is a script run. However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a string that looks the same, but with Cyrillic "o"s is not a script run.

More interesting examples involve characters with more than one script in their Script Extension. Consider the following characters:

  U+060C  Arabic comma
  U+06D4  Arabic full stop
The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could appear in script runs of either Arabic or Hanifi Rohingya. The first could also appear in Syriac or Thaana script runs, but the second could not.

The Chinese Han script

The Chinese Han script is commonly used in conjunction with other scripts for writing certain languages. Japanese uses the Hiragana and Katakana scripts together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo and Han. These three combinations are treated as special cases when checking script runs and are, in effect, "virtual scripts". Thus, a script run may contain a mixture of Hiragana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture of Bopomofo and Han, but not, for example, a mixture of Hangul and Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Standard 39 ("Unicode Security Mechanisms", http://unicode.org/reports/tr39/) in allowing such mixtures.

Decimal digits

Unicode contains many sets of 10 decimal digits in different scripts, and some scripts (including the Common script) contain more than one set. Some of these decimal digits them are visually indistinguishable from the common ASCII digits. In addition to the script checking described above, if a script run contains any decimal digits, they must all come from the same set of 10 adjacent characters.

VALIDITY OF UTF STRINGS

When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant functions. If an invalid UTF string is passed, a negative error code is returned. The code unit offset to the offending character can be extracted from the match data block by calling pcre2_get_startchar(), which is used for this purpose after a UTF error.

In some situations, you may already know that your strings are valid, and therefore want to skip these checks in order to improve performance, for example in the case of a long subject string that is being scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at compile time or at match time, PCRE2 assumes that the pattern or subject it is given (respectively) contains only valid UTF code unit sequences.

If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result is undefined and your program may crash or loop indefinitely or give incorrect results. There is, however, one mode of matching that can handle invalid UTF subject strings. This is enabled by passing PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in the next section. The rest of this section covers the case when PCRE2_MATCH_INVALID_UTF is not set.

Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF check for the pattern; it does not also apply to subject strings. If you want to disable the check for a subject string you must pass this same option to pcre2_match() or pcre2_dfa_match().

UTF-16 and UTF-32 strings can indicate their endianness by special code knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order.

Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other processing takes place. In the case of pcre2_match() and pcre2_dfa_match() calls with a non-zero starting offset, the check is applied only to that part of the subject that could be inspected during matching, and there is a check that the starting offset points to the first code unit of a character or to the end of the subject. If there are no lookbehind assertions in the pattern, the check starts at the starting offset. Otherwise, it starts at the length of the longest lookbehind before the starting offset, or at the start of the subject if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds.

In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be.

Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, where they are used in pairs to encode code points with values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs are available independently in the UTF-8 and UTF-32 encodings. (In other words, the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and UTF-32.)

Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. If you want to allow escape sequences such as \x{d800} (a surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16.

Errors in UTF-8 strings

The following negative error codes are given for invalid UTF-8 strings:

  PCRE2_ERROR_UTF8_ERR1
  PCRE2_ERROR_UTF8_ERR2
  PCRE2_ERROR_UTF8_ERR3
  PCRE2_ERROR_UTF8_ERR4
  PCRE2_ERROR_UTF8_ERR5
The string ends with a truncated UTF-8 character; the code specifies how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279) allows for up to 6 bytes, and this is checked first; hence the possibility of 4 or 5 missing bytes.
  PCRE2_ERROR_UTF8_ERR6
  PCRE2_ERROR_UTF8_ERR7
  PCRE2_ERROR_UTF8_ERR8
  PCRE2_ERROR_UTF8_ERR9
  PCRE2_ERROR_UTF8_ERR10
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the character do not have the binary value 0b10 (that is, either the most significant bit is 0, or the next bit is 1).
  PCRE2_ERROR_UTF8_ERR11
  PCRE2_ERROR_UTF8_ERR12
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; these code points are excluded by RFC 3629.
  PCRE2_ERROR_UTF8_ERR13
A 4-byte character has a value greater than 0x10ffff; these code points are excluded by RFC 3629.
  PCRE2_ERROR_UTF8_ERR14
A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of code points are reserved by RFC 3629 for use with UTF-16, and so are excluded from UTF-8.
  PCRE2_ERROR_UTF8_ERR15
  PCRE2_ERROR_UTF8_ERR16
  PCRE2_ERROR_UTF8_ERR17
  PCRE2_ERROR_UTF8_ERR18
  PCRE2_ERROR_UTF8_ERR19
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a value that can be represented by fewer bytes, which is invalid. For example, the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just one byte.
  PCRE2_ERROR_UTF8_ERR20
The two most significant bits of the first byte of a character have the binary value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a byte can only validly occur as the second or subsequent byte of a multi-byte character.
  PCRE2_ERROR_UTF8_ERR21
The first byte of a character has the value 0xfe or 0xff. These values can never occur in a valid UTF-8 string.

Errors in UTF-16 strings

The following negative error codes are given for invalid UTF-16 strings:

  PCRE2_ERROR_UTF16_ERR1  Missing low surrogate at end of string
  PCRE2_ERROR_UTF16_ERR2  Invalid low surrogate follows high surrogate
  PCRE2_ERROR_UTF16_ERR3  Isolated low surrogate

Errors in UTF-32 strings

The following negative error codes are given for invalid UTF-32 strings:

  PCRE2_ERROR_UTF32_ERR1  Surrogate character (0xd800 to 0xdfff)
  PCRE2_ERROR_UTF32_ERR2  Code point is greater than 0x10ffff

MATCHING IN INVALID UTF STRINGS

You can run pattern matches on subject strings that may contain invalid UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_INVALID_UTF option. This is supported by pcre2_match(), including JIT matching, but not by pcre2_dfa_match(). When PCRE2_MATCH_INVALID_UTF is set, it forces PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a valid UTF string.

If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, and you are not certain that your subject strings are valid UTF sequences, you should not make use of the JIT "fast path" function pcre2_jit_match() because it bypasses sanity checks, including the one for UTF validity. An invalid string may cause undefined behaviour, including looping, crashing, or giving the wrong answer.

Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() generates, but if pcre2_jit_compile() is subsequently called, it does generate different code. If JIT is not used, the option affects the behaviour of the interpretive code in pcre2_match(). When PCRE2_MATCH_INVALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time.

In this mode, an invalid code unit sequence in the subject never matches any pattern item. It does not match dot, it does not match \p{Any}, it does not even match negative items such as [^X]. A lookbehind assertion fails if it encounters an invalid sequence while moving the current point backwards. In other words, an invalid UTF code unit sequence acts as a barrier which no match can cross.

You can also think of this as the subject being split up into fragments of valid UTF, delimited internally by invalid code unit sequences. The pattern is matched fragment by fragment. The result of a successful match, however, is given as code unit offsets in the entire subject string in the usual way. There are a few points to consider:

The internal boundaries are not interpreted as the beginnings or ends of lines and so do not match circumflex or dollar characters in the pattern.

If pcre2_match() is called with an offset that points to an invalid UTF-sequence, that sequence is skipped, and the match starts at the next valid UTF character, or the end of the subject.

At internal fragment boundaries, \b and \B behave in the same way as at the beginning and end of the subject. For example, a sequence such as \bWORD\b would match an instance of WORD that is surrounded by invalid UTF code units.

Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbitrary data, knowing that any matched strings that are returned are valid UTF. This can be useful when searching for UTF text in executable or other binary files.

Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned.

AUTHOR

Philip Hazel
Retired from University Computing Service
Cambridge, England.

REVISION

Last updated: 27 November 2024
Copyright © 1997-2024 University of Cambridge.

Return to the PCRE2 index page.

================================================ FILE: doc/index.html.src ================================================ PCRE2 specification

Perl-compatible Regular Expressions (revised API: PCRE2)

The HTML documentation for PCRE2 consists of a number of pages that are listed below in alphabetical order. If you are new to PCRE2, please read the first one first.

pcre2 Introductory page
pcre2-config Information about the installation configuration
pcre2api PCRE2's native API
pcre2build Building PCRE2
pcre2callout The callout facility
pcre2compat Compability with Perl
pcre2convert Experimental foreign pattern conversion functions
pcre2demo A demonstration C program that uses the PCRE2 library
pcre2grep The pcre2grep command
pcre2jit Discussion of the just-in-time optimization support
pcre2limits Details of size and other limits
pcre2matching Discussion of the two matching algorithms
pcre2partial Using PCRE2 for partial matching
pcre2pattern Specification of the regular expressions supported by PCRE2
pcre2perform Some comments on performance
pcre2posix The POSIX API to the PCRE2 8-bit library
pcre2sample Discussion of the pcre2demo program
pcre2serialize Serializing functions for saving precompiled patterns
pcre2syntax Syntax quick-reference summary
pcre2test The pcre2test command for testing PCRE2
pcre2unicode Discussion of Unicode and UTF-8/UTF-16/UTF-32 support

There are also individual pages that summarize the interface for each function in the library.

pcre2_callout_enumerate Enumerate callouts in a compiled pattern
pcre2_code_copy Copy a compiled pattern
pcre2_code_copy_with_tables Copy a compiled pattern and its character tables
pcre2_code_free Free a compiled pattern
pcre2_compile Compile a regular expression pattern
pcre2_compile_context_copy Copy a compile context
pcre2_compile_context_create Create a compile context
pcre2_compile_context_free Free a compile context
pcre2_config Show build-time related configuration options
pcre2_convert_context_copy Copy a convert context
pcre2_convert_context_create Create a convert context
pcre2_convert_context_free Free a convert context
pcre2_converted_pattern_free Free converted foreign pattern
pcre2_dfa_match Match a compiled pattern to a subject string (DFA algorithm; not Perl compatible)
pcre2_general_context_copy Copy a general context
pcre2_general_context_create Create a general context
pcre2_general_context_free Free a general context
pcre2_get_error_message Get textual error message for error number
pcre2_get_mark Get a (*MARK) name
pcre2_get_match_data_size Get the size of a match data block
pcre2_get_ovector_count Get the ovector count
pcre2_get_ovector_pointer Get a pointer to the ovector
pcre2_get_startchar Get the starting character offset
pcre2_jit_compile Process a compiled pattern with the JIT compiler
pcre2_jit_free_unused_memory Free unused JIT memory
pcre2_jit_match Fast path interface to JIT matching
pcre2_jit_stack_assign Assign stack for JIT matching
pcre2_jit_stack_create Create a stack for JIT matching
pcre2_jit_stack_free Free a JIT matching stack
pcre2_maketables Build character tables in current locale
pcre2_maketables_free Free character tables
pcre2_match Match a compiled pattern to a subject string (Perl compatible)
pcre2_match_context_copy Copy a match context
pcre2_match_context_create Create a match context
pcre2_match_context_free Free a match context
pcre2_match_data_create Create a match data block
pcre2_match_data_create_from_pattern Create a match data block getting size from pattern
pcre2_match_data_free Free a match data block
pcre2_next_match Get the match parameters for the next match
pcre2_pattern_convert Experimental foreign pattern converter
pcre2_pattern_info Extract information about a pattern
pcre2_serialize_decode Decode serialized compiled patterns
pcre2_serialize_encode Serialize compiled patterns for save/restore
pcre2_serialize_free Free serialized compiled patterns
pcre2_serialize_get_number_of_codes Get number of serialized compiled patterns
pcre2_set_bsr Set \R convention
pcre2_set_callout Set up a callout function
pcre2_set_character_tables Set character tables
pcre2_set_compile_extra_options Set compile time extra options
pcre2_set_compile_recursion_guard Set up a compile recursion guard function
pcre2_set_depth_limit Set the match backtracking depth limit
pcre2_set_glob_escape Set glob escape character
pcre2_set_glob_separator Set glob separator character
pcre2_set_heap_limit Set the match backtracking heap limit
pcre2_set_match_limit Set the match limit
pcre2_set_max_pattern_compiled_length Set the maximum length of a compiled pattern
pcre2_set_max_pattern_length Set the maximum length of a pattern
pcre2_set_max_varlookbehind Set the maximum match length for a variable-length lookbehind
pcre2_set_newline Set the newline convention
pcre2_set_offset_limit Set the offset limit
pcre2_set_optimize Set an optimization directive
pcre2_set_parens_nest_limit Set the parentheses nesting limit
pcre2_set_recursion_limit Obsolete: use pcre2_set_depth_limit
pcre2_set_recursion_memory_management Obsolete function that (from 10.30 onwards) does nothing
pcre2_set_substitute_callout Set a substitution callout function
pcre2_set_substitute_case_callout Set a substitution case callout function
pcre2_substitute Match a compiled pattern to a subject string and do substitutions
pcre2_substring_copy_byname Extract named substring into given buffer
pcre2_substring_copy_bynumber Extract numbered substring into given buffer
pcre2_substring_free Free extracted substring
pcre2_substring_get_byname Extract named substring into new memory
pcre2_substring_get_bynumber Extract numbered substring into new memory
pcre2_substring_length_byname Find length of named substring
pcre2_substring_length_bynumber Find length of numbered substring
pcre2_substring_list_free Free list of extracted substrings
pcre2_substring_list_get Extract all substrings into new memory
pcre2_substring_nametable_scan Find table entries for given string name
pcre2_substring_number_from_name Convert captured string name to number
================================================ FILE: doc/pcre2-config.1 ================================================ .TH PCRE2-CONFIG 1 "22 February 2025" "PCRE2 10.48-DEV" .SH NAME pcre2-config - program to return PCRE2 configuration .SH SYNOPSIS .rs .sp .nf .B pcre2-config [--prefix] [--exec-prefix] [--version] .B " [--libs8] [--libs16] [--libs32] [--libs-posix]" .B " [--cflags] [--cflags-posix]" .fi . . .SH DESCRIPTION .rs .sp \fBpcre2-config\fP returns the configuration of the installed PCRE2 libraries and the options required to compile a program to use them. Some of the options apply only to the 8-bit, 16-bit, or 32-bit libraries, respectively, and are not available for libraries that have not been built. If an unavailable option is encountered, the "usage" information is output. . . .SH OPTIONS .rs .TP 10 \fB--prefix\fP Writes the directory prefix used in the PCRE2 installation for architecture-independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some systems) to the standard output. .TP 10 \fB--exec-prefix\fP Writes the directory prefix used in the PCRE2 installation for architecture-dependent files (normally the same as \fB--prefix\fP) to the standard output. .TP 10 \fB--version\fP Writes the version number of the installed PCRE2 libraries to the standard output. .TP 10 \fB--libs8\fP Writes to the standard output the command line options required to link with the 8-bit PCRE2 library (\fB-lpcre2-8\fP on many systems). .TP 10 \fB--libs16\fP Writes to the standard output the command line options required to link with the 16-bit PCRE2 library (\fB-lpcre2-16\fP on many systems). .TP 10 \fB--libs32\fP Writes to the standard output the command line options required to link with the 32-bit PCRE2 library (\fB-lpcre2-32\fP on many systems). .TP 10 \fB--libs-posix\fP Writes to the standard output the command line options required to link with PCRE2's POSIX API wrapper library (\fB-lpcre2-posix\fP \fB-lpcre2-8\fP on many systems). .TP 10 \fB--cflags\fP Writes to the standard output the command line options required to compile files that use PCRE2 (this may include some \fB-I\fP options, but is blank on many systems). .TP 10 \fB--cflags-posix\fP Writes to the standard output the command line options required to compile files that use PCRE2's POSIX API wrapper library (this may include some \fB-I\fP options, but is blank on many systems). . . .SH "SEE ALSO" .rs .sp \fBpcre2(3)\fP . . .SH AUTHOR .rs .sp This manual page was originally written by Mark Baker for the Debian GNU/Linux system. It has been subsequently revised as a generic PCRE2 man page. . . .SH REVISION .rs .sp .nf Last updated: 22 February 2025 .fi ================================================ FILE: doc/pcre2-config.txt ================================================ PCRE2-CONFIG(1) General Commands Manual PCRE2-CONFIG(1) NAME pcre2-config - program to return PCRE2 configuration SYNOPSIS pcre2-config [--prefix] [--exec-prefix] [--version] [--libs8] [--libs16] [--libs32] [--libs-posix] [--cflags] [--cflags-posix] DESCRIPTION pcre2-config returns the configuration of the installed PCRE2 libraries and the options required to compile a program to use them. Some of the options apply only to the 8-bit, 16-bit, or 32-bit libraries, respec- tively, and are not available for libraries that have not been built. If an unavailable option is encountered, the "usage" information is output. OPTIONS --prefix Writes the directory prefix used in the PCRE2 installation for architecture-independent files (/usr on many systems, /usr/local on some systems) to the standard output. --exec-prefix Writes the directory prefix used in the PCRE2 installation for architecture-dependent files (normally the same as --pre- fix) to the standard output. --version Writes the version number of the installed PCRE2 libraries to the standard output. --libs8 Writes to the standard output the command line options re- quired to link with the 8-bit PCRE2 library (-lpcre2-8 on many systems). --libs16 Writes to the standard output the command line options re- quired to link with the 16-bit PCRE2 library (-lpcre2-16 on many systems). --libs32 Writes to the standard output the command line options re- quired to link with the 32-bit PCRE2 library (-lpcre2-32 on many systems). --libs-posix Writes to the standard output the command line options re- quired to link with PCRE2's POSIX API wrapper library (-lpcre2-posix -lpcre2-8 on many systems). --cflags Writes to the standard output the command line options re- quired to compile files that use PCRE2 (this may include some -I options, but is blank on many systems). --cflags-posix Writes to the standard output the command line options re- quired to compile files that use PCRE2's POSIX API wrapper library (this may include some -I options, but is blank on many systems). SEE ALSO pcre2(3) AUTHOR This manual page was originally written by Mark Baker for the Debian GNU/Linux system. It has been subsequently revised as a generic PCRE2 man page. REVISION Last updated: 22 February 2025 PCRE2 10.48-DEV 22 February 2025 PCRE2-CONFIG(1) ================================================ FILE: doc/pcre2.3 ================================================ .TH PCRE2 3 "22 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH INTRODUCTION .rs .sp PCRE2 is the name used for a revised API for the PCRE library, which is a set of functions, written in C, that implement regular expression pattern matching using the same syntax and semantics as Perl, with just a few differences. After nearly two decades, the limitations of the original API were making development increasingly difficult. The new API is more extensible, and it was simplified by abolishing the separate "study" optimizing function; in PCRE2, patterns are automatically optimized where possible. Since forking from PCRE1, the code has been extensively refactored and new features introduced. The old library is now obsolete and is no longer maintained. .P As well as Perl-style regular expression patterns, some features that appeared in Python and the original PCRE before they appeared in Perl are available using the Python syntax. There is also support for some .NET and Oniguruma syntax items, and there are options for requesting minor changes that give better ECMAScript (JavaScript) compatibility. .P The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit, or 32-bit code units, which means that up to three separate libraries may be installed, one for each code unit size. The size of a code unit is not related to the bit size of the underlying hardware. In a 64-bit environment that also supports 32-bit applications, versions of PCRE2 that are compiled in both 64-bit and 32-bit modes may be needed. .P The original work to extend PCRE to 16-bit and 32-bit code units was done by Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings can be interpreted either as one character per code unit, or as UTF-encoded Unicode, with support for Unicode general category properties. Unicode support is optional at build time (but is the default). However, processing strings as UTF code units must be enabled explicitly at run time. The version of Unicode in use can be discovered by running .sp pcre2test -C .P The three libraries contain identical sets of functions, with names ending in _8, _16, or _32, respectively (for example, \fBpcre2_compile_8()\fP). However, by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just one code unit width can be written using generic names such as \fBpcre2_compile()\fP, and the documentation is written assuming that this is the case. .P In addition to the Perl-compatible matching function, PCRE2 contains an alternative function that matches the same compiled patterns in a different way. In certain circumstances, the alternative function has some advantages. For a discussion of the two matching algorithms, see the .\" HREF \fBpcre2matching\fP .\" page. .P Details of exactly which Perl regular expression features are and are not supported by PCRE2 are given in separate documents. See the .\" HREF \fBpcre2pattern\fP .\" and .\" HREF \fBpcre2compat\fP .\" pages. There is a syntax summary in the .\" HREF \fBpcre2syntax\fP .\" page. .P Some features of PCRE2 can be included, excluded, or changed when the library is built. The .\" HREF \fBpcre2_config()\fP .\" function makes it possible for a client to discover which features are available. The features themselves are described in the .\" HREF \fBpcre2build\fP .\" page. Documentation about building PCRE2 for various operating systems can be found in the .\" HTML .\" \fBREADME\fP .\" and .\" HTML .\" \fBNON-AUTOTOOLS-BUILD\fP .\" files in the source distribution. .P The libraries contains a number of undocumented internal functions and data tables that are used by more than one of the exported external functions, but which are not intended for use by external callers. Their names all begin with "_pcre2", which hopefully will not provoke any name clashes. In some environments, it is possible to control which external symbols are exported when a shared library is built, and in these cases the undocumented symbols are not exported. . . .SH "SECURITY CONSIDERATIONS" .rs .sp If you are using PCRE2 in a non-UTF application that permits users to supply arbitrary patterns for compilation, you should be aware of a feature that allows users to turn on UTF support from within a pattern. For example, an 8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings of UTF-8 code units instead of individual 8-bit characters. This causes both the pattern and any data against which it is matched to be checked for UTF-8 validity. If the data string is very long, such a check might use sufficiently many resources as to cause your application to lose performance. .P One way of guarding against this possibility is to use the \fBpcre2_pattern_info()\fP function to check the compiled pattern's options for PCRE2_UTF. Alternatively, you can set the PCRE2_NEVER_UTF option when calling \fBpcre2_compile()\fP. This causes a compile time error if the pattern contains a UTF-setting sequence. .P The use of Unicode properties for character types such as \ed can also be enabled from within the pattern, by specifying "(*UCP)". This feature can be disallowed by setting the PCRE2_NEVER_UCP option. .P If your application is one that supports UTF, be aware that validity checking can take time. If the same data string is to be matched many times, you can use the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid running redundant checks. .P The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to problems, because it may leave the current matching point in the middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an application to lock out the use of \eC, causing a compile-time error if it is encountered. It is also possible to build PCRE2 with the use of \eC permanently disabled. .P Another way that performance can be hit is by running a pattern that has a very large search tree against a string that will never match. Nested unlimited repeats in a pattern are a common example. PCRE2 provides some protection against this: see the \fBpcre2_set_match_limit()\fP function in the .\" HREF \fBpcre2api\fP .\" page. There is a similar function called \fBpcre2_set_depth_limit()\fP that can be used to restrict the amount of memory that is used. . . .SH "USER DOCUMENTATION" .rs .sp The user documentation for PCRE2 comprises a number of different sections. In the "man" format, each of these is a separate "man page". In the HTML format, each is a separate page, linked from the index page. In the plain text format, the descriptions of the \fBpcre2grep\fP and \fBpcre2test\fP programs are in files called \fBpcre2grep.txt\fP and \fBpcre2test.txt\fP, respectively. The remaining sections, except for the \fBpcre2demo\fP section (which is a program listing), and the short pages for individual functions, are concatenated in \fBpcre2.txt\fP, for ease of searching. The sections are as follows: .sp pcre2 this document pcre2-config show PCRE2 installation configuration information pcre2api details of PCRE2's native C API pcre2build building PCRE2 pcre2callout details of the pattern callout feature pcre2compat discussion of Perl compatibility pcre2convert details of pattern conversion functions pcre2demo a demonstration C program that uses PCRE2 pcre2grep description of the \fBpcre2grep\fP command (8-bit only) pcre2jit discussion of just-in-time optimization support pcre2limits details of size and other limits pcre2matching discussion of the two matching algorithms pcre2partial details of the partial matching facility .\" JOIN pcre2pattern syntax and semantics of supported regular expression patterns pcre2perform discussion of performance issues pcre2posix the POSIX-compatible C API for the 8-bit library pcre2sample discussion of the pcre2demo program pcre2serialize details of pattern serialization pcre2syntax quick syntax reference pcre2test description of the \fBpcre2test\fP command pcre2unicode discussion of Unicode and UTF support .sp In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results. . . .SH AUTHORS .rs .sp The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Herczeg. .P PCRE2 was written by Philip Hazel, of the University Computing Service, Cambridge, England. Many others have also contributed. .P To contact the maintainers, please use the GitHub issues tracker or PCRE2 mailing list, as described at the project page: .\" HTML .\" https://github.com/PCRE2Project/pcre2 .\" . . .SH REVISION .rs .sp .nf Last updated: 22 February 2025 Copyright (c) 1997-2021 University of Cambridge. .fi ================================================ FILE: doc/pcre2.txt ================================================ ----------------------------------------------------------------------------- This file contains a concatenation of the PCRE2 man pages, converted to plain text format for ease of searching with a text editor, or for use on systems that do not have a man page processor. The small individual files that give synopses of each function in the library have not been included. Neither has the pcre2demo program. There are separate text files for the pcre2grep and pcre2test commands. ----------------------------------------------------------------------------- PCRE2(3) Library Functions Manual PCRE2(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) INTRODUCTION PCRE2 is the name used for a revised API for the PCRE library, which is a set of functions, written in C, that implement regular expression pattern matching using the same syntax and semantics as Perl, with just a few differences. After nearly two decades, the limitations of the original API were making development increasingly difficult. The new API is more extensible, and it was simplified by abolishing the sepa- rate "study" optimizing function; in PCRE2, patterns are automatically optimized where possible. Since forking from PCRE1, the code has been extensively refactored and new features introduced. The old library is now obsolete and is no longer maintained. As well as Perl-style regular expression patterns, some features that appeared in Python and the original PCRE before they appeared in Perl are available using the Python syntax. There is also support for some .NET and Oniguruma syntax items, and there are options for requesting minor changes that give better ECMAScript (JavaScript) compatibility. The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit, or 32-bit code units, which means that up to three separate li- braries may be installed, one for each code unit size. The size of a code unit is not related to the bit size of the underlying hardware. In a 64-bit environment that also supports 32-bit applications, versions of PCRE2 that are compiled in both 64-bit and 32-bit modes may be needed. The original work to extend PCRE to 16-bit and 32-bit code units was done by Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings can be interpreted either as one character per code unit, or as UTF-encoded Unicode, with support for Unicode general cate- gory properties. Unicode support is optional at build time (but is the default). However, processing strings as UTF code units must be enabled explicitly at run time. The version of Unicode in use can be discovered by running pcre2test -C The three libraries contain identical sets of functions, with names ending in _8, _16, or _32, respectively (for example, pcre2_com- pile_8()). However, by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just one code unit width can be written using generic names such as pcre2_compile(), and the documentation is written assuming that this is the case. In addition to the Perl-compatible matching function, PCRE2 contains an alternative function that matches the same compiled patterns in a dif- ferent way. In certain circumstances, the alternative function has some advantages. For a discussion of the two matching algorithms, see the pcre2matching page. Details of exactly which Perl regular expression features are and are not supported by PCRE2 are given in separate documents. See the pcre2pattern and pcre2compat pages. There is a syntax summary in the pcre2syntax page. Some features of PCRE2 can be included, excluded, or changed when the library is built. The pcre2_config() function makes it possible for a client to discover which features are available. The features them- selves are described in the pcre2build page. Documentation about build- ing PCRE2 for various operating systems can be found in the README and NON-AUTOTOOLS-BUILD files in the source distribution. The libraries contains a number of undocumented internal functions and data tables that are used by more than one of the exported external functions, but which are not intended for use by external callers. Their names all begin with "_pcre2", which hopefully will not provoke any name clashes. In some environments, it is possible to control which external symbols are exported when a shared library is built, and in these cases the undocumented symbols are not exported. SECURITY CONSIDERATIONS If you are using PCRE2 in a non-UTF application that permits users to supply arbitrary patterns for compilation, you should be aware of a feature that allows users to turn on UTF support from within a pattern. For example, an 8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings of UTF-8 code units instead of individual 8-bit characters. This causes both the pat- tern and any data against which it is matched to be checked for UTF-8 validity. If the data string is very long, such a check might use suf- ficiently many resources as to cause your application to lose perfor- mance. One way of guarding against this possibility is to use the pcre2_pat- tern_info() function to check the compiled pattern's options for PCRE2_UTF. Alternatively, you can set the PCRE2_NEVER_UTF option when calling pcre2_compile(). This causes a compile time error if the pat- tern contains a UTF-setting sequence. The use of Unicode properties for character types such as \d can also be enabled from within the pattern, by specifying "(*UCP)". This fea- ture can be disallowed by setting the PCRE2_NEVER_UCP option. If your application is one that supports UTF, be aware that validity checking can take time. If the same data string is to be matched many times, you can use the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid running redundant checks. The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to problems, because it may leave the current matching point in the middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C op- tion can be used by an application to lock out the use of \C, causing a compile-time error if it is encountered. It is also possible to build PCRE2 with the use of \C permanently disabled. Another way that performance can be hit is by running a pattern that has a very large search tree against a string that will never match. Nested unlimited repeats in a pattern are a common example. PCRE2 pro- vides some protection against this: see the pcre2_set_match_limit() function in the pcre2api page. There is a similar function called pcre2_set_depth_limit() that can be used to restrict the amount of mem- ory that is used. USER DOCUMENTATION The user documentation for PCRE2 comprises a number of different sec- tions. In the "man" format, each of these is a separate "man page". In the HTML format, each is a separate page, linked from the index page. In the plain text format, the descriptions of the pcre2grep and pcre2test programs are in files called pcre2grep.txt and pcre2test.txt, respectively. The remaining sections, except for the pcre2demo section (which is a program listing), and the short pages for individual func- tions, are concatenated in pcre2.txt, for ease of searching. The sec- tions are as follows: pcre2 this document pcre2-config show PCRE2 installation configuration information pcre2api details of PCRE2's native C API pcre2build building PCRE2 pcre2callout details of the pattern callout feature pcre2compat discussion of Perl compatibility pcre2convert details of pattern conversion functions pcre2demo a demonstration C program that uses PCRE2 pcre2grep description of the pcre2grep command (8-bit only) pcre2jit discussion of just-in-time optimization support pcre2limits details of size and other limits pcre2matching discussion of the two matching algorithms pcre2partial details of the partial matching facility pcre2pattern syntax and semantics of supported regular expression patterns pcre2perform discussion of performance issues pcre2posix the POSIX-compatible C API for the 8-bit library pcre2sample discussion of the pcre2demo program pcre2serialize details of pattern serialization pcre2syntax quick syntax reference pcre2test description of the pcre2test command pcre2unicode discussion of Unicode and UTF support In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results. AUTHORS The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Her- czeg. PCRE2 was written by Philip Hazel, of the University Computing Service, Cambridge, England. Many others have also contributed. To contact the maintainers, please use the GitHub issues tracker or PCRE2 mailing list, as described at the project page: https://github.com/PCRE2Project/pcre2 REVISION Last updated: 22 February 2025 Copyright (c) 1997-2021 University of Cambridge. PCRE2 10.48-DEV 22 February 2025 PCRE2(3) ------------------------------------------------------------------------------ PCRE2API(3) Library Functions Manual PCRE2API(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) #include PCRE2 is a new API for PCRE, starting at release 10.0. This document contains a description of all its native functions. See the pcre2 docu- ment for an overview of all the PCRE2 documentation. PCRE2 NATIVE API BASIC FUNCTIONS pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext); void pcre2_code_free(pcre2_code *code); pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext); pcre2_match_data *pcre2_match_data_create_from_pattern( const pcre2_code *code, pcre2_general_context *gcontext); int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext); int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount); void pcre2_match_data_free(pcre2_match_data *match_data); PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data); PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data); PCRE2_SIZE pcre2_get_match_data_heapframes_size( pcre2_match_data *match_data); uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data); PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data); PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data); PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS pcre2_general_context *pcre2_general_context_create( void *(*private_malloc)(PCRE2_SIZE, void *), void (*private_free)(void *, void *), void *memory_data); pcre2_general_context *pcre2_general_context_copy( pcre2_general_context *gcontext); void pcre2_general_context_free(pcre2_general_context *gcontext); PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS pcre2_compile_context *pcre2_compile_context_create( pcre2_general_context *gcontext); pcre2_compile_context *pcre2_compile_context_copy( pcre2_compile_context *ccontext); void pcre2_compile_context_free(pcre2_compile_context *ccontext); int pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value); int pcre2_set_character_tables(pcre2_compile_context *ccontext, const uint8_t *tables); int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t extra_options); int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE value); int pcre2_set_max_pattern_compiled_length( pcre2_compile_context *ccontext, PCRE2_SIZE value); int pcre2_set_max_varlookbehind(pcre2_compile_contest *ccontext, uint32_t value); int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value); int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t value); int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); int pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive); PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS pcre2_match_context *pcre2_match_context_create( pcre2_general_context *gcontext); pcre2_match_context *pcre2_match_context_copy( pcre2_match_context *mcontext); void pcre2_match_context_free(pcre2_match_context *mcontext); int pcre2_set_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_callout_block *, void *), void *callout_data); int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data); int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); int pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t value); int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value); int pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t value); PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS int pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); void pcre2_substring_free(PCRE2_UCHAR *buffer); int pcre2_substring_get_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); int pcre2_substring_get_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); int pcre2_substring_length_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_SIZE *length); int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length); int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name); void pcre2_substring_list_free(PCRE2_UCHAR **list); int pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, PCRE2_SIZE *outlengthptr); PCRE2 NATIVE API JIT FUNCTIONS int pcre2_jit_compile(pcre2_code *code, uint32_t options); int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext); void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize, size_t maxsize, pcre2_general_context *gcontext); void pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback_function, void *callback_data); void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack); PCRE2 NATIVE API SERIALIZATION FUNCTIONS int32_t pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext); int32_t pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext); void pcre2_serialize_free(uint8_t *bytes); int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes); PCRE2 NATIVE API AUXILIARY FUNCTIONS pcre2_code *pcre2_code_copy(const pcre2_code *code); pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen); const uint8_t *pcre2_maketables(pcre2_general_context *gcontext); void pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables); int pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where); int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data); int pcre2_config(uint32_t what, void *where); PCRE2 NATIVE API OBSOLETE FUNCTIONS int pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t value); int pcre2_set_recursion_memory_management( pcre2_match_context *mcontext, void *(*private_malloc)(size_t, void *), void (*private_free)(void *, void *), void *memory_data); These functions became obsolete at release 10.30 and are retained only for backward compatibility. They should not be used in new code. The first is replaced by pcre2_set_depth_limit(); the second is no longer needed and has no effect (it always returns zero). PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS pcre2_convert_context *pcre2_convert_context_create( pcre2_general_context *gcontext); pcre2_convert_context *pcre2_convert_context_copy( pcre2_convert_context *cvcontext); void pcre2_convert_context_free(pcre2_convert_context *cvcontext); int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, uint32_t escape_char); int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, uint32_t separator_char); int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, PCRE2_UCHAR **buffer, PCRE2_SIZE *blength, pcre2_convert_context *cvcontext); void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern); These functions provide a way of converting non-PCRE2 patterns into patterns that can be processed by pcre2_compile(). This facility is ex- perimental and may be changed in future releases. At present, "globs" and POSIX basic and extended patterns can be converted. Details are given in the pcre2convert documentation. PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code units, respectively. However, there is just one header file, pcre2.h. This contains the function prototypes and other definitions for all three libraries. One, two, or all three can be installed simul- taneously. On Unix-like systems the libraries are called libpcre2-8, libpcre2-16, and libpcre2-32, and they can also co-exist with the orig- inal PCRE libraries. Every PCRE2 function comes in three different forms, one for each library, for example: pcre2_compile_8() pcre2_compile_16() pcre2_compile_32() There are also three different sets of data types: PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32 PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32 The UCHAR types define unsigned code units of the appropriate widths. For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are pointers to constants of the equivalent UCHAR types, that is, they are pointers to vectors of unsigned code units. Character strings are passed to a PCRE2 library as sequences of un- signed integers in code units of the appropriate width. The length of a string may be given as a number of code units, or the string may be specified as zero-terminated. Many applications use only one code unit width. For their convenience, macros are defined whose names are the generic forms such as pcre2_com- pile() and PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific func- tion and macro names. PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it to be 8, 16, or 32 before including pcre2.h in order to make use of the generic names. Applications that use more than one code unit width can be linked with more than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before including pcre2.h, and then use the real function names. Any code that is to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is unknown should also use the real function names. (Unfortunately, it is not possible in C code to save and restore the value of a macro.) If PCRE2_CODE_UNIT_WIDTH is not defined before including pcre2.h, a compiler error occurs. When using multiple libraries in an application, you must take care when processing any particular pattern to use only functions from a single library. For example, if you want to run a match using a pat- tern that was compiled with pcre2_compile_16(), you must do so with pcre2_match_16(), not pcre2_match_8() or pcre2_match_32(). In the function summaries above, and in the rest of this document and other PCRE2 documents, functions and data types are described using their generic names, without the _8, _16, or _32 suffix. PCRE2 API OVERVIEW PCRE2 has its own native API, which is described in this document. There are also some wrapper functions for the 8-bit library that corre- spond to the POSIX regular expression API, but they do not give access to all the functionality of PCRE2 and they are not thread-safe. They are described in the pcre2posix documentation. Both these APIs define a set of C function calls. The native API C data types, function prototypes, option values, and error codes are defined in the header file pcre2.h, which also contains definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the library. Applications can use these to include support for different releases of PCRE2. In a Windows environment, if you want to statically link an application program against a non-dll PCRE2 library, you must define PCRE2_STATIC before including pcre2.h. The functions pcre2_compile() and pcre2_match() are used for compiling and matching regular expressions in a Perl-compatible manner. A sample program that demonstrates the simplest way of using them is provided in the file called pcre2demo.c in the PCRE2 source distribution. A listing of this program is given in the pcre2demo documentation, and the pcre2sample documentation describes how to compile and run it. The compiling and matching functions recognize various options that are passed as bits in an options argument. There are also some more compli- cated parameters such as custom memory management functions and re- source limits that are passed in "contexts" (which are just memory blocks, described below). Simple applications do not need to make use of contexts. Just-in-time (JIT) compiler support is an optional feature of PCRE2 that can be built in appropriate hardware environments. It greatly speeds up the matching performance of many patterns. Programs can re- quest that it be used if available by calling pcre2_jit_compile() after a pattern has been successfully compiled by pcre2_compile(). This does nothing if JIT support is not available. More complicated programs might need to make use of the specialist functions pcre2_jit_stack_create(), pcre2_jit_stack_free(), and pcre2_jit_stack_assign() in order to control the JIT code's memory us- age. JIT matching is automatically used by pcre2_match() if it is available, unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT matching, which gives improved performance at the expense of less sanity checking. The JIT-specific functions are discussed in the pcre2jit documentation. A second matching function, pcre2_dfa_match(), which is not Perl-com- patible, is also provided. This uses a different algorithm for the matching. The alternative algorithm finds all possible matches (at a given point in the subject), and scans the subject just once (unless there are lookaround assertions). However, this algorithm does not re- turn captured substrings. A description of the two matching algorithms and their advantages and disadvantages is given in the pcre2matching documentation. There is no JIT support for pcre2_dfa_match(). In addition to the main compiling and matching functions, there are convenience functions for extracting captured substrings from a subject string that has been matched by pcre2_match(). They are: pcre2_substring_copy_byname() pcre2_substring_copy_bynumber() pcre2_substring_get_byname() pcre2_substring_get_bynumber() pcre2_substring_list_get() pcre2_substring_length_byname() pcre2_substring_length_bynumber() pcre2_substring_nametable_scan() pcre2_substring_number_from_name() pcre2_substring_free() and pcre2_substring_list_free() are also pro- vided, to free memory used for extracted strings. If either of these functions is called with a NULL argument, the function returns immedi- ately without doing anything. The function pcre2_substitute() can be called to match a pattern and return a copy of the subject string with substitutions for parts that were matched. Functions whose names begin with pcre2_serialize_ are used for saving compiled patterns on disc or elsewhere, and reloading them later. Finally, there are functions for finding out information about a com- piled pattern (pcre2_pattern_info()) and about the configuration with which PCRE2 was built (pcre2_config()) and that it is using. Functions with names ending with _free() are used for freeing memory blocks of various sorts. In all cases, if one of these functions is called with a NULL argument, it does nothing. STRING LENGTHS AND OFFSETS The PCRE2 API uses string lengths and offsets into strings of code units in several places. These values are always of type PCRE2_SIZE, which is an unsigned integer type, currently always defined as size_t. The largest value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings and unset offsets. Therefore, the longest string that can be handled is one less than this maximum. Note that string lengths are al- ways given in code units. Only in the 8-bit library is such a length the same as the number of bytes in the string. NEWLINES PCRE2 supports five different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (line- feed) character, the two-character sequence CRLF, any of the three pre- ceding, or any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). Each of the first three conventions is used by at least one operating system as its standard newline sequence. When PCRE2 is built, a default can be specified. If it is not, the default is set to LF, which is the Unix standard. However, the newline convention can be changed by an ap- plication when calling pcre2_compile(), or it can be specified by spe- cial text at the start of the pattern itself; this overrides any other settings. See the pcre2pattern page for details of the special charac- ter sequences. In the PCRE2 documentation the word "newline" is used to mean "the character or pair of characters that indicate a line break". The choice of newline convention affects the handling of the dot, circumflex, and dollar metacharacters, the handling of #-comments in /x mode, and, when CRLF is a recognized line ending sequence, the match position advance- ment for a non-anchored pattern. There is more detail about this in the section on pcre2_match() options below. The choice of newline convention does not affect the interpretation of the \n or \r escape sequences, nor does it affect what \R matches; this has its own separate convention. MULTITHREADING In a multithreaded application it is important to keep thread-specific data separate from data that can be shared between threads. The PCRE2 library code itself is thread-safe: it contains no static or global variables. The API is designed to be fairly simple for non-threaded ap- plications while at the same time ensuring that multithreaded applica- tions can use it. There are several different blocks of data that are used to pass infor- mation between the application and the PCRE2 libraries. The compiled pattern A pointer to the compiled form of a pattern is returned to the user when pcre2_compile() is successful. The data in the compiled pattern is fixed, and does not change when the pattern is matched. Therefore, it is thread-safe, that is, the same compiled pattern can be used by more than one thread simultaneously. For example, an application can compile all its patterns at the start, before forking off multiple threads that use them. However, if the just-in-time (JIT) optimization feature is being used, it needs separate memory stack areas for each thread. See the pcre2jit documentation for more details. In a more complicated situation, where patterns are compiled only when they are first needed, but are still shared between threads, pointers to compiled patterns must be protected from simultaneous writing by multiple threads. This is somewhat tricky to do correctly. If you know that writing to a pointer is atomic in your environment, you can use logic like this: Get a read-only (shared) lock (mutex) for pointer if (pointer == NULL) { Get a write (unique) lock for pointer if (pointer == NULL) pointer = pcre2_compile(... } Release the lock Use pointer in pcre2_match() Of course, testing for compilation errors should also be included in the code. The reason for checking the pointer a second time is as follows: Sev- eral threads may have acquired the shared lock and tested the pointer for being NULL, but only one of them will be given the write lock, with the rest kept waiting. The winning thread will compile the pattern and store the result. After this thread releases the write lock, another thread will get it, and if it does not retest pointer for being NULL, will recompile the pattern and overwrite the pointer, creating a memory leak and possibly causing other issues. In an environment where writing to a pointer may not be atomic, the above logic is not sufficient. The thread that is doing the compiling may be descheduled after writing only part of the pointer, which could cause other threads to use an invalid value. Instead of checking the pointer itself, a separate "pointer is valid" flag (that can be updated atomically) must be used: Get a read-only (shared) lock (mutex) for pointer if (!pointer_is_valid) { Get a write (unique) lock for pointer if (!pointer_is_valid) { pointer = pcre2_compile(... pointer_is_valid = TRUE } } Release the lock Use pointer in pcre2_match() If JIT is being used, but the JIT compilation is not being done immedi- ately (perhaps waiting to see if the pattern is used often enough), similar logic is required. JIT compilation updates a value within the compiled code block, so a thread must gain unique write access to the pointer before calling pcre2_jit_compile(). Alternatively, pcre2_code_copy() or pcre2_code_copy_with_tables() can be used to ob- tain a private copy of the compiled code before calling the JIT com- piler. Context blocks The next main section below introduces the idea of "contexts" in which PCRE2 functions are called. A context is nothing more than a collection of parameters that control the way PCRE2 operates. Grouping a number of parameters together in a context is a convenient way of passing them to a PCRE2 function without using lots of arguments. The parameters that are stored in contexts are in some sense "advanced features" of the API. Many straightforward applications will not need to use contexts. In a multithreaded application, if the parameters in a context are val- ues that are never changed, the same context can be used by all the threads. However, if any thread needs to change any value in a context, it must make its own thread-specific copy. Match blocks The matching functions need a block of memory for storing the results of a match. This includes details of what was matched, as well as addi- tional information such as the name of a (*MARK) setting. Each thread must provide its own copy of this memory. PCRE2 CONTEXTS Some PCRE2 functions have a lot of parameters, many of which are used only by specialist applications, for example, those that use custom memory management or non-standard character tables. To keep function argument lists at a reasonable size, and at the same time to keep the API extensible, "uncommon" parameters are passed to certain functions in a context instead of directly. A context is just a block of memory that holds the parameter values. Applications that do not need to ad- just any of the context parameters can pass NULL when a context pointer is required. There are three different types of context: a general context that is relevant for several PCRE2 operations, a compile-time context, and a match-time context. The general context At present, this context just contains pointers to (and data for) ex- ternal memory management functions that are called from several places in the PCRE2 library. The context is named `general' rather than specifically `memory' because in future other fields may be added. If you do not want to supply your own custom memory management functions, you do not need to bother with a general context. A general context is created by: pcre2_general_context *pcre2_general_context_create( void *(*private_malloc)(PCRE2_SIZE, void *), void (*private_free)(void *, void *), void *memory_data); The two function pointers specify custom memory management functions, whose prototypes are: void *private_malloc(PCRE2_SIZE, void *); void private_free(void *, void *); Whenever code in PCRE2 calls these functions, the final argument is the value of memory_data. Either of the first two arguments of the creation function may be NULL, in which case the system memory management func- tions malloc() and free() are used. (This is not currently useful, as there are no other fields in a general context, but in future there might be.) The private_malloc() function is used (if supplied) to ob- tain memory for storing the context, and all three values are saved as part of the context. Whenever PCRE2 creates a data block of any kind, the block contains a pointer to the free() function that matches the malloc() function that was used. When the time comes to free the block, this function is called. A general context can be copied by calling: pcre2_general_context *pcre2_general_context_copy( pcre2_general_context *gcontext); The memory used for a general context should be freed by calling: void pcre2_general_context_free(pcre2_general_context *gcontext); If this function is passed a NULL argument, it returns immediately without doing anything. The compile context A compile context is required if you want to provide an external func- tion for stack checking during compilation or to change the default values of any of the following compile-time parameters: What \R matches (Unicode newlines or CR, LF, CRLF only) PCRE2's character tables The newline character sequence The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) Which performance optimizations the compiler should apply A compile context is also required if you are using custom memory man- agement. If none of these apply, just pass NULL as the context argu- ment of pcre2_compile(). A compile context is created, copied, and freed by the following func- tions: pcre2_compile_context *pcre2_compile_context_create( pcre2_general_context *gcontext); pcre2_compile_context *pcre2_compile_context_copy( pcre2_compile_context *ccontext); void pcre2_compile_context_free(pcre2_compile_context *ccontext); A compile context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected. int pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value); The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line ending sequence. The value is used by the JIT compiler and by the two interpreted matching functions, pcre2_match() and pcre2_dfa_match(). int pcre2_set_character_tables(pcre2_compile_context *ccontext, const uint8_t *tables); The value must be the result of a call to pcre2_maketables(), whose only argument is a general context. This function builds a set of char- acter tables in the current locale. int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t extra_options); As PCRE2 has developed, almost all the 32 option bits that are avail- able in the options argument of pcre2_compile() have been used up. To avoid running out, the compile context contains a set of extra option bits which are used for some newer, assumed rarer, options. This func- tion sets those bits. It always sets all the bits (either on or off). It does not modify any existing setting. The available options are de- fined in the section entitled "Extra compile options" below. int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE value); This sets a maximum length, in code units, for any pattern string that is compiled with this context. If the pattern is longer, an error is generated. This facility is provided so that applications that accept patterns from external sources can limit their size. The default is the largest number that a PCRE2_SIZE variable can hold, which is effec- tively unlimited. int pcre2_set_max_pattern_compiled_length( pcre2_compile_context *ccontext, PCRE2_SIZE value); This sets a maximum size, in bytes, for the memory needed to hold the compiled version of a pattern that is compiled with this context. If the pattern needs more memory, an error is generated. This facility is provided so that applications that accept patterns from external sources can limit the amount of memory they use. The default is the largest number that a PCRE2_SIZE variable can hold, which is effec- tively unlimited. int pcre2_set_max_varlookbehind(pcre2_compile_contest *ccontext, uint32_t value); This sets a maximum length for the number of characters matched by a variable-length lookbehind assertion. The default is set when PCRE2 is built, with the ultimate default being 255, the same as Perl. Lookbe- hind assertions without a bounding length are not supported. int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value); This specifies which characters or character sequences are to be recog- nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), PCRE2_NEWLINE_ANY (any Unicode newline sequence), or PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero). A pattern can override the value set in the compile context by starting with a sequence such as (*CRLF). See the pcre2pattern page for details. When a pattern is compiled with the PCRE2_EXTENDED or PCRE2_EX- TENDED_MORE option, the newline convention affects the recognition of the end of internal comments starting with #. The value is saved with the compiled pattern for subsequent use by the JIT compiler and by the two interpreted matching functions, pcre2_match() and pcre2_dfa_match(). int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t value); This parameter adjusts the limit, set when PCRE2 is built (default 250), on the depth of parenthesis nesting in a pattern. This limit stops rogue patterns using up too much system stack when being com- piled. The limit applies to parentheses of all kinds, not just captur- ing parentheses. int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); There is at least one application that runs PCRE2 in threads with very limited system stack, where running out of stack is to be avoided at all costs. The parenthesis limit above cannot take account of how much stack is actually available during compilation. For a finer control, you can supply a function that is called whenever pcre2_compile() starts to compile a parenthesized part of a pattern. This function can check the actual stack size (or anything else that it wants to, of course). The first argument to the callout function gives the current depth of nesting, and the second is user data that is set up by the last argu- ment of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. int pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive); PCRE2 can apply various performance optimizations during compilation, in order to make matching faster. For example, the compiler might con- vert some regex constructs into an equivalent construct which pcre2_match() can execute faster. By default, all available optimiza- tions are enabled. However, in rare cases, one might wish to disable specific optimizations. For example, if it is known that some optimiza- tions cannot benefit a certain regex, it might be desirable to disable them, in order to speed up compilation. The permitted values of directive are as follows: PCRE2_OPTIMIZATION_FULL Enable all optional performance optimizations. This is the default value. PCRE2_OPTIMIZATION_NONE Disable all optional performance optimizations. PCRE2_AUTO_POSSESS PCRE2_AUTO_POSSESS_OFF Enable/disable "auto-possessification" of variable quantifiers such as * and +. This optimization, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can disable this optimization if you want the matching functions to do a full, unoptimized search and run all the callouts. PCRE2_DOTSTAR_ANCHOR PCRE2_DOTSTAR_ANCHOR_OFF Enable/disable an optimization that is applied when .* is the first significant item in a top-level branch of a pattern, and all the other branches also start with .* or with \A or \G or ^. Such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause call- outs to be skipped. Dotstar anchor optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). PCRE2_START_OPTIMIZE PCRE2_START_OPTIMIZE_OFF Enable/disable optimizations which cause matching functions to scan the subject string for specific code unit values before attempting a match. For example, if it is known that an unanchored match must start with a specific value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without actually running the main matching function. This means that a special item such as (*COMMIT) at the start of a pattern is not considered until after a suitable starting point for the match has been found. Also, when call- outs or (*MARK) items are in use, these "start-up" optimizations can cause them to be skipped if the pattern is never actually used. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run. Disabling start-up optimizations ensures that in cases where the result is "no match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are considered at every possible starting position in the subject string. Disabling start-up optimizations may change the outcome of a matching operation. Consider the pattern (*COMMIT)ABC When this is compiled, PCRE2 records the fact that a match must start with the character "A". Suppose the subject string is "DEFABC". The start-up optimization scans along the subject, finds "A" and runs the first match attempt from there. The (*COMMIT) item means that the pat- tern must match the current starting position, which in this case, it does. However, if the same match is run without start-up optimizations, the initial scan along the subject string does not happen. The first match attempt is run starting from "D" and when this fails, (*COMMIT) prevents any further matches being tried, so the overall result is "no match". Another start-up optimization makes use of a minimum length for a matching subject, which is recorded when possible. Consider the pattern (*MARK:1)B(*MARK:2)(X|Y) The minimum length for a match is two characters. If the subject is "XXBB", the "starting character" optimization skips "XX", then tries to match "BB", which is long enough. In the process, (*MARK:2) is encoun- tered and remembered. When the match attempt fails, the next "B" is found, but there is only one character left, so there are no more at- tempts, and "no match" is returned with the "last mark seen" set to "2". Without start-up optimizations, however, matches are tried at every possible starting position, including at the end of the subject, where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is returned is "1". In this case, the optimizations do not affect the overall match result, which is still "no match", but they do affect the auxiliary information that is returned. The match context A match context is required if you want to: Set up a callout function Set an offset limit for matching an unanchored pattern Change the limit on the amount of heap used when matching Change the backtracking match limit Change the backtracking depth limit Set custom memory management specifically for the match If none of these apply, just pass NULL as the context argument of pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match(). A match context is created, copied, and freed by the following func- tions: pcre2_match_context *pcre2_match_context_create( pcre2_general_context *gcontext); pcre2_match_context *pcre2_match_context_copy( pcre2_match_context *mcontext); void pcre2_match_context_free(pcre2_match_context *mcontext); A match context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected. int pcre2_set_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_callout_block *, void *), void *callout_data); This sets up a callout function for PCRE2 to call at specified points during a matching operation. Details are given in the pcre2callout doc- umentation. int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); This sets up a callout function for PCRE2 to call after each substitu- tion made by pcre2_substitute(). Details are given in the section enti- tled "Creating a new string with substitutions" below. int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data); This sets up a callout function for PCRE2 to call when performing case transformations inside pcre2_substitute(). Details are given in the section entitled "Creating a new string with substitutions" below. int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); The offset_limit parameter limits how far an unanchored search can ad- vance in the subject string. The default value is PCRE2_UNSET. The pcre2_match() and pcre2_dfa_match() functions return PCRE2_ERROR_NO- MATCH if a match with a starting point before or at the given offset is not found. The pcre2_substitute() function makes no more substitutions. For example, if the pattern /abc/ is matched against "123abc" with an offset limit less than 3, the result is PCRE2_ERROR_NOMATCH. A match can never be found if the startoffset argument of pcre2_match(), pcre2_dfa_match(), or pcre2_substitute() is greater than the offset limit set in the match context. When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT op- tion when calling pcre2_compile() so that when JIT is in use, different code can be compiled. If a match is started with a non-default match limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is generated. The offset limit facility can be used to track progress when searching large subject strings or to limit the extent of global substitutions. See also the PCRE2_FIRSTLINE option, which requires a match to start before or at the first newline that follows the start of matching in the subject. If this is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. int pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t value); The heap_limit parameter specifies, in units of kibibytes (1024 bytes), the maximum amount of heap memory that pcre2_match() may use to hold backtracking information when running an interpretive match. This limit also applies to pcre2_dfa_match(), which may use the heap when process- ing patterns with a lot of nested pattern recursion or lookarounds or atomic groups. This limit does not apply to matching with the JIT opti- mization, which has its own memory control arrangements (see the pcre2jit documentation for more details). If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2 is built; if it is not, the default is set very large and is essentially unlimited. A value for the heap limit may also be supplied by an item at the start of a pattern of the form (*LIMIT_HEAP=ddd) where ddd is a decimal number. However, such a setting is ignored un- less ddd is less than the limit set by the caller of pcre2_match() or, if no such limit is set, less than the default. The pcre2_match() function always needs some heap memory, so setting a value of zero guarantees a "heap limit exceeded" error. Details of how pcre2_match() uses the heap are given in the pcre2perform documenta- tion. For pcre2_dfa_match(), a vector on the system stack is used when pro- cessing pattern recursions, lookarounds, or atomic groups, and only if this is not big enough is heap memory used. In this case, setting a value of zero disables the use of the heap. int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value); The match_limit parameter provides a means of preventing PCRE2 from us- ing up too many computing resources when processing patterns that are not going to match, but which have a very large number of possibilities in their search trees. The classic example is a pattern that uses nested unlimited repeats. There is an internal counter in pcre2_match() that is incremented each time round its main matching loop. If this value reaches the match limit, pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT. This has the effect of limiting the amount of backtracking that can take place. For patterns that are not anchored, the count restarts from zero for each position in the subject string. This limit also applies to pcre2_dfa_match(), though the counting is done in a different way. When pcre2_match() is called with a pattern that was successfully processed by pcre2_jit_compile(), the way in which matching is executed is entirely different. However, there is still the possibility of run- away matching that goes on for a very long time, and so the match_limit value is also used in this case (but in a different way) to limit how long the matching can continue. The default value for the limit can be set when PCRE2 is built; the de- fault is 10 million, which handles all but the most extreme cases. A value for the match limit may also be supplied by an item at the start of a pattern of the form (*LIMIT_MATCH=ddd) where ddd is a decimal number. However, such a setting is ignored un- less ddd is less than the limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if no such limit is set, less than the default. int pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t value); This parameter limits the depth of nested backtracking in pcre2_match(). Each time a nested backtracking point is passed, a new memory frame is used to remember the state of matching at that point. Thus, this parameter indirectly limits the amount of memory that is used in a match. However, because the size of each memory frame depends on the number of capturing parentheses, the actual memory limit varies from pattern to pattern. This limit was more useful in versions before 10.30, where function recursion was used for backtracking. The depth limit is not relevant, and is ignored, when matching is done using JIT compiled code. However, it is supported by pcre2_dfa_match(), which uses it to limit the depth of nested internal recursive function calls that implement atomic groups, lookaround assertions, and pattern recursions. This limits, indirectly, the amount of system stack that is used. It was more useful in versions before 10.32, when stack memory was used for local workspace vectors for recursive function calls. From version 10.32, only local variables are allocated on the stack and as each call uses only a few hundred bytes, even a small stack can support quite a lot of recursion. If the depth of internal recursive function calls is great enough, lo- cal workspace vectors are allocated on the heap from version 10.32 on- wards, so the depth limit also indirectly limits the amount of heap memory that is used. A recursive pattern such as /(.(?2))((?1)|)/, when matched to a very long string using pcre2_dfa_match(), can use a great deal of memory. However, it is probably better to limit heap usage di- rectly by calling pcre2_set_heap_limit(). The default value for the depth limit can be set when PCRE2 is built; if it is not, the default is set to the same value as the default for the match limit. If the limit is exceeded, pcre2_match() or pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth limit may also be supplied by an item at the start of a pattern of the form (*LIMIT_DEPTH=ddd) where ddd is a decimal number. However, such a setting is ignored un- less ddd is less than the limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if no such limit is set, less than the default. CHECKING BUILD-TIME OPTIONS int pcre2_config(uint32_t what, void *where); The function pcre2_config() makes it possible for a PCRE2 client to find the value of certain configuration parameters and to discover which optional features have been compiled into the PCRE2 library. The pcre2build documentation has more details about these features. The first argument for pcre2_config() specifies which information is required. The second argument is a pointer to memory into which the in- formation is placed. If NULL is passed, the function returns the amount of memory that is needed for the requested information. For calls that return numerical values, the value is in bytes; when requesting these values, where should point to appropriately aligned memory. For calls that return strings, the required length is given in code units, not counting the terminating zero. When requesting information, the returned value from pcre2_config() is non-negative on success, or the negative error code PCRE2_ERROR_BADOP- TION if the value in the first argument is not recognized. The follow- ing information is available: PCRE2_CONFIG_BSR The output is a uint32_t integer whose value indicates what character sequences the \R escape sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line ending se- quence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The default can be overridden when a pattern is compiled. PCRE2_CONFIG_COMPILED_WIDTHS The output is a uint32_t integer whose lower bits indicate which code unit widths were selected when PCRE2 was built. The 1-bit indicates 8-bit support, and the 2-bit and 4-bit indicate 16-bit and 32-bit sup- port, respectively. PCRE2_CONFIG_DEPTHLIMIT The output is a uint32_t integer that gives the default limit for the depth of nested backtracking in pcre2_match() or the depth of nested recursions, lookarounds, and atomic groups in pcre2_dfa_match(). Fur- ther details are given with pcre2_set_depth_limit() above. PCRE2_CONFIG_EFFECTIVE_LINKSIZE The output is a uint32_t integer that contains the number of bytes the library uses for internal linkage in compiled regular expressions. Its value is derived from the value that was provided at build time and that is described below by PCRE2_CONFIG_LINKSIZE. PCRE2_CONFIG_HEAPLIMIT The output is a uint32_t integer that gives, in kibibytes, the default limit for the amount of heap memory used by pcre2_match() or pcre2_dfa_match(). Further details are given with pcre2_set_heap_limit() above. PCRE2_CONFIG_JIT The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee that JIT will be used for any given match, and neither does it guaran- tee that JIT will actually be able to function, because it may not be able to allocate executable memory in some environments. There is a special call to pcre2_jit_compile() that can be used to check this. See the pcre2jit documentation for more details. PCRE2_CONFIG_JITTARGET The where argument should point to a code-unit-aligned buffer. All pre- vious versions of PCRE2 have required no more than 128 code units of buffer capacity. However, this requirement is not guaranteed to be maintained, so applications should call pcre2_config() with where set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to pcre2_config(). The buffer is filled with a string that contains the name of the architecture for which the JIT compiler is configured at build time, for example, a 64-bit ARM CPU that supports the Armv8.1 extension writes "ARM-64 (LSE) 64bit (little endian + unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is returned; otherwise the number of code units used is returned. This is the length of the string plus one unit for the terminating zero. PCRE2_CONFIG_LINKSIZE The output is a uint32_t integer that contains the number of bytes the library was instructed to use for internal linkage in compiled regular expressions. When PCRE2 is configured, the value can be set to 2, 3, or 4, with the default being 2 for most libraries. The actual number of bytes used depends on the size of the code units that the library supports and can be higher. See PCRE2_CONFIG_EFFEC- TIVE_LINKSIZE above for details. The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all but the most massive patterns, since it allows the size of the compiled pattern to be up to 65535 code units. Larger values allow larger regular expressions to be compiled by those two libraries, but at the expense of slower matching. PCRE2_CONFIG_MATCHLIMIT The output is a uint32_t integer that gives the default match limit for pcre2_match(). Further details are given with pcre2_set_match_limit() above. PCRE2_CONFIG_NEWLINE The output is a uint32_t integer whose value specifies the default character sequence that is recognized as meaning "newline". The values are: PCRE2_NEWLINE_CR Carriage return (CR) PCRE2_NEWLINE_LF Linefeed (LF) PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF PCRE2_NEWLINE_NUL The NUL character (binary zero) The default should normally correspond to the standard sequence for your operating system. PCRE2_CONFIG_NEVER_BACKSLASH_C The output is a uint32_t integer that is set to one if the use of \C was permanently disabled when PCRE2 was built; otherwise it is set to zero. PCRE2_CONFIG_PARENSLIMIT The output is a uint32_t integer that gives the maximum depth of nest- ing of parentheses (of any kind) in a pattern. This limit is imposed to cap the amount of system stack used when a pattern is compiled. It is specified when PCRE2 is built; the default is 250. This limit does not take into account the stack that may already be used by the calling ap- plication. For finer control over compilation stack usage, see pcre2_set_compile_recursion_guard(). PCRE2_CONFIG_STACKRECURSE This parameter is obsolete and should not be used in new code. The out- put is a uint32_t integer that is always set to zero. PCRE2_CONFIG_TABLES_LENGTH The output is a uint32_t integer that gives the length of PCRE2's char- acter processing tables in bytes. For details of these tables see the section on locale support below. PCRE2_CONFIG_UNICODE_VERSION The where argument should point to a code-unit-aligned buffer. All pre- vious versions of PCRE2 have required no more than 24 code units of buffer capacity. However, applications should call pcre2_config() with where set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to pcre2_config(). If PCRE2 has been compiled without Unicode support, the buffer is filled with the text "Unicode not supported". Otherwise, the Unicode version string (for example, "8.0.0") is written. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero. PCRE2_CONFIG_UNICODE The output is a uint32_t integer that is set to one if Unicode support is available; otherwise it is set to zero. Unicode support implies UTF support. PCRE2_CONFIG_VERSION The where argument should point to a code-unit-aligned buffer. All pre- vious versions of PCRE2 have required no more than 24 code units of buffer capacity. However, applications should call pcre2_config() with where set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to pcre2_config(). The buffer is filled with the PCRE2 version string, zero-terminated. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero. COMPILING A PATTERN pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext); void pcre2_code_free(pcre2_code *code); pcre2_code *pcre2_code_copy(const pcre2_code *code); pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); The pcre2_compile() function compiles a pattern into an internal form. The pattern is defined by a pointer to a string of code units and a length in code units. If the pattern is zero-terminated, the length can be specified as PCRE2_ZERO_TERMINATED. A NULL pattern pointer with a length of zero is treated as an empty string (NULL with a non-zero length causes an error return). The function returns a pointer to a block of memory that contains the compiled pattern and related data, or NULL if an error occurred. If the compile context argument ccontext is NULL, memory for the com- piled pattern is obtained by calling malloc(). Otherwise, it is ob- tained from the same memory function that was used for the compile con- text. The caller must free the memory by calling pcre2_code_free() when it is no longer needed. If pcre2_code_free() is called with a NULL ar- gument, it returns immediately, without doing anything. The function pcre2_code_copy() makes a copy of the compiled code in new memory, using the same memory allocator as was used for the original. However, if the code has been processed by the JIT compiler (see be- low), the JIT information cannot be copied (because it is position-de- pendent). The new copy can initially be used only for non-JIT match- ing, though it can be passed to pcre2_jit_compile() if required. If pcre2_code_copy() is called with a NULL argument, it returns NULL. The pcre2_code_copy() function provides a way for individual threads in a multithreaded application to acquire a private copy of shared com- piled code. However, it does not make a copy of the character tables used by the compiled pattern; the new pattern code points to the same tables as the original code. (See "Locale Support" below for details of these character tables.) In many applications the same tables are used throughout, so this behaviour is appropriate. Nevertheless, there are occasions when a copy of a compiled pattern and the relevant tables are needed. The pcre2_code_copy_with_tables() provides this facility. Copies of both the code and the tables are made, with the new code pointing to the new tables. The memory for the new tables is automati- cally freed when pcre2_code_free() is called for the new copy of the compiled code. If pcre2_code_copy_with_tables() is called with a NULL argument, it returns NULL. NOTE: When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can be referenced by the substring extraction functions after a successful match. After running a match, you must not free a compiled pattern or a subject string until after all operations on the match data block have taken place, unless, in the case of the subject string, you have used the PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled "Option bits for pcre2_match()" be- low. The options argument for pcre2_compile() contains various bit settings that affect the compilation. It should be zero if none of them are re- quired. The available options are described below. Some of them (in particular, those that are compatible with Perl, but some others as well) can also be set and unset from within the pattern (see the de- tailed description in the pcre2pattern documentation). For those options that can be different in different parts of the pat- tern, the contents of the options argument specifies their settings at the start of compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and PCRE2_NO_UTF_CHECK options can be set at the time of matching as well as at compile time. Some additional options and less frequently required compile-time para- meters (for example, the newline setting) can be provided in a compile context (as described above). If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme- diately. Otherwise, the variables to which these point are set to an error code and an offset (number of code units) within the pattern, re- spectively, when pcre2_compile() returns NULL because a compilation er- ror has occurred. There are over 100 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and are described in the pcre2unicode documentation. There is no separate documentation for the positive error codes, be- cause the textual error messages that are obtained by calling the pcre2_get_error_message() function (see "Obtaining a textual error mes- sage" below) should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined for both positive and negative error codes in pcre2.h. When compilation is successful errorcode is set to a value that returns the message "no error" if passed to pcre2_get_error_mes- sage(). The value returned in erroroffset is an indication of where in the pat- tern an error occurred. When there is no error, zero is returned. A non-zero value is not necessarily the furthest point in the pattern that was read. For example, after the error "lookbehind assertion is not fixed length", the error offset points to the start of the failing assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the first code unit of the failing character. Some errors are not detected until the whole pattern has been scanned; in these cases, the offset passed back is the length of the pattern. Note that the offset is in code units, not characters, even in a UTF mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char- acter. This code fragment shows a typical straightforward call to pcre2_com- pile(): pcre2_code *re; PCRE2_SIZE erroffset; int errorcode; re = pcre2_compile( "^A.*Z", /* the pattern */ PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */ 0, /* default options */ &errorcode, /* for error code */ &erroffset, /* for error offset */ NULL); /* no compile context */ Main compile options The following names for option bits are defined in the pcre2.h header file: PCRE2_ANCHORED If this bit is set, the pattern is forced to be "anchored", that is, it is constrained to match only at the first matching point in the string that is being searched (the "subject string"). This effect can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. PCRE2_ALLOW_EMPTY_CLASS By default, for compatibility with Perl, a closing square bracket that immediately follows an opening one is treated as a data character for the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which therefore contains no characters and so can never match. PCRE2_ALT_BSUX This option request alternative handling of three escape sequences, which makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set: (1) \U matches an upper case "U" character; by default \U causes a com- pile time error (Perl uses \U to upper case subsequent characters). (2) \u matches a lower case "u" character unless it is followed by four hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, \u causes a compile time error (Perl uses it to upper case the following character). (3) \x matches a lower case "x" character unless it is followed by two hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, as in Perl, a hexadecimal number is always expected after \x, but it may have one or two digits. ECMAscript 6 added additional functionality to \u. This can be accessed using the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile op- tions" below). Note that this alternative escape handling applies only to patterns. Neither of these options affects the processing of re- placement strings passed to pcre2_substitute(). PCRE2_ALT_CIRCUMFLEX In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter matches at the start of the subject (unless PCRE2_NOTBOL is set), and also after any internal newline. However, it does not match after a newline at the end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a termi- nating newline, you must set PCRE2_ALT_CIRCUMFLEX. PCRE2_ALT_EXTENDED_CLASS Alters the parsing of character classes to follow the extended syntax described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact on the behaviour of the Perl-specific "(?[...])" syntax for ex- tended classes, but instead enables the alternative syntax of extended class behaviour inside ordinary "[...]" character classes. See the pcre2pattern documentation for details of the character classes sup- ported. PCRE2_ALT_VERBNAMES By default, for compatibility with Perl, the name in any verb sequence such as (*MARK:NAME) is any sequence of characters that does not in- clude a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. How- ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- cessing is applied to verb names and only an unescaped closing paren- thesis terminates the name. A closing parenthesis can be included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped white space in verb names is skipped and #-comments are recognized, ex- actly as in the rest of the pattern. PCRE2_AUTO_CALLOUT If this bit is set, pcre2_compile() automatically inserts callout items, all with number 255, before each pattern item, except immedi- ately before or after an explicit callout in the pattern. For discus- sion of the callout facility, see the pcre2callout documentation. PCRE2_CASELESS If this bit is set, letters in the pattern match both upper and lower case letters in the subject. It is equivalent to Perl's /i option, and it can be changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than U+007F. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASE- LESS_RESTRICT. One language family, Turkish and Azeri, has its own case-insensitivity rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 (small dotless i) characters. For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having an- other case. From release 10.45 PCRE2_CASELESS also affects what some of the letter- related Unicode property escapes (\p and \P) match. The properties Lu (upper case letter), Ll (lower case letter), and Lt (title case letter) are all treated as LC (cased letter) when PCRE2_CASELESS is set. PCRE2_DOLLAR_ENDONLY If this bit is set, a dollar metacharacter in the pattern matches only at the end of the subject string. Without this option, a dollar also matches immediately before a newline at the end of the string (but not before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. There is no equivalent to this option in Perl, and no way to set it within a pattern. PCRE2_DOTALL If this bit is set, a dot metacharacter in the pattern matches any character, including one that indicates a newline. However, it only ever matches one character, even if newlines are coded as CRLF. Without this option, a dot does not match when the current position in the sub- ject is at a newline. This option is equivalent to Perl's /s option, and it can be changed within a pattern by a (?s) option setting. A neg- ative class such as [^a] always matches newline characters, and the \N escape sequence always matches a non-newline character, independent of the setting of PCRE2_DOTALL. PCRE2_DUPNAMES If this bit is set, names used to identify capture groups need not be unique. This can be helpful for certain types of pattern when it is known that only one instance of the named group can ever be matched. There are more details of named capture groups below; see also the pcre2pattern documentation. PCRE2_ENDANCHORED If this bit is set, the end of any pattern match must be right at the end of the string being searched (the "subject string"). If the pattern match succeeds by reaching (*ACCEPT), but does not reach the end of the subject, the match fails at the current starting point. For unanchored patterns, a new match is then tried at the next starting point. How- ever, if the match succeeds by reaching the end of the pattern, but not the end of the subject, backtracking occurs and an alternative match may be found. Consider these two patterns: .(*ACCEPT)|.. .|.. If matched against "abc" with PCRE2_ENDANCHORED set, the first matches "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only to the first (that is, the longest) matched string. Other parallel matches, which are necessarily substrings of the first one, must obvi- ously end before the end of the subject. PCRE2_EXTENDED If this bit is set, most white space characters in the pattern are to- tally ignored except when escaped, inside a character class, or inside a \Q...\E sequence. However, white space is not allowed within se- quences such as (?> that introduce various parenthesized groups, nor within numerical quantifiers such as {1,3}. Ignorable white space is permitted between an item and a following quantifier and between a quantifier and a following + that indicates possessiveness. PCRE2_EX- TENDED is equivalent to Perl's /x option, and it can be changed within a pattern by a (?x) option setting. When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- nizes as white space only those characters with code points less than 256 that are flagged as white space in its low-character table. The ta- ble is normally created by pcre2_maketables(), which uses the isspace() function to identify space characters. In most ASCII environments, the relevant characters are those with code points 0x0009 (tab), 0x000A (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage return), and 0x0020 (space). When PCRE2 is compiled with Unicode support, in addition to these char- acters, five more Unicode "Pattern White Space" characters are recog- nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to- right mark), U+200F (right-to-left mark), U+2028 (line separator), and U+2029 (paragraph separator). This set of characters is the same as recognized by Perl's /x option. Note that the horizontal and vertical space characters that are matched by the \h and \v escapes in patterns are a much bigger set. As well as ignoring most white space, PCRE2_EXTENDED also causes char- acters between an unescaped # outside a character class and the next newline, inclusive, to be ignored, which makes it possible to include comments inside complicated patterns. Note that the end of this type of comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. Which characters are interpreted as newlines can be specified by a set- ting in the compile context that is passed to pcre2_compile() or by a special sequence at the start of the pattern, as described in the sec- tion entitled "Newline conventions" in the pcre2pattern documentation. A default is defined when PCRE2 is built. PCRE2_EXTENDED_MORE This option has the effect of PCRE2_EXTENDED, but, in addition, un- escaped space and horizontal tab characters are ignored inside a char- acter class. Note: only these two characters are ignored, not the full set of pattern white space characters that are ignored outside a char- acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, and it can be changed within a pattern by a (?xx) option setting. PCRE2_FIRSTLINE If this option is set, the start of an unanchored pattern match must be before or at the first newline in the subject string following the start of matching, though the matched text may continue over the new- line. If startoffset is non-zero, the limiting newline is not necessar- ily the first newline in the subject. For example, if the subject string is "abc\nxyz" (where \n represents a single-character newline) a pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. This option has no effect for anchored patterns. PCRE2_LITERAL If this option is set, all meta-characters in the pattern are disabled, and it is treated as a literal string. Matching literal strings with a regular expression engine is not the most efficient way of doing it. If you are doing a lot of literal matching and are worried about effi- ciency, you should consider using other approaches. The only other main options that are allowed with PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- TRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error. PCRE2_MATCH_INVALID_UTF This option forces PCRE2_UTF (see below) and also enables support for matching by pcre2_match() in subject strings that contain invalid UTF sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes un- less such sequences are suitably aligned. This facility is not sup- ported for DFA matching. For details, see the pcre2unicode documenta- tion. PCRE2_MATCH_UNSET_BACKREF If this option is set, a backreference to an unset capture group matches an empty string (by default this causes the current matching alternative to fail). A pattern such as (\1)(a) succeeds when this op- tion is set (assuming it can find an "a" in the subject), whereas it fails by default, for Perl compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka JavaScript). PCRE2_MULTILINE By default, for the purposes of matching "start of line" and "end of line", PCRE2 treats the subject string as consisting of a single line of characters, even if it actually contains newlines. The "start of line" metacharacter (^) matches only at the start of the string, and the "end of line" metacharacter ($) matches only at the end of the string, or before a terminating newline (except when PCRE2_DOLLAR_EN- DONLY is set). Note, however, that unless PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a newline. This behav- iour (for ^, $, and dot) is the same as Perl. When PCRE2_MULTILINE it is set, the "start of line" and "end of line" constructs match immediately following or immediately before internal newlines in the subject string, respectively, as well as at the very start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. Note that the "start of line" metacharacter does not match after a newline at the end of the subject, for compatibility with Perl. However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. PCRE2_NEVER_BACKSLASH_C This option locks out the use of \C in the pattern that is being com- piled. This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in ap- plications that process patterns from external sources. Note that there is also a build-time option that permanently locks out the use of \C. PCRE2_NEVER_UCP This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as described for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error. PCRE2_NEVER_UTF This option locks out interpretation of the pattern as UTF-8, UTF-16, or UTF-32, depending on which library is in use. In particular, it pre- vents the creator of the pattern from switching to UTF interpretation by starting the pattern with (*UTF). This option may be useful in ap- plications that process patterns from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes an error. PCRE2_NO_AUTO_CAPTURE If this option is set, it disables the use of numbered capturing paren- theses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). This is the same as Perl's /n option. Note that, when this option is set, references to capture groups (backreferences or recursion/subroutine calls) may only refer to named groups, though the reference can be by name or by number. PCRE2_NO_AUTO_POSSESS If this (deprecated) option is set, it disables "auto-possessifica- tion", which is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. How- ever, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. If a compile context is available, it is recommended to use pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS takes precedence over the pcre2_set_optimize() optimization directives PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF. PCRE2_NO_DOTSTAR_ANCHOR If this (deprecated) option is set, it disables an optimization that is applied when .* is the first significant item in a top-level branch of a pattern, and all the other branches also start with .* or with \A or \G or ^. The optimization is automatically disabled for .* if it is in- side an atomic group or a capture group that is the subject of a back- reference, or if the pattern contains (*PRUNE) or (*SKIP). When the op- timization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. (If a com- pile context is available, it is recommended to use pcre2_set_opti- mize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF instead.) PCRE2_NO_START_OPTIMIZE This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT compiler. Setting this option is equivalent to calling pcre2_set_optimize() with the directive parameter set to PCRE2_START_OPTIMIZE_OFF. There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails imme- diately if it cannot find it, without actually running the main match- ing function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run. Disabling the start-up optimizations may cause performance to suffer. However, this may be desirable for patterns which contain callouts or items such as (*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF for further details. PCRE2_NO_UTF_CHECK When PCRE2_UTF is set, the validity of the pattern as a UTF string is automatically checked. There are discussions about the validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode document. If an invalid UTF sequence is found, pcre2_compile() returns a negative error code. If you know that your pattern is a valid UTF string, and you want to skip this check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set, the effect of passing an in- valid UTF string as a pattern is undefined. It may cause your program to crash or loop. Note that this option can also be passed to pcre2_match() and pcre2_dfa_match(), to suppress UTF validity checking of the subject string. Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis- able the error that is given if an escape sequence for an invalid Uni- code code point is encountered in the pattern. In particular, the so- called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences such as \x{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described in the section entitled "Extra compile options" below. However, this is pos- sible only in UTF-8 and UTF-32 modes, because these values are not rep- resentable in UTF-16. PCRE2_UCP This option has two effects. Firstly, it change the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes. By default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode properties are used to classify characters. There are some PCRE2_EXTRA options (see below) that add finer control to this behaviour. More details are given in the section on generic character types in the pcre2pattern page. The second effect of PCRE2_UCP is to force the use of Unicode proper- ties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default). The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII characters match only non-ASCII characters. The PCRE2_EX- TRA_TURKISH_CASING option (see above) alters the matching of the 'i' characters to follow their behaviour in Turkish and Azeri languages. For further details on PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EX- TRA_TURKISH_CASING, see the pcre2unicode page. PCRE2_UNGREEDY This option inverts the "greediness" of the quantifiers so that they are not greedy by default, but become greedy if followed by "?". It is not compatible with Perl. It can also be set by a (?U) option setting within the pattern. PCRE2_USE_OFFSET_LIMIT This option must be set for pcre2_compile() if pcre2_set_offset_limit() is going to be used to set a non-default offset limit in a match con- text for matches that use this pattern. An error is generated if an offset limit is set without this option. For more details, see the de- scription of pcre2_set_offset_limit() in the section that describes match contexts. See also the PCRE2_FIRSTLINE option above. PCRE2_UTF This option causes PCRE2 to regard both the pattern and the subject strings that are subsequently processed as strings of UTF characters instead of single-code-unit strings. It is available when PCRE2 is built to include Unicode support (which is the default). If Unicode support is not available, the use of this option provokes an error. De- tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the pcre2unicode page. In particular, note that it changes the way PCRE2_CASELESS works. Extra compile options The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_options() function are as follows: PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Since release 10.38 PCRE2 has forbidden the use of \K within lookaround assertions, following Perl's lead. This option is provided to re-enable the previous behaviour (act in positive lookarounds, ignore in negative ones) in case anybody is relying on it. PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot therefore be represented in UTF-16. They can be represented in UTF-8 and UTF-32, but are defined as invalid code points, and cause errors if encountered in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2. These values also cause errors if encountered in escape sequences such as \x{d912} within a pattern. However, it seems that some applications, when using PCRE2 to check for unwanted characters in UTF-8 strings, ex- plicitly test for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- cause it applies only to the testing of input strings for UTF validity. If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- gate code point values in UTF-8 and UTF-32 patterns no longer provoke errors and are incorporated in the compiled pattern. However, they can only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set. PCRE2_EXTRA_ALT_BSUX The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and \x in the way that ECMAscript (aka JavaScript) does. Additional func- tionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal character code, where hhh.. is any number of hexadeci- mal digits. PCRE2_EXTRA_ASCII_BSD This option forces \d to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aD) op- tion setting. PCRE2_EXTRA_ASCII_BSS This option forces \s to match only ASCII space characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) option setting. PCRE2_EXTRA_ASCII_BSW This option forces \w to match only ASCII word characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aW) option setting. PCRE2_EXTRA_ASCII_DIGIT This option forces the POSIX character classes [:digit:] and [:xdigit:] to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option setting. PCRE2_EXTRA_ASCII_POSIX This option forces all the POSIX character classes, including [:digit:] and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aP) option setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL This is a dangerous option. Use with care. By default, an unrecognized escape such as \j or a malformed one such as \x{2z} causes a compile- time error when detected by pcre2_compile(). Perl is somewhat inconsis- tent in handling such items: for example, \j is treated as a literal "j", and non-hexadecimal digits in \x{} are just ignored, though warn- ings are given in both cases if Perl's warning switch is enabled. How- ever, a malformed octal number after \o{ always causes an error in Perl. If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to pcre2_compile(), all unrecognized or malformed escape sequences are treated as single-character escapes. For example, \j is a literal "j" and \x{2z} is treated as the literal string "x{2z}". Setting this op- tion means that typos in patterns may go undetected and have unexpected results. Also note that a sequence such as [\N{] is interpreted as a malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] gives an error because an unqualified \N is a valid escape sequence but is not supported in a character class. To reiterate: this is a danger- ous option. Use with great care. PCRE2_EXTRA_CASELESS_RESTRICT When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow for more than two cases per character. There are two case-equivalent character sets that contain both ASCII and non- ASCII characters. The ASCII letter S is case-equivalent to U+017f (long S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must ei- ther be ASCII or non-ASCII. The option can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option settings. PCRE2_EXTRA_ESCAPED_CR_IS_LF There are some legacy applications where the escape sequence \r in a pattern is expected to match a newline. If this option is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character. The option does not affect a lit- eral CR in the pattern, nor does it affect CR specified as an explicit code point such as \x{0D}. PCRE2_EXTRA_MATCH_LINE This option is provided for use by the -x option of pcre2grep. It causes the pattern only to match complete lines. This is achieved by automatically inserting the code for "^(?:" at the start of the com- piled pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched line may be in the middle of the subject string. This op- tion can be used with PCRE2_LITERAL. PCRE2_EXTRA_MATCH_WORD This option is provided for use by the -w option of pcre2grep. It causes the pattern only to match strings that have a word boundary at the start and the end. This is achieved by automatically inserting the code for "\b(?:" at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. PCRE2_EXTRA_NO_BS0 If this option is set (note that its final character is the digit 0) it locks out the use of the sequence \0 unless at least one more octal digit follows. PCRE2_EXTRA_PYTHON_OCTAL If this option is set, PCRE2 follows Python's rules for interpreting octal escape sequences. The rules for handling sequences such as \14, which could be an octal number or a back reference are different. De- tails are given in the pcre2pattern documentation. PCRE2_EXTRA_NEVER_CALLOUT If this option is set, PCRE2 treats callouts in the pattern as a syntax error, returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application knows that a callout will not be provided to pcre2_match(), so that callouts in the pattern are not silently ig- nored. PCRE2_EXTRA_TURKISH_CASING This option alters case-equivalence of the 'i' letters to follow the alphabet used by Turkish and Azeri languages. The option can be changed within a pattern by the (*TURKISH_CASING) start-of-pattern setting. Ei- ther the UTF or UCP options must be set. In the 8-bit library, UTF must be set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RE- STRICT. JUST-IN-TIME (JIT) COMPILATION int pcre2_jit_compile(pcre2_code *code, uint32_t options); int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext); void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize, size_t maxsize, pcre2_general_context *gcontext); void pcre2_jit_stack_assign(pcre2_match_context *mcontext, pcre2_jit_callback callback_function, void *callback_data); void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack); These functions provide support for JIT compilation, which, if the just-in-time compiler is available, further processes a compiled pat- tern into machine code that executes much faster than the pcre2_match() interpretive matching function. Full details are given in the pcre2jit documentation. JIT compilation is a heavyweight optimization. It can take some time for patterns to be analyzed, and for one-off matches and simple pat- terns the benefit of faster execution might be offset by a much slower compilation time. Most (but not all) patterns can be optimized by the JIT compiler. LOCALE SUPPORT const uint8_t *pcre2_maketables(pcre2_general_context *gcontext); void pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables); PCRE2 handles caseless matching, and determines whether characters are letters, digits, or whatever, by reference to a set of tables, indexed by character code point. However, this applies only to characters whose code points are less than 256. By default, higher-valued code points never match escapes such as \w or \d. When PCRE2 is built with Unicode support (the default), certain Unicode character properties can be tested with \p and \P, or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled; this causes \w and friends to use Unicode property support instead of the built-in tables. PCRE2_UCP also causes upper/lower casing operations on charac- ters with code points greater than 127 to use Unicode properties. These effects apply even when PCRE2_UTF is not set. There are, however, some PCRE2_EXTRA options (see above) that can be used to modify or suppress them. The use of locales with Unicode is discouraged. If you are handling characters with code points greater than 127, you should either use Unicode support, or use locales, but not try to mix the two. PCRE2 contains a built-in set of character tables that are used by de- fault. These are sufficient for many applications. Normally, the in- ternal tables recognize only ASCII characters. However, when PCRE2 is built, it is possible to cause the internal tables to be rebuilt in the default "C" locale of the local system, which may cause them to be dif- ferent. The built-in tables can be overridden by tables supplied by the appli- cation that calls PCRE2. These may be created in a different locale from the default. As more and more applications change to using Uni- code, the need for this locale support is expected to die away. External tables are built by calling the pcre2_maketables() function, in the relevant locale. The only argument to this function is a general context, which can be used to pass a custom memory allocator. If the argument is NULL, the system malloc() is used. The result can be passed to pcre2_compile() as often as necessary, by creating a compile context and calling pcre2_set_character_tables() to set the tables pointer therein. For example, to build and use tables that are appropriate for the French locale (where accented characters with values greater than 127 are treated as letters), the following code could be used: setlocale(LC_CTYPE, "fr_FR"); tables = pcre2_maketables(NULL); ccontext = pcre2_compile_context_create(NULL); pcre2_set_character_tables(ccontext, tables); re = pcre2_compile(..., ccontext); The locale name "fr_FR" is used on Linux and other Unix-like systems; if you are using Windows, the name for the French locale is "french". The pointer that is passed (via the compile context) to pcre2_compile() is saved with the compiled pattern, and the same tables are used by the matching functions. Thus, for any single pattern, compilation and matching both happen in the same locale, but different patterns can be processed in different locales. It is the caller's responsibility to ensure that the memory containing the tables remains available while they are still in use. When they are no longer needed, you can discard them using pcre2_maketables_free(), which should pass as its first parameter the same global context that was used to create the tables. Saving locale tables The tables described above are just a sequence of binary bytes, which makes them independent of hardware characteristics such as endianness or whether the processor is 32-bit or 64-bit. A copy of the result of pcre2_maketables() can therefore be saved in a file or elsewhere and re-used later, even in a different program or on another computer. The size of the tables (number of bytes) must be obtained by calling pcre2_config() with the PCRE2_CONFIG_TABLES_LENGTH option because pcre2_maketables() does not return this value. Note that the pcre2_dftables program, which is part of the PCRE2 build system, can be used stand-alone to create a file that contains a set of binary tables. See the pcre2build documentation for details. INFORMATION ABOUT A COMPILED PATTERN int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where); The pcre2_pattern_info() function returns general information about a compiled pattern. For information about callouts, see the next section. The first argument for pcre2_pattern_info() is a pointer to the com- piled pattern. The second argument specifies which piece of information is required, and the third argument is a pointer to a variable to re- ceive the data. If the third argument is NULL, the first argument is ignored, and the function returns the size in bytes of the variable that is required for the information requested. Otherwise, the yield of the function is zero for success, or one of the following negative num- bers: PCRE2_ERROR_NULL the argument code was NULL PCRE2_ERROR_BADMAGIC the "magic number" was not found PCRE2_ERROR_BADOPTION the value of what was invalid PCRE2_ERROR_UNSET the requested field is not set The "magic number" is placed at the start of each compiled pattern as a simple check against passing an arbitrary memory pointer. Here is a typical call of pcre2_pattern_info(), to obtain the length of the com- piled pattern: int rc; size_t length; rc = pcre2_pattern_info( re, /* result of pcre2_compile() */ PCRE2_INFO_SIZE, /* what is required */ &length); /* where to put the data */ The possible values for the second argument are defined in pcre2.h, and are as follows: PCRE2_INFO_ALLOPTIONS PCRE2_INFO_ARGOPTIONS PCRE2_INFO_EXTRAOPTIONS Return copies of the pattern's options. The third argument should point to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the op- tions that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP- TIONS returns the compile options as modified by any top-level (*XXX) option settings such as (*UTF) at the start of the pattern itself. PCRE2_INFO_EXTRAOPTIONS returns the extra options that were set in the compile context by calling the pcre2_set_compile_extra_options() func- tion. For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EX- TENDED option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can change within a pattern do not affect the result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the pattern. (This was different in some earlier releases.) A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if the first significant item in every top-level branch is one of the following: ^ unless PCRE2_MULTILINE is set \A always \G always .* sometimes - see below When .* is the first significant item, anchoring is possible only when all the following are true: .* is not in an atomic group .* is not in a capture group that is the subject of a backreference PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. PCRE2_INFO_BACKREFMAX Return the number of the highest backreference in the pattern. The third argument should point to a uint32_t variable. Named capture groups acquire numbers as well as names, and these count towards the highest backreference. Backreferences such as \4 or \g{12} match the captured characters of the given group, but in addition, the check that a capture group is set in a conditional group such as (?(3)a|b) is also a backreference. Zero is returned if there are no backreferences. PCRE2_INFO_BSR The output is a uint32_t integer whose value indicates what character sequences the \R escape sequence matches. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. PCRE2_INFO_CAPTURECOUNT Return the highest capture group number in the pattern. In patterns where (?| is not used, this is also the total number of capture groups. The third argument should point to a uint32_t variable. PCRE2_INFO_DEPTHLIMIT If the pattern set a backtracking depth limit by including an item of the form (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to pcre2_pattern_info() returns the error PCRE2_ER- ROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function. PCRE2_INFO_FIRSTBITMAP In the absence of a single first code unit for a non-anchored pattern, pcre2_compile() may construct a 256-bit table that defines a fixed set of values for the first code unit in any match. For example, a pattern that starts with [abc] results in a table with three bits set. When code unit values greater than 255 are supported, the flag bit for 255 means "any code unit of value 255 or above". If such a table was con- structed, a pointer to it is returned. Otherwise NULL is returned. The third argument should point to a const uint8_t * variable. PCRE2_INFO_FIRSTCODETYPE Return information about the first code unit of any matched string, for a non-anchored pattern. The third argument should point to a uint32_t variable. If there is a fixed first value, for example, the letter "c" from a pattern such as (cat|cow|coyote), 1 is returned, and the value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is known that a match can occur only at the start of the subject or following a newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0 is returned. PCRE2_INFO_FIRSTCODEUNIT Return the value of the first code unit of any matched string for a pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third argument should point to a uint32_t variable. In the 8-bit library, the value is always less than 256. In the 16-bit library the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode. PCRE2_INFO_FRAMESIZE Return the size (in bytes) of the data frames that are used to remember backtracking positions when the pattern is processed by pcre2_match() without the use of JIT. The third argument should point to a size_t variable. The frame size depends on the number of capturing parentheses in the pattern. Each additional capture group adds two PCRE2_SIZE vari- ables. PCRE2_INFO_HASBACKSLASHC Return 1 if the pattern contains any instances of \C, otherwise 0. The third argument should point to a uint32_t variable. PCRE2_INFO_HASCRORLF Return 1 if the pattern contains any explicit matches for CR or LF characters, otherwise 0. The third argument should point to a uint32_t variable. An explicit match is either a literal CR or LF character, or \r or \n or one of the equivalent hexadecimal or octal escape se- quences. PCRE2_INFO_HEAPLIMIT If the pattern set a heap memory limit by including an item of the form (*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argu- ment should point to a uint32_t integer. If no such value has been set, the call to pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function. PCRE2_INFO_JCHANGED Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise 0. The third argument should point to a uint32_t variable. (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respec- tively. PCRE2_INFO_JITSIZE If the compiled pattern was successfully processed by pcre2_jit_com- pile(), return the size of the JIT compiled code, otherwise return zero. The third argument should point to a size_t variable. PCRE2_INFO_LASTCODETYPE Returns 1 if there is a rightmost literal code unit that must exist in any matched string, other than at its start. The third argument should point to a uint32_t variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is recorded only if it follows something of variable length. For example, for the pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0. PCRE2_INFO_LASTCODEUNIT Return the value of the rightmost literal code unit that must exist in any matched string, other than at its start, for a pattern where PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argu- ment should point to a uint32_t variable. PCRE2_INFO_MATCHEMPTY Return 1 if the pattern might match an empty string, otherwise 0. The third argument should point to a uint32_t variable. When a pattern con- tains recursive subroutine calls it is not always possible to determine whether or not it can match an empty string. PCRE2 takes a cautious ap- proach and returns 1 in such cases. PCRE2_INFO_MATCHLIMIT If the pattern set a match limit by including an item of the form (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third ar- gument should point to a uint32_t integer. If no such value has been set, the call to pcre2_pattern_info() returns the error PCRE2_ERROR_UN- SET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match func- tion. PCRE2_INFO_MAXLOOKBEHIND A lookbehind assertion moves back a certain number of characters (not code units) when it starts to process each of its branches. This re- quest returns the largest of these backward moves. The third argument should point to a uint32_t integer. The simple assertions \b and \B re- quire a one-character lookbehind and cause PCRE2_INFO_MAXLOOKBEHIND to return 1 in the absence of anything longer. \A also registers a one- character lookbehind, though it does not actually inspect the previous character. Note that this information is useful for multi-segment matching only if the pattern contains no nested lookbehinds. For example, the pattern (?<=a(?<=ba)c) returns a maximum lookbehind of 2, but when it is processed, the first lookbehind moves back by two characters, matches one character, then the nested lookbehind also moves back by two char- acters. This puts the matching point three characters earlier than it was at the start. PCRE2_INFO_MAXLOOKBEHIND is really only useful as a debugging tool. See the pcre2partial documentation for a discussion of multi-segment matching. PCRE2_INFO_MINLENGTH If a minimum length for matching subject strings was computed, its value is returned. Otherwise the returned value is 0. This value is not computed when PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in UTF mode may be different from the number of code units. The third argument should point to a uint32_t variable. The value is a lower bound to the length of any matching string. There may not be any strings of that length that do actually match, but every string that does match is at least that long. PCRE2_INFO_NAMECOUNT PCRE2_INFO_NAMEENTRYSIZE PCRE2_INFO_NAMETABLE PCRE2 supports the use of named as well as numbered capturing parenthe- ses. The names are just an additional way of identifying the parenthe- ses, which still acquire numbers. Several convenience functions such as pcre2_substring_get_byname() are provided for extracting captured sub- strings by name. It is also possible to extract the data directly, by first converting the name to a number in order to access the correct pointers in the output vector (described with pcre2_match() below). To do the conversion, you need to use the name-to-number map, which is de- scribed by these three values. The map consists of a number of fixed-size entries. PCRE2_INFO_NAME- COUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each entry in code units; both of these return a uint32_t value. The entry size depends on the length of the longest name. PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table. This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit li- brary, the first two bytes of each entry are the number of the captur- ing parenthesis, most significant byte first. In the 16-bit library, the pointer points to 16-bit code units, the first of which contains the parenthesis number. In the 32-bit library, the pointer points to 32-bit code units, the first of which contains the parenthesis number. The rest of the entry is the corresponding name, zero terminated. The names are in alphabetical order. If (?| is used to create multiple capture groups with the same number, as described in the section on du- plicate group numbers in the pcre2pattern page, the groups may be given the same name, but there is only one entry in the table. Different names for groups of the same number are not permitted. Duplicate names for capture groups with different numbers are permit- ted, but only if PCRE2_DUPNAMES is set. They appear in the table in the order in which they were found in the pattern. In the absence of (?| this is the order of increasing number; when (?| is used this is not necessarily the case because later capture groups may have lower num- bers. As a simple example of the name/number table, consider the following pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white space - including newlines - is ignored): (? (?(\d\d)?\d\d) - (?\d\d) - (?\d\d) ) There are four named capture groups, so the table has four entries, and each entry in the table is eight bytes long. The table is as follows, with non-printing bytes shows in hexadecimal, and undefined bytes shown as ??: 00 01 d a t e 00 ?? 00 05 d a y 00 ?? ?? 00 04 m o n t h 00 00 02 y e a r 00 ?? When writing code to extract data from named capture groups using the name-to-number map, remember that the length of the entries is likely to be different for each compiled pattern. PCRE2_INFO_NEWLINE The output is one of the following uint32_t values: PCRE2_NEWLINE_CR Carriage return (CR) PCRE2_NEWLINE_LF Linefeed (LF) PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF PCRE2_NEWLINE_NUL The NUL character (binary zero) This identifies the character sequence that will be recognized as mean- ing "newline" while matching. PCRE2_INFO_SIZE Return the size of the compiled pattern in bytes (for all three li- braries). The third argument should point to a size_t variable. This value includes the size of the general data block that precedes the code units of the compiled pattern itself. The value that is used when pcre2_compile() is getting memory in which to place the compiled pat- tern may be slightly larger than the value returned by this option, be- cause there are cases where the code that calculates the size has to over-estimate. Processing a pattern with the JIT compiler does not al- ter the value returned by this option. INFORMATION ABOUT A PATTERN'S CALLOUTS int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data); A script language that supports the use of string arguments in callouts might like to scan all the callouts in a pattern before running the match. This can be done by calling pcre2_callout_enumerate(). The first argument is a pointer to a compiled pattern, the second points to a callback function, and the third is arbitrary user data. The callback function is called for every callout in the pattern in the order in which they appear. Its first argument is a pointer to a callout enumer- ation block, and its second argument is the user_data value that was passed to pcre2_callout_enumerate(). The contents of the callout enu- meration block are described in the pcre2callout documentation, which also gives further details about callouts. SERIALIZATION AND PRECOMPILING It is possible to save compiled patterns on disc or elsewhere, and re- load them later, subject to a number of restrictions. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width, and PCRE2_SIZE type. Before compiled patterns can be saved, they must be converted to a "serialized" form, which in the case of PCRE2 is really just a bytecode dump. The functions whose names be- gin with pcre2_serialize_ are used for converting to and from the seri- alized form. They are described in the pcre2serialize documentation. Note that PCRE2 serialization does not convert compiled patterns to an abstract format like Java or .NET serialization. THE MATCH DATA BLOCK pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext); pcre2_match_data *pcre2_match_data_create_from_pattern( const pcre2_code *code, pcre2_general_context *gcontext); void pcre2_match_data_free(pcre2_match_data *match_data); Information about a successful or unsuccessful match is placed in a match data block, which is an opaque structure that is accessed by function calls. In particular, the match data block contains a vector of offsets into the subject string that define the matched parts of the subject. This is known as the ovector. Before calling pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match() you must create a match data block by calling one of the creation func- tions above. For pcre2_match_data_create(), the first argument is the number of pairs of offsets in the ovector. When using pcre2_match(), one pair of offsets is required to identify the string that matched the whole pattern, with an additional pair for each captured substring. For example, a value of 4 creates enough space to record the matched portion of the subject plus three captured sub- strings. When using pcre2_dfa_match() there may be multiple matched substrings of different lengths at the same point in the subject. The ovector should be made large enough to hold as many as are expected. A minimum of at least 1 pair is imposed by pcre2_match_data_create(), so it is always possible to return the overall matched string in the case of pcre2_match() or the longest match in the case of pcre2_dfa_match(). The maximum number of pairs is 65535; if the first argument of pcre2_match_data_create() is greater than this, 65535 is used. The second argument of pcre2_match_data_create() is a pointer to a gen- eral context, which can specify custom memory management for obtaining the memory for the match data block. If you are not using custom memory management, pass NULL, which causes malloc() to be used. For pcre2_match_data_create_from_pattern(), the first argument is a pointer to a compiled pattern. The ovector is created to be exactly the right size to hold all the substrings a pattern might capture when matched using pcre2_match(). You should not use this call when matching with pcre2_dfa_match(). The second argument is again a pointer to a general context, but in this case if NULL is passed, the memory is ob- tained using the same allocator that was used for the compiled pattern (custom or default). A match data block can be used many times, with the same or different compiled patterns. You can extract information from a match data block after a match operation has finished, using functions that are de- scribed in the sections on matched strings and other match data below. When a call of pcre2_match() fails, valid data is available in the match block only when the error is PCRE2_ERROR_NOMATCH, PCRE2_ER- ROR_PARTIAL, or one of the error codes for an invalid UTF string. Ex- actly what is available depends on the error, and is detailed below. When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can be referenced by the extraction functions after a successful match. After running a match, you must not free a compiled pattern or a subject string until after all operations on the match data block (for that match) have taken place, unless, in the case of the subject string, you have used the PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled "Option bits for pcre2_match()" be- low. When a match data block itself is no longer needed, it should be freed by calling pcre2_match_data_free(). If this function is called with a NULL argument, it returns immediately, without doing anything. MEMORY USE FOR MATCH DATA BLOCKS PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data); PCRE2_SIZE pcre2_get_match_data_heapframes_size( pcre2_match_data *match_data); The size of a match data block depends on the size of the ovector that it contains. The function pcre2_get_match_data_size() returns the size, in bytes, of the block that is its argument. When pcre2_match() runs interpretively (that is, without using JIT), it makes use of a vector of data frames for remembering backtracking posi- tions. The size of each individual frame depends on the number of cap- turing parentheses in the pattern and can be obtained by calling pcre2_pattern_info() with the PCRE2_INFO_FRAMESIZE option (see the sec- tion entitled "Information about a compiled pattern" above). Heap memory is used for the frames vector; if the initial memory block turns out to be too small during matching, it is automatically ex- panded. When pcre2_match() returns, the memory is not freed, but re- mains attached to the match data block, for use by any subsequent matches that use the same block. It is automatically freed when the match data block itself is freed. You can find the current size of the frames vector that a match data block owns by calling pcre2_get_match_data_heapframes_size(). For a newly created match data block the size will be zero. Some types of match may require a lot of frames and thus a large vector; applications that run in environments where memory is constrained can check this and free the match data block if the heap frames vector has become too big. MATCHING A PATTERN: THE TRADITIONAL FUNCTION int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext); The function pcre2_match() is called to match a subject string against a compiled pattern, which is passed in the code argument. You can call pcre2_match() with the same code argument as many times as you like, in order to find multiple matches in the subject string or to match dif- ferent subject strings with the same pattern. This function is the main matching facility of the library, and it op- erates in a Perl-like manner. For specialist use there is also an al- ternative matching function, which is described below in the section about the pcre2_dfa_match() function. Here is an example of a simple call to pcre2_match(): pcre2_match_data *md = pcre2_match_data_create(4, NULL); int rc = pcre2_match( re, /* result of pcre2_compile() */ "some string", /* the subject string */ 11, /* the length of the subject string */ 0, /* start at offset 0 in the subject */ 0, /* default options */ md, /* the match data block */ NULL); /* a match context; NULL means use defaults */ If the subject string is zero-terminated, the length can be given as PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common matching parameters are to be changed. For details, see the sec- tion on the match context above. The string to be matched by pcre2_match() The subject string is passed to pcre2_match() as a pointer in subject, a length in length, and a starting offset in startoffset. The length and offset are in code units, not characters. That is, they are in bytes for the 8-bit library, 16-bit code units for the 16-bit library, and 32-bit code units for the 32-bit library, whether or not UTF pro- cessing is enabled. As a special case, if subject is NULL and length is zero, the subject is assumed to be an empty string. If length is non- zero, an error occurs if subject is NULL. If startoffset is greater than the length of the subject, pcre2_match() returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the search for a match starts at the beginning of the subject, and this is by far the most common case. In UTF-8 or UTF-16 mode, the starting off- set must point to the start of a character, or to the end of the sub- ject (in UTF-32 mode, one code unit equals one character, so all off- sets are valid). Like the pattern string, the subject may contain bi- nary zeros. A non-zero starting offset is useful when searching for another match in the same subject by calling pcre2_match() again after a previous success. Setting startoffset differs from passing over a shortened string and setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of lookbehind. For example, consider the pattern \Biss\B which finds occurrences of "iss" in the middle of words. (\B matches only if the current position in the subject is not a word boundary.) When applied to the string "Mississippi" the first call to pcre2_match() finds the first occurrence. If pcre2_match() is called again with just the remainder of the subject, namely "issippi", it does not match, because \B is always false at the start of the subject, which is deemed to be a word boundary. However, if pcre2_match() is passed the entire string again, but with startoffset set to 4, it finds the second occurrence of "iss" because it is able to look behind the starting point to discover that it is preceded by a letter. Finding all the matches in a subject is tricky when the pattern can match an empty string. PCRE2 includes a helper API to assist with this; see the section entitled "Iterating over all matches" below for de- tails. If a non-zero starting offset is passed when the pattern is anchored, a single attempt to match at the given offset is made. This can only suc- ceed if the pattern does not require the match to be at the start of the subject. In other words, the anchoring must be the result of set- ting the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL, not by starting the pattern with ^ or \A. Option bits for pcre2_match() The unused bits of the options argument for pcre2_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_DISABLE_RECURSELOOP_CHECK, PCRE2_EN- DANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_PAR- TIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. Setting PCRE2_ANCHORED or PCRE2_ENDANCHORED at match time is not sup- ported by the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the interpretive code in pcre2_match() is run. PCRE2_DISABLE_RECURSELOOP_CHECK is ignored by JIT, but apart from PCRE2_NO_JIT (obviously), the remaining options are supported for JIT matching. PCRE2_ANCHORED The PCRE2_ANCHORED option limits pcre2_match() to matching at the first matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out to be anchored by virtue of its contents, it cannot be made unanchored at matching time. Note that setting the option at match time disables JIT matching. PCRE2_COPY_MATCHED_SUBJECT By default, a pointer to the subject is remembered in the match data block so that, after a successful match, it can be referenced by the substring extraction functions. This means that the subject's memory must not be freed until all such operations are complete. For some ap- plications where the lifetime of the subject string is not guaranteed, it may be necessary to make a copy of the subject string, but it is wasteful to do this unless the match is successful. After a successful match, if PCRE2_COPY_MATCHED_SUBJECT is set, the subject is copied and the new pointer is remembered in the match data block instead of the original subject pointer. The memory allocator that was used for the match block itself is used. The copy is automatically freed when pcre2_match_data_free() is called to free the match data block. It is also automatically freed if the match data block is re-used for another match operation. PCRE2_DISABLE_RECURSELOOP_CHECK This option is relevant only to pcre2_match() for interpretive match- ing. It is ignored when JIT is used, and is forbidden for pcre2_dfa_match(). The use of recursion in patterns can lead to infinite loops. In the in- terpretive matcher these would be eventually caught by the match or heap limits, but this could take a long time and/or use a lot of memory if the limits are large. There is therefore a check at the start of each recursion. If the same group is still active from a previous call, and the current subject pointer is the same as it was at the start of that group, and the furthest inspected character of the sub- ject has not changed, an error is generated. There are rare cases of matches that would complete, but nevertheless trigger this error. This option disables the check. It is provided mainly for testing when comparing JIT and interpretive behaviour. PCRE2_ENDANCHORED If the PCRE2_ENDANCHORED option is set, any string that pcre2_match() matches must be right at the end of the subject string. Note that set- ting the option at match time disables JIT matching. PCRE2_NOTBOL This option specifies that first character of the subject string is not the beginning of a line, so the circumflex metacharacter should not match before it. Setting this without having set PCRE2_MULTILINE at compile time causes circumflex never to match. This option affects only the behaviour of the circumflex metacharacter. It does not affect \A. PCRE2_NOTEOL This option specifies that the end of the subject string is not the end of a line, so the dollar metacharacter should not match it nor (except in multiline mode) a newline immediately before it. Setting this with- out having set PCRE2_MULTILINE at compile time causes dollar never to match. This option affects only the behaviour of the dollar metacharac- ter. It does not affect \Z or \z. PCRE2_NOTEMPTY An empty string is not considered to be a valid match if this option is set. If there are alternatives in the pattern, they are tried. If all the alternatives match the empty string, the entire match fails. For example, if the pattern a?b? is applied to a string not beginning with "a" or "b", it matches an empty string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not valid, so pcre2_match() searches further into the string for occurrences of "a" or "b". PCRE2_NOTEMPTY_ATSTART This is like PCRE2_NOTEMPTY, except that it locks out an empty string match only at the first matching position, that is, at the start of the subject plus the starting offset. An empty string match later in the subject is permitted. If the pattern is anchored, such a match can oc- cur only if the pattern contains \K. PCRE2_NO_JIT By default, if a pattern has been successfully processed by pcre2_jit_compile(), JIT is automatically used when pcre2_match() is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use of JIT; it forces matching to be done by the interpreter. PCRE2_NO_UTF_CHECK When PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked unless PCRE2_NO_UTF_CHECK is passed to pcre2_match() or PCRE2_MATCH_INVALID_UTF was passed to pcre2_compile(). The latter special case is discussed in detail in the pcre2unicode doc- umentation. In the default case, if a non-zero starting offset is given, the check is applied only to that part of the subject that could be inspected during matching, and there is a check that the starting offset points to the first code unit of a character or to the end of the subject. If there are no lookbehind assertions in the pattern, the check starts at the starting offset. Otherwise, it starts at the length of the longest lookbehind before the starting offset, or at the start of the subject if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds. The check is carried out before any other processing takes place, and a negative error code is returned if the check fails. There are several UTF error codes for each code unit width, corresponding to different problems with the code unit sequence. There are discussions about the validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode documentation. If you know that your subject is valid, and you want to skip this check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling pcre2_match(). You might want to do this for the second and subsequent calls to pcre2_match() if you are making repeated calls to find multiple matches in the same subject string. Warning: Unless PCRE2_MATCH_INVALID_UTF was set at compile time, when PCRE2_NO_UTF_CHECK is set at match time the effect of passing an in- valid string as a subject, or an invalid value of startoffset, is unde- fined. Your program may crash or loop indefinitely or give wrong re- sults. PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT These options turn on the partial matching feature. A partial match oc- curs if the end of the subject string is reached successfully, but there are not enough subject characters to complete the match. In addi- tion, either at least one character must have been inspected or the pattern must contain a lookbehind, or the pattern must be one that could match an empty string. If this situation arises when PCRE2_PARTIAL_SOFT (but not PCRE2_PAR- TIAL_HARD) is set, matching continues by testing any remaining alterna- tives. Only if no complete match can be found is PCRE2_ERROR_PARTIAL returned instead of PCRE2_ERROR_NOMATCH. In other words, PCRE2_PAR- TIAL_SOFT specifies that the caller is prepared to handle a partial match, but only if no complete match can be found. If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if a partial match is found, pcre2_match() immediately returns PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid- ered to be more important than an alternative complete match. There is a more detailed discussion of partial and multi-segment match- ing, with examples, in the pcre2partial documentation. NEWLINE HANDLING WHEN MATCHING When PCRE2 is built, a default newline convention is set; this is usu- ally the standard convention for the operating system. The default can be overridden in a compile context by calling pcre2_set_newline(). It can also be overridden by starting a pattern string with, for example, (*CRLF), as described in the section on newline conventions in the pcre2pattern page. During matching, the newline choice affects the be- haviour of the dot, circumflex, and dollar metacharacters. It may also alter the way the match starting position is advanced after a match failure for an unanchored pattern. When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is set as the newline convention, and a match attempt for an unanchored pattern fails when the current starting position is at a CRLF sequence, and the pattern contains no explicit matches for CR or LF characters, the match position is advanced by two characters instead of one, in other words, to after the CRLF. The above rule is a compromise that makes the most common cases work as expected. For example, if the pattern is .+A (and the PCRE2_DOTALL op- tion is not set), it does not match the string "\r\nA" because, after failing at the start, it skips both the CR and the LF before retrying. However, the pattern [\r\n]A does match that string, because it con- tains an explicit CR or LF reference, and so advances only by one char- acter after the first failure. An explicit match for CR of LF is either a literal appearance of one of those characters in the pattern, or one of the \r or \n or equivalent octal or hexadecimal escape sequences. Implicit matches such as [^X] do not count, nor does \s, even though it includes CR and LF in the char- acters that it matches. Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \r or \n escapes appear in the pattern. HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data); PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data); In general, a pattern matches a certain portion of the subject, and in addition, further substrings from the subject may be picked out by parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's book, this is called "capturing" in what follows, and the phrase "capture group" (Perl terminology) is used for a fragment of a pattern that picks out a substring. PCRE2 supports several other kinds of parenthesized group that do not cause substrings to be captured. The pcre2_pattern_info() function can be used to find out how many capture groups there are in a compiled pattern. You can use auxiliary functions for accessing captured substrings by number or by name, as described in sections below. Alternatively, you can make direct use of the vector of PCRE2_SIZE val- ues, called the ovector, which contains the offsets of captured strings. It is part of the match data block. The function pcre2_get_ovector_pointer() returns the address of the ovector, and pcre2_get_ovector_count() returns the number of pairs of values it con- tains. Within the ovector, the first in each pair of values is set to the off- set of the first code unit of a substring, and the second is set to the offset of the first code unit after the end of a substring. These val- ues are always code unit offsets, not character offsets. That is, they are byte offsets in the 8-bit library, 16-bit offsets in the 16-bit li- brary, and 32-bit offsets in the 32-bit library. After a partial match (error return PCRE2_ERROR_PARTIAL), only the first pair of offsets (that is, ovector[0] and ovector[1]) are set. They identify the part of the subject that was partially matched. See the pcre2partial documentation for details of partial matching. After a fully successful match, the first pair of offsets identifies the portion of the subject string that was matched by the entire pat- tern. The next pair is used for the first captured substring, and so on. The value returned by pcre2_match() is one more than the highest numbered pair that has been set. For example, if two substrings have been captured, the returned value is 3. If there are no captured sub- strings, the return value from a successful match is 1, indicating that just the first pair of offsets has been set. If a pattern uses the \K escape sequence within a positive lookahead assertion, the reported start of a successful match can be greater than the end of the match. For example, if the pattern (?=ab\K) is matched against "ab", the start and end offset values for the match are 2 and 0. If a capture group is matched repeatedly within a single match opera- tion, it is the last portion of the subject that it matched that is re- turned. If the ovector is too small to hold all the captured substring offsets, as much as possible is filled in, and the function returns a value of zero. If captured substrings are not of interest, pcre2_match() may be called with a match data block whose ovector is of minimum length (that is, one pair). It is possible for capture group number n+1 to match some part of the subject when group n has not been used at all. For example, if the string "abc" is matched against the pattern (a|(z))(bc) the return from the function is 4, and groups 1 and 3 are matched, but 2 is not. When this happens, both values in the offset pairs corresponding to unused groups are set to PCRE2_UNSET. Offset values that correspond to unused groups at the end of the ex- pression are also set to PCRE2_UNSET. For example, if the string "abc" is matched against the pattern (abc)(x(yz)?)? groups 2 and 3 are not matched. The return from the function is 2, because the highest used capture group number is 1. The offsets for the second and third capture groups (assuming the vector is large enough, of course) are set to PCRE2_UNSET. Elements in the ovector that do not correspond to capturing parentheses in the pattern are never changed. That is, if a pattern contains n cap- turing parentheses, no more than ovector[0] to ovector[2n+1] are set by pcre2_match(). The other elements retain whatever values they previ- ously had. After a failed match attempt, the contents of the ovector are unchanged. OTHER INFORMATION ABOUT A MATCH PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data); PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data); As well as the offsets in the ovector, other information about a match is retained in the match data block and can be retrieved by the above functions in appropriate circumstances. If they are called at other times, the result is undefined. After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function pcre2_get_mark() can be called to access this name, which can be specified in the pattern by any of the backtracking control verbs, not just (*MARK). The same function applies to all the verbs. It returns a pointer to the zero-terminated name, which is within the com- piled pattern. If no name is available, NULL is returned. The length of the name (excluding the terminating zero) is stored in the code unit that precedes the name. You should use this length instead of relying on the terminating zero if the name might contain a binary zero. After a successful match, the name that is returned is the last mark name encountered on the matching path through the pattern. Instances of backtracking verbs without names do not count. Thus, for example, if the matching path contains (*MARK:A)(*PRUNE), the name "A" is returned. After a "no match" or a partial match, the last encountered name is re- turned. For example, consider this pattern: ^(*MARK:A)((*MARK:B)a|b)c When it matches "bc", the returned name is A. The B mark is "seen" in the first branch of the group, but it is not on the matching path. On the other hand, when this pattern fails to match "bx", the returned name is B. Warning: By default, certain start-of-match optimizations are used to give a fast "no match" result in some situations. For example, if the anchoring is removed from the pattern above, there is an initial check for the presence of "c" in the subject before running the matching en- gine. This check fails for "bx", causing a match failure without seeing any marks. You can disable the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option for pcre2_compile() or by starting the pattern with (*NO_START_OPT). After a successful match, a partial match, or one of the invalid UTF errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can be called. After a successful or partial match it returns the code unit offset of the character at which the match started. For a non-partial match, this can be different to the value of ovector[0] if the pattern contains the \K escape sequence. After a partial match, however, this value is always the same as ovector[0] because \K does not affect the result of a partial match. After a UTF check failure, pcre2_get_startchar() can be used to obtain the code unit offset of the invalid UTF character. Details are given in the pcre2unicode page. ERROR RETURNS FROM pcre2_match() If pcre2_match() fails, it returns a negative number. This can be con- verted to a text string by calling the pcre2_get_error_message() func- tion (see "Obtaining a textual error message" below). Negative error codes are also returned by other functions, and are documented with them. The codes are given names in the header file. If UTF checking is in force and an invalid UTF subject string is detected, one of a number of UTF-specific negative error codes is returned. Details are given in the pcre2unicode page. The following are the other errors that may be returned by pcre2_match(): PCRE2_ERROR_NOMATCH The subject string did not match the pattern. PCRE2_ERROR_PARTIAL The subject string did not match, but it did match partially. See the pcre2partial documentation for details of partial matching. PCRE2_ERROR_BADMAGIC PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to catch the case when it is passed a junk pointer. This is the error that is returned when the magic number is not present. PCRE2_ERROR_BADMODE This error is given when a compiled pattern is passed to a function in a library of a different code unit width, for example, a pattern com- piled by the 8-bit library is passed to a 16-bit or 32-bit library function. PCRE2_ERROR_BADOFFSET The value of startoffset was greater than the length of the subject. PCRE2_ERROR_BADOPTION An unrecognized bit was set in the options argument. PCRE2_ERROR_BADUTFOFFSET The UTF code unit sequence that was passed as a subject was checked and found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of startoffset did not point to the beginning of a UTF character or the end of the subject. PCRE2_ERROR_CALLOUT This error is never generated by pcre2_match() itself. It is provided for use by callout functions that want to cause pcre2_match() or pcre2_callout_enumerate() to return a distinctive error code. See the pcre2callout documentation for details. PCRE2_ERROR_DEPTHLIMIT The nested backtracking depth limit was reached. PCRE2_ERROR_HEAPLIMIT The heap limit was reached. PCRE2_ERROR_INTERNAL An unexpected internal error has occurred. This error could be caused by a bug in PCRE2 or by overwriting of the compiled pattern. PCRE2_ERROR_JIT_STACKLIMIT This error is returned when a pattern that was successfully studied us- ing JIT is being matched, but the memory available for the just-in-time processing stack is not large enough. See the pcre2jit documentation for more details. PCRE2_ERROR_MATCHLIMIT The backtracking match limit was reached. PCRE2_ERROR_NOMEMORY Heap memory is used to remember backtracking points. This error is given when the memory allocation function (default or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca- tion fails. PCRE2_ERROR_NULL Either the code, subject, or match_data argument was passed as NULL. PCRE2_ERROR_RECURSELOOP This error is returned when pcre2_match() detects a recursion loop within the pattern. Specifically, it means that either the whole pat- tern or a capture group has been called recursively for the second time at the same position in the subject string. Some simple patterns that might do this are detected and faulted at compile time, but more com- plicated cases, in particular mutual recursions between two different groups, cannot be detected until matching is attempted. OBTAINING A TEXTUAL ERROR MESSAGE int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen); A text message for an error code from any PCRE2 function (compile, match, or auxiliary) can be obtained by calling pcre2_get_error_mes- sage(). The code is passed as the first argument, with the remaining two arguments specifying a code unit buffer and its length in code units, into which the text message is placed. The message is returned in code units of the appropriate width for the library that is being used. The returned message is terminated with a trailing zero, and the func- tion returns the number of code units used, excluding the trailing zero. If the error number is unknown, the negative error code PCRE2_ER- ROR_BADDATA is returned. If the buffer is too small, the message is truncated (but still with a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. None of the messages is very long; a buffer size of 120 code units is ample. ITERATING OVER ALL MATCHES int pcre2_next_match(pcre2_match_data *match_data, PCRE2_SIZE *pstart_offset, uint32_t *poptions); A common task for applications is to implement "global" matching behav- iour, for example, replacing all matches in the subject; splitting the subject on all matches; or simply counting the number of matches. The pcre2_next_match() function helps with this task by providing the ap- propriate parameters for the next match attempt (available since PCRE2 10.47). First, a match attempt should be made using one of the matching func- tions (pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match()). Then, pcre2_next_match() can be called, providing the same match_data parame- ter. It returns 0 ("false") if there is no need to make a further match at- tempt, or 1 ("true") if another match should be attempted. Returning 1 does not imply that there is another match, only that another match should be attempted (which may return PCRE2_ERROR_NOMATCH). The *pstart_offset and *poptions are set if the function returns 1. The *pstart_offset should be passed to the next match attempt directly, and the *poptions should be passed to the next match attempt by combin- ing with the application's match options using OR. There is some code that demonstrates how to do this in the pcre2demo sample program. The general pattern is: uint32_t app_options = ...; uint32_t global_options = 0; PCRE2_SIZE start_offset = 0; while (1) { int rc = pcre2_match(re, subject, subject_len, start_offset, app_options | global_options, match_data, match_context); if (rc == PCRE2_ERROR_NOMATCH) break; /* no match, and no more attempts */ if (rc < 0) { ... exit } ...handle the match if (!pcre2_next_match(match_data, &start_offset, &global_options)) break; /* no more attempts */ } The guarantees provided by pcre2_next_match() are that the start_offset will advance, so the loop will definitely terminate. The conditions which ensure this are that either: (a) pcre2_next_match() returns 0 (false); or (b) the returned *pstart_offset is strictly greater than the previous start_offset; or (c) if the previous match was a success- ful match of the empty string then the returned *pstart_offset is equal to the previous ovector[1], and *poptions will be set to PCRE2_NOTEMPTY_ATSTART to prevent another empty match from being re- turned. A loop implemented as shown above will always terminate, unless there is a bug in PCRE2. As a measure of "defensive programming", applica- tions are encouraged to add an assertion or check to break their loop if it does not make progress (and report the issue as a bug). If an application does not use the flag PCRE2_EXTRA_AL- LOW_LOOKAROUND_BSK, then each match is "well-behaved" and satisfies: start_offset <= ovector[0] <= ovector[1]. In this case, the matches found by pcre2_match() with pcre2_next_match() will be sorted, non-overlapping (possibly touching), and with no duplicates. Otherwise, if PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK is used, then the guar- antees are considerably weaker. We do not guarantee that the matches will always advance: only that the start_offset will. The matches found by pcre2_match() with pcre2_next_match() will be a finite sequence (as pcre2_next_match() ensures that start_offset advances, so the search will terminate). The matches can however be overlapping, can contain duplicates, and (in truly pathological examples) may not even be sorted by ovector[0]. Additionally, each match itself can end before it starts (ovector[1] < ovector[0]). We recommend that applications do not set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK. EXTRACTING CAPTURED SUBSTRINGS BY NUMBER int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length); int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); int pcre2_substring_get_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); void pcre2_substring_free(PCRE2_UCHAR *buffer); Captured substrings can be accessed directly by using the ovector as described above. For convenience, auxiliary functions are provided for extracting captured substrings as new, separate, zero-terminated strings. A substring that contains a binary zero is correctly extracted and has a further zero added on the end, but the result is not, of course, a C string. The functions in this section identify substrings by number. The number zero refers to the entire matched substring, with higher numbers refer- ring to substrings captured by parenthesized groups. After a partial match, only substring zero is available. An attempt to extract any other substring gives the error PCRE2_ERROR_PARTIAL. The next section describes similar functions for extracting captured substrings by name. If a pattern uses the \K escape sequence within a positive lookahead assertion, the reported start of a successful match can be greater than the end of the match. For example, if the pattern (?=ab\K) is matched against "ab", the start and end offset values for the match are 2 and 0. In this situation, calling these functions with a zero substring number extracts a zero-length empty string. You can find the length in code units of a captured substring without extracting it by calling pcre2_substring_length_bynumber(). The first argument is a pointer to the match data block, the second is the group number, and the third is a pointer to a variable into which the length is placed. If you just want to know whether or not the substring has been captured, you can pass the third argument as NULL. The pcre2_substring_copy_bynumber() function copies a captured sub- string into a supplied buffer, whereas pcre2_substring_get_bynumber() copies it into new memory, obtained using the same memory allocation function that was used for the match data block. The first two argu- ments of these functions are a pointer to the match data block and a capture group number. The final arguments of pcre2_substring_copy_bynumber() are a pointer to the buffer and a pointer to a variable that contains its length in code units. This is updated to contain the actual number of code units used for the extracted substring, excluding the terminating zero. For pcre2_substring_get_bynumber() the third and fourth arguments point to variables that are updated with a pointer to the new memory and the number of code units that comprise the substring, again excluding the terminating zero. When the substring is no longer needed, the memory should be freed by calling pcre2_substring_free(). The return value from all these functions is zero for success, or a negative error code. If the pattern match failed, the match failure code is returned. If a substring number greater than zero is used af- ter a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible error codes are: PCRE2_ERROR_NOMEMORY The buffer was too small for pcre2_substring_copy_bynumber(), or the attempt to get memory failed for pcre2_substring_get_bynumber(). PCRE2_ERROR_NOSUBSTRING There is no substring with that number in the pattern, that is, the number is greater than the number of capturing parentheses. PCRE2_ERROR_UNAVAILABLE The substring number, though not greater than the number of captures in the pattern, is greater than the number of slots in the ovector, so the substring could not be captured. PCRE2_ERROR_UNSET The substring did not participate in the match. For example, if the pattern is (abc)|(def) and the subject is "def", and the ovector con- tains at least two capturing slots, substring number 1 is unset. EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS int pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); void pcre2_substring_list_free(PCRE2_UCHAR **list); The pcre2_substring_list_get() function extracts all available sub- strings and builds a list of pointers to them. It also (optionally) builds a second list that contains their lengths (in code units), ex- cluding a terminating zero that is added to each of them. All this is done in a single block of memory that is obtained using the same memory allocation function that was used to get the match data block. This function must be called only after a successful match. If called after a partial match, the error code PCRE2_ERROR_PARTIAL is returned. The address of the memory block is returned via listptr, which is also the start of the list of string pointers. The end of the list is marked by a NULL pointer. The address of the list of lengths is returned via lengthsptr. If your strings do not contain binary zeros and you do not therefore need the lengths, you may supply NULL as the lengthsptr argu- ment to disable the creation of a list of lengths. The yield of the function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the mem- ory block could not be obtained. When the list is no longer needed, it should be freed by calling pcre2_substring_list_free(). If this function encounters a substring that is unset, which can happen when capture group number n+1 matches some part of the subject, but group n has not been used at all, it returns an empty string. This can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain PCRE2_UNSET for unset substrings, or by calling pcre2_substring_length_bynumber(). EXTRACTING CAPTURED SUBSTRINGS BY NAME int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name); int pcre2_substring_length_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_SIZE *length); int pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); int pcre2_substring_get_byname(pcre2_match_data *match_data, PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); void pcre2_substring_free(PCRE2_UCHAR *buffer); To extract a substring by name, you first have to find associated num- ber. For example, for this pattern: (a+)b(?\d+)... the number of the capture group called "xxx" is 2. If the name is known to be unique (PCRE2_DUPNAMES was not set), you can find the number from the name by calling pcre2_substring_number_from_name(). The first argu- ment is the compiled pattern, and the second is the name. The yield of the function is the group number, PCRE2_ERROR_NOSUBSTRING if there is no group with that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one group with that name. Given the number, you can extract the substring directly from the ovector, or use one of the "bynumber" functions described above. For convenience, there are also "byname" functions that correspond to the "bynumber" functions, the only difference being that the second ar- gument is a name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate names, these functions scan all the groups with the given name, and return the captured substring from the first named group that is set. If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is returned. If all groups with the name have numbers that are greater than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is re- turned. If there is at least one group with a slot in the ovector, but no group is found to be set, PCRE2_ERROR_UNSET is returned. Warning: If the pattern uses the (?| feature to set up multiple capture groups with the same number, as described in the section on duplicate group numbers in the pcre2pattern page, you cannot use names to distin- guish the different capture groups, because names are not included in the compiled code. The matching process uses only numbers. For this reason, the use of different names for groups with the same number causes an error at compile time. CREATING A NEW STRING WITH SUBSTITUTIONS int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, PCRE2_SIZE *outlengthptr); This function optionally calls pcre2_match() and then makes a copy of the subject string in outputbuffer, replacing parts that were matched with the replacement string, whose length is supplied in rlength, which can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a special case, if replacement is NULL and rlength is zero, the re- placement is assumed to be an empty string. If rlength is non-zero, an error occurs if replacement is NULL. There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to re- turn just the replacement string(s). The default action is to perform just one replacement if the pattern matches, but there is an option that requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL be- low). If successful, pcre2_substitute() returns the number of substitutions that were carried out. This may be zero if no match was found, and is never greater than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A nega- tive value is returned if an error is detected. Matches in which a \K item in a lookahead in the pattern causes the match to end before it starts are not supported, and give rise to an error return. For global replacements, matches in which \K in a lookbe- hind causes the match to start earlier than the point that was reached in the previous iteration are also not supported. (These cases are only possible if the pattern was compiled with the backwards-compatibility option PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK.) The first seven arguments of pcre2_substitute() are the same as for pcre2_match(), except that the partial matching options are not permit- ted, and match_data may be passed as NULL, in which case a match data block is obtained and freed within this function, using memory manage- ment functions from the match context, if provided, or else those that were used to allocate memory for the compiled code. If match_data is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the provided block is used for all calls to pcre2_match(), and its contents afterwards are the result of the final call made internally by pcre2_substitute() to the matching function. For global changes, this will always be a no-match error. The contents of the ovector within the match data block may or may not have been changed. As well as the usual options for pcre2_match(), a number of additional options can be set in the options argument of pcre2_substitute(). One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external match_data block must be provided, and it must have already been used for an external call to pcre2_match() (or pcre2_jit_match()) with the same pattern, subject pointer, effective subject length, start offset, and match option arguments (substitute-specific options can be added to the options argument). If any of these parameters is changed, pcre2_substitute() returns an error. The data in the match_data block (return code, offset vector) is used for the first substitution instead of calling pcre2_match() from within pcre2_substitute(). This allows an application to check for a match before choosing to substitute, without having to repeat the match. If the contents of the subject buffer are mutated in between pcre2_match() and a call to pcre2_substitute() with PCRE2_SUBSTI- TUTE_MATCHED, the behaviour is unsafe; in particular, in this case, PCRE2 is unable to ensure that the offsets in the ovector point to the start of characters (with UTF-encoded input). The contents of the externally supplied match data block are not changed when PCRE2_SUBSTITUTE_MATCHED is set, and so the match block is permitted for use in another call using PCRE2_SUBSTITUTE_MATCHED. If PCRE2_SUBSTITUTE_GLOBAL is also set, pcre2_match() is called after the first substitution to check for furthe matches, but this is done using an internally obtained match data block, thus always leaving the exter- nal block unchanged. The code argument is not used for matching before the first substitu- tion when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, even when PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains in- formation such as the UTF setting and the number of capturing parenthe- ses in the pattern. When using PCRE2_SUBSTITUTE_MATCHED, you should not modify the subject string in between the prior call to pcre2_match() and pcre2_substi- tute(), as the substitution assumes that the passed-in ovector is com- patible with the subject string. Although PCRE2 does verify that the subject is a pointer to the same buffer, it cannot in general verify whether the contents of the buffer have changed. For example, if the subject buffer is mutated from one valid UTF-8 string to another valid string, of the same length in code units, the ovector offsets are no longer guaranteed to point to the start of a character. Beware that with PCRE2_SUBSTITUTE_MATCHED in UTF mode, the subject string is not re-scanned for UTF validity when pcre2_substitute() first uses it. The default action of pcre2_substitute() is to return a copy of the subject string with matched substrings replaced. However, if PCRE2_SUB- STITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are returned. In the global case, multiple replacements are concatenated in the output buffer. Substitution callouts (see below) can be used to separate them if necessary. Partial matching is supported, with limitations: if matching succeeds but with a partial match, then pcre2_substitute returns PCRE2_ER- ROR_PARTIAL. When partial-matching (either of PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT is passed), then PCRE2_SUBSTITUTE_REPLACEMENT_ONLY must also be set, or else PCRE2_ERROR_BADOPTION is returned. Similarly, certain replacement items ($' and $_) cause PCRE2_ERROR_PARTIALSUBS to be returned when partial-matching, even if a complete match is found. The outlengthptr argument of pcre2_substitute() must point to a vari- able that contains the length, in code units, of the output buffer. If the function is successful, the value is updated to contain the length in code units of the new string, excluding the trailing zero that is automatically added. If the function is not successful, the value set via outlengthptr de- pends on the type of error. For syntax errors in the replacement string, the value is the offset in the replacement string where the er- ror was detected. For other errors, the value is PCRE2_UNSET by de- fault. This includes the case of the output buffer being too small, un- less PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set. PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is too small. The default action is to return PCRE2_ERROR_NOMEM- ORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (with- out, of course, writing anything) in order to compute the size of buffer that is needed, which will include the extra space for the ter- minating NUL. This value is passed back via the outlengthptr variable, with the result of the function still being PCRE2_ERROR_NOMEMORY. Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean that the entire operation is carried out twice. Depending on the appli- cation, it may be more efficient to allocate a large buffer and free the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER- FLOW_LENGTH. The replacement string, which is interpreted as a UTF string in UTF mode, is checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF replacement string causes an immediate return with the rel- evant UTF error code. If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not in- terpreted in any way. By default, however, a dollar character is an es- cape character that can specify the insertion of characters from cap- ture groups and names from (*MARK) or other control verbs in the pat- tern. Dollar is the only escape character (backslash is treated as lit- eral). The following forms are recognized: $$ insert a dollar character $n or ${n} insert the contents of group n $0 or $& insert the entire matched substring $` insert the substring that precedes the match $' insert the substring that follows the match $_ insert the entire input string $+ insert the highest-numbered capture group which matched $*MARK or ${*MARK} insert a control verb name Either a group number or a group name can be given for n, for example $2 or $NAME. Curly brackets are required only if the following charac- ter would be interpreted as part of the number or name. The number may be zero to include the entire matched string. For example, if the pat- tern a(b)c is matched with "=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=". The JavaScript form $, where the angle brackets are part of the syntax, is also recognized for group names, but not for group numbers or *MARK. $*MARK inserts the name from the last encountered backtracking control verb on the matching path that has a name. (*MARK) must always include a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this pcre2test example shows: /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} apple lemon 2: pear orange PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string, replacing every matching substring. If this option is not set, only the first matching substring is replaced. The search for matches takes place in the original subject string (that is, previous replace- ments do not affect it). Iteration is implemented by advancing the startoffset value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, search- ing stops when that limit is reached. Because global substitutions apply the pattern repeatedly to the sub- ject string, and always iterate over non-overlapping matches, the sub- stitutions done by pcre2_substitute() do not match and substitute text inside the replacement strings themselves (no recursive/iterative sub- stitution). However, applications can easily implement other alterna- tive replacement strategies, such as iteratively replacing, then match- ing and replacing on the result. The replacement loop inside pcre2_sub- stitute() is simple and can be emulated in client code by allocating a buffer, searching for matches in a loop, and calling pcre2_substitute() with PCRE2_SUBSTITUTE_REPLACEMENT_ONLY an PCRE2_SUBSTITUTE_MATCHED, and without PCRE2_SUBSTITUTE_GLOBAL. You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of startoffset and an off- set limit. Here is a pcre2test example: /B/g,replace=!,use_offset_limit ABC ABC ABC ABC\=offset=3,offset_limit=12 2: ABC A!C A!C ABC When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same off- set is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next two characters are CR, LF. In this case, the offset is advanced by two characters. PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error. PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including un- known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UN- SET error. This option does not influence the extended substitution syntax described below. PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is set, several things change: Firstly, backslash in a replacement string is interpreted as an escape character. The usual forms such as \x{ddd} can be used to specify par- ticular character codes, and backslash followed by any non-alphanumeric character quotes that character. Extended quoting can be coded using \Q...\E, exactly as in pattern strings. The escapes \b and \v are in- terpreted as the characters backspace and vertical tab, respectively. The interpretation of backslash followed by one or more digits is the same as in a pattern, which in Perl has some ambiguities. Details are given in the pcre2pattern page. The Python form \g, where the angle brackets are part of the syntax and n is either a group name or number, is recognized as an alternative way of inserting the contents of a group, for example \g<3>. There are also four escape sequences for forcing the case of inserted letters. Case forcing applies to all inserted characters, including those from capture groups and letters within \Q...\E quoted sequences. The insertion mechanism has three states: no case forcing, force upper case, and force lower case. The escape sequences change the current state: \U and \L change to upper or lower case forcing, respectively, and \E (when not terminating a \Q quoted sequence) reverts to no case forcing. The sequences \u and \l force the next character (if it is a letter) to upper or lower case, respectively, and then the state auto- matically reverts to no case forcing. However, if \u is immediately followed by \L or \l is immediately fol- lowed by \U, the next character's case is forced by the first escape sequence, and subsequent characters by the second. This provides a "ti- tle casing" facility that can be applied to group captures. For exam- ple, if group 1 has captured "heLLo", the replacement string "\u\L$1" becomes "Hello". If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater than 127. However, only simple case folding, as de- termined by the Unicode file CaseFolding.txt is supported. PCRE2 does not support language-specific special casing rules such as using dif- ferent lower case Greek sigmas in the middle and ends of words (as de- fined in the Unicode file SpecialCasing.txt). Note that case forcing sequences such as \U...\E do not nest. For exam- ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EX- TRA_ALT_BSUX options do not apply to replacement strings. The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash: ${n:-string} ${n:+string1:string2} As in the simple case, n may be a group number or a name. The first form specifies a default value. If group n is set, its value is in- serted; if not, the string is expanded and the result inserted. The second form specifies strings that are expanded and inserted when group n is set or unset, respectively. The first form is just a convenient shorthand for ${n:+${n}:string} Backslash can be used to escape colons and closing curly brackets in the replacement strings. A change of the case forcing state within a replacement string remains in force afterwards, as shown in this pcre2test example: /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo body 1: hello somebody 1: HELLO The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- known groups in the extended syntax forms to be treated as unset. If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrele- vant and are ignored. Substitution errors In the event of an error, pcre2_substitute() returns a negative error code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() are passed straight back. PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser- tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ- ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- SET_EMPTY is not set. PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is needed is returned via outlengthptr. Note that this does not happen by default. PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the match_data argument is NULL or if the subject or replacement arguments are NULL. For backward compatibility reasons an exception is made for the replacement argument if the rlength argument is also 0. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the replacement string, with more particular errors being PCRE2_ER- ROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier than the current position in the subject, which can happen if \K is used in a lookaround assertion). As for all PCRE2 errors, a text message that describes the error can be obtained by calling the pcre2_get_error_message() function (see "Ob- taining a textual error message" above). Substitution callouts int pcre2_set_substitute_callout(pcre2_match_context *mcontext, int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); The pcre2_set_substitute_callout() function can be used to specify a callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution has been processed, but it can cause the replacement not to happen. The callout function is not called for simulated substitutions that happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when substitution processing exceeds the buffer space pro- vided by the caller, processing continues by counting code units. The simulation is unable to populate the callout block, and so the simula- tion is pessimistic about the required buffer size. Whichever is larger of accepted or rejected substitution is reported as the required size. Therefore, the returned buffer length may be an overestimate (without a substitution callout, it is normally an exact measurement). The first argument of the callout function is a pointer to a substitute callout block structure, which contains the following fields, not nec- essarily in this order: uint32_t version; uint32_t subscount; PCRE2_SPTR input; PCRE2_SPTR output; PCRE2_SIZE *ovector; uint32_t oveccount; PCRE2_SIZE output_offsets[2]; The version field contains the version number of the block format. The current version is 0. The version number will increase in future if more fields are added, but the intention is never to remove any of the existing fields. The subscount field is the number of the current match. It is 1 for the first callout, 2 for the second, and so on. The input and output point- ers are copies of the values passed to pcre2_substitute(). The ovector field points to the ovector, which contains the result of the most recent match. The oveccount field contains the number of pairs that are set in the ovector, and is always greater than zero. The output_offsets vector contains the offsets of the replacement in the output string. This has already been processed for dollar and (if requested) backslash substitutions as described above. The second argument of the callout function is the value passed as callout_data when the function was registered. The value returned by the callout function is interpreted as follows: If the value is zero, the replacement is accepted, and, if PCRE2_SUB- STITUTE_GLOBAL is set, processing continues with a search for the next match. If the value is not zero, the current replacement is not ac- cepted. If the value is greater than zero, processing continues when PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero or PCRE2_SUBSTITUTE_GLOBAL is not set), the rest of the input is copied to the output and the call to pcre2_substitute() exits, returning the number of matches so far. Substitution case callouts int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, void *), void *callout_data); The pcre2_set_substitute_case_callout() function can be used to specify a callout function for pcre2_substitute() to use when performing case transformations. This does not affect any case insensitivity behaviour when performing a match, but only the user-visible transformations per- formed when processing a substitution such as: pcre2_substitute(..., "\\U$1", ...) The default case transformations applied by PCRE2 are reasonably com- plete, and, in UTF or UCP mode, perform the simple locale-invariant case transformations as specified by Unicode. This is suitable for the internal (invisible) case-equivalence procedures used during pattern matching, but an application may wish to use more sophisticated locale- aware processing for the user-visible substitution transformations. One example implementation of the callout_function using the ICU li- brary would be: PCRE2_SIZE icu_case_callout( PCRE2_SPTR input, PCRE2_SIZE input_len, PCRE2_UCHAR *output, PCRE2_SIZE output_cap, int to_case, void *data_ptr) { UErrorCode err = U_ZERO_ERROR; int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER ? u_strToLower(output, output_cap, input, input_len, NULL, &err) : to_case == PCRE2_SUBSTITUTE_CASE_UPPER ? u_strToUpper(output, output_cap, input, input_len, NULL, &err) : u_strToTitle(output, output_cap, input, input_len, &first_char_only, NULL, &err); if (U_FAILURE(err)) return (~(PCRE2_SIZE)0); return r; } The first and second arguments of the case callout function are the Unicode string to transform. The third and fourth arguments are the output buffer and its capacity. The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the callout to indicate that the case of the entire callout input should be case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that only the first character or glyph should be trans- formed to Unicode titlecase and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode properties to titlecase each word in a string; but PCRE2 is requesting that only the single leading character is to be titlecased). The sixth argument is the callout_data supplied to pcre2_set_substi- tute_case_callout(). The resulting string in the destination buffer may be larger or smaller than the input, if the casing rules merge or split characters. The re- turn value is the length required for the output string. If a buffer of sufficient size was provided to the callout, then the result must be written to the buffer and the number of code units returned. If the re- sult does not fit in the provided buffer, then the required capacity must be returned and PCRE2 will not make use of the output buffer. PCRE2 provides input and output buffers which overlap, so the callout must support this by suitable internal buffering. Alternatively, if the callout wishes to indicate an error, then it may return (~(PCRE2_SIZE)0). In this case pcre2_substitute() will immedi- ately fail with error PCRE2_ERROR_REPLACECASE. When a case callout is combined with the PCRE2_SUBSTITUTE_OVER- FLOW_LENGTH option, there are situations when pcre2_substitute() will return an underestimate of the required buffer size. If you call pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for the replacement string to be constructed, then instead of calling the case callout, pcre2_substitute() will make an estimate of the required buffer size. The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second call is not guaranteed to succeed either, if the case callout requires more buffer space than expected. The caller must make repeated attempts in a loop. DUPLICATE CAPTURE GROUP NAMES int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); When a pattern is compiled with the PCRE2_DUPNAMES option, names for capture groups are not required to be unique. Duplicate names are al- ways allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names. Normally, patterns that use duplicate names are such that in any one match, only one of each set of identically-named groups participates. An example is shown in the pcre2pattern documentation. When duplicates are present, pcre2_substring_copy_byname() and pcre2_substring_get_byname() return the first substring corresponding to the given name that is set. Only if none are set is PCRE2_ERROR_UN- SET is returned. The pcre2_substring_number_from_name() function re- turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. If you want to get full details of all captured substrings for a given name, you must use the pcre2_substring_nametable_scan() function. The first argument is the compiled pattern, and the second is the name. If the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. When the third and fourth arguments are not NULL, they must be pointers to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the given name, and the function returns the length of each entry in code units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. The format of the name table is described above in the section entitled Information about a pattern. Given all the relevant entries for the name, you can extract each of their numbers, and hence the captured data. FINDING ALL POSSIBLE MATCHES AT ONE POSITION The traditional matching function uses a similar algorithm to Perl, which stops when it finds the first match at a given point in the sub- ject. If you want to find all possible matches, or the longest possible match at a given position, consider using the alternative matching function (see below) instead. If you cannot use the alternative func- tion, you can kludge it up by making use of the callout facility, which is described in the pcre2callout documentation. What you have to do is to insert a callout right at the end of the pat- tern. When your callout function is called, extract and save the cur- rent matched substring. Then return 1, which forces pcre2_match() to backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH. MATCHING A PATTERN: THE ALTERNATIVE FUNCTION int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount); The function pcre2_dfa_match() is called to match a subject string against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does not backtrack (except when processing lookaround assertions). This has different characteristics to the normal algorithm, and is not compati- ble with Perl. Some of the features of PCRE2 patterns are not sup- ported. Nevertheless, there are times when this kind of matching can be useful. For a discussion of the two matching algorithms, and a list of features that pcre2_dfa_match() does not support, see the pcre2matching documentation. The arguments for the pcre2_dfa_match() function are the same as for pcre2_match(), plus two extras. The ovector within the match data block is used in a different way, and this is described below. The other com- mon arguments are used in the same way as for pcre2_match(), so their description is not repeated here. The two additional arguments provide workspace for the function. The workspace vector should contain at least 20 elements. It is used for keeping track of multiple paths through the pattern tree. More work- space is needed for patterns and subjects where there are a lot of po- tential matches. Here is an example of a simple call to pcre2_dfa_match(): int wspace[20]; pcre2_match_data *md = pcre2_match_data_create(4, NULL); int rc = pcre2_dfa_match( re, /* result of pcre2_compile() */ "some string", /* the subject string */ 11, /* the length of the subject string */ 0, /* start at offset 0 in the subject */ 0, /* default options */ md, /* the match data block */ NULL, /* a match context; NULL means use defaults */ wspace, /* working space vector */ 20); /* number of elements (NOT size in bytes) */ Option bits for pcre2_dfa_match() The unused bits of the options argument for pcre2_dfa_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- TEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for pcre2_match(), so their description is not repeated here. PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT These have the same general effect as they do for pcre2_match(), but the details are slightly different. When PCRE2_PARTIAL_HARD is set for pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the subject is reached and there is still at least one matching possibility that requires additional characters. This happens even if some complete matches have already been found. When PCRE2_PARTIAL_SOFT is set, the return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the subject is reached, there have been no complete matches, but there is still at least one matching possibility. The por- tion of the string that was inspected when the longest partial match was found is set as the first matching string in both cases. There is a more detailed discussion of partial and multi-segment matching, with examples, in the pcre2partial documentation. PCRE2_DFA_SHORTEST Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as soon as it has found one match. Because of the way the alterna- tive algorithm works, this is necessarily the shortest possible match at the first possible matching point in the subject string. PCRE2_DFA_RESTART When pcre2_dfa_match() returns a partial match, it is possible to call it again, with additional subject characters, and have it continue with the same match. The PCRE2_DFA_RESTART option requests this action; when it is set, the workspace and wscount options must reference the same vector as before because data about the match so far is left in them after a partial match. There is more discussion of this facility in the pcre2partial documentation. Successful returns from pcre2_dfa_match() When pcre2_dfa_match() succeeds, it may have matched more than one sub- string in the subject. Note, however, that all the matches from one run of the function start at the same point in the subject. The shorter matches are all initial substrings of the longer matches. For example, if the pattern <.*> is matched against the string This is no more the three matched strings are On success, the yield of the function is a number greater than zero, which is the number of matched substrings. The offsets of the sub- strings are returned in the ovector, and can be extracted by number in the same way as for pcre2_match(), but the numbers bear no relation to any capture groups that may exist in the pattern, because DFA matching does not support capturing. Calls to the convenience functions that extract substrings by name re- turn the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used af- ter a DFA match. The convenience functions that extract substrings by number never return PCRE2_ERROR_NOSUBSTRING. The matched strings are stored in the ovector in reverse order of length; that is, the longest matching string is first. If there were too many matches to fit into the ovector, the yield of the function is zero, and the vector is filled with the longest matches. NOTE: PCRE2's "auto-possessification" optimization usually applies to character repeats at the end of a pattern (as well as internally). For example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA matching, this means that only one possible match is found. If you re- ally do want multiple matches in such cases, either use an ungreedy re- peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- piling. Error returns from pcre2_dfa_match() The pcre2_dfa_match() function returns a negative number when it fails. Many of the errors are the same as for pcre2_match(), as described above. There are in addition the following errors that are specific to pcre2_dfa_match(): PCRE2_ERROR_DFA_UITEM This return is given if pcre2_dfa_match() encounters an item in the pattern that it does not support, for instance, the use of \C in a UTF mode or a backreference. PCRE2_ERROR_DFA_UCOND This return is given if pcre2_dfa_match() encounters a condition item that uses a backreference for the condition, or a test for recursion in a specific capture group. These are not supported. PCRE2_ERROR_DFA_UINVALID_UTF This return is given if pcre2_dfa_match() is called for a pattern that was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA matching. PCRE2_ERROR_DFA_WSSIZE This return is given if pcre2_dfa_match() runs out of space in the workspace vector. PCRE2_ERROR_DFA_RECURSE When a recursion or subroutine call is processed, the matching function calls itself recursively, using private memory for the ovector and workspace. This error is given if the internal ovector is not large enough. This should be extremely rare, as a vector of size 1000 is used. PCRE2_ERROR_DFA_BADRESTART When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, some plausibility checks are made on the contents of the workspace, which should contain data about the previous partial match. If any of these checks fail, this error is given. SEE ALSO pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 29 October 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 29 October 2025 PCRE2API(3) ------------------------------------------------------------------------------ PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) BUILDING PCRE2 PCRE2 is distributed with a configure script that can be used to build the library in Unix-like environments using the Autotools applications. Also in the distribution are files to support building using CMake in- stead of configure. The text file README contains general information about building with Autotools (some of which is repeated below), and also has some comments about building on various operating systems. The files in the vms directory support building under OpenVMS. There is a lot more information about building PCRE2 without using Autotools (in- cluding information about using CMake and building "by hand") in the text file called NON-AUTOTOOLS-BUILD. You should consult this file as well as the README file if you are building in a non-Unix-like environ- ment. PCRE2 BUILD-TIME OPTIONS The rest of this document describes the optional features of PCRE2 that can be selected when the library is compiled. It assumes use of the configure script, where the optional features are selected or dese- lected by providing options to configure before running the make com- mand. However, the same options can be selected in both Unix-like and non-Unix-like environments if you are using CMake instead of configure to build PCRE2. If you are not using Autotools or CMake, option selection can be done by editing the config.h file, or by passing parameter settings to the compiler, as described in NON-AUTOTOOLS-BUILD. The complete list of options for configure (which includes the standard ones such as the selection of the installation directory) can be ob- tained by running ./configure --help The following sections include descriptions of "on/off" options whose names begin with --enable or --disable. Because of the way that config- ure works, --enable and --disable always come in pairs, so the comple- mentary option always exists as well, but as it specifies the default, it is not described. Options that specify values have names that start with --with. At the end of a configure run, a summary of the configura- tion is output. BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES By default, a library called libpcre2-8 is built, containing functions that take string arguments contained in arrays of bytes, interpreted either as single-byte characters, or UTF-8 strings. You can also build two other libraries, called libpcre2-16 and libpcre2-32, which process strings that are contained in arrays of 16-bit and 32-bit code units, respectively. These can be interpreted either as single-unit characters or UTF-16/UTF-32 strings. To build these additional libraries, add one or both of the following to the configure command: --enable-pcre2-16 --enable-pcre2-32 If you do not want the 8-bit library, add --disable-pcre2-8 as well. At least one of the three libraries must be built. Note that the POSIX wrapper is for the 8-bit library only, and that pcre2grep is an 8-bit program. Neither of these are built if you select only the 16-bit or 32-bit libraries. BUILDING SHARED AND STATIC LIBRARIES The Autotools PCRE2 building process uses libtool to build both shared and static libraries by default. You can suppress an unwanted library by adding one of --disable-shared --disable-static to the configure command. Setting --disable-shared ensures that PCRE2 libraries are built as static libraries. The binaries that are then created as part of the build process (for example, pcre2test and pcre2grep) are linked statically with one or more PCRE2 libraries, but may also be dynamically linked with other libraries such as libc. If you want these binaries to be fully statically linked, you can set LD- FLAGS like this: LDFLAGS=--static ./configure --disable-shared Note the two hyphens in --static. Of course, this works only if static versions of all the relevant libraries are available for linking. UNICODE AND UTF SUPPORT By default, PCRE2 is built with support for Unicode and UTF character strings. To build it without Unicode support, add --disable-unicode to the configure command. This setting applies to all three libraries. It is not possible to build one library with Unicode support and an- other without in the same configuration. Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16 or UTF-32. To do that, applications that use the library can set the PCRE2_UTF option when they call pcre2_compile() to compile a pat- tern. Alternatively, patterns may be started with (*UTF) unless the application has locked this out by setting PCRE2_NEVER_UTF. UTF support allows the libraries to process character code points up to 0x10ffff in the strings that they handle. Unicode support also gives access to the Unicode properties of characters, using pattern escapes such as \P, \p, and \X. Only the general category properties such as Lu and Nd, script names, and some bi-directional and binary properties are supported. Details are given in the pcre2pattern documentation. Pattern escapes such as \d and \w do not by default make use of Unicode properties. The application can request that they do by setting the PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP). DISABLING THE USE OF \C The \C escape sequence, which matches a single code unit, even in a UTF mode, can cause unpredictable behaviour because it may leave the cur- rent matching point in the middle of a multi-code-unit character. The application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C op- tion when calling pcre2_compile(). There is also a build-time option --enable-never-backslash-C (note the upper case C) which locks out the use of \C entirely. JUST-IN-TIME COMPILER SUPPORT Just-in-time (JIT) compiler support is included in the build by speci- fying --enable-jit This support is available only for certain hardware architectures. If this option is set for an unsupported architecture, a building error occurs. If in doubt, use --enable-jit=auto which enables JIT only if the current hardware is supported. You can check if JIT is enabled in the configuration summary that is output at the end of a configure run. If you are enabling JIT under SELinux you may also want to add --enable-jit-sealloc which enables the use of an execmem allocator in JIT that is compatible with SELinux. This has no effect if JIT is not enabled. See the pcre2jit documentation for a discussion of JIT usage. When JIT support is enabled, pcre2grep automatically makes use of it, unless you add --disable-pcre2grep-jit to the configure command. NEWLINE RECOGNITION By default, PCRE2 interprets the linefeed (LF) character as indicating the end of a line. This is the normal newline character on Unix-like systems. You can compile PCRE2 to use carriage return (CR) instead, by adding --enable-newline-is-cr to the configure command. There is also an --enable-newline-is-lf op- tion, which explicitly specifies linefeed as the newline character. Alternatively, you can specify that line endings are to be indicated by the two-character sequence CRLF (CR immediately followed by LF). If you want this, add --enable-newline-is-crlf to the configure command. There is a fourth option, specified by --enable-newline-is-anycrlf which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as indicating a line ending. A fifth option, specified by --enable-newline-is-any causes PCRE2 to recognize any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single charac- ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). The final option is --enable-newline-is-nul which causes NUL (binary zero) to be set as the default line-ending character. Whatever default line ending convention is selected when PCRE2 is built can be overridden by applications that use the library. At build time it is recommended to use the standard for your operating system. WHAT \R MATCHES By default, the sequence \R in a pattern matches any Unicode newline sequence, independently of what has been selected as the line ending sequence. If you specify --enable-bsr-anycrlf the default is changed so that \R matches only CR, LF, or CRLF. What- ever is selected when PCRE2 is built can be overridden by applications that use the library. HANDLING VERY LARGE PATTERNS Within a compiled pattern, offset values are used to point from one part to another (for example, from an opening parenthesis to an alter- nation metacharacter). By default, in the 8-bit and 16-bit libraries, two-byte values are used for these offsets, leading to a maximum size for a compiled pattern of around 64 thousand code units. This is suffi- cient to handle all but the most gigantic patterns. Nevertheless, some people do want to process truly enormous patterns, so it is possible to compile PCRE2 to use three-byte or four-byte offsets by adding a set- ting such as --with-link-size=3 to the configure command. The value given must be 2, 3, or 4. For the 16-bit library, a value of 3 is rounded up to 4. In these libraries, using longer offsets slows down the operation of PCRE2 because it has to load additional data when handling them. For the 32-bit library the value is always 4 and cannot be overridden; the value of --with-link- size is ignored. LIMITING PCRE2 RESOURCE USAGE The pcre2_match() function increments a counter each time it goes round its main loop. Putting a limit on this counter controls the amount of computing resource used by a single call to pcre2_match(). The limit can be changed at run time, as described in the pcre2api documentation. The default is 10 million, but this can be changed by adding a setting such as --with-match-limit=500000 to the configure command. This setting also applies to the pcre2_dfa_match() matching function, and to JIT matching (though the counting is done differently). The pcre2_match() function uses heap memory to record backtracking points. The more nested backtracking points there are (that is, the deeper the search tree), the more memory is needed. There is an upper limit, specified in kibibytes (units of 1024 bytes). This limit can be changed at run time, as described in the pcre2api documentation. The default limit (in effect unlimited) is 20 million. You can change this by a setting such as --with-heap-limit=500 which limits the amount of heap to 500 KiB. This limit applies only to interpretive matching in pcre2_match() and pcre2_dfa_match(), which may also use the heap for internal workspace when processing complicated patterns. This limit does not apply when JIT (which has its own memory arrangements) is used. You can also explicitly limit the depth of nested backtracking in the pcre2_match() interpreter. This limit defaults to the value that is set for --with-match-limit. You can set a lower default limit by adding, for example, --with-match-limit-depth=10000 to the configure command. This value can be overridden at run time. This depth limit indirectly limits the amount of heap memory that is used, but because the size of each backtracking "frame" depends on the number of capturing parentheses in a pattern, the amount of heap that is used before the limit is reached varies from pattern to pattern. This limit was more useful in versions before 10.30, where function re- cursion was used for backtracking. As well as applying to pcre2_match(), the depth limit also controls the depth of recursive function calls in pcre2_dfa_match(). These are used for lookaround assertions, atomic groups, and recursion within pat- terns. The limit does not apply to JIT matching. LIMITING VARIABLE-LENGTH LOOKBEHIND ASSERTIONS Lookbehind assertions in which one or more branches can match a vari- able number of characters are supported only if there is a maximum matching length for each top-level branch. There is a limit to this maximum that defaults to 255 characters. You can alter this default by a setting such as --with-max-varlookbehind=100 The limit can be changed at runtime by calling pcre2_set_max_varlookbe- hind(). Lookbehind assertions in which every branch matches a fixed number of characters (not necessarily all the same) are not constrained by this limit. CREATING CHARACTER TABLES AT BUILD TIME PCRE2 uses fixed tables for processing characters whose code points are less than 256. By default, PCRE2 is built with a set of tables that are distributed in the file src/pcre2_chartables.c.dist. These tables are for ASCII codes only. If you add --enable-rebuild-chartables to the configure command, the distributed tables are no longer used. Instead, a program called pcre2_dftables is compiled and run. This out- puts the source for new set of tables, created in the default locale of your C run-time system. This method of replacing the tables does not work if you are cross compiling, because pcre2_dftables needs to be run on the local host and therefore not compiled with the cross compiler. If you need to create alternative tables when cross compiling, you will have to do so "by hand". There may also be other reasons for creating tables manually. To cause pcre2_dftables to be built on the local host, run a normal compiling command, and then run the program with the output file as its argument, for example: cc src/pcre2_dftables.c -o pcre2_dftables ./pcre2_dftables src/pcre2_chartables.c This builds the tables in the default locale of the local host. If you want to specify a locale, you must use the -L option: LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c You can also specify -b (with or without -L). This causes the tables to be written in binary instead of as source code. A set of binary tables can be loaded into memory by an application and passed to pcre2_com- pile() in the same way as tables created by calling pcre2_maketables(). The tables are just a string of bytes, independent of hardware charac- teristics such as endianness. This means they can be bundled with an application that runs in different environments, to ensure consistent behaviour. USING EBCDIC CODE PCRE2 assumes by default that it will run in an environment where the character code is ASCII or Unicode, which is a superset of ASCII. This is the case for most computer operating systems. PCRE2 can, however, be compiled to run in an 8-bit EBCDIC environment by adding --enable-ebcdic --disable-unicode to the configure command. You should only use it if you know that you are in an EBCDIC environment (for example, an IBM mainframe operating system). This setting implies --enable-rebuild-chartables, in order to ensure that you have the correct default character tables for your system's codepage. There is an exception when you set --enable-ebcdic-ignoring- compiler (see below), which allows using a default set of EBCDIC 1047 character tables rather than forcing use of --enable-rebuild-charta- bles. It is not supported to enable both EBCDIC input and either ASCII or UTF-8/16/32 in the same build of the library. When PCRE2 is built with EBCDIC support, it always operates in EBCDIC, and consequently --en- able-unicode and --enable-ebcdic are mutually exclusive. The EBCDIC character that corresponds to an ASCII LF is assumed to have the value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In such an environment you should use --enable-ebcdic-nl25 (which implies --enable-ebcdic). The EBCDIC character for CR has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is not chosen as LF is made to correspond to the Unicode NEL character (which, in Unicode, is 0x85). The options that select newline behaviour, such as --enable-newline-is- cr, and equivalent run-time options, refer to these character values in an EBCDIC environment. On systems requiring an EBCDIC build of PCRE2, the compiler should be set to use the correct codepage, so that C character literals such as 'z' use the correct numeric value for whichever EBCDIC codpage is in use. (PCRE2 cannot support multiple EBCDIC codepages dynamically.) How- ever, if this not possible, then you can use --enable-ebcdic-ignoring-compiler in order to disregard the compiler's codepage, and instead force PCRE2 to use numeric constants corresponding to the EBCDIC 1047 codepage in- stead. This can be used to build (or test) EBCDIC support on an ASCII/UTF-8 system such as Linux. PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS By default pcre2grep supports the use of callouts with string arguments within the patterns it is matching. There are two kinds: one that gen- erates output using local code, and another that calls an external pro- gram or script. If --disable-pcre2grep-callout-fork is added to the configure command, only the first kind of callout is supported; if --disable-pcre2grep-callout is used, all callouts are completely ig- nored. For more details of pcre2grep callouts, see the pcre2grep docu- mentation. PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT By default, pcre2grep reads all files as plain text. You can build it so that it recognizes files whose names end in .gz or .bz2, and reads them with libz or libbz2, respectively, by adding one or both of --enable-pcre2grep-libz --enable-pcre2grep-libbz2 to the configure command. These options naturally require that the rel- evant libraries are installed on your system. Configuration will fail if they are not. PCRE2GREP BUFFER SIZE pcre2grep uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it finds a match. The default starting size of the buffer is 20KiB. The buffer itself is three times this size, but because of the way it is used for holding "before" lines, the longest line that is guaranteed to be processable is the notional buffer size. If a longer line is encoun- tered, pcre2grep automatically expands the buffer, up to a specified maximum size, whose default is 1MiB or the starting size, whichever is the larger. You can change the default parameter values by adding, for example, --with-pcre2grep-bufsize=51200 --with-pcre2grep-max-bufsize=2097152 to the configure command. The caller of pcre2grep can override these values by using --buffer-size and --max-buffer-size on the command line. PCRE2TEST OPTION FOR LIBREADLINE SUPPORT If you add one of --enable-pcre2test-libreadline --enable-pcre2test-libedit to the configure command, pcre2test is linked with the libreadline or- libedit library, respectively, and when its input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking instead with libedit, which has a BSD licence. Setting --enable-pcre2test-libreadline causes the -lreadline option to be added to the pcre2test build. In many operating environments with a system-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is in use), some extra configuration may be necessary. The INSTALL file for libreadline says this: "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link with readline the to choose an appropriate library." If your environment has not been set up so that an appropriate library is automatically included, you may need to add something like LIBS="-lncurses" immediately before the configure command. INCLUDING DEBUGGING CODE If you add --enable-debug to the configure command, additional debugging code is included in the build. This feature is intended for use by the PCRE2 maintainers. DEBUGGING WITH VALGRIND SUPPORT If you add --enable-valgrind to the configure command, PCRE2 will use valgrind annotations to mark certain memory regions as unaddressable. This allows it to detect in- valid memory accesses, and is mostly useful for debugging PCRE2 itself. CODE COVERAGE REPORTING If your C compiler is gcc, you can build a version of PCRE2 that can generate a code coverage report for its test suite. To enable this, you must install lcov version 1.6 or above. Then specify --enable-coverage to the configure command and build PCRE2 in the usual way. Note that using ccache (a caching C compiler) is incompatible with code coverage reporting. If you have configured ccache to run automatically on your system, you must set the environment variable CCACHE_DISABLE=1 before running make to build PCRE2, so that ccache is not used. When --enable-coverage is used, the following addition targets are added to the Makefile: make coverage This creates a fresh coverage report for the PCRE2 test suite. It is equivalent to running "make coverage-reset", "make coverage-baseline", "make check", and then "make coverage-report". make coverage-reset This zeroes the coverage counters, but does nothing else. make coverage-baseline This captures baseline coverage information. make coverage-report This creates the coverage report. make coverage-clean-report This removes the generated coverage report without cleaning the cover- age data itself. make coverage-clean-data This removes the captured coverage data without removing the coverage files created at compile time (*.gcno). make coverage-clean This cleans all coverage data including the generated coverage report. For more information about code coverage, see the gcov and lcov docu- mentation. DISABLING THE Z AND T FORMATTING MODIFIERS The C99 standard defines formatting modifiers z and t for size_t and ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in environments other than old versions of Microsoft Visual Studio when __STDC_VERSION__ is defined and has a value greater than or equal to 199901L (indicating support for C99). However, there is at least one environment that claims to be C99 but does not support these modifiers. If --disable-percent-zt is specified, no use is made of the z or t modifiers. Instead of %td or %zu, a suitable format is used depending in the size of long for the platform. SUPPORT FOR FUZZERS There is a special option for use by people who want to run fuzzing tests on PCRE2: --enable-fuzz-support At present this applies only to the 8-bit library. If set, it causes an extra library called libpcre2-fuzzsupport.a to be built, but not in- stalled. This contains a single function called LLVMFuzzerTestOneIn- put() whose arguments are a pointer to a string and the length of the string. When called, this function tries to compile the string as a pattern, and if that succeeds, to match it. This is done both with no options and with some random options bits that are generated from the string. Setting --enable-fuzz-support also causes a binary called pcre2fuz- zcheck to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing function and outputs information about what it is doing. The input strings are specified by arguments: if an argument starts with "=" the rest of it is a literal input string. Otherwise, it is assumed to be a file name, and the contents of the file are the test string. OBSOLETE OPTION In versions of PCRE2 prior to 10.30, there were two ways of handling backtracking in the pcre2_match() function. The default was to use the system stack, but if --disable-stack-for-recursion was set, memory on the heap was used. From release 10.30 onwards this has changed (the stack is no longer used) and this option now does nothing except give a warning. SEE ALSO pcre2api(3), pcre2-config(3). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 17 October 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 17 October 2025 PCRE2BUILD(3) ------------------------------------------------------------------------------ PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) SYNOPSIS #include int (*pcre2_callout)(pcre2_callout_block *, void *); int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data); DESCRIPTION PCRE2 provides a feature called "callout", which is a means of tem- porarily passing control to the caller of PCRE2 in the middle of pat- tern matching. The caller of PCRE2 provides an external function by putting its entry point in a match context (see pcre2_set_callout() in the pcre2api documentation). When using the pcre2_substitute() function, an additional callout fea- ture is available. This does a callout after each change to the subject string and is described in the pcre2api documentation; the rest of this document is concerned with callouts during pattern matching. Within a regular expression, (?C) indicates a point at which the external function is to be called. Different callout points can be identified by putting a number less than 256 after the letter C. The default value is zero. Alternatively, the argument may be a delimited string. The starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the start, except for {, where the end- ing delimiter is }. If the ending delimiter is needed within the string, it must be doubled. For example, this pattern has two callout points: (?C1)abc(?C"some ""arbitrary"" text")def If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 automatically inserts callouts, all with number 255, before each item in the pattern except for immediately before or after an explicit callout. For example, if PCRE2_AUTO_CALLOUT is used with the pattern A(?C3)B it is processed as if it were (?C255)A(?C3)B(?C255) Here is a more complicated example: A(\d{2}|--) With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) Notice that there is a callout before and after each parenthesis and alternation bar. If the pattern contains a conditional group whose con- dition is an assertion, an automatic callout is inserted immediately before the condition. Such a callout may also be inserted explicitly, for example: (?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de) This applies only to assertion conditions (because they are themselves independent groups). Callouts can be useful for tracking the progress of pattern matching. The pcre2test program has a pattern qualifier (/auto_callout) that sets automatic callouts. When any callouts are present, the output from pcre2test indicates how the pattern is being matched. This is useful information when you are trying to optimize the performance of a par- ticular pattern. MISSING CALLOUTS You should be aware that, because of optimizations in the way PCRE2 compiles and matches patterns, callouts sometimes do not happen exactly as you might expect. Auto-possessification At compile time, PCRE2 "auto-possessifies" repeated items when it knows that what follows cannot be part of the repeat. For example, a+[bc] is compiled as if it were a++[bc]. The pcre2test output when this pattern is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string "aaaa" is: --->aaaa +0 ^ a+ +2 ^ ^ [bc] No match This indicates that when matching [bc] fails, there is no backtracking into a+ (because it is being treated as a++) and therefore the callouts that would be taken for the backtracks do not occur. You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this: --->aaaa +0 ^ a+ +2 ^ ^ [bc] +2 ^ ^ [bc] +2 ^ ^ [bc] +2 ^^ [bc] No match This time, when matching [bc] fails, the matcher backtracks into a+ and tries again, repeatedly, until a+ itself fails. Automatic .* anchoring By default, an optimization is applied when .* is the first significant item in a pattern. If PCRE2_DOTALL is set, so that the dot can match any character, the pattern is automatically anchored. If PCRE2_DOTALL is not set, a match can start only after an internal newline or at the beginning of the subject, and pcre2_compile() remembers this. If a pat- tern has more than one top-level branch, automatic anchoring occurs if all branches are anchorable. This optimization is disabled, however, if .* is in an atomic group or if there is a backreference to the capture group in which it appears. It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How- ever, the presence of callouts does not affect it. For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT and applied to the string "aa", the pcre2test output is: --->aa +0 ^ .* +2 ^ ^ \d +2 ^^ \d +2 ^ \d No match This shows that all match attempts start at the beginning of the sub- ject. In other words, the pattern is anchored. You can disable this op- timization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out- put changes to: --->aa +0 ^ .* +2 ^ ^ \d +2 ^^ \d +2 ^ \d +0 ^ .* +2 ^^ \d +2 ^ \d No match This shows more match attempts, starting at the second subject charac- ter. Another optimization, described in the next section, means that there is no subsequent attempt to match with an empty subject. Other optimizations Other optimizations that provide fast "no match" results also affect callouts. For example, if the pattern is ab(?C4)cd PCRE2 knows that any matching string must contain the letter "d". If the subject string is "abyz", the lack of "d" means that matching doesn't ever start, and the callout is never reached. However, with "abyd", though the result is still no match, the callout is obeyed. For most patterns PCRE2 also knows the minimum length of a matching string, and will immediately give a "no match" return without actually running a match if the subject is not long enough, or, for unanchored patterns, if it has been scanned far enough. You can disable these optimizations by passing the PCRE2_NO_START_OPTI- MIZE option to pcre2_compile(), or by starting the pattern with (*NO_START_OPT). This slows down the matching process, but does ensure that callouts such as the example above are obeyed. THE CALLOUT INTERFACE During matching, when PCRE2 reaches a callout point, if an external function is provided in the match context, it is called. This applies to both normal, DFA, and JIT matching. The first argument to the call- out function is a pointer to a pcre2_callout block. The second argument is the void * callout data that was supplied when the callout was set up by calling pcre2_set_callout() (see the pcre2api documentation). The callout block structure contains the following fields, not necessarily in this order: uint32_t version; uint32_t callout_number; uint32_t capture_top; uint32_t capture_last; uint32_t callout_flags; PCRE2_SIZE *offset_vector; PCRE2_SPTR mark; PCRE2_SPTR subject; PCRE2_SIZE subject_length; PCRE2_SIZE start_match; PCRE2_SIZE current_position; PCRE2_SIZE pattern_position; PCRE2_SIZE next_item_length; PCRE2_SIZE callout_string_offset; PCRE2_SIZE callout_string_length; PCRE2_SPTR callout_string; The version field contains the version number of the block format. The current version is 2; the three callout string fields were added for version 1, and the callout_flags field for version 2. If you are writ- ing an application that might use an earlier release of PCRE2, you should check the version number before accessing any of these fields. The version number will increase in future if more fields are added, but the intention is never to remove any of the existing fields. Fields for numerical callouts For a numerical callout, callout_string is NULL, and callout_number contains the number of the callout, in the range 0-255. This is the number that follows (?C for callouts that part of the pattern; it is 255 for automatically generated callouts. Fields for string callouts For callouts with string arguments, callout_number is always zero, and callout_string points to the string that is contained within the com- piled pattern. Its length is given by callout_string_length. Duplicated ending delimiters that were present in the original pattern string have been turned into single characters, but there is no other processing of the callout string argument. An additional code unit containing binary zero is present after the string, but is not included in the length. The delimiter that was used to start the string is also stored within the pattern, immediately before the string itself. You can access this delimiter as callout_string[-1] if you need it. The callout_string_offset field is the code unit offset to the start of the callout argument string within the original pattern string. This is provided for the benefit of applications such as script languages that might need to report errors in the callout string within the pattern. Fields for all callouts The remaining fields in the callout block are the same for both kinds of callout. The offset_vector field is a pointer to a vector of capturing offsets (the "ovector"). You may read the elements in this vector, but you must not change any of them. For calls to pcre2_match(), the offset_vector field is not (since re- lease 10.30) a pointer to the actual ovector that was passed to the matching function in the match data block. Instead it points to an in- ternal ovector of a size large enough to hold all possible captured substrings in the pattern. Note that whenever a recursion or subroutine call within a pattern completes, the capturing state is reset to what it was before. The capture_last field contains the number of the most recently cap- tured substring, and the capture_top field contains one more than the number of the highest numbered captured substring so far. If no sub- strings have yet been captured, the value of capture_last is 0 and the value of capture_top is 1. The values of these fields do not always differ by one; for example, when the callout in the pattern ((a)(b))(?C2) is taken, capture_last is 1 but capture_top is 4. The contents of ovector[2] to ovector[*2-1] can be in- spected in order to extract substrings that have been matched so far, in the same way as extracting substrings after a match has completed. The values in ovector[0] and ovector[1] are always PCRE2_UNSET because the match is by definition not complete. Substrings that have not been captured but whose numbers are less than capture_top also have both of their ovector slots set to PCRE2_UNSET. For DFA matching, the offset_vector field points to the ovector that was passed to the matching function in the match data block for call- outs at the top level, but to an internal ovector during the processing of pattern recursions, lookarounds, and atomic groups. However, these ovectors hold no useful information because pcre2_dfa_match() does not support substring capturing. The value of capture_top is always 1 and the value of capture_last is always 0 for DFA matching. The subject and subject_length fields contain copies of the values that were passed to the matching function. The start_match field normally contains the offset within the subject at which the current match attempt started. However, if the escape se- quence \K has been encountered, this value is changed to reflect the modified starting point. If the pattern is not anchored, the callout function may be called several times from the same point in the pattern for different starting points in the subject. The current_position field contains the offset within the subject of the current match pointer. The pattern_position field contains the offset in the pattern string to the next item to be matched. The next_item_length field contains the length of the next item to be processed in the pattern string. When the callout is at the end of the pattern, the length is zero. When the callout precedes an opening parenthesis, the length includes meta characters that follow the paren- thesis. For example, in a callout before an assertion such as (?=ab) the length is 3. For an alternation bar or a closing parenthesis, the length is one, unless a closing parenthesis is followed by a quanti- fier, in which case its length is included. (This changed in release 10.23. In earlier releases, before an opening parenthesis the length was that of the entire group, and before an alternation bar or a clos- ing parenthesis the length was zero.) The pattern_position and next_item_length fields are intended to help in distinguishing between different automatic callouts, which all have the same callout number. However, they are set for all callouts, and are used by pcre2test to show the next item to be matched when display- ing callout information. In callouts from pcre2_match() the mark field contains a pointer to the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or (*THEN) item in the match, or NULL if no such items have been passed. Instances of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In callouts from the DFA matching function this field always contains NULL. The callout_flags field is always zero in callouts from pcre2_dfa_match() or when JIT is being used. When pcre2_match() without JIT is used, the following bits may be set: PCRE2_CALLOUT_STARTMATCH This is set for the first callout after the start of matching for each new starting position in the subject. PCRE2_CALLOUT_BACKTRACK This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run. Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output from pcre2test does not indi- cate the presence of these bits unless the callout_extra modifier is set. The information in the callout_flags field is provided so that applica- tions can track and tell their users how matching with backtracking is done. This can be useful when trying to optimize patterns, or just to understand how PCRE2 works. There is no support in pcre2_dfa_match() because there is no backtracking in DFA matching, and there is no sup- port in JIT because JIT is all about maximimizing matching performance. In both these cases the callout_flags field is always zero. RETURN VALUES FROM CALLOUTS The external callout function returns an integer to PCRE2. If the value is zero, matching proceeds as normal. If the value is greater than zero, matching fails at the current point, but the testing of other matching possibilities goes ahead, just as if a lookahead assertion had failed. If the value is less than zero, the match is abandoned, and the matching function returns the negative value. Negative values should normally be chosen from the set of PCRE2_ER- ROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout functions; it will never be used by PCRE2 itself. CALLOUT ENUMERATION int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), void *user_data); A script language that supports the use of string arguments in callouts might like to scan all the callouts in a pattern before running the match. This can be done by calling pcre2_callout_enumerate(). The first argument is a pointer to a compiled pattern, the second points to a callback function, and the third is arbitrary user data. The callback function is called for every callout in the pattern in the order in which they appear. Its first argument is a pointer to a callout enumer- ation block, and its second argument is the user_data value that was passed to pcre2_callout_enumerate(). The data block contains the fol- lowing fields: version Block version number pattern_position Offset to next item in pattern next_item_length Length of next item in pattern callout_number Number for numbered callouts callout_string_offset Offset to string within pattern callout_string_length Length of callout string callout_string Points to callout string or is NULL The version number is currently 0. It will increase if new fields are ever added to the block. The remaining fields are the same as their namesakes in the pcre2_callout block that is used for callouts during matching, as described above. Note that the value of pattern_position is unique for each callout. However, if a callout occurs inside a group that is quantified with a non-zero minimum or a fixed maximum, the group is replicated inside the compiled pattern. For example, a pattern such as /(a){2}/ is compiled as if it were /(a)(a)/. This means that the callout will be enumerated more than once, but with the same value for pattern_position in each case. The callback function should normally return zero. If it returns a non- zero value, scanning the pattern stops, and that value is returned from pcre2_callout_enumerate(). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 26 February 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 26 February 2025 PCRE2CALLOUT(3) ------------------------------------------------------------------------------ PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) DIFFERENCES BETWEEN PCRE2 AND PERL This document describes some of the known differences in the ways that PCRE2 and Perl handle regular expressions. The differences described here are with respect to Perl version 5.38.0, but as both Perl and PCRE2 are continually changing, the information may at times be out of date. 1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the next character unless it is the start of a newline se- quence. This means that, if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF (0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline indicator. 2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does have are given in the pcre2unicode page. 3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized asser- tions, but they do not mean what you might think. For example, (?!a){3} does not assert that the next three characters are not "a". It just as- serts that the next character is not "a" three times (in principle; PCRE2 optimizes this to run the assertion just once). Perl allows some repeat quantifiers on other assertions, for example, \b* , but these do not seem to have any use. PCRE2 does not allow any kind of quantifier on non-lookaround assertions. 4. If a braced quantifier such as {1,2} appears where there is nothing to repeat (for example, at the start of a branch), PCRE2 raises an er- ror whereas Perl treats the quantifier characters as literal. When a braced quantifier (...){min,max} has min > max, Perl treats it as an item which fails to match any portion of the subject (as no number of repetitions can meet the condition), and additionally issues a warning when in warning mode. PCRE2 has no warning features, so it gives an er- ror in this case. 5. Capture groups that occur inside negative lookaround assertions are counted, but their entries in the offsets vector are set only when a negative assertion is a condition that has a matching branch (that is, the condition is false). Perl may set such capture groups in other circumstances. 6. The following Perl escape sequences are not supported: \F, \l, \L, \u, \U, and \N when followed by a character name. \N on its own, match- ing a non-newline character, and \N{U+dd..}, matching a Unicode code point, are supported. The escapes that modify the case of following letters are implemented by Perl's general string-handling and are not part of its pattern matching engine. If any of these are encountered by PCRE2, an error is generated by default. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript interprets them. 7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties such as Lu and Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) prop- erty, but in PCRE2 its use is limited. See the pcre2pattern documenta- tion for details. The long synonyms for property names that Perl sup- ports (such as \p{Letter}) are not supported by PCRE2, nor is it per- mitted to prefix any of these properties with "Is". 8. PCRE2 supports the \Q...\E escape for quoting substrings. Characters in between are treated as literals. However, this is slightly different from Perl in that $ and @ are also handled as literals inside the quotes. In Perl, they cause variable interpolation (PCRE2 does not have variables). Also, Perl does "double-quotish backslash interpolation" on any backslashes between \Q and \E which, its documentation says, "may lead to confusing results". PCRE2 treats a backslash between \Q and \E just like any other character. Note the following examples: Pattern PCRE2 matches Perl matches \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz \Qabc\$xyz\E abc\$xyz abc\$xyz \Qabc\E\$\Qxyz\E abc$xyz abc$xyz \QA\B\E A\B A\B \Q\\E \ \\E The \Q...\E sequence is recognized both inside and outside character classes by both PCRE2 and Perl. Another difference from Perl is that any appearance of \Q or \E inside what might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a quantifier. Perl recog- nizes a quantifier if (redundantly) either of the numbers is inside \Q...\E, but not if the separating comma is. When not recognized as a quantifier a sequence such as {\Q1\E,2} is treated as the literal string "{1,2}". 9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details. 10. Subroutine calls (whether recursive or not) were treated as atomic groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. 11. In PCRE2, if any of the backtracking control verbs are used in a group that is called as a subroutine (whether or not recursively), their effect is confined to that group; it does not extend to the sur- rounding pattern. This is not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are processed as anchored at the point where they are tested. PCRE2 also confines all control verbs within atomic assertions, again including (*THEN) in assertions with only one branch. 12. If a pattern contains more than one backtracking control verb, the first one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. 13. There are some differences that are concerned with the settings of captured strings when part of a pattern is repeated. For example, matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- set, but in PCRE2 it is set to "b". 14. PCRE2's handling of duplicate capture group numbers and names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to trans- late between numbers and names. In particular, a pattern such as (?|(?A)|(?B)), where the two capture groups have the same number but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. 15. Perl used to recognize comments in some places that PCRE2 does not, for example, between the ( and ? at the start of a group. If the /x modifier is set, Perl allowed white space between ( and ? though the latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. 16. Perl, when in warning mode, gives warnings for character classes such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- als. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. 17. In PCRE2, until release 10.45, the upper/lower case character prop- erties Lu and Ll were not affected when case-independent matching was specified. Perl has changed in this respect, and PCRE2 has now changed to match. When caseless matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased letter). 18. From release 5.32.0, Perl locks out the use of \K in lookaround as- sertions. From release 10.38 PCRE2 does the same by default. However, there is an option for re-enabling the previous behaviour. When this option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions. 19. PCRE2 provides some extensions to the Perl regular expression fa- cilities. Perl 5.10 included new features that were not in earlier versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This list is with respect to Perl 5.38: (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. (b) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- fiers is inverted, that is, by default they are not greedy, but if fol- lowed by a question mark they are. (d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART options have no Perl equivalents. (f) The \R escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. (g) The callout facility is PCRE2-specific. Perl supports codeblocks and variable interpolation, but not general hooks on every match. (h) The partial matching facility is PCRE2-specific. (i) The alternative matching function (pcre2_dfa_match()) matches in a different way and is not Perl-compatible. (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at the start of a pattern. These set overall options that cannot be changed within the pattern. (k) PCRE2 supports non-atomic positive lookaround assertions. This is an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. (l) There are three syntactical items in patterns that can refer to a capturing group by number: back references such as \g{2}, subroutine calls such as (?3), and condition references such as (?(4)...). PCRE2 supports relative group numbers such as +2 and -4 in all three cases. Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. (m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 ex- tension that is not available in Perl. 20. Perl has different limits than PCRE2. See the pcre2limits documen- tation for details. Perl went with 5.10 from recursion to iteration keeping the intermediate matches on the heap, which is ~10% slower but does not fall into any stack-overflow limit. PCRE2 made a similar change at release 10.30, and also has many build-time and run-time cus- tomizable limits. 21. Unlike Perl, PCRE2 doesn't have character set modifiers and spe- cially no way to set characters by context just like Perl's "/d". A regular expression using PCRE2_UTF and PCRE2_UCP will use similar rules to Perl's "/u"; something closer to "/a" could be selected by adding other PCRE2_EXTRA_ASCII* options on top. 22. Some recursive patterns that Perl diagnoses as infinite recursions can be handled by PCRE2, either by the interpreter or the JIT. An exam- ple is /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject. 23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl tries to recover and prints a warning if the problem was that an in- valid hexadecimal digit was found. Since PCRE2 doesn't have warnings it returns an error instead. Additionally, Perl accepts \x{} and gener- ates NUL unlike PCRE2. 24. From release 10.45, PCRE2 gives an error if \x is not followed by a hexadecimal digit or a curly bracket. It used to interpret this as the NUL character. Perl still generates NUL, but warns when in warning mode in most cases. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 02 June 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 02 June 2025 PCRE2COMPAT(3) ------------------------------------------------------------------------------ PCRE2JIT(3) Library Functions Manual PCRE2JIT(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 JUST-IN-TIME COMPILER SUPPORT Just-in-time compiling is a heavyweight optimization that can greatly speed up pattern matching. However, it comes at the cost of extra pro- cessing before the match is performed, so it is of most benefit when the same pattern is going to be matched many times. This does not nec- essarily mean many calls of a matching function; if the pattern is not anchored, matching attempts may take place many times at various posi- tions in the subject, even for a single call. Therefore, if the subject string is very long, it may still pay to use JIT even for one-off matches. JIT support is available for all of the 8-bit, 16-bit and 32-bit PCRE2 libraries. JIT support applies only to the traditional Perl-compatible matching function. It does not apply when the DFA matching function is being used. The code for JIT support was written by Zoltan Herczeg. AVAILABILITY OF JIT SUPPORT JIT support is an optional feature of PCRE2. The "configure" option --enable-jit (or equivalent CMake option) must be set when PCRE2 is built if you want to use JIT. The support is limited to the following hardware platforms: ARM 32-bit (v7, and Thumb2) ARM 64-bit IBM s390x 64 bit Intel x86 32-bit and 64-bit LoongArch 64 bit MIPS 32-bit and 64-bit Power PC 32-bit and 64-bit RISC-V 32-bit and 64-bit If --enable-jit is set on an unsupported platform, compilation fails. A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular match. One reason for this is that there are a number of op- tions and pattern items that are not supported by JIT (see below). An- other reason is that in some environments JIT is unable to get exe- cutable memory in which to build its compiled code. The only guarantee from pcre2_config() is that if it returns zero, JIT will definitely not be used. As of release 10.45 there is a more informative way to test for JIT support. If pcre2_compile_jit() is called with the single option PCRE2_JIT_TEST_ALLOC it returns zero if JIT is available and has a working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UN- SUPPORTED if JIT support is not compiled. The code argument is ignored, so it can be a NULL value. A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs that need the best possible performance, there is a "fast path" API that is JIT-specific. SIMPLE USE OF JIT To make use of the JIT support in the simplest way, all you have to do is to call pcre2_jit_compile() after successfully compiling a pattern with pcre2_compile(). This function has two arguments: the first is the compiled pattern pointer that was returned by pcre2_compile(), and the second is zero or more of the following option bits: PCRE2_JIT_COM- PLETE, PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT. If JIT support is not available, a call to pcre2_jit_compile() does nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern is passed to the JIT compiler, which turns it into machine code that executes much faster than the normal interpretive code, but yields exactly the same results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code. There is a limit to the size of pattern that JIT supports, imposed by the size of machine stack that it uses. The exact rules are not docu- mented because they may change at any time, in particular, when new op- timizations are introduced. If a pattern is too big, a call to pcre2_jit_compile() returns PCRE2_ERROR_NOMEMORY. PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- plete matches. If you want to run partial matches using the PCRE2_PAR- TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should set one or both of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT compiler generates different optimized code for each of the three modes (normal, soft partial, hard partial). When pcre2_match() is called, the appropriate code is run if it is avail- able. Otherwise, the pattern is matched using interpretive code. You can call pcre2_jit_compile() multiple times for the same compiled pattern. It does nothing if it has previously compiled code for any of the option bits. For example, you can call it once with PCRE2_JIT_COM- PLETE and (perhaps later, when you find you need partial matching) again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial match- ing. If pcre2_jit_compile() is called with no option bits set, it imme- diately returns zero. This is an alternative way of testing whether JIT support has been compiled. At present, it is not possible to free JIT compiled code except when the entire compiled pattern is freed by calling pcre2_code_free(). In some circumstances you may need to call additional functions. These are described in the section entitled "Controlling the JIT stack" be- low. There are some pcre2_match() options that are not supported by JIT, and there are also some pattern items that JIT cannot handle. Details are given below. In both cases, matching automatically falls back to the interpretive code. If you want to know whether JIT was actually used for a particular match, you should arrange for a JIT callback function to be set up as described in the section entitled "Controlling the JIT stack" below, even if you do not need to supply a non-default JIT stack. Such a callback function is called whenever JIT code is about to be obeyed. If the match-time options are not right for JIT execution, the callback function is not obeyed. If the JIT compiler finds an unsupported item, no JIT data is gener- ated. You can find out if JIT compilation was successful for a compiled pattern by calling pcre2_pattern_info() with the PCRE2_INFO_JITSIZE op- tion. A non-zero result means that JIT compilation was successful. A result of 0 means that JIT support is not available, or the pattern was not processed by pcre2_jit_compile(), or the JIT compiler was not able to handle the pattern. Successful JIT compilation does not, however, guarantee the use of JIT at match time because there are some match time options that are not supported by JIT. MATCHING SUBJECTS CONTAINING INVALID UTF When a pattern is compiled with the PCRE2_UTF option, subject strings are normally expected to be a valid sequence of UTF code units. By de- fault, this is checked at the start of matching and an error is gener- ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be passed to pcre2_match() to skip the check (for improved performance) if you are sure that a subject string is valid. If this option is used with an invalid string, the result is undefined. The calling program may crash or loop or otherwise misbehave. However, a way of running matches on strings that may contain invalid UTF sequences is available. Calling pcre2_compile() with the PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- pile() is subsequently called, the compiled JIT code also supports in- valid UTF. Details of how this support works, in both the JIT and the interpretive cases, is given in the pcre2unicode documentation. There is also an obsolete option for pcre2_jit_compile() called PCRE2_JIT_INVALID_UTF, which currently exists only for backward compat- ibility. It is superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF and should no longer be used. It may be removed in future. UNSUPPORTED OPTIONS AND PATTERN ITEMS The pcre2_match() options that are supported for JIT matching are PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not supported at match time. If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the use of JIT, forcing matching by the interpreter code. The only unsupported pattern items are \C (match a single data unit) when running in a UTF mode, and a callout immediately before an asser- tion condition in a conditional group. RETURN VALUES FROM JIT MATCHING When a pattern is matched using JIT, the return values are the same as those given by the interpretive pcre2_match() code, with the addition of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the memory used for the JIT stack was insufficient. See "Controlling the JIT stack" below for a discussion of JIT stack usage. The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if searching a very large pattern tree goes on for too long, as it is in the same circumstance when JIT is not used, but the details of exactly what is counted are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned when JIT matching is used. CONTROLLING THE JIT STACK When the compiled JIT code runs, it needs a block of memory to use as a stack. By default, it uses 32KiB on the machine stack. However, some large or complicated patterns need more than this. The error PCRE2_ER- ROR_JIT_STACKLIMIT is given when there is not enough stack. Three func- tions are provided for managing blocks of memory for use as JIT stacks. There is further discussion about the use of JIT stacks in the section entitled "JIT stack FAQ" below. The pcre2_jit_stack_create() function creates a JIT stack. Its argu- ments are a starting size, a maximum size, and a general context (for memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type pcre2_jit_stack, or NULL if there is an error. The pcre2_jit_stack_free() function is used to free a stack that is no longer needed. If its argument is NULL, this function returns immediately, without doing anything. (For the techni- cally minded: the address space is allocated by mmap or VirtualAlloc.) A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. The pcre2_jit_stack_assign() function specifies which stack JIT code should use. Its arguments are as follows: pcre2_match_context *mcontext pcre2_jit_callback callback void *data The first argument is a pointer to a match context. When this is subse- quently passed to a matching function, its information determines which JIT stack is used. If this argument is NULL, the function returns imme- diately, without doing anything. There are three cases for the values of the other two options: (1) If callback is NULL and data is NULL, an internal 32KiB block on the machine stack is used. This is the default when a match context is created. (2) If callback is NULL and data is not NULL, data must be a pointer to a valid JIT stack, the result of calling pcre2_jit_stack_create(). (3) If callback is not NULL, it must point to a function that is called with data as an argument at the start of matching, in order to set up a JIT stack. If the return from the callback function is NULL, the internal 32KiB stack is used; otherwise the return value must be a valid JIT stack, the result of calling pcre2_jit_stack_create(). A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when pcre2_match() is called with options that are incom- patible for JIT matching. A callback function can therefore be used to determine whether a match operation was executed by JIT or by the in- terpreter. You may safely use the same JIT stack for more than one pattern (either by assigning directly or by callback), as long as the patterns are matched sequentially in the same thread. Currently, the only way to set up non-sequential matches in one thread is to use callouts: if a call- out function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es). In a multithread application, if you do not specify a JIT stack, or if you assign or pass back NULL from a callback, that is thread-safe, be- cause each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for each thread so that the application is thread-safe. Strictly speaking, even more is allowed. You can assign the same non- NULL stack to a match context that is used by any number of patterns, as long as they are not used for matching by multiple threads at the same time. For example, you could use the same stack in all compiled patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended. This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate: During thread initialization thread_local_var = pcre2_jit_stack_create(...) During thread exit pcre2_jit_stack_free(thread_local_var) Use a one-line callback function return thread_local_var All the functions described in this section do nothing if JIT is not available. JIT STACK FAQ (1) Why do we need JIT stacks? PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is diffi- cult. For example, the stack chain needs to be updated every time if we extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory. (2) Why don't we simply allocate blocks of memory with malloc()? Modern operating systems have a nice feature: they can reserve an ad- dress space instead of allocating memory. We can safely allocate memory pages inside this address space, so the stack could grow without moving memory data (this is important because of pointers). Thus we can allo- cate 1MiB address space, and use only a single memory page (usually 4KiB) if that is enough. However, we can still grow up to 1MiB anytime if needed. (3) Who "owns" a JIT stack? The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by pcre2_match(), (that is, it is assigned to a match context that is passed to the pattern currently running), that stack must not be used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function. (4) When should a JIT stack be freed? You can free a JIT stack at any time, as long as it will not be used by pcre2_match() again. When you assign the stack to a match context, only a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, any- time. Just do not call pcre2_match() with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently used by pcre2_match() in another thread). You can also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement. (5) Should I allocate/free a stack every time before/after calling pcre2_match()? No, because this is too costly in terms of resources. However, you could implement some clever idea which release the stack if it is not used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns. (6) OK, the stack is for long term memory allocation. But what happens if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the stack is freed? Especially on embedded systems, it might be a good idea to release mem- ory sometimes without freeing the stack. There is no API for this at the moment. Probably a function call which returns with the currently allocated memory for any stack and another which allows releasing mem- ory (shrinking the stack) would be a good idea if someone needs this. (7) This is too much of a headache. Isn't there any better solution for JIT stack handling? No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API. FREEING JIT SPECULATIVE MEMORY void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); The JIT executable allocator does not free all memory when it is possi- ble. It expects new allocations, and keeps some free memory around to improve allocation speed. However, in low memory conditions, it might be better to free all possible memory. You can cause this to happen by calling pcre2_jit_free_unused_memory(). Its argument is a general con- text, for custom memory management, or NULL for standard memory manage- ment. EXAMPLE CODE This is a single-threaded example that specifies a JIT stack without using a callback. A real program should include error checking after all the function calls. int rc; pcre2_code *re; pcre2_match_data *match_data; pcre2_match_context *mcontext; pcre2_jit_stack *jit_stack; re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0, &errornumber, &erroffset, NULL); rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); mcontext = pcre2_match_context_create(NULL); jit_stack = pcre2_jit_stack_create(32*1024, 512*1024, NULL); pcre2_jit_stack_assign(mcontext, NULL, jit_stack); match_data = pcre2_match_data_create(re, 10); rc = pcre2_match(re, subject, length, 0, 0, match_data, mcontext); /* Process result */ pcre2_code_free(re); pcre2_match_data_free(match_data); pcre2_match_context_free(mcontext); pcre2_jit_stack_free(jit_stack); JIT FAST PATH API Because the API described above falls back to interpreted matching when JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via pcre2_match() does have a performance impact. Programs that are written for use where JIT is known to be available, and which need the best possible performance, can instead use a "fast path" API to call JIT matching directly instead of calling pcre2_match() (obviously only for patterns that have been successfully processed by pcre2_jit_compile()). The fast path function is called pcre2_jit_match(), and it takes ex- actly the same arguments as pcre2_match(). However, the subject string must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- ported. Unsupported option bits (for example, PCRE2_ANCHORED and PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- turn values are also the same as for pcre2_match(), plus PCRE2_ER- ROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For exam- ple, if the subject pointer is NULL but the length is non-zero, an im- mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these checks do not happen on the JIT fast path. If invalid UTF data is passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), the result is undefined. The program may crash or loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you should call pcre2_jit_match() in UTF mode only if you are sure the subject is valid. Bypassing the sanity checks and the pcre2_match() wrapping can give speedups of more than 10%. SEE ALSO pcre2api(3), pcre2unicode(3) AUTHOR Philip Hazel (FAQ by Zoltan Herczeg) Retired from University Computing Service Cambridge, England. REVISION Last updated: 22 August 2024 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 22 August 2024 PCRE2JIT(3) ------------------------------------------------------------------------------ PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) SIZE AND OTHER LIMITATIONS There are some size limitations in PCRE2 but it is hoped that they will never in practice be relevant. The maximum size of a compiled pattern is approximately 64 thousand code units for the 8-bit and 16-bit libraries if PCRE2 is compiled with the default internal linkage size, which is 2 bytes for these li- braries. If you want to process regular expressions that are truly enormous, you can compile PCRE2 with an internal linkage size of 3 or 4 (when building the 16-bit library, 3 is rounded up to 4). See the README file in the source distribution and the pcre2build documentation for details. In these cases the limit is substantially larger. How- ever, the speed of execution is slower. In the 32-bit library, the in- ternal linkage size is always 4. The maximum length of a source pattern string is essentially unlimited; it is the largest number a PCRE2_SIZE variable can hold. However, the program that calls pcre2_compile() can specify a smaller limit. The maximum length (in code units) of a subject string is one less than the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an un- signed integer type, usually defined as size_t. Its maximum value (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-termi- nated strings and unset offsets. All values in repeating quantifiers must be less than 65536. There are two different limits that apply to branches of lookbehind as- sertions. If every branch in such an assertion matches a fixed number of characters, the maximum length of any branch is 65535 characters. If any branch matches a variable number of characters, then the maximum matching length for every branch is limited. The default limit is set at compile time, defaulting to 255, but can be changed by the calling program. There is no limit to the number of parenthesized groups, but there can be no more than 65535 capture groups, and there is a limit to the depth of nesting of parenthesized subpatterns of all kinds. This is imposed in order to limit the amount of system stack used at compile time. The default limit can be specified when PCRE2 is built; if not, the default is set to 250. An application can change this limit by calling pcre2_set_parens_nest_limit() to set the limit in a compile context. The maximum length of the name for a named capture group as well as the number of such groups is configurable at build time. The maximum length for the name defaults to 128 code units, and the maximum number of such groups to 10000. The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb is 255 code units for the 8-bit library and 65535 code units for the 16-bit and 32-bit libraries. The maximum length of a string argument to a callout is the largest number a 32-bit unsigned integer can hold. The maximum amount of heap memory used for matching is controlled by the heap limit, which can be set in a pattern or in a match context. The default is a very large number, effectively unlimited. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 03 September 2025 Copyright (c) 1997-2023 University of Cambridge. PCRE2 10.48-DEV 03 September 2025 PCRE2LIMITS(3) ------------------------------------------------------------------------------ PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 MATCHING ALGORITHMS This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() function. This works in the same way as Perl's matching function, and provides a Perl-compatible matching operation. The just- in-time (JIT) optimization that is described in the pcre2jit documenta- tion is compatible with this function. An alternative algorithm is provided by the pcre2_dfa_match() function; it operates in a different way, and is not Perl-compatible. This alter- native has advantages and disadvantages compared with the standard al- gorithm, and these are described below. When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, when there are multiple possibilities. For example, if the anchored pattern ^<.*> is matched against the string there are three possible answers. The standard algorithm finds only one of them, whereas the alternative algorithm finds all three. REGULAR EXPRESSIONS AS TREES The set of strings that are matched by a regular expression can be rep- resented as a tree structure. An unlimited repetition in the pattern makes the tree of infinite size, but it is still a tree. Matching the pattern to a given subject string (from a given starting point) can be thought of as a search of the tree. There are two ways to search a tree: depth-first and breadth-first, and these correspond to the two matching algorithms provided by PCRE2. THE STANDARD MATCHING ALGORITHM In the terminology of Jeffrey Friedl's book "Mastering Regular Expres- sions", the standard algorithm is an "NFA algorithm". It conducts a depth-first search of the pattern tree. That is, it proceeds along a single path through the tree, checking that the subject matches what is required. When there is a mismatch, the algorithm tries any alterna- tives at the current point, and if they all fail, it backs up to the previous branch point in the tree, and tries the next alternative branch at that level. This often involves backing up (moving to the left) in the subject string as well. The order in which repetition branches are tried is controlled by the greedy or ungreedy nature of the quantifier. If a leaf node is reached, a matching string has been found, and at that point the algorithm stops. Thus, if there is more than one possi- ble match, this algorithm returns the first one that it finds. Whether this is the shortest, the longest, or some intermediate length depends on the way the alternations and the greedy or ungreedy repetition quan- tifiers are specified in the pattern. Because it ends up with a single path through the tree, it is rela- tively straightforward for this algorithm to keep track of the sub- strings that are matched by portions of the pattern in parentheses. This provides support for capturing parentheses and backreferences. THE ALTERNATIVE MATCHING ALGORITHM This algorithm conducts a breadth-first search of the tree. Starting from the first matching point in the subject, it scans the subject string from left to right, once, character by character, and as it does this, it remembers all the paths through the tree that represent valid matches. In Friedl's terminology, this is a kind of "DFA algorithm", though it is not implemented as a traditional finite state machine (it keeps multiple states active simultaneously). Although the general principle of this matching algorithm is that it scans the subject string only once, without backtracking, there is one exception: when a lookaround assertion is encountered, the characters following or preceding the current point have to be independently in- spected. The scan continues until either the end of the subject is reached, or there are no more unterminated paths. At this point, terminated paths represent the different matching possibilities (if there are none, the match has failed). Thus, if there is more than one possible match, this algorithm finds all of them, and in particular, it finds the longest. The matches are returned in the output vector in decreasing order of length. There is an option to stop the algorithm after the first match (which is necessarily the shortest) is found. Note that the size of vector needed to contain all the results depends on the number of simultaneous matches, not on the number of capturing parentheses in the pattern. Using pcre2_match_data_create_from_pat- tern() to create the match data block is therefore not advisable when doing DFA matching. Note also that all the matches that are found start at the same point in the subject. If the pattern cat(er(pillar)?)? is matched against the string "the caterpillar catchment", the result is the three strings "caterpillar", "cater", and "cat" that start at the fifth character of the subject. The algorithm does not automati- cally move on to find matches that start at later positions. PCRE2's "auto-possessification" optimization usually applies to charac- ter repeats at the end of a pattern (as well as internally). For exam- ple, the pattern "a\d+" is compiled as if it were "a\d++" because there is no point even considering the possibility of backtracking into the repeated digits. For DFA matching, this means that only one possible match is found. If you really do want multiple matches in such cases, either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- SESS option when compiling. There are a number of features of PCRE2 regular expressions that are not supported or behave differently in the alternative matching func- tion. Those that are not supported cause an error if encountered. 1. Because the algorithm finds all possible matches, the greedy or un- greedy nature of repetition quantifiers is not relevant (though it may affect auto-possessification, as just described). During matching, greedy and ungreedy quantifiers are treated in exactly the same way. However, possessive quantifiers can make a difference when what follows could also match what is quantified, for example in a pattern like this: ^a++\w! This pattern matches "aaab!" but not "aaa!", which would be matched by a non-possessive quantifier. Similarly, if an atomic group is present, it is matched as if it were a standalone pattern at the current point, and the longest match is then "locked in" for the rest of the overall pattern. 2. When dealing with multiple paths through the tree simultaneously, it is not straightforward to keep track of captured substrings for the different matching possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured sub- strings are available. 3. Because no substrings are captured, a number of related features are not available: (a) Backreferences; (b) Conditional expressions that use a backreference as the condition or test for a specific group recursion; (c) Script runs; (d) Scan substring assertions. 4. Because many paths through the tree may be active, the \K escape se- quence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported. 5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0. 6. The \C escape sequence, which (in the standard algorithm) always matches a single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree. 7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion. 8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- ported by pcre2_dfa_match(). ADVANTAGES OF THE ALTERNATIVE ALGORITHM The main advantage of the alternative algorithm is that all possible matches (at a single point in the subject) are automatically found, and in particular, the longest match is found. To find more than one match at the same point using the standard algorithm, you have to do kludgy things with callouts. Partial matching is possible with this algorithm, though it has some limitations. The pcre2partial documentation gives details of partial matching and discusses multi-segment matching. DISADVANTAGES OF THE ALTERNATIVE ALGORITHM The alternative algorithm suffers from a number of disadvantages: 1. It is substantially slower than the standard algorithm. This is partly because it has to search for all possible matches, but is also because it is less susceptible to optimization. 2. Capturing parentheses and other features such as backreferences that rely on them are not supported. 3. Matching within invalid UTF strings is not supported. 4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm. 5. JIT optimization is not supported. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 22 February 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 22 February 2025 PCRE2MATCHING(3) ------------------------------------------------------------------------------ PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PARTIAL MATCHING IN PCRE2 In normal use of PCRE2, if there is a match up to the end of a subject string, but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH is returned, just like any other failing match. There are circumstances where it might be helpful to distinguish this "partial match" case. One example is an application where the subject string is very long, and not all available at once. The requirement here is to be able to do the matching segment by segment, but special action is needed when a matched substring spans the boundary between two segments. Another example is checking a user input string as it is typed, to en- sure that it conforms to a required format. Invalid characters can be immediately diagnosed and rejected, giving instant feedback. Partial matching is a PCRE2-specific feature; it is not Perl-compati- ble. It is requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options when calling a matching function. The dif- ference between the two options is whether or not a partial match is preferred to an alternative complete match, though the details differ between the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD takes precedence. If you want to use partial matching with just-in-time optimized code, as well as setting a partial match option for the matching function, you must also call pcre2_jit_compile() with one or both of these op- tions: PCRE2_JIT_PARTIAL_HARD PCRE2_JIT_PARTIAL_SOFT PCRE2_JIT_COMPLETE should also be set if you are going to run non-par- tial matches on the same pattern. Separate code is compiled for each mode. If the appropriate JIT mode has not been compiled, interpretive matching code is used. Setting a partial matching option disables two of PCRE2's standard op- timization hints. PCRE2 remembers the last literal code unit in a pat- tern, and abandons matching immediately if it is not present in the subject string. This optimization cannot be used for a subject string that might match only partially. PCRE2 also remembers a minimum length of a matching string, and does not bother to run the matching function on shorter strings. This optimization is also disabled for partial matching. REQUIREMENTS FOR A PARTIAL MATCH A possible partial match occurs during matching when the end of the subject string is reached successfully, but either more characters are needed to complete the match, or the addition of more characters might change what is matched. Example 1: if the pattern is /abc/ and the subject is "ab", more char- acters are definitely needed to complete a match. In this case both hard and soft matching options yield a partial match. Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match can be found, but the addition of more characters might change what is matched. In this case, only PCRE2_PARTIAL_HARD returns a par- tial match; PCRE2_PARTIAL_SOFT returns the complete match. On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next pattern item is \z, \Z, \b, \B, or $ there is always a partial match. Otherwise, for both options, the next pattern item must be one that inspects a character, and at least one of the following must be true: (1) At least one character has already been inspected. An inspected character need not form part of the final matched string; lookbehind assertions and the \K escape sequence provide ways of inspecting char- acters before the start of a matched string. (2) The pattern contains one or more lookbehind assertions. This condi- tion exists in case there is a lookbehind that inspects characters be- fore the start of the match. (3) There is a special case when the whole pattern can match an empty string. When the starting point is at the end of the subject, the empty string match is a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above conditions is true, it is returned. However, because adding more characters might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match, which in this case means "there is going to be a match at this point, but until some more char- acters are added, we do not know if it will be an empty string or some- thing longer". PARTIAL MATCHING USING pcre2_match() When a partial matching option is set, the result of calling pcre2_match() can be one of the following: A successful match A complete match has been found, starting and ending within this sub- ject. PCRE2_ERROR_NOMATCH No match can start anywhere in this subject. PCRE2_ERROR_PARTIAL Adding more characters may result in a complete match that uses one or more characters from the end of this subject. When a partial match is returned, the first two elements in the ovector point to the portion of the subject that was matched, but the values in the rest of the ovector are undefined. The appearance of \K in the pat- tern has no effect for a partial match. Consider this pattern: /abc\K123/ If it is matched against "456abc123xyz" the result is a complete match, and the ovector defines the matched string as "123", because \K resets the "start of match" point. However, if a partial match is requested and the subject string is "456abc12", a partial match is found for the string "abc12", because all these characters are needed for a subse- quent re-match with additional characters. If there is more than one partial match, the first one that was found provides the data that is returned. Consider this pattern: /123\w+X|dogY/ If this is matched against the subject string "abc123dog", both alter- natives fail to match, but the end of the subject is reached during matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying "123dog" as the first partial match. (In this exam- ple, there are two partial matches, because "dog" on its own partially matches the second alternative.) How a partial match is processed by pcre2_match() What happens when a partial match is identified depends on which of the two partial matching options is set. If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a partial match is found, without continuing to search for possible complete matches. This option is "hard" because it prefers an earlier partial match over a later complete match. For this reason, the assump- tion is made that the end of the supplied subject string is not the true end of the available data, which is why \z, \Z, \b, \B, and $ al- ways give a partial match. If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching continues as normal, and other alternatives in the pattern are tried. If no complete match can be found, PCRE2_ERROR_PARTIAL is re- turned instead of PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match over a partial match. All the various matching items in a pattern behave as if the subject string is potentially com- plete; \z, \Z, and $ match at the end of the subject, as normal, and for \b and \B the end of the subject is treated as a non-alphanumeric. The difference between the two partial matching options can be illus- trated by a pattern such as: /dog(sbody)?/ This matches either "dog" or "dogsbody", greedily (that is, it prefers the longer string if possible). If it is matched against the string "dog" with PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PAR- TIAL. On the other hand, if the pattern is made ungreedy the result is different: /dog(sbody)??/ In this case the result is always a complete match because that is found first, and matching never continues after finding a complete match. It might be easier to follow this explanation by thinking of the two patterns like this: /dog(sbody)?/ is the same as /dogsbody|dog/ /dog(sbody)??/ is the same as /dog|dogsbody/ The second pattern will never match "dogsbody", because it will always find the shorter match first. Example of partial matching using pcre2test The pcre2test data modifiers partial_hard (or ph) and partial_soft (or ps) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT, respectively, when calling pcre2_match(). Here is a run of pcre2test using a pattern that matches the whole subject in the form of a date: re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ data> 25dec3\=ph Partial match: 23dec3 data> 3ju\=ph Partial match: 3ju data> 3juj\=ph No match This example gives the same results for both hard and soft partial matching options. Here is an example where there is a difference: re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ data> 25jun04\=ps 0: 25jun04 1: jun data> 25jun04\=ph Partial match: 25jun04 With PCRE2_PARTIAL_SOFT, the subject is matched completely. For PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so there is only a partial match. MULTI-SEGMENT MATCHING WITH pcre2_match() PCRE was not originally designed with multi-segment matching in mind. However, over time, features (including partial matching) that make multi-segment matching possible have been added. A very long string can be searched segment by segment by calling pcre2_match() repeatedly, with the aim of achieving the same results that would happen if the en- tire string was available for searching all the time. Normally, the strings that are being sought are much shorter than each individual segment, and are in the middle of very long strings, so the pattern is normally not anchored. Special logic must be implemented to handle a matched substring that spans a segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a partial match at the end of a segment whenever there is the possibility of changing the match by adding more characters. The PCRE2_NOTBOL option should also be set for all but the first segment. When a partial match occurs, the next segment must be added to the cur- rent subject and the match re-run, using the startoffset argument of pcre2_match() to begin at the point where the partial match started. For example: re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/ data> ...the date is 23ja\=ph Partial match: 23ja data> ...the date is 23jan19 and on that day...\=offset=15 0: 23jan19 1: jan Note the use of the offset modifier to start the new match where the partial match was found. In this example, the next segment was added to the one in which the partial match was found. This is the most straightforward approach, typically using a memory buffer that is twice the size of each segment. After a partial match, the first half of the buffer is discarded, the second half is moved to the start of the buffer, and a new segment is added before repeating the match as in the example above. After a no match, the entire buffer can be discarded. If there are memory constraints, you may want to discard text that pre- cedes a partial match before adding the next segment. Unfortunately, this is not at present straightforward. In cases such as the above, where the pattern does not contain any lookbehinds, it is sufficient to retain only the partially matched substring. However, if the pattern contains a lookbehind assertion, characters that precede the start of the partial match may have been inspected during the matching process. When pcre2test displays a partial match, it indicates these characters with '<' if the allusedtext modifier is set: re> "(?<=123)abc" data> xx123ab\=ph,allusedtext Partial match: 123ab <<< However, the allusedtext modifier is not available for JIT matching, because JIT matching does not record the first (or last) consulted characters. For this reason, this information is not available via the API. It is therefore not possible in general to obtain the exact number of characters that must be retained in order to get the right match re- sult. If you cannot retain the entire segment, you must find some heuristic way of choosing. If you know the approximate length of the matching substrings, you can use that to decide how much text to retain. The only lookbehind infor- mation that is currently available via the API is the length of the longest individual lookbehind in a pattern, but this can be misleading if there are nested lookbehinds. The value returned by calling pcre2_pattern_info() with the PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code units) that any individual look- behind moves back when it is processed. A pattern such as "(?<=(? /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ data> 23ja\=dfa,ps Partial match: 23ja data> n05\=dfa,dfa_restart 0: n05 The first call has "23ja" as the subject, and requests partial match- ing; the second call has "n05" as the subject for the continued (restarted) match. Notice that when the match is complete, only the last part is shown; PCRE2 does not retain the previously partially- matched string. It is up to the calling program to do that if it needs to. This means that, for an unanchored pattern, if a continued match fails, it is not possible to try again at a new starting point. All this facility is capable of doing is continuing with the previous match attempt. For example, consider this pattern: 1234|3789 If the first part of the subject is "ABC123", a partial match of the first alternative is found at offset 3. There is no partial match for the second alternative, because such a match does not start at the same point in the subject string. Attempting to continue with the string "7890" does not yield a match because only those alternatives that match at one point in the subject are remembered. Depending on the ap- plication, this may or may not be what you want. If you do want to allow for starting again at the next character, one way of doing it is to retain some or all of the segment and try a new complete match, as described for pcre2_match() above. Another possibil- ity is to work with two buffers. If a partial match at offset n in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you can then try a new match starting at offset n+1 in the first buffer. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 27 November 2024 Copyright (c) 1997-2019 University of Cambridge. PCRE2 10.48-DEV 27 November 2024 PCRE2PARTIAL(3) ------------------------------------------------------------------------------ PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 REGULAR EXPRESSION DETAILS The syntax and semantics of the regular expressions that are supported by PCRE2 are described in detail below. There is a quick-reference syn- tax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. PCRE2 also supports some alterna- tive regular expression syntax that does not conflict with the Perl syntax in order to provide some compatibility with regular expressions in Python, .NET, and Oniguruma. There are in addition some options that enable alternative syntax and semantics that are not the same as in Perl. Perl's regular expressions are described in its own documentation, and regular expressions in general are covered in a number of books, some of which have copious examples. Jeffrey Friedl's "Mastering Regular Ex- pressions", published by O'Reilly, covers regular expressions in great detail. This description of PCRE2's regular expressions is intended as reference material. This document discusses the regular expression patterns that are sup- ported by PCRE2 when its main matching function, pcre2_match(), is used. PCRE2 also has an alternative matching function, pcre2_dfa_match(), which matches using a different algorithm that is not Perl-compatible. Some of the features discussed below are not available when DFA matching is used. The advantages and disadvantages of the alternative function, and how it differs from the normal func- tion, are discussed in the pcre2matching page. EBCDIC CHARACTER CODES Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes this by default. However, it can be compiled to run in an envi- ronment that uses the EBCDIC code, which is the case for some IBM main- frame operating systems. In the sections below, character code values are ASCII or Unicode; in an EBCDIC environment these characters may have different code values, and there are no code points greater than 255. Differences in behaviour when PCRE2 is running in an EBCDIC envi- ronment are described in the section "EBCDIC environments" below, which you can ignore unless you really are in an EBCDIC environment. SPECIAL START-OF-PATTERN ITEMS A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-com- patible, but are provided to make these options accessible to pattern writers who are not able to change the program that processes the pat- tern. Any number of these items may appear, but they must all be to- gether right at the start of the pattern string, and the letters must be in upper case. UTF support In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be specified for the 32-bit library, in which case it constrains the character values to valid Unicode code points. To process UTF strings, PCRE2 must be built to include Unicode support (which is the default). When using UTF strings you must either call the compiling function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How setting a UTF mode affects pattern matching is mentioned in several places below. There is also a summary of features in the pcre2unicode page. Some applications that allow their users to supply patterns may wish to restrict them to non-UTF data for security reasons. If the PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- lowed, and its appearance in a pattern causes an error. Unicode property support Another special sequence that may appear at the start of a pattern is (*UCP). This has the same effect as setting the PCRE2_UCP option: it causes sequences such as \d and \w to use Unicode properties to deter- mine character types, instead of recognizing only characters with codes less than 256 via a lookup table. If also causes upper/lower casing op- erations to use Unicode properties for characters with code points greater than 127, even when UTF is not set. These behaviours can be changed within the pattern; see the section entitled "Internal Option Setting" below. Some applications that allow their users to supply patterns may wish to restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to pcre2_compile(), (*UCP) is not allowed, and its appearance in a pattern causes an error. Locking out empty string matching Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever matching function is subsequently called to match the pat- tern. These options lock out the matching of empty strings, either en- tirely, or only at the start of the subject. Disabling auto-possessification If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_opti- mize() with a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers possessive when what follows cannot match the re- peated item. For example, by default a+b is treated as a++b. For more details, see the pcre2api documentation. Disabling start-up optimizations If a pattern starts with (*NO_START_OPT), it has the same effect as setting the PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_opti- mize() with a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for quickly reaching "no match" results. For more de- tails, see the pcre2api documentation. Disabling automatic anchoring If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_op- timize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables op- timizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the pcre2api documentation. Disabling JIT compilation If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by the application to apply the JIT optimization by calling pcre2_jit_compile() is ignored. Setting match resource limits The pcre2_match() function contains a counter that is incremented every time it goes round its main loop. The caller of pcre2_match() can set a limit on this counter, which therefore limits the amount of computing resource used for a match. The maximum depth of nested backtracking can also be limited; this indirectly restricts the amount of heap memory that is used, but there is also an explicit memory limit that can be set. These facilities are provided to catch runaway matches that are pro- voked by patterns with huge matching trees. A common example is a pat- tern with nested unlimited repeats applied to a long string that does not match. When one of these limits is reached, pcre2_match() gives an error return. The limits can also be set by items at the start of the pattern of the form (*LIMIT_HEAP=d) (*LIMIT_MATCH=d) (*LIMIT_DEPTH=d) where d is any number of decimal digits. However, the value of the set- ting must be less than the value set (or defaulted) by the caller of pcre2_match() for it to have any effect. In other words, the pattern writer can lower the limits set by the programmer, but not raise them. If there is more than one setting of one of these limits, the lower value is used. The heap limit is specified in kibibytes (units of 1024 bytes). Prior to release 10.30, LIMIT_DEPTH was called LIMIT_RECURSION. This name is still recognized for backwards compatibility. The heap limit applies only when the pcre2_match() or pcre2_dfa_match() interpreters are used for matching. It does not apply to JIT. The match limit is used (but in a different way) when JIT is being used, or when pcre2_dfa_match() is called, to limit computing resource usage by those matching functions. The depth limit is ignored by JIT but is relevant for DFA matching, which uses function recursion for recursions within the pattern and for lookaround assertions and atomic groups. In this case, the depth limit controls the depth of such recursion. Newline conventions PCRE2 supports six different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (line- feed) character, the two-character sequence CRLF, any of the three pre- ceding, any Unicode newline sequence, or the NUL character (binary zero). The pcre2api page has further discussion about newlines, and shows how to set the newline convention when calling pcre2_compile(). It is also possible to specify a newline convention by starting a pat- tern string with one of the following sequences: (*CR) carriage return (*LF) linefeed (*CRLF) carriage return, followed by linefeed (*ANYCRLF) any of the three above (*ANY) all Unicode newline sequences (*NUL) the NUL character (binary zero) These override the default and the options given to the compiling func- tion. For example, on a Unix system where LF is the default newline se- quence, the pattern (*CR)a.b changes the convention to CR. That pattern matches "a\nb" because LF is no longer a newline. If more than one of these settings is present, the last one is used. The newline convention affects where the circumflex and dollar asser- tions are true. It also affects the interpretation of the dot metachar- acter when PCRE2_DOTALL is not set, and the behaviour of \N when not followed by an opening brace. However, it does not affect what the \R escape sequence matches. By default, this is any Unicode newline se- quence, for Perl compatibility. However, this can be changed; see the next section and the description of \R in the section entitled "Newline sequences" below. A change of \R setting can be combined with a change of newline convention. Specifying what \R matches It is possible to restrict \R to match only CR, LF, or CRLF (instead of the complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF at compile time. This effect can also be achieved by starting a pattern with (*BSR_ANYCRLF). For completeness, (*BSR_UNI- CODE) is also recognized, corresponding to PCRE2_BSR_UNICODE. CHARACTERS AND METACHARACTERS A regular expression is a pattern that is matched against a subject string from left to right. Most characters stand for themselves in a pattern, and match the corresponding characters in the subject. As a trivial example, the pattern The quick brown fox matches a portion of a subject string that is identical to itself. When caseless matching is specified (the PCRE2_CASELESS option or (?i) within the pattern), letters are matched independently of case. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to pcre2_compile() or set by (*CASE- LESS_RESTRICT) or (?r) within the pattern). If the PCRE2_EXTRA_TURK- ISH_CASING option is in force (either passed to pcre2_compile() or set by (*TURKISH_CASING) within the pattern), then the 'i' letters are matched according to Turkish and Azeri languages. The power of regular expressions comes from the ability to include wild cards, character classes, alternatives, and repetitions in the pattern. These are encoded in the pattern by the use of metacharacters, which do not stand for themselves but instead are interpreted in some special way. There are two different sets of metacharacters: those that are recog- nized anywhere in the pattern except within square brackets, and those that are recognized within square brackets. Outside square brackets, the metacharacters are as follows: \ general escape character with several uses ^ assert start of string (or line, in multiline mode) $ assert end of string (or line, in multiline mode) . match any character except newline (by default) [ start character class definition | start of alternative branch ( start group or control verb ) end group or control verb * 0 or more quantifier + 1 or more quantifier; also "possessive quantifier" ? 0 or 1 quantifier; also quantifier minimizer { potential start of min/max quantifier Brace characters { and } are also used to enclose data for construc- tions such as \g{2} or \k{name}. In almost all uses of braces, space and/or horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear be- fore or after the comma. The exception to this is \u{...} which is an ECMAScript compatibility feature that is recognized only when the PCRE2_EXTRA_ALT_BSUX option is set. ECMAScript does not ignore such white space; it causes the item to be interpreted as literal. Part of a pattern that is in square brackets is called a "character class". In a character class the only metacharacters are: \ general escape character ^ negate the class, but only if the first character - indicates character range [ POSIX character class (if followed by POSIX syntax) ] terminates the character class If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or between a # outside a character class and the next new- line, inclusive, is ignored. An escaping backslash can be used to in- clude a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are ignored inside a character class. Note: only these two characters are ignored, not the full set of pattern white space characters that are ignored outside a character class. Option settings can be changed within a pattern; see the section entitled "Internal Option Setting" below. The following sections describe the use of each of the metacharacters. BACKSLASH The backslash character has several uses. Firstly, if it is followed by a character that is not a digit or a letter, it takes away any special meaning that character may have. This use of backslash as an escape character applies both inside and outside character classes. For example, if you want to match a * character, you must write \* in the pattern. This escaping action applies whether or not the following character would otherwise be interpreted as a metacharacter, so it is always safe to precede a non-alphanumeric with backslash to specify that it stands for itself. In particular, if you want to match a back- slash, you write \\. Only ASCII digits and letters have any special meaning after a back- slash. All other characters (in particular, those whose code points are greater than 127) are treated as literals. If you want to treat all characters in a sequence as literals, you can do so by putting them between \Q and \E. Note that this includes white space even when the PCRE2_EXTENDED option is set so that most other white space is ignored. The behaviour is different from Perl in that $ and @ are handled as literals in \Q...\E sequences in PCRE2, whereas in Perl, $ and @ cause variable interpolation. Also, Perl does "double- quotish backslash interpolation" on any backslashes between \Q and \E which, its documentation says, "may lead to confusing results". PCRE2 treats a backslash between \Q and \E just like any other character. Note the following examples: Pattern PCRE2 matches Perl matches \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz \Qabc\$xyz\E abc\$xyz abc\$xyz \Qabc\E\$\Qxyz\E abc$xyz abc$xyz \QA\B\E A\B A\B \Q\\E \ \\E The \Q...\E sequence is recognized both inside and outside character classes. An isolated \E that is not preceded by \Q is ignored. If \Q is not followed by \E later in the pattern, the literal interpretation continues to the end of the pattern (that is, \E is assumed at the end). If the isolated \Q is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket. Another difference from Perl is that any appearance of \Q or \E inside what might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers is inside \Q...\E, but not if the separating comma is. When not recognized as a quantifier a sequence such as {\Q1\E,2} is treated as the literal string "{1,2}". Non-printing characters A second use of backslash provides a way of encoding non-printing char- acters in patterns in a visible manner. There is no restriction on the appearance of non-printing characters in a pattern, but when a pattern is being prepared by text editing, it is often easier to use one of the following escape sequences instead of the binary character it repre- sents. In an ASCII or Unicode environment, these escapes are as fol- lows: \a alarm, that is, the BEL character (hex 07) \cx "control-x", where x is a non-control ASCII character \e escape (hex 1B) \f form feed (hex 0C) \n linefeed (hex 0A) \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh.. A description of how back references work is given later, following the discussion of parenthesized groups. By default, after \x that is not followed by {, one or two hexadecimal digits are read (letters can be in upper or lower case). If the charac- ter that follows \x is neither { nor a hexadecimal digit, an error oc- curs. This is different from Perl's default behaviour, which generates a NUL character, but is in line with the behaviour of Perl's 'strict' mode in re. Any number of hexadecimal digits may appear between \x{ and }. If a character other than a hexadecimal digit appears between \x{ and }, or if there is no terminating }, an error occurs. Characters whose code points are less than 256 can be defined by either of the two syntaxes for \x or by an octal sequence. There is no differ- ence in the way they are handled. For example, \xdc is exactly the same as \x{dc} or \334. However, using the braced versions does make such sequences easier to read. Support is available for some ECMAScript (aka JavaScript) escape se- quences via two compile-time options. If PCRE2_ALT_BSUX is set, the se- quence \x followed by { is not recognized. Only if \x is followed by two hexadecimal digits is it recognized as a character escape. Other- wise it is interpreted as a literal "x" character. In this mode, sup- port for code points greater than 256 is provided by \u, which must be followed by four hexadecimal digits; otherwise it is interpreted as a literal "u" character. PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- dition, \u{hhh..} is recognized as the character specified by hexadeci- mal code point. There may be any number of hexadecimal digits, but un- like other places that also use curly brackets, spaces are not allowed and would result in the string being interpreted as a literal. This syntax is from ECMAScript 6. The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- ating in UTF mode. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. Note that when \N is not followed by an opening brace (curly bracket) it has an entirely differ- ent meaning, matching any character that is not a newline. There are some legacy applications where the escape sequence \r is ex- pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character. An error occurs if \c is not followed by a character whose ASCII code point is in the range 32 to 126. The precise effect of \cx is as fol- lows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes hex 7B (; is 3B). If the code unit following \c has a code point less than 32 or greater than 126, a compile-time error occurs. For differences in the way some escapes behave in EBCDIC environments, see section "EBCDIC environments" below. Octal escapes and back references The escape \o must be followed by a sequence of octal digits, enclosed in braces. An error occurs if this is not the case. This escape pro- vides a way of specifying character code points as octal numbers greater than 0777, and it also allows octal numbers and backreferences to be unambiguously distinguished. If braces are not used, after \0 up to two further octal digits are read. However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit must follow \0 (use \00 to generate a NUL character). Make sure you supply two digits after the initial zero if the pattern character that follows is itself an octal digit. Inside a character class, when a backslash is followed by any octal digit, up to three octal digits are read to generate a code point. Any subsequent digits stand for themselves. The sequences \8 and \9 are treated as the literal characters "8" and "9". Outside a character class, Perl's handling of a backslash followed by a digit other than 0 is complicated by ambiguity, and Perl has changed over time, causing PCRE2 also to change. From PCRE2 release 10.45 there is an option called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous rules. The next two subsections describe the two sets of rules. For greater clarity and unambiguity, it is best to avoid following \ by a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical character code points, and \g{...} to specify backreferences. Perl rules for non-class backslash 1-9 All the digits that follow the backslash are read as a decimal number. If the number is less than 10, begins with the digit 8 or 9, or if there are at least that many previous capture groups in the expression, the entire sequence is taken as a back reference. Otherwise, up to three octal digits are read to form a character code. For example: \040 is another way of writing an ASCII space \40 is the same, provided there are fewer than 40 previous capture groups \7 is always a backreference \11 might be a backreference, or another way of writing a tab \011 is always a tab \0113 is a tab followed by the character "3" \113 might be a backreference, otherwise the character with octal code 113 \377 might be a backreference, otherwise the value 255 (decimal) \81 is always a backreference Note that octal values of 100 or greater that are specified using this syntax must not be introduced by a leading zero, because no more than three octal digits are ever read. Python rules for non_class backslash 1-9 If there are at least three octal digits after the backslash, exactly three are read as an octal code point number, but the value must be no greater than \377, even in modes where higher code point values are supported. Any subsequent digits stand for themselves. If there are fewer than three octal digits, the sequence is taken as a decimal back reference. Thus, for example, \12 is always a back reference, indepen- dent of how many captures there are in the pattern. An error is gener- ated for a reference to a non-existent capturing group. Constraints on character values Characters that are specified using octal or hexadecimal numbers are limited to certain values, as follows: 8-bit non-UTF mode no greater than 0xff 16-bit non-UTF mode no greater than 0xffff 32-bit non-UTF mode no greater than 0xffffffff All UTF modes no greater than 0x10ffff and a valid code point Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the so-called "surrogate" code points). The check for these can be disabled by the caller of pcre2_compile() by setting the option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16. Escape sequences in character classes All the sequences that define a single character value can be used both inside and outside character classes. In addition, inside a character class, \b is interpreted as the backspace character (hex 08). When not followed by an opening brace, \N is not allowed in a character class. \B, \R, and \X are not special inside a character class. Like other unrecognized alphabetic escape sequences, they cause an error. Outside a character class, these sequences have different meanings. Unsupported escape sequences In Perl, the sequences \F, \l, \L, \u, and \U are recognized by its string handler and used to modify the case of following characters. By default, PCRE2 does not support these escape sequences in patterns. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX op- tions is set, \U matches a "U" character, and \u can be used to define a character by code point, as described above. Absolute and relative backreferences The sequence \g followed by a signed or unsigned number, optionally en- closed in braces, is an absolute or relative backreference. A named backreference can be coded as \g{name}. Backreferences are discussed later, following the discussion of parenthesized groups. Absolute and relative subroutine calls For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative syntax for referencing a capture group as a subroutine. Details are discussed later. Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backref- erence; the latter is a subroutine call. Generic character types Another use of backslash is for specifying generic character types: \d any decimal digit \D any character that is not a decimal digit \h any horizontal white space character \H any character that is not a horizontal white space character \N any character that is not a newline \s any white space character \S any character that is not a white space character \v any vertical white space character \V any character that is not a vertical white space character \w any "word" character \W any "non-word" character The \N escape sequence has the same meaning as the "." metacharacter when PCRE2_DOTALL is not set, but setting PCRE2_DOTALL does not change the meaning of \N. Note that when \N is followed by an opening brace it has a different meaning. See the section entitled "Non-printing charac- ters" above for details. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. Each pair of lower and upper case escape sequences partitions the com- plete set of characters into two disjoint sets. Any given character matches one, and only one, of each pair. The sequences can appear both inside and outside character classes. They each match one character of the appropriate type. If the current matching point is at the end of the subject string, all of them fail, because there is no character to match. The default \s characters are HT (9), LF (10), VT (11), FF (12), CR (13), and space (32), which are defined as white space in the "C" lo- cale. This list may vary if locale-specific matching is taking place. For example, in some locales the "non-breaking space" character (\xA0) is recognized as white space, and in others the VT character is not. A "word" character is an underscore or any character that is a letter or digit. By default, the definition of letters and digits is con- trolled by PCRE2's low-valued character tables, and may vary if locale- specific matching is taking place (see "Locale support" in the pcre2api page). For example, in a French locale such as "fr_FR" in Unix-like systems, or "french" in Windows, some character codes greater than 127 are used for accented letters, and these are then matched by \w. The use of locales with Unicode is discouraged. By default, characters whose code points are greater than 127 never match \d, \s, or \w, and always match \D, \S, and \W, although this may be different for characters in the range 128-255 when locale-specific matching is happening. These escape sequences retain their original meanings from before Unicode support was available, mainly for effi- ciency reasons. If the PCRE2_UCP option is set, the behaviour is changed so that Unicode properties are used to determine character types, as follows: \d any character that matches \p{Nd} (decimal digit) \s any character that matches \p{Z} or \h or \v \w any character that matches \p{L}, \p{N}, \p{Mn}, or \p{Pc} The addition of \p{Mn} (non-spacing mark) and the replacement of an ex- plicit test for underscore with a test for \p{Pc} (connector punctua- tion) happened in PCRE2 release 10.43. This brings PCRE2 into line with Perl. The upper case escapes match the inverse sets of characters. Note that \d matches only decimal digits, whereas \w matches any Unicode digit, as well as other character categories. Note also that PCRE2_UCP affects \b, and \B because they are defined in terms of \w and \W. Matching these sequences is noticeably slower when PCRE2_UCP is set. The effect of PCRE2_UCP on any one of these escape sequences can be negated by the options PCRE2_EXTRA_ASCII_BSD, PCRE2_EXTRA_ASCII_BSS, and PCRE2_EXTRA_ASCII_BSW, respectively. These options can be set and reset within a pattern by means of an internal option setting (see be- low). The sequences \h, \H, \v, and \V, in contrast to the other sequences, which match only ASCII characters by default, always match a specific list of code points, whether or not PCRE2_UCP is set. The horizontal space characters are: U+0009 Horizontal tab (HT) U+0020 Space U+00A0 Non-break space U+1680 Ogham space mark U+180E Mongolian vowel separator U+2000 En quad U+2001 Em quad U+2002 En space U+2003 Em space U+2004 Three-per-em space U+2005 Four-per-em space U+2006 Six-per-em space U+2007 Figure space U+2008 Punctuation space U+2009 Thin space U+200A Hair space U+202F Narrow no-break space U+205F Medium mathematical space U+3000 Ideographic space The vertical space characters are: U+000A Linefeed (LF) U+000B Vertical tab (VT) U+000C Form feed (FF) U+000D Carriage return (CR) U+0085 Next line (NEL) U+2028 Line separator U+2029 Paragraph separator In 8-bit, non-UTF-8 mode, only the characters with code points less than 256 are relevant. Newline sequences Outside a character class, by default, the escape sequence \R matches any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent to the following: (?>\r\n|\n|\x0b|\f|\r|\x85) This is an example of an "atomic group", details of which are given be- low. This particular group matches either the two-character sequence CR followed by LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car- riage return, U+000D), or NEL (next line, U+0085). Because this is an atomic group, the two-character sequence is treated as a single unit that cannot be split. In other modes, two additional characters whose code points are greater than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa- rator, U+2029). Unicode support is not needed for these characters to be recognized. It is possible to restrict \R to match only CR, LF, or CRLF (instead of the complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF at compile time. (BSR is an abbreviation for "back- slash R".) This can be made the default when PCRE2 is built; if this is the case, the other behaviour can be requested via the PCRE2_BSR_UNI- CODE option. It is also possible to specify these settings by starting a pattern string with one of the following sequences: (*BSR_ANYCRLF) CR, LF, or CRLF only (*BSR_UNICODE) any Unicode newline sequence These override the default and the options given to the compiling func- tion. Note that these special settings, which are not Perl-compatible, are recognized only at the very start of a pattern, and that they must be in upper case. If more than one of them is present, the last one is used. They can be combined with a change of newline convention; for ex- ample, a pattern can start with: (*ANY)(*BSR_ANYCRLF) They can also be combined with the (*UTF) or (*UCP) special sequences. Inside a character class, \R is treated as an unrecognized escape se- quence, and causes an error. Unicode character properties When PCRE2 is built with Unicode support (the default), three addi- tional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing charac- ters whose code points are less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Un- known script and with an unassigned type. Matching characters by Unicode property is not fast, because PCRE2 has to do a multistage table lookup in order to find a character's prop- erty. That is why the traditional escape sequences such as \d and \w do not use Unicode properties in PCRE2 by default, though you can make them do so by setting the PCRE2_UCP option or by starting the pattern with (*UCP). The extra escape sequences that provide property support are: \p{xx} a character with the xx property \P{xx} a character without the xx property \X a Unicode extended grapheme cluster For compatibility with Perl, negation can be specified by including a circumflex between the opening brace and the property. For example, \p{^Lu} is the same as \P{Lu}. In accordance with Unicode's "loose matching" rules, ASCII white space characters, hyphens, and underscores are ignored in the properties rep- resented by xx above. As well as the space character, ASCII white space can be tab, linefeed, vertical tab, formfeed, or carriage return. Some properties are specified as a name only; others as a name and a value, separated by a colon or an equals sign. The names and values consist of ASCII letters and digits (with one Perl-specific exception, see below). They are not case sensitive. Note, however, that the es- capes themselves, \p and \P, are case sensitive. There are abbrevia- tions for many names. The following examples are all equivalent: \p{bidiclass=al} \p{BC=al} \p{ Bidi_Class : AL } \p{ Bi-di class = Al } \P{ ^ Bi-di class = Al } There is support for Unicode script names, Unicode general category properties, "Any", which matches any character (including newline), Bidi_Class, a number of binary (yes/no) properties, and some special PCRE2 properties (described below). Certain other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} does not match any characters, so always causes a match failure. Script properties for \p and \P There are three different syntax forms for matching a script. Each Uni- code character has a basic script and, optionally, a list of other scripts ("Script Extensions") with which it is commonly used. Using the Adlam script as an example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and "script extensions" for the property types are recognized and, as for all property specifications, an equals sign is an alternative to the colon. If a script name is given without a property type, for example, \p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this inter- pretation at release 5.26 and PCRE2 changed at release 10.40. Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not part of an identified script are lumped together as "Com- mon". The current list of recognized script names and their 4-character abbreviations can be obtained by running this command: pcre2test -LS The general category property for \p and \P Each character has exactly one Unicode general category property, spec- ified by a two-letter abbreviation. If only one letter is specified with \p or \P, it includes all the general category properties that start with that letter. In this case, in the absence of negation, the curly brackets in the escape sequence are optional; these two examples have the same effect: \p{L} \pL The following general category property codes are supported: C Other Cc Control Cf Format Cn Unassigned Co Private use Cs Surrogate L Letter Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter M Mark Mc Spacing mark Me Enclosing mark Mn Non-spacing mark N Number Nd Decimal number Nl Letter number No Other number P Punctuation Pc Connector punctuation Pd Dash punctuation Pe Close punctuation Pf Final punctuation Pi Initial punctuation Po Other punctuation Ps Open punctuation S Symbol Sc Currency symbol Sk Modifier symbol Sm Mathematical symbol So Other symbol Z Separator Zl Line separator Zp Paragraph separator Zs Space separator Perl originally used the name L& for the Lc property. This is still supported by Perl, but discouraged. PCRE2 also still supports it. This property matches any character that has the Lu, Ll, or Lt property, in other words, any letter that is not classified as a modifier or "other". From release 10.45 of PCRE2 the properties Lu, Ll, and Lt are all treated as Lc when case-independent matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The other properties are not affected by caseless matching. The Cs (Surrogate) property applies only to characters whose code points are in the range U+D800 to U+DFFF. These characters are no dif- ferent to any other character when PCRE2 is not in UTF mode (using the 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF valid- ity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api page). The long synonyms for property names that Perl supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". No character that is in the Unicode table has the Cn (unassigned) prop- erty. Instead, this property is assumed for any code point that is not in the Unicode table. Binary (yes/no) properties for \p and \P Unicode defines a number of binary properties, that is, properties whose only values are true or false. You can obtain a list of those that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP The Bidi_Class property for \p and \P \p{Bidi_Class:} matches a character with the given class \p{BC:} matches a character with the given class The recognized classes are: AL Arabic letter AN Arabic number B paragraph separator BN boundary neutral CS common separator EN European number ES European separator ET European terminator FSI first strong isolate L left-to-right LRE left-to-right embedding LRI left-to-right isolate LRO left-to-right override NSM non-spacing mark ON other neutral PDF pop directional format PDI pop directional isolate R right-to-left RLE right-to-left embedding RLI right-to-left isolate RLO right-to-left override S segment separator WS white space As in all property specifications, an equals sign may be used instead of a colon and the class names are case-insensitive. Only the short names listed above are recognized; PCRE2 does not at present support any long alternatives. Extended grapheme clusters The \X escape matches any number of Unicode characters that form an "extended grapheme cluster", and treats the sequence as an atomic group (see below). Unicode supports various kinds of composite character by giving each character a grapheme breaking property, and having rules that use these properties to define the boundaries of extended grapheme clusters. The rules are defined in Unicode Standard Annex 29, "Unicode Text Segmentation". Unicode 11.0.0 abandoned the use of some previous properties that had been used for emojis. Instead it introduced vari- ous emoji-specific properties. PCRE2 uses only the Extended Picto- graphic property. \X always matches at least one character. Then it decides whether to add additional characters according to the following rules for ending a cluster: 1. End at the end of the subject string. 2. Do not end between CR and LF; otherwise end after any control char- acter. 3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters are of five types: L, V, T, LV, and LVT. An L character may be followed by an L, V, LV, or LVT character; an LV or V character may be followed by a V or T character; an LVT or T character may be fol- lowed only by a T character. 4. Do not end before extending characters or spacing marks or the zero- width joiner (ZWJ) character. Characters with the "mark" property al- ways have the "extend" grapheme breaking property. 5. Do not end after prepend characters. 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width joiner) sequences. An emoji ZWJ sequence consists of a character with the Extended_Pictographic property, optionally followed by one or more characters with the Extend property, followed by the ZWJ character, followed by another Extended_Pictographic character. 7. Do not break within emoji flag sequences. That is, do not break be- tween regional indicator (RI) characters if there are an odd number of RI characters before the break point. 8. Otherwise, end the cluster. PCRE2's additional properties As well as the standard Unicode properties described above, PCRE2 sup- ports four more that make it possible to convert traditional escape se- quences such as \w and \s to use Unicode properties. PCRE2 uses these non-standard, non-Perl properties internally when PCRE2_UCP is set. However, they may also be used explicitly. These properties are: Xan Any alphanumeric character Xps Any POSIX space character Xsp Any Perl space character Xwd Any Perl "word" character Xan matches characters that have either the L (letter) or the N (num- ber) property. Xps matches the characters tab, linefeed, vertical tab, form feed, or carriage return, and any other character that has the Z (separator) property (this includes the space character). Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl compat- ibility, but Perl changed. Xwd matches the same characters as Xan, plus those that match Mn (non-spacing mark) or Pc (connector punctuation, which includes underscore). There is another non-standard property, Xuc, which matches any charac- ter that can be represented by a Universal Character Name in C++ and other programming languages. These are the characters $, @, ` (grave accent), and all characters with Unicode code points greater than or equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that most base (ASCII) characters are excluded. (Universal Character Names are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. Note that the Xuc property does not match these sequences but the char- acters that they represent.) Resetting the match start In normal use, the escape sequence \K causes any previously matched characters not to be included in the final matched sequence that is re- turned. For example, the pattern: foo\Kbar matches "foobar", but reports that it has matched "bar". \K does not interact with anchoring in any way. The pattern: ^foo\Kbar matches only when the subject begins with "foobar" (in single line mode), though it again reports the matched string as "bar". This fea- ture is similar to a lookbehind assertion (described below), but the part of the pattern that precedes \K is not constrained to match a lim- ited number of characters, as is required for a lookbehind assertion. The use of \K does not interfere with the setting of captured sub- strings. For example, when the pattern (foo)\Kbar matches "foobar", the first substring is still set to "foo". From version 5.32.0 Perl forbids the use of \K in lookaround asser- tions. From release 10.38 PCRE2 also forbids this by default. However, the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling pcre2_compile() to re-enable the previous behaviour. When this option is set, \K is acted upon when it occurs inside positive assertions, but is ignored in negative assertions. Note that when a pattern such as (?=ab\K) matches, the reported start of the match can be greater than the end of the match. Using \K in a lookbehind assertion at the start of a pattern can also lead to odd effects. For example, consider this pattern: (?<=\Kfoo)bar If the subject is "foobar", a call to pcre2_match() with a starting offset of 3 succeeds and reports the matching string as "foobar", that is, the start of the reported match is earlier than where the match started. Simple assertions The final use of backslash is for certain simple assertions. An asser- tion specifies a condition that has to be met at a particular point in a match, without consuming any characters from the subject string. The use of groups for more complicated assertions is described below. The backslashed assertions are: \b matches at a word boundary \B matches when not at a word boundary \A matches at the start of the subject \Z matches at the end of the subject also matches before a newline at the end of the subject \z matches only at the end of the subject \G matches at the first matching position in the subject Inside a character class, \b has a different meaning; it matches the backspace character. If any other of these assertions appears in a character class, an "invalid escape sequence" error is generated. A word boundary is a position in the subject string where the current character and the previous character do not both match \w or \W (i.e. one matches \w and the other matches \W), or the start or end of the string if the first or last character matches \w, respectively. When PCRE2 is built with Unicode support, the meanings of \w and \W can be changed by setting the PCRE2_UCP option. When this is done, it also af- fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" or "end of word" metasequence. However, whatever follows \b normally determines which it is. For example, the fragment \ba matches "a" at the start of a word. The \A, \Z, and \z assertions differ from the traditional circumflex and dollar (described in the next section) in that they only ever match at the very start and end of the subject string, whatever options are set. Thus, they are independent of multiline mode. These three asser- tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, which affect only the behaviour of the circumflex and dollar metachar- acters. However, if the startoffset argument of pcre2_match() is non- zero, indicating that matching is to start at a point other than the beginning of the subject, \A can never match. The difference between \Z and \z is that \Z matches before a newline at the end of the string as well as at the very end, whereas \z matches only at the end. The \G assertion is true only when the current matching position is at the start point of the matching process, as specified by the startoff- set argument of pcre2_match(). It differs from \A when the value of startoffset is non-zero. By calling pcre2_match() multiple times with appropriate arguments, you can mimic Perl's /g option, and it is in this kind of implementation where \G can be useful. Note, however, that PCRE2's implementation of \G, being true at the starting character of the matching process, is subtly different from Perl's, which defines it as true at the end of the previous match. In Perl, these can be different when the previously matched string was empty. Because PCRE2 does just one match at a time, it cannot reproduce this behaviour. If all the alternatives of a pattern begin with \G, the expression is anchored to the starting match position, and the "anchored" flag is set in the compiled regular expression. CIRCUMFLEX AND DOLLAR The circumflex and dollar metacharacters are zero-width assertions. That is, they test for a particular condition being true without con- suming any characters from the subject string. These two metacharacters are concerned with matching the starts and ends of lines. If the new- line convention is set so that only the two-character sequence CRLF is recognized as a newline, isolated CR and LF characters are treated as ordinary data characters, and are not recognized as newlines. Outside a character class, in the default matching mode, the circumflex character is an assertion that is true only if the current matching point is at the start of the subject string. If the startoffset argu- ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- flex can never match if the PCRE2_MULTILINE option is unset. Inside a character class, circumflex has an entirely different meaning (see be- low). Circumflex need not be the first character of the pattern if a number of alternatives are involved, but it should be the first thing in each alternative in which it appears if the pattern is ever to match that branch. If all possible alternatives start with a circumflex, that is, if the pattern is constrained to match only at the start of the sub- ject, it is said to be an "anchored" pattern. (There are also other constructs that can cause a pattern to be anchored.) The dollar character is an assertion that is true only if the current matching point is at the end of the subject string, or immediately be- fore a newline at the end of the string (by default), unless PCRE2_NO- TEOL is set. Note, however, that it does not actually match the new- line. Dollar need not be the last character of the pattern if a number of alternatives are involved, but it should be the last item in any branch in which it appears. Dollar has no special meaning in a charac- ter class. The meaning of dollar can be changed so that it matches only at the very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \Z assertion. The meanings of the circumflex and dollar metacharacters are changed if the PCRE2_MULTILINE option is set. When this is the case, a dollar character matches before any newlines in the string, as well as at the very end, and a circumflex matches immediately after internal newlines as well as at the start of the subject string. It does not match after a newline that ends the string, for compatibility with Perl. However, this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option. For example, the pattern /^abc$/ matches the subject string "def\nabc" (where \n represents a newline) in multiline mode, but not otherwise. Consequently, patterns that are anchored in single line mode because all branches start with ^ are not anchored in multiline mode, and a match for circumflex is possible when the startoffset argument of pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. When the newline convention (see "Newline conventions" below) recog- nizes the two-character sequence CRLF as a newline, this is preferred, even if the single characters CR and LF are also recognized as new- lines. For example, if the newline convention is "any", a multiline mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather than after CR, even though CR on its own is a valid newline. (It also matches at the very start of the string, of course.) Note that the sequences \A, \Z, and \z can be used to match the start and end of the subject in both modes, and if all branches of a pattern start with \A it is always anchored, whether or not PCRE2_MULTILINE is set. FULL STOP (PERIOD, DOT) AND \N Outside a character class, a dot in the pattern matches any one charac- ter in the subject string except (by default) a character that signi- fies the end of a line. One or more characters may be specified as line terminators (see "Newline conventions" above). Dot never matches a single line-ending character. When the two-charac- ter sequence CRLF is the only line ending, dot does not match CR if it is immediately followed by LF, but otherwise it matches all characters (including isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurrences of CR of LF match dot. When all Unicode line endings are being recognized, dot does not match CR or LF or any of the other line ending characters. The behaviour of dot with regard to newlines can be changed. If the PCRE2_DOTALL option is set, a dot matches any one character, without exception. If the two-character sequence CRLF is present in the sub- ject string, it takes two dots to match it. The handling of dot is entirely independent of the handling of circum- flex and dollar, the only relationship being that they both involve newlines. Dot has no special meaning in a character class. The escape sequence \N when not followed by an opening brace behaves like a dot, except that it is not affected by the PCRE2_DOTALL option. In other words, it matches any character except one that signifies the end of a line. When \N is followed by an opening brace it has a different meaning. See the section entitled "Non-printing characters" above for details. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. MATCHING A SINGLE CODE UNIT Outside a character class, the escape sequence \C matches any one code unit, whether or not a UTF mode is set. In the 8-bit library, one code unit is one byte; in the 16-bit library it is a 16-bit unit; in the 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, but it is unclear how it can use- fully be used. Because \C breaks up characters into individual code units, matching one unit with \C in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined re- sults, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's valid- ity at the start of processing unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used). An application can lock out the use of \C by setting the PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to build PCRE2 with the use of \C permanently disabled. PCRE2 does not allow \C to appear in lookbehind assertions (described below) in UTF-8 or UTF-16 modes, because this would make it impossible to calculate the length of the lookbehind. Neither the alternative matching function pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. The former gives a match-time error; the latter fails to optimize and so the match is always run using the interpreter. In the 32-bit library, however, \C is always supported (when not ex- plicitly locked out) because it always matches a single code unit, whether or not UTF-32 is specified. In general, the \C escape sequence is best avoided. However, one way of using it that avoids the problem of malformed UTF-8 or UTF-16 charac- ters is to use a lookahead to check the length of the next character, as in this pattern, which could be used with a UTF-8 string (ignore white space and line breaks): (?| (?=[\x00-\x7f])(\C) | (?=[\x80-\x{7ff}])(\C)(\C) | (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) | (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C)) In this example, a group that starts with (?| resets the capturing parentheses numbers in each alternative (see "Duplicate Group Numbers" below). The assertions at the start of each branch check the next UTF-8 character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- tively. The character's individual bytes are then captured by the ap- propriate number of \C groups. SQUARE BRACKETS AND CHARACTER CLASSES An opening square bracket introduces a character class, terminated by a closing square bracket. A closing square bracket on its own is not spe- cial by default. If a closing square bracket is required as a member of the class, it should be the first data character in the class (after an initial circumflex, if present) or escaped with a backslash. This means that, by default, an empty class cannot be defined. However, if the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at the start does end the (empty) class. A character class matches a single character in the subject. A matched character must be in the set of characters defined by the class, unless the first character in the class definition is a circumflex, in which case the subject character must not be in the set defined by the class. If a circumflex is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash. For example, the character class [aeiou] matches any lower case English vowel, whereas [^aeiou] matches all other characters. Note that a cir- cumflex is just a convenient notation for specifying the characters that are in the class by enumerating those that are not. A class that starts with a circumflex is not an assertion; it still consumes a char- acter from the subject string, and therefore it fails to match if the current pointer is at the end of the string. Characters in a class may be specified by their code points using \o, \x, or \N{U+hh..} in the usual way. When caseless matching is set, any letters in a class represent both their upper case and lower case ver- sions, so for example, a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in ad- dition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when ei- ther PCRE2_UTF or PCRE2_UCP is set. If you do not want these ASCII/non- ASCII case equivalences, you can suppress them by setting PCRE2_EX- TRA_CASELESS_RESTRICT, either as an option in a compile context, or by including (*CASELESS_RESTRICT) or (?r) within a pattern. Characters that might indicate line breaks are never treated in any special way when matching character classes, whatever line-ending se- quence is in use, and whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A class such as [^a] always matches one of these characters. The generic character type escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V, \w, and \W may appear in a character class, and add the characters that they match to the class. For example, [\dABCDEF] matches any hexadecimal digit. In UTF modes, the PCRE2_UCP option af- fects the meanings of \d, \s, \w and their upper case partners, just as it does when they appear outside a character class, as described in the section entitled "Generic character types" above. The escape sequence \b has a different meaning inside a character class; it matches the backspace character. The sequences \B, \R, and \X are not special in- side a character class. Like any other unrecognized escape sequences, they cause an error. The same is true for \N when not followed by an opening brace. The minus (hyphen) character can be used to specify a range of charac- ters in a character class. For example, [d-m] matches any letter be- tween d and m, inclusive. If a minus character is required in a class, it must be escaped with a backslash or appear in a position where it cannot be interpreted as indicating a range, typically as the first or last character in the class, or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z. There is some special treatment for alphabetic ranges in EBCDIC envi- ronments; see the section "EBCDIC environments" below. Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d or \H. However, unless the hyphen is the last character in the class, Perl outputs a warning in its warning mode, as this is most likely a user error. As PCRE2 has no facility for warning, an error is given in these cases. It is not possible to have the literal character "]" as the end charac- ter of a range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as the end of a range, so [W-\]46] is in- terpreted as a class containing a range and two other characters. The octal or hexadecimal representation of "]" can also be used to end a range. Ranges normally include all code points between the start and end char- acters, inclusive. They can also be used for code points specified nu- merically, for example [\000-\037]. Ranges can include any characters that are valid for the current mode. In any UTF mode, the so-called "surrogate" characters (those whose code points lie between 0xd800 and 0xdfff inclusive) may not be specified explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- ever, ranges such as [\x{d7ff}-\x{e000}], which include the surrogates, are always permitted. If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character tables for a French locale are in use, [\xc8-\xcb] matches accented E characters in both cases. A circumflex can conveniently be used with the upper case character types to specify a more restricted set of characters than the matching lower case type. For example, the class [^\W_] matches any letter or digit, but not underscore, whereas [\w] includes underscore. A positive character class should be read as "something OR something OR ..." and a negative class as "NOT something AND NOT something AND NOT ...". The metacharacters that are recognized in character classes are back- slash, hyphen (when it can be interpreted as specifying a range), cir- cumflex (only at the start), and the terminating closing square bracket. An opening square bracket is also special when it can be in- terpreted as introducing a POSIX class (see "Posix character classes" below), or a special compatibility feature (see "Compatibility feature for word boundaries" below. Escaping any non-alphanumeric character in a class turns it into a literal, whether or not it would otherwise be a metacharacter. PERL EXTENDED CHARACTER CLASSES From release 10.45 PCRE2 supports Perl's (?[...]) extended character class syntax. This can be used to perform set operations such as inter- section on character classes. The syntax permitted within (?[...]) is quite different to ordinary character classes. Inside the extended class, there is an expression syntax consisting of "atoms", operators, and ordinary parentheses "()" used for grouping. Such classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) turned on within them. This means that literal space and tab characters are ignored everywhere in the class. The allowed atoms are individual characters specified by escape se- quences such as \n or \x{123}, character types such as \d, POSIX classes such as [:alpha:], and nested ordinary (non-extended) character classes. For example, in (?[\d & [...]]) the nested class [...] follows the usual rules for ordinary character classes, in which parentheses are not metacharacters, and character literals and ranges are permit- ted. Character literals and ranges may not appear outside a nested ordinary character class because they are not atoms in the extended syntax. The extended syntax does not introduce any additional escape sequences, so (?[\y]) is an unknown escape, as it would be in [\y]. In the extended syntax, ^ does not negate a class (except within an or- dinary class nested inside an extended class); it is instead a binary operator. The binary operators are "&" (intersection), "|" or "+" (union), "-" (subtraction) and "^" (symmetric difference). These are left-associa- tive and "&" has higher (tighter) precedence, while the others have equal lower precedence. The one prefix unary operator is "!" (comple- ment), with highest precedence. UTS#18 EXTENDED CHARACTER CLASSES The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) syntax, allowing instead extended class behaviour inside or- dinary [...] character classes. This altered syntax for [...] classes is loosely described by the Unicode standard UTS#18. The PCRE2_ALT_EX- TENDED_CLASS option does not prevent use of (?[...]) classes; it just changes the meaning of all [...] classes that are not nested inside a Perl (?[...]) class. Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a character class with two literal characters "a" and "[", but in UTS#18 extended classes the "[" character becomes an additional metacharacter within classes, denoting the start of a nested class, so a literal "[" must be escaped as "\[". Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", "--" and "~~" which denote character class union, intersection, subtraction, and symmetric difference respectively. In standard Perl syntax, these would simply be needlessly-repeated literals (except for "--" which could be the start or end of a range). In UTS#18 extended classes these operators can be used in constructs such as [\p{L}--[QW]] for "Unicode letters, other than Q and W". A literal "-" at the start or end of a range must be escaped, so while "[--1]" in Perl syntax is the range from hyphen to "1", it must be escaped as "[\--1]" in UTS#18 extended classes. Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to ignore space and tab characters is not automatically enabled for UTS#18 extended classes, but it is honoured if set. Extended UTS#18 classes can be nested, and nested classes are them- selves extended classes (unlike Perl, where nested classes must be sim- ple classes). For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any letter that is in the Thai or Greek scripts. Note that this means that no special grouping characters (such as the parentheses used in Perl's (?[...]) class syntax) are needed. Individual class items (literal characters, literal ranges, properties such as \d or \p{...}, and nested classes) can be combined by juxtapo- sition or by an operator. Juxtaposition is the implicit union operator, and binds more tightly than any explicit operator. Thus a sequence of literals and/or ranges behaves as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character except "E" or "8". Precedence between the explicit operators is not defined, so mixing op- erators is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is valid. This is an emerging syntax which is being adopted gradually across the regex ecosystem: for example JavaScript adopted the "/v" flag in EC- MAScript 2024; Python's "re" module reserves the syntax for future use with a FutureWarning for unescaped use of "[" as a literal within char- acter classes. Due to UTS#18 providing insufficient guidance, engines interpret the syntax differently. Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 extended classes, but with slight incompatibilities ([A||B&&C] is parsed as [A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex"). PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so that all the UTS#18 extended classes accepted as valid by PCRE2 have the property that they are interpreted either with the same behaviour, or as invalid, by all other major engines. Please file an issue if you are aware of cross-engine differences in behaviour between PCRE2 and another major engine. POSIX CHARACTER CLASSES Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports this notation, in both ordinary and extended classes. For ex- ample, [01[:alpha:]%] matches "0", "1", any alphabetic character, or "%". The supported class names are: alnum letters and digits alpha letters ascii character codes 0 - 127 blank space or tab only cntrl control characters digit decimal digits (same as \d) graph printing characters, excluding space lower lower case letters print printing characters, including space punct printing characters, excluding letters and digits and space space white space (the same as \s from PCRE2 8.34) upper upper case letters word "word" characters (same as \w) xdigit hexadecimal digits The default "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and space (32). If locale-specific matching is taking place, the list of space characters may be different; there may be fewer or more of them. "Space" and \s match the same set of characters, as do "word" and \w. The name "word" is a Perl extension, and "blank" is a GNU extension from Perl 5.8. Another Perl extension is negation, which is indicated by a ^ character after the colon. For example, [12[:^digit:]] matches "1", "2", or any non-digit. PCRE2 (and Perl) also recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not supported, and an error is given if they are encountered. By default, characters with values greater than 127 do not match any of the POSIX character classes, although this may be different for charac- ters in the range 128-255 when locale-specific matching is happening. However, in UCP mode, unless certain options are set (see below), some of the classes are changed so that Unicode character properties are used. This is achieved by replacing POSIX classes with other sequences, as follows: [:alnum:] becomes \p{Xan} [:alpha:] becomes \p{L} [:blank:] becomes \h [:cntrl:] becomes \p{Cc} [:digit:] becomes \p{Nd} [:lower:] becomes \p{Ll} [:space:] becomes \p{Xps} [:upper:] becomes \p{Lu} [:word:] becomes \p{Xwd} Negated versions, such as [:^alpha:] use \P instead of \p. Four other POSIX classes are handled specially in UCP mode: [:graph:] This matches characters that have glyphs that mark the page when printed. In Unicode property terms, it matches all char- acters with the L, M, N, P, S, or Cf properties, except for: U+061C Arabic Letter Mark U+180E Mongolian Vowel Separator U+2066 - U+2069 Various "isolate"s [:print:] This matches the same characters as [:graph:] plus space characters that are not controls, that is, characters with the Zs property. [:punct:] This matches all characters that have the Unicode P (punctua- tion) property, plus those characters with code points less than 256 that have the S (Symbol) property. [:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This is a change that was made in PCRE2 release 10.43 for Perl compatibility. The other POSIX classes are unchanged by PCRE2_UCP, and match only characters with code points less than 256. There are two options that can be used to restrict the POSIX classes to ASCII characters when PCRE2_UCP is set. The option PCRE2_EX- TRA_ASCII_DIGIT affects just [:digit:] and [:xdigit:]. Within a pat- tern, this can be set and unset by (?aT) and (?-aT). The PCRE2_EX- TRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency. COMPATIBILITY FEATURE FOR WORD BOUNDARIES In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of word". PCRE2 treats these items as follows: [[:<:]] is converted to \b(?=\w) [[:>:]] is converted to \b(?<=\w) Only these exact character sequences are recognized. A sequence such as [a[:<:]b] provokes error for an unrecognized POSIX class name. This support is not compatible with Perl. It is provided to help migrations from other environments, and is best not used in any new patterns. Note that \b matches at the start and the end of a word (see "Simple asser- tions" above), and in a Perl-style pattern the preceding or following character normally shows which is wanted, without the need for the as- sertions that are used above in order to give exactly the POSIX behav- iour. Note also that the PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so it also affects these POSIX se- quences. VERTICAL BAR Vertical bar characters are used to separate alternative patterns. For example, the pattern gilbert|sullivan matches either "gilbert" or "sullivan". Any number of alternatives may appear, and an empty alternative is permitted (matching the empty string). The matching process tries each alternative in turn, from left to right, and the first one that succeeds is used. If the alternatives are within a group (defined below), "succeeds" means matching the rest of the main pattern as well as the alternative in the group. INTERNAL OPTION SETTING The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, and are described in detail in the pcre2api documenta- tion. The option letters are: i for PCRE2_CASELESS m for PCRE2_MULTILINE n for PCRE2_NO_AUTO_CAPTURE s for PCRE2_DOTALL x for PCRE2_EXTENDED xx for PCRE2_EXTENDED_MORE For example, (?im) sets caseless, multiline matching. It is also possi- ble to unset these options by preceding the relevant letters with a hy- phen, for example (?-im). The two "extended" options are not indepen- dent; unsetting either one cancels the effects of both of them. A combined setting and unsetting such as (?im-sx), which sets PCRE2_CASELESS and PCRE2_MULTILINE while unsetting PCRE2_DOTALL and PCRE2_EXTENDED, is also permitted. Only one hyphen may appear in the options string. If a letter appears both before and after the hyphen, the option is unset. An empty options setting "(?)" is allowed. Need- less to say, it has no effect. If the first character following (? is a circumflex, it causes all of the above options to be unset. Letters may follow the circumflex to cause some options to be re-instated, but a hyphen may not appear. Some PCRE2-specific options can be changed by the same mechanism using these pairs or individual letters: aD for PCRE2_EXTRA_ASCII_BSD aS for PCRE2_EXTRA_ASCII_BSS aW for PCRE2_EXTRA_ASCII_BSW aP for PCRE2_EXTRA_ASCII_POSIX and PCRE2_EXTRA_ASCII_DIGIT aT for PCRE2_EXTRA_ASCII_DIGIT r for PCRE2_EXTRA_CASELESS_RESTRICT J for PCRE2_DUPNAMES U for PCRE2_UNGREEDY However, except for 'r', these are not unset by (?^), which is equiva- lent to (?-imnrsx). If 'a' is not followed by any of the upper case letters shown above, it sets (or unsets) all the ASCII options. PCRE2_EXTRA_ASCII_DIGIT has no additional effect when PCRE2_EX- TRA_ASCII_POSIX is set, but including it in (?aP) means that (?-aP) suppresses all ASCII restrictions for POSIX classes. When one of these option changes occurs at top level (that is, not in- side group parentheses), the change applies until a subsequent change, or the end of the pattern. An option change within a group (see below for a description of groups) affects only that part of the group that follows it. At the end of the group these options are reset to the state they were before the group. For example, (a(?i)b)c matches abc and aBc and no other strings (assuming PCRE2_CASELESS is not set externally). Any changes made in one alternative do carry on into subsequent branches within the same group. For example, (a(?i)b|c) matches "ab", "aB", "c", and "C", even though when matching "C" the first branch is abandoned before the option setting. This is because the effects of option settings happen at compile time. There would be some very weird behaviour otherwise. As a convenient shorthand, if any option settings are required at the start of a non-capturing group (see the next section), the option let- ters may appear between the "?" and the ":". Thus the two patterns (?i:saturday|sunday) (?:(?i)saturday|sunday) match exactly the same set of strings. Note: There are other PCRE2-specific options, applying to the whole pattern, which can be set by the application when the compiling func- tion is called. In addition, the pattern can contain special leading sequences such as (*CRLF) to override what the application has set or what has been defaulted. Details are given in the section entitled "Newline sequences" above. There are also the (*UTF) and (*UCP) leading sequences that can be used to set UTF and Unicode property modes; they are equivalent to setting the PCRE2_UTF and PCRE2_UCP options, respec- tively. However, the application can set the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences. GROUPS Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things: 1. It localizes a set of alternatives. For example, the pattern cat(aract|erpillar|) matches "cataract", "caterpillar", or "cat". Without the parentheses, it would match "cataract", "erpillar" or an empty string. 2. It creates a "capture group". This means that, when the whole pat- tern matches, the portion of the subject string that matched the group is passed back to the caller, separately from the portion that matched the whole pattern. (This applies only to the traditional matching function; the DFA matching function does not support capturing.) Opening parentheses are counted from left to right (starting from 1) to obtain numbers for capture groups. For example, if the string "the red king" is matched against the pattern the ((red|white) (king|queen)) the captured substrings are "red king", "red", and "king", and are num- bered 1, 2, and 3, respectively. The fact that plain parentheses fulfil two functions is not always helpful. There are often times when grouping is required without cap- turing. If an opening parenthesis is followed by a question mark and a colon, the group does not do any capturing, and is not counted when computing the number of any subsequent capture groups. For example, if the string "the white queen" is matched against the pattern the ((?:red|white) (king|queen)) the captured substrings are "white queen" and "queen", and are numbered 1 and 2. The maximum number of capture groups is 65535. As a convenient shorthand, if any option settings are required at the start of a non-capturing group, the option letters may appear between the "?" and the ":". Thus the two patterns (?i:saturday|sunday) (?:(?i)saturday|sunday) match exactly the same set of strings. Because alternative branches are tried from left to right, and options are not reset until the end of the group is reached, an option setting in one branch does affect sub- sequent branches, so the above patterns match "SUNDAY" as well as "Sat- urday". DUPLICATE GROUP NUMBERS Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is itself a non-capturing group. For example, consider this pattern: (?|(Sat)ur|(Sun))day Because the two alternatives are inside a (?| group, both sets of cap- turing parentheses are numbered one. Thus, when the pattern matches, you can look at captured substring number one, whichever alternative matched. This construct is useful when you want to capture part, but not all, of one of a number of alternatives. Inside a (?| group, paren- theses are numbered as usual, but the number is reset at the start of each branch. The numbers of any capturing parentheses that follow the whole group start after the highest number used in any branch. The fol- lowing example is taken from the Perl documentation. The numbers under- neath show in which buffer the captured content will be stored. # before ---------------branch-reset----------- after / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x # 1 2 2 3 2 3 4 A backreference to a capture group uses the most recent value that is set for the group. The following pattern matches "abcabc" or "defdef": /(?|(abc)|(def))\1/ In contrast, a subroutine call to a capture group always refers to the first one in the pattern with the given number. The following pattern matches "abcabc" or "defabc": /(?|(abc)|(def))(?1)/ A relative reference such as (?-1) is no different: it is just a conve- nient way of computing an absolute group number. If a condition test for a group's having matched refers to a non-unique number, the test is true if any group with that number has matched. An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section. NAMED CAPTURE GROUPS Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is modified, the numbers may change. To help with this difficulty, PCRE2 supports the naming of capture groups. This feature was not added to Perl until release 5.10. Python had the feature ear- lier, and PCRE1 introduced it at release 4.0, using the Python syntax. PCRE2 supports both the Perl and the Python syntax. In PCRE2, a capture group can be named in one of three ways: (?...) or (?'name'...) as in Perl, or (?P...) as in Python. Names may be up to 128 code units long. When PCRE2_UTF is not set, they may contain only ASCII alphanumeric characters and underscores, but must start with a non-digit. When PCRE2_UTF is set, the syntax of group names is extended to allow any Unicode letter or Unicode decimal digit. In other words, group names must match one of these patterns: ^[_A-Za-z][_A-Za-z0-9]*\z when PCRE2_UTF is not set ^[_\p{L}][_\p{L}\p{Nd}]*\z when PCRE2_UTF is set References to capture groups from other parts of the pattern, such as backreferences, recursion, and conditions, can all be made by name as well as by number. Named capture groups are allocated numbers as well as names, exactly as if the names were not present. In both PCRE2 and Perl, capture groups are primarily identified by numbers; any names are just aliases for these numbers. The PCRE2 API provides function calls for extracting the complete name-to-number translation table from a compiled pattern, as well as convenience functions for extracting captured substrings by name. Warning: When more than one capture group has the same number, as de- scribed in the previous section, a name given to one of them applies to all of them. Perl allows identically numbered groups to have different names. Consider this pattern, where there are two capture groups, both numbered 1: (?|(?aa)|(?bb)) Perl allows this, with both names AA and BB as aliases of group 1. Thus, after a successful match, both names yield the same value (either "aa" or "bb"). In an attempt to reduce confusion, PCRE2 does not allow the same group number to be associated with more than one name. The example above pro- vokes a compile-time error. However, there is still scope for confu- sion. Consider this pattern: (?|(?aa)|(bb)) Although the second group number 1 is not explicitly named, the name AA is still an alias for any group 1. Whether the pattern matches "aa" or "bb", a reference by name to group AA yields the matched string. By default, a name must be unique within a pattern, except that dupli- cate names are permitted for groups with the same number, for example: (?|(?aa)|(?bb)) The duplicate name constraint can be disabled by setting the PCRE2_DUP- NAMES option at compile time, or by the use of (?J) within the pattern, as described in the section entitled "Internal Option Setting" above. Duplicate names can be useful for patterns where only one instance of the named capture group can match. Suppose you want to match the name of a weekday, either as a 3-letter abbreviation or as the full name, and in both cases you want to extract the abbreviation. This pattern (ignoring the line breaks) does the job: (?J) (?Mon|Fri|Sun)(?:day)?| (?Tue)(?:sday)?| (?Wed)(?:nesday)?| (?Thu)(?:rsday)?| (?Sat)(?:urday)? There are five capture groups, but only one is ever set after a match. The convenience functions for extracting the data by name returns the substring for the first (and in this example, the only) group of that name that matched. This saves searching to find which numbered group it was. (An alternative way of solving this problem is to use a "branch reset" group, as described in the previous section.) If you make a backreference to a non-unique named group from elsewhere in the pattern, the groups to which the name refers are checked in the order in which they appear in the overall pattern. The first one that is set is used for the reference. For example, this pattern matches both "foofoo" and "barbar" but not "foobar" or "barfoo": (?J)(?:(?foo)|(?bar))\k If you make a subroutine call to a non-unique named group, the one that corresponds to the first occurrence of the name is used. In the absence of duplicate numbers this is the one with the lowest number. If you use a named reference in a condition test (see the section about conditions below), either to check whether a capture group has matched, or to check for recursion, all groups with the same name are tested. If the condition is true for any one of them, the overall condition is true. This is the same behaviour as testing by number. For further de- tails of the interfaces for handling named capture groups, see the pcre2api documentation. REPETITION Repetition is specified by quantifiers, which may follow any one of these items: a literal data character the dot metacharacter the \C escape sequence the \R escape sequence the \X escape sequence any escape sequence that matches a single character a character class a backreference a parenthesized group (including lookaround assertions) a subroutine call (recursive or otherwise) If a quantifier does not follow a repeatable item, an error occurs. The general repetition quantifier specifies a minimum and maximum number of permitted matches by giving two numbers in curly brackets (braces), separated by a comma. The numbers must be less than 65536, and the first must be less than or equal to the second. For example, z{2,4} matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special character. If the second number is omitted, but the comma is present, there is no upper limit; if the second number and the comma are both omitted, the quantifier specifies an exact number of required matches. Thus [aeiou]{3,} matches at least 3 successive vowels, but may match many more, whereas \d{8} matches exactly 8 digits. If the first number is omitted, the lower limit is taken as zero; in this case the upper limit must be present. X{,4} is interpreted as X{0,4} This is a change in behaviour that happened in Perl 5.34.0 and PCRE2 10.43. In earlier versions such a sequence was not interpreted as a quantifier. Other regular expression engines may behave either way. If the characters that follow an opening brace do not match the syntax of a quantifier, the brace is taken as a literal character. In particu- lar, this means that {,} is a literal string of three characters. Note that not every opening brace is potentially the start of a quanti- fier because braces are used in other items such as \N{U+345} or \k{name}. In UTF modes, quantifiers apply to characters rather than to individual code units. Thus, for example, \x{100}{2} matches two characters, each of which is represented by a two-byte sequence in a UTF-8 string. Simi- larly, \X{3} matches three Unicode extended grapheme clusters, each of which may be several code units long (and they may be of different lengths). The quantifier {0} is permitted, causing the expression to behave as if the previous item and the quantifier were not present. This may be use- ful for capture groups that are referenced as subroutines from else- where in the pattern (but see also the section entitled "Defining cap- ture groups for use by reference only" below). Except for parenthesized groups, items that have a {0} quantifier are omitted from the compiled pattern. For convenience, the three most common quantifiers have single-charac- ter abbreviations: * is equivalent to {0,} + is equivalent to {1,} ? is equivalent to {0,1} It is possible to construct infinite loops by following a group that can match no characters with a quantifier that has no upper limit, for example: (a?)* Earlier versions of Perl and PCRE1 used to give an error at compile time for such patterns. However, because there are cases where this can be useful, such patterns are now accepted, but whenever an iteration of such a group matches no characters, matching moves on to the next item in the pattern instead of repeatedly matching an empty string. This does not prevent backtracking into any of the iterations if a subse- quent item fails to match. By default, quantifiers are "greedy", that is, they match as much as possible (up to the maximum number of permitted repetitions), without causing the rest of the pattern to fail. The classic example of where this gives problems is in trying to match comments in C programs. These appear between /* and */ and within the comment, individual * and / characters may appear. An attempt to match C comments by applying the pattern /\*.*\*/ to the string /* first comment */ not comment /* second comment */ fails, because it matches the entire string owing to the greediness of the .* item. However, if a quantifier is followed by a question mark, it ceases to be greedy, and instead matches the minimum number of times possible, so the pattern /\*.*?\*/ does the right thing with C comments. The meaning of the various quan- tifiers is not otherwise changed, just the preferred number of matches. Do not confuse this use of question mark with its use as a quantifier in its own right. Because it has two uses, it can sometimes appear doubled, as in \d??\d which matches one digit by preference, but can match two if that is the only way the rest of the pattern matches. If the PCRE2_UNGREEDY option is set (an option that is not available in Perl), the quantifiers are not greedy by default, but individual ones can be made greedy by following them with a question mark. In other words, it inverts the default behaviour. When a parenthesized group is quantified with a minimum repeat count that is greater than 1 or with a limited maximum, more memory is re- quired for the compiled pattern, in proportion to the size of the mini- mum or maximum. If a pattern starts with .* or .{0,} and the PCRE2_DOTALL option (equivalent to Perl's /s) is set, thus allowing the dot to match new- lines, the pattern is implicitly anchored, because whatever follows will be tried against every character position in the subject string, so there is no point in retrying the overall match at any position af- ter the first. PCRE2 normally treats such a pattern as though it were preceded by \A. In cases where it is known that the subject string contains no new- lines, it is worth setting PCRE2_DOTALL in order to obtain this opti- mization, or alternatively, using ^ to indicate anchoring explicitly. However, there are some cases where the optimization cannot be used. When .* is inside capturing parentheses that are the subject of a backreference elsewhere in the pattern, a match at the start may fail where a later one succeeds. Consider, for example: (.*)abc\1 If the subject is "xyz123abc123" the match point is the fourth charac- ter. For this reason, such a pattern is not implicitly anchored. Another case where implicit anchoring is not applied is when the lead- ing .* is inside an atomic group. Once again, a match at the start may fail where a later one succeeds. Consider this pattern: (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking con- trol verbs (*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. When a capture group is repeated, the value captured is the substring that matched the final iteration. For example, after (tweedle[dume]{3}\s*)+ has matched "tweedledum tweedledee" the value of the captured substring is "tweedledee". However, if there are nested capture groups, the cor- responding captured values may have been set in previous iterations. For example, after (a|(b))+ matches "aba" the value of the second captured substring is "b". ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be re-evaluated to see if a different number of repeats allows the rest of the pattern to match. Sometimes it is useful to prevent this, either to change the nature of the match, or to cause it fail earlier than it otherwise might, when the author of the pattern knows there is no point in carrying on. Consider, for example, the pattern \d+foo when applied to the subject line 123456bar After matching all 6 digits and then failing to match "foo", the normal action of the matcher is to try again with only 5 digits matching the \d+ item, and then with 4, and so on, before ultimately failing. "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides the means for specifying that once a group has matched, it is not to be re-evaluated in this way. If we use atomic grouping for the previous example, the matcher gives up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example: (?>\d+)foo Perl 5.28 introduced an experimental alphabetic form starting with (* which may be easier to remember: (*atomic:\d+)foo This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is prevented from backtracking into it. Backtracking past it to previous items, however, works as normal. An alternative description is that a group of this type matches exactly the string of characters that an identical standalone pattern would match, if anchored at the current point in the subject string. Atomic groups are not capture groups. Simple cases such as the above example can be thought of as a maximizing repeat that must swallow everything it can. So, while both \d+ and \d+? are prepared to adjust the number of digits they match in order to make the rest of the pat- tern match, (?>\d+) can only match an entire sequence of digits. Atomic groups in general can of course contain arbitrarily complicated expressions, and can be nested. However, when the contents of an atomic group is just a single repeated item, as in the example above, a sim- pler notation, called a "possessive quantifier" can be used. This con- sists of an additional + character following a quantifier. Using this notation, the previous example can be rewritten as \d++foo Note that a possessive quantifier can be used with an entire group, for example: (abc|xyz){2,3}+ Possessive quantifiers are always greedy; the setting of the PCRE2_UN- GREEDY option is ignored. They are a convenient notation for the sim- pler forms of atomic group. However, there is no difference in the meaning of a possessive quantifier and the equivalent atomic group, though there may be a performance difference; possessive quantifiers should be slightly faster. The possessive quantifier syntax is an extension to the Perl 5.8 syn- tax. Jeffrey Friedl originated the idea (and the name) in the first edition of his book. Mike McCloskey liked it, so implemented it when he built Sun's Java package, and PCRE1 copied it from there. It found its way into Perl at release 5.10. PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive, or by starting the pattern with (*NO_AUTO_POSSESS). When a pattern contains an unlimited repeat inside a group that can it- self be repeated an unlimited number of times, the use of an atomic group is the only way to avoid some failing matches taking a very long time indeed. The pattern (\D+|<\d+>)*[!?] matches an unlimited number of substrings that either consist of non- digits, or digits enclosed in <>, followed by either ! or ?. When it matches, it runs quickly. However, if it is applied to aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa it takes a long time before reporting failure. This is because the string can be divided between the internal \D+ repeat and the external * repeat in a large number of ways, and all have to be tried. (The ex- ample uses [!?] rather than a single character at the end, because both PCRE2 and Perl have an optimization that allows for fast failure when a single character is used. They remember the last single character that is required for a match, and fail early if it is not present in the string.) If the pattern is changed so that it uses an atomic group, like this: ((?>\D+)|<\d+>)*[!?] sequences of non-digits cannot be broken, and failure happens quickly. BACKREFERENCES Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that is, to its left) in the pattern, provided there have been that many previous capture groups. However, if the decimal number following the backslash is less than 8, it is always taken as a backreference, and causes an error only if there are not that many capture groups in the entire pattern. In other words, the group that is referenced need not be to the left of the ref- erence for numbers less than 8. A "forward backreference" of this type can make sense when a repetition is involved and the group to the right has participated in an earlier iteration. It is not possible to have a numerical "forward backreference" to a group whose number is 8 or more using this syntax because a sequence such as \50 is interpreted as a character defined in octal. See the subsection entitled "Non-printing characters" above for further details of the handling of digits following a backslash. Other forms of back- referencing do not suffer from this restriction. In particular, there is no problem when named capture groups are used (see below). Another way of avoiding the ambiguity inherent in the use of digits following a backslash is to use the \g escape sequence. This escape must be followed by a signed or unsigned number, optionally enclosed in braces. These examples are all identical: (ring), \1 (ring), \g1 (ring), \g{1} An unsigned number specifies an absolute reference without the ambigu- ity that is present in the older syntax. It is also useful when literal digits follow the reference. A signed number is a relative reference. Consider this example: (abc(def)ghi)\g{-1} The sequence \g{-1} is a reference to the capture group whose number is one less than the number of the next group to be started, so in this example (where the next group would be numbered 3) is it equivalent to \2, and \g{-2} would be equivalent to \1. Note that if this construct is inside a capture group, that group is included in the count, so in this example \g{-2} also refers to group 1: (A)(\g{-2}B) The use of relative references can be helpful in long patterns, and also in patterns that are created by joining together fragments that contain references within themselves. The sequence \g{+1} is a reference to the next capture group that is started after this item, and \g{+2} refers to the one after that, and so on. This kind of forward reference can be useful in patterns that repeat. Perl does not support the use of + in this way. A backreference matches whatever actually most recently matched the capture group in the current subject string, rather than anything at all that matches the group (see "Groups as subroutines" below for a way of doing that). So the pattern (sens|respons)e and \1ibility matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If caseful matching is in force at the time of the backreference, the case of letters is relevant. For exam- ple, ((?i)rah)\s+\1 matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original capture group is matched caselessly. There are several different ways of writing backreferences to named capture groups. The .NET syntax is \k{name}, the Python syntax is (?=name), and the original Perl syntax is \k or \k'name'. All of these are now supported by both Perl and PCRE2. Perl 5.10's unified backreference syntax, in which \g can be used for both numeric and named references, is also supported by PCRE2. We could rewrite the above example in any of the following ways: (?(?i)rah)\s+\k (?'p1'(?i)rah)\s+\k{p1} (?P(?i)rah)\s+(?P=p1) (?(?i)rah)\s+\g{p1} A capture group that is referenced by name may appear in the pattern before or after the reference. There may be more than one backreference to the same group. If a group has not actually been used in a particular match, backreferences to it always fail by default. For example, the pattern (a|(bc))\2 always fails if it starts to match "a" rather than "bc". However, if the PCRE2_MATCH_UNSET_BACKREF option is set at compile time, a backref- erence to an unset value matches an empty string. Because there may be many capture groups in a pattern, all digits fol- lowing a backslash are taken as part of a potential backreference num- ber. If the pattern continues with a digit character, some delimiter must be used to terminate the backreference. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, this can be white space. Otherwise, the \g{} syntax or an empty comment (see "Comments" below) can be used. Recursive backreferences A backreference that occurs inside the group to which it refers fails when the group is first used, so, for example, (a\1) never matches. However, such references can be useful inside repeated groups. For ex- ample, the pattern (a|b\1)+ matches any number of "a"s and also "aba", "ababbaa" etc. At each iter- ation of the group, the backreference matches the character string cor- responding to the previous iteration. In order for this to work, the pattern must be such that the first iteration does not need to match the backreference. This can be done using alternation, as in the exam- ple above, or by a quantifier with a minimum of zero. For versions of PCRE2 less than 10.25, backreferences of this type used to cause the group that they reference to be treated as an atomic group. This restriction no longer applies, and backtracking into such groups can occur as normal. ASSERTIONS An assertion is a test that does not consume any characters. The test must succeed for the match to continue. The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described above. More complicated assertions are coded as parenthesized groups. If matching such a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed. A special kind of assertion, called a "scan substring" assertion, matches a subpattern against a previously captured substring. This is described in the section entitled "Scan substring assertions" below. It is a PCRE2 extension, not compatible with Perl. The other goup-based assertions are of two kinds: those that look ahead of the current position in the subject string, and those that look be- hind it, and in each case an assertion may be positive (must match for the assertion to be true) or negative (must not match for the assertion to be true). The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no back- tracking into the assertion. However, there are some cases where non- atomic assertions can be useful. PCRE2 has some support for these, de- scribed in the section entitled "Non-atomic assertions" below, but they are not Perl-compatible. A lookaround assertion may appear as the condition in a conditional group (see below). In this case, the result of matching the assertion determines which branch of the condition is followed. Assertion groups are not capture groups. If an assertion contains cap- ture groups within it, these are counted for the purposes of numbering the capture groups in the whole pattern. Within each branch of an as- sertion, locally captured substrings may be referenced in the usual way. For example, a sequence such as (.)\g{-1} can be used to check that two adjacent characters are the same. When a branch within an assertion fails to match, any substrings that were captured are discarded (as happens with any pattern branch that fails to match). A negative assertion is true only when all its branches fail to match; this means that no captured substrings are ever retained after a successful negative assertion. When an assertion con- tains a matching branch, what happens depends on the type of assertion. For a positive assertion, internally captured substrings in the suc- cessful branch are retained, and matching continues with the next pat- tern item after the assertion. For a negative assertion, a matching branch means that the assertion is not true. If such an assertion is being used as a condition in a conditional group (see below), captured substrings are retained, because matching continues with the "no" branch of the condition. For other failing negative assertions, control passes to the previous backtracking point, thus discarding any captured strings within the assertion. Most assertion groups may be repeated; though it makes no sense to as- sert the same thing several times, the side effect of capturing in pos- itive assertions may occasionally be useful. However, an assertion that forms the condition for a conditional group may not be quantified. PCRE2 used to restrict the repetition of assertions, but from release 10.35 the only restriction is that an unlimited maximum repetition is changed to be one more than the minimum. For example, {3,} is treated as {3,4}. Alphabetic assertion names Traditionally, symbolic sequences such as (?= and (?<= have been used to specify lookaround assertions. Perl 5.28 introduced some experimen- tal alphabetic alternatives which might be easier to remember. They all start with (* instead of (? and must be written using lower case let- ters. PCRE2 supports the following synonyms: (*positive_lookahead: or (*pla: is the same as (?= (*negative_lookahead: or (*nla: is the same as (?! (*positive_lookbehind: or (*plb: is the same as (?<= (*negative_lookbehind: or (*nlb: is the same as (? .*? \b\1\b ){2} For a subject such as "word1 word2 word3 word2 word3 word4" the result is "word3". How does it work? At the start, ^(?x) anchors the pattern and sets the "x" option, which causes white space (introduced for read- ability) to be ignored. Inside the assertion, the greedy .* at first consumes the entire string, but then has to backtrack until the rest of the assertion can match a word, which is captured by group 1. In other words, when the assertion first succeeds, it captures the right-most word in the string. The current matching point is then reset to the start of the subject, and the rest of the pattern match checks for two occurrences of the captured word, using an ungreedy .*? to scan from the left. If this succeeds, we are done, but if the last word in the string does not oc- cur twice, this part of the pattern fails. If a traditional atomic lookahead (?= or (*pla: had been used, the assertion could not be re- entered, and the whole match would fail. The pattern would succeed only if the very last word in the subject was found twice. Using a non-atomic lookahead, however, means that when the last word does not occur twice in the string, the lookahead can backtrack and find the second-last word, and so on, until either the match succeeds, or all words have been tested. Two conditions must be met for a non-atomic assertion to be useful: the contents of one or more capturing groups must change after a backtrack into the assertion, and there must be a backreference to a changed group later in the pattern. If this is not the case, the rest of the pattern match fails exactly as before because nothing has changed, so using a non-atomic assertion just wastes resources. There is one exception to backtracking into a non-atomic assertion. If an (*ACCEPT) control verb is triggered, the assertion succeeds atomi- cally. That is, a subsequent match failure cannot backtrack into the assertion. Non-atomic assertions are not supported by the alternative matching function pcre2_dfa_match(). They are supported by JIT, but only if they do not contain any control verbs such as (*ACCEPT). (This may change in future). Note that assertions that appear as conditions for conditional groups (see below) must be atomic. SCAN SUBSTRING ASSERTIONS A special kind of assertion, not compatible with Perl, makes it possi- ble to check the contents of a captured substring by matching it with a subpattern. Because this involves capturing, this feature is not sup- ported by pcre2_dfa_match(). A scan substring assertion starts with the sequence (*scan_substring: or (*scs: which is followed by a list of substring numbers (absolute or relative) and/or substring names enclosed in single quotes or angle brackets, all within parentheses. The rest of the item is the subpat- tern that is applied to the substring, as shown in these examples: (*scan_substring:(1)...) (*scs:(-2)...) (*scs:('AB')...) (*scs:(1,'AB',-2)...) The list of groups is checked in the order they are given, and it is the contents of the first one that is found to be set that are scanned. When PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the same name are checked in numerical order. A scan sub- string assertion fails if none of the groups it references have been set. The pattern match on the substring is always anchored, that is, it must match from the start of the substring. There is no "bumpalong" if it does not match at the start. The end of the subject is temporarily re- set to be the end of the substring, so \Z, \z, and $ will match there. However, the start of the subject is not reset. This means that ^ matches only if the substring is actually at the start of the main sub- ject, but it also means that lookbehind assertions into what precedes the substring are possible. Here is a very simple example: find a word that contains the rare (in English) sequence of letters "rh" not at the start: \b(\w++)(*scs:(1).+rh) The first group captures a word which is then scanned by the second group. This example does not actually need this heavyweight feature; the same match can be achieved with: \b\w+?rh\w*\b When things are more complicated, however, scanning a captured sub- string can be a useful way to describe the required match. For exmple, there is a rather complicated pattern in the PCRE2 test data that checks an entire subject string for a palindrome, that is, the sequence of letters is the same in both directions. Suppose you want to search for individual words of two or more characters such as "level" that are palindromes: (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...) Within a substring scanning subpattern, references to other groups work as normal. Capturing groups may appear, and will retain their values during ongoing matching if the assertion succeeds. SCRIPT RUNS In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are commonly used together, and because some diacritical and other marks are used with multiple scripts, it is not that simple. There is a full description of the rules that PCRE2 uses in the section entitled "Script Runs" in the pcre2unicode documentation. If part of a pattern is enclosed between (*script_run: or (*sr: and a closing parenthesis, it fails if the sequence of characters that it matches are not a script run. After a failure, normal backtracking oc- curs. Script runs can be used to detect spoofing attacks using charac- ters that look the same, but are from different scripts. The string "paypal.com" is an infamous example, where the letters could be a mix- ture of Latin and Cyrillic. This pattern ensures that the matched char- acters in a sequence of non-spaces that follow white space are a script run: \s+(*sr:\S+) To be sure that they are all from the Latin script (for example), a lookahead can be used: \s+(?=\p{Latin})(*sr:\S+) This works as long as the first character is expected to be a character in that script, and not (for example) punctuation, which is allowed with any script. If this is not the case, a more creative lookahead is needed. For example, if digits, underscore, and dots are permitted at the start: \s+(?=[0-9_.]*\p{Latin})(*sr:\S+) In many cases, backtracking into a script run pattern fragment is not desirable. The script run can employ an atomic group to prevent this. Because this is a common requirement, a shorthand notation is provided by (*atomic_script_run: or (*asr: (*asr:...) is the same as (*sr:(?>...)) Note that the atomic group is inside the script run. Putting it outside would not prevent backtracking into the script run pattern. Support for script runs is not available if PCRE2 is compiled without Unicode support. A compile-time error is given if any of the above con- structs is encountered. Script runs are not supported by the alternate matching function, pcre2_dfa_match() because they use the same mecha- nism as capturing parentheses. Warning: The (*ACCEPT) control verb (see below) should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking. CONDITIONAL GROUPS It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on the result of an assertion, or whether a specific capture group has already been matched. The two possible forms of conditional group are: (?(condition)yes-pattern) (?(condition)yes-pattern|no-pattern) If the condition is satisfied, the yes-pattern is used; otherwise the no-pattern (if present) is used. An absent no-pattern is equivalent to an empty string (it always matches). If there are more than two alter- natives in the group, a compile-time error occurs. Each of the two al- ternatives may itself contain nested groups of any form, including con- ditional groups; the restriction to two alternatives applies only at the level of the condition itself. This pattern fragment is an example where the alternatives are complex: (?(1) (A|B|C) | (D | (?(2)E|F) | E) ) There are five kinds of condition: references to capture groups, refer- ences to recursion, two pseudo-conditions called DEFINE and VERSION, and assertions. Checking for a used capture group by number If the text between the parentheses consists of a sequence of digits, the condition is true if a capture group of that number has previously matched. If there is more than one capture group with the same number (see the earlier section about duplicate group numbers), the condition is true if any of them have matched. An alternative notation, which is a PCRE2 extension, not supported by Perl, is to precede the digits with a plus or minus sign. In this case, the group number is relative rather than absolute. The most recently opened capture group (which could be enclosing this condition) can be referenced by (?(-1), the next most recent by (?(-2), and so on. Inside loops it can also make sense to re- fer to subsequent groups. The next capture group to be opened can be referenced as (?(+1), and so on. The value zero in any of these forms is not used; it provokes a compile-time error. Consider the following pattern, which contains non-significant white space to make it more readable (assume the PCRE2_EXTENDED option) and to divide it into three parts for ease of discussion: ( \( )? [^()]+ (?(1) \) ) The first part matches an optional opening parenthesis, and if that character is present, sets it as the first captured substring. The sec- ond part matches one or more characters that are not parentheses. The third part is a conditional group that tests whether or not the first capture group matched. If it did, that is, if subject started with an opening parenthesis, the condition is true, and so the yes-pattern is executed and a closing parenthesis is required. Otherwise, since no- pattern is not present, the conditional group matches nothing. In other words, this pattern matches a sequence of non-parentheses, optionally enclosed in parentheses. If you were embedding this pattern in a larger one, you could use a relative reference: ...other stuff... ( \( )? [^()]+ (?(-1) \) ) ... This makes the fragment independent of the parentheses in the larger pattern. Checking for a used capture group by name Perl uses the syntax (?()...) or (?('name')...) to test for a used capture group by name. For compatibility with earlier versions of PCRE1, which had this facility before Perl, the syntax (?(name)...) is also recognized. Note, however, that undelimited names consisting of the letter R followed by digits are ambiguous (see the following sec- tion). Rewriting the above example to use a named group gives this: (? \( )? [^()]+ (?() \) ) If the name used in a condition of this kind is a duplicate, the test is applied to all groups of the same name, and is true if any one of them has matched. Checking for pattern recursion "Recursion" in this sense refers to any subroutine-like call from one part of the pattern to another, whether or not it is actually recur- sive. See the sections entitled "Recursive patterns" and "Groups as subroutines" below for details of recursion and subroutine calls. If a condition is the string (R), and there is no capture group with the name R, the condition is true if matching is currently in a recur- sion or subroutine call to the whole pattern or any capture group. If digits follow the letter R, and there is no group with that name, the condition is true if the most recent call is into a group with the given number, which must exist somewhere in the overall pattern. This is a contrived example that is equivalent to a+b: ((?(R1)a+|(?1)b)) However, in both cases, if there is a capture group with a matching name, the condition tests for its being set, as described in the sec- tion above, instead of testing for recursion. For example, creating a group with the name R1 by adding (?) to the above pattern com- pletely changes its meaning. If a name preceded by ampersand follows the letter R, for example: (?(R&name)...) the condition is true if the most recent recursion is into a group of that name (which must exist within the pattern). This condition does not check the entire recursion stack. It tests only the current level. If the name used in a condition of this kind is a duplicate, the test is applied to all groups of the same name, and is true if any one of them is the most recent recursion. At "top level", all these recursion test conditions are false. Defining capture groups for use by reference only If the condition is the string (DEFINE), the condition is always false, even if there is a group with the name DEFINE. In this case, there may be only one alternative in the rest of the conditional group. It is al- ways skipped if control reaches this point in the pattern; the idea of DEFINE is that it can be used to define subroutines that can be refer- enced from elsewhere. (The use of subroutines is described below.) For example, a pattern to match an IPv4 address such as "192.168.23.245" could be written like this (ignore white space and line breaks): (?(DEFINE) (? 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) ) \b (?&byte) (\.(?&byte)){3} \b The first part of the pattern is a DEFINE group inside which another group named "byte" is defined. This matches an individual component of an IPv4 address (a number less than 256). When matching takes place, this part of the pattern is skipped because DEFINE acts like a false condition. The rest of the pattern uses references to the named group to match the four dot-separated components of an IPv4 address, insist- ing on a word boundary at each end. Checking the PCRE2 version Programs that link with a PCRE2 library can check the version by call- ing pcre2_config() with appropriate arguments. Users of applications that do not have access to the underlying code cannot do this. A spe- cial "condition" called VERSION exists to allow such users to discover which version of PCRE2 they are dealing with by using this condition to match a string such as "yesno". VERSION must be followed either by "=" or ">=" and a version number. For example: (?(VERSION>=10.4)yes|no) This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or "no" otherwise. The fractional part of the version number could be ommited. Assertion conditions If the condition is not in any of the above formats, it must be a parenthesized assertion. This may be a positive or negative lookahead or lookbehind assertion. However, it must be a traditional atomic as- sertion, not one of the non-atomic assertions. Consider this pattern, again containing non-significant white space, and with the two alternatives on the second line: (?(?=[^a-z]*[a-z]) \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) The condition is a positive lookahead assertion that matches an op- tional sequence of non-letters followed by a letter. In other words, it tests for the presence of at least one letter in the subject. If a let- ter is found, the subject is matched against the first alternative; otherwise it is matched against the second. This pattern matches strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits. When an assertion that is a condition contains capture groups, any cap- turing that occurs in a matching branch is retained afterwards, for both positive and negative assertions, because matching always contin- ues after the assertion, whether it succeeds or fails. (Compare non- conditional assertions, for which captures are retained only for posi- tive assertions that succeed.) COMMENTS There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as (?: or a group name or number or a Unicode property name. The characters that make up a comment play no part in the pattern matching. The sequence (?# marks the start of a comment that continues up to the next closing parenthesis. Nested parentheses are not permitted. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, an unescaped # character also introduces a comment, which in this case continues to immediately after the next newline character or character sequence in the pattern. Which characters are interpreted as newlines is controlled by an option passed to the compiling function or by a special sequence at the start of the pattern, as described in the section entitled "New- line conventions" above. Note that the end of this type of comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. For example, consider this pattern when PCRE2_EXTENDED is set, and the default newline convention (a sin- gle linefeed character) is in force: abc #comment \n still comment On encountering the # character, pcre2_compile() skips along, looking for a newline in the pattern. The sequence \n is still literal at this stage, so it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so. RECURSIVE PATTERNS Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can be done is to use a pattern that matches up to some fixed depth of nesting. It is not possible to handle an arbitrary nesting depth. For some time, Perl has provided a facility that allows regular expres- sions to recurse (amongst other things). It does this by interpolating Perl code in the expression at run time, and the code can refer to the expression itself. A Perl pattern using code interpolation to solve the parentheses problem can be created like this: $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x; The (?p{...}) item interpolates Perl code at run time, and in this case refers recursively to the pattern in which it appears. Obviously, PCRE2 cannot support the interpolation of Perl code. In- stead, it supports special syntax for recursion of the entire pattern, and also for individual capture group recursion. After its introduction in PCRE1 and Python, this kind of recursion was subsequently introduced into Perl at release 5.10. A special item that consists of (? followed by a number greater than zero and a closing parenthesis is a recursive subroutine call of the capture group of the given number, provided that it occurs inside that group. (If not, it is a non-recursive subroutine call, which is de- scribed in the next section.) The special item (?R) or (?0) is a recur- sive call of the entire regular expression. This PCRE2 pattern solves the nested parentheses problem (assume the PCRE2_EXTENDED option is set so that white space is ignored): \( ( [^()]++ | (?R) )* \) First it matches an opening parenthesis. Then it matches any number of substrings which can either be a sequence of non-parentheses, or a re- cursive match of the pattern itself (that is, a correctly parenthesized substring). Finally there is a closing parenthesis. Note the use of a possessive quantifier to avoid backtracking into sequences of non- parentheses. If this were part of a larger pattern, you would not want to recurse the entire pattern, so instead you could use this: ( \( ( [^()]++ | (?1) )* \) ) We have put the pattern into parentheses, and caused the recursion to refer to them instead of the whole pattern. In a larger pattern, keeping track of parenthesis numbers can be tricky. This is made easier by the use of relative references. Instead of (?1) in the pattern above you can write (?-2) to refer to the second most recently opened parentheses preceding the recursion. In other words, a negative number counts capturing parentheses leftwards from the point at which it is encountered. Be aware however, that if duplicate capture group numbers are in use, relative references refer to the earliest group with the appropriate number. Consider, for example: (?|(a)|(b)) (c) (?-2) The first two capture groups (a) and (b) are both numbered 1, and group (c) is number 2. When the reference (?-2) is encountered, the second most recently opened parentheses has the number 1, but it is the first such group (the (a) group) to which the recursion refers. This would be the same if an absolute reference (?1) was used. In other words, rela- tive references are just a shorthand for computing a group number. It is also possible to refer to subsequent capture groups, by writing references such as (?+2). However, these cannot be recursive because the reference is not inside the parentheses that are referenced. They are always non-recursive subroutine calls, as described in the next section. An alternative approach is to use named parentheses. The Perl syntax for this is (?&name); PCRE1's earlier syntax (?P>name) is also sup- ported. We could rewrite the above example as follows: (? \( ( [^()]++ | (?&pn) )* \) ) If there is more than one group with the same name, the earliest one is used. The example pattern that we have been looking at contains nested unlim- ited repeats, and so the use of a possessive quantifier for matching strings of non-parentheses is important when applying the pattern to strings that do not match. For example, when this pattern is applied to (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() it yields "no match" quickly. However, if a possessive quantifier is not used, the match runs for a very long time indeed because there are so many different ways the + and * repeats can carve up the subject, and all have to be tested before failure can be reported. At the end of a match, the values of capturing parentheses are those from the outermost level. If you want to obtain intermediate values, a callout function can be used (see below and the pcre2callout documenta- tion). If the pattern above is matched against (ab(cd)ef) the value for the inner capturing parentheses (numbered 2) is "ef", which is the last value taken on at the top level. If a capture group is not matched at the top level, its final captured value is unset, even if it was (temporarily) set at a deeper level during the matching process. Do not confuse the (?R) item with the condition (R), which tests for recursion. Consider this pattern, which matches text in angle brack- ets, allowing for arbitrary nesting. Only digits are allowed in nested brackets (that is, when recursing), whereas any characters are permit- ted at the outer level. < (?: (?(R) \d++ | [^<>]*+) | (?R)) * > In this pattern, (?(R) is the start of a conditional group, with two different alternatives for the recursive and non-recursive cases. The (?R) item is the actual recursive call. Differences in recursion processing between PCRE2 and Perl Some former differences between PCRE2 and Perl no longer exist. Before release 10.30, recursion processing in PCRE2 differed from Perl in that a recursive subroutine call was always treated as an atomic group. That is, once it had matched some of the subject string, it was never re-entered, even if it contained untried alternatives and there was a subsequent matching failure. (Historical note: PCRE implemented recursion before Perl did.) Starting with release 10.30, recursive subroutine calls are no longer treated as atomic. That is, they can be re-entered to try unused alter- natives if there is a matching failure later in the pattern. This is now compatible with the way Perl works. If you want a subroutine call to be atomic, you must explicitly enclose it in an atomic group. Supporting backtracking into recursions simplifies certain types of re- cursive pattern. For example, this pattern matches palindromic strings: ^((.)(?1)\2|.?)$ The second branch in the group matches a single central character in the palindrome when there are an odd number of characters, or nothing when there are an even number of characters, but in order to work it has to be able to try the second case when the rest of the pattern match fails. If you want to match typical palindromic phrases, the pat- tern has to ignore all non-word characters, which can be done like this: ^\W*+((.)\W*+(?1)\W*+\2|\W*+.?)\W*+$ If run with the PCRE2_CASELESS option, this pattern matches phrases such as "A man, a plan, a canal: Panama!". Note the use of the posses- sive quantifier *+ to avoid backtracking into sequences of non-word characters. Without this, PCRE2 takes a great deal longer (ten times or more) to match typical phrases, and Perl takes so long that you think it has gone into a loop. Another way in which PCRE2 and Perl used to differ in their recursion processing is in the handling of captured values. Formerly in Perl, when a group was called recursively or as a subroutine (see the next section), it had no access to any values that were captured outside the recursion, whereas in PCRE2 these values can be referenced. Consider this pattern: ^(.)(\1|a(?2)) This pattern matches "bab". The first capturing parentheses match "b", then in the second group, when the backreference \1 fails to match "b", the second alternative matches "a" and then recurses. In the recursion, \1 does now match "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works. Groups as subroutines If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine in a programming language. More accurately, PCRE2 treats the referenced group as an independent subpattern which it tries to match at the current matching position. The called group may be de- fined before or after the reference. A numbered reference can be ab- solute or relative, as in these examples: (...(absolute)...)...(?2)... (...(relative)...)...(?-1)... (...(?+1)...(relative)... An earlier example pointed out that the pattern (sens|respons)e and \1ibility matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If instead the pattern (sens|respons)e and (?1)ibility is used, it does match "sense and responsibility" as well as the other two strings. Another example is given in the discussion of DEFINE above. Like recursions, subroutine calls used to be treated as atomic, but this changed at PCRE2 release 10.30, so backtracking into subroutine calls can now occur. However, any capturing parentheses that are set during the subroutine call revert to their previous values afterwards. Processing options such as case-independence are fixed when a group is defined, so if it is used as a subroutine, such options cannot be changed for different calls. For example, consider this pattern: (abc)(?i:(?-1)) It matches "abcabc". It does not match "abcABC" because the change of processing option does not affect the called group. The behaviour of backtracking control verbs in groups when called as subroutines is described in the section entitled "Backtracking verbs in subroutines" below. Recursion and subroutines with returned capture groups Since PCRE2 10.47, recursion and subroutine calls may also specify a list of capture groups to return. This is a PCRE2 syntax extension not supported by Perl. The pattern matching recurses into the referenced expression as described above, however, when the recursion returns to the calling expression the subgroups captured during the recursion can be retained when the calling expression's context is restored. When used as a subroutine, this allows the subroutine's capture groups to be used as return values. Only the specific capture groups listed by the caller will be retained, using the following syntax: (?R(grouplist)) recurse whole pattern, returning capture groups (?n(grouplist)) ) (?+n(grouplist)) ) (?-n(grouplist)) ) call subroutine, returning capture groups (?&name(grouplist)) ) (?P>name(grouplist)) ) The list of capture groups "grouplist" is a comma-separated list of (absolute or relative) group numbers, and group names enclosed in sin- gle quotes or angle brackets. Here is an example which first uses the DEFINE condition to create a re-usable routine for matching a weekday, then calls that subroutine and retains the groups it captures for use later: (?x: # ignore whitespace for clarity # Define the routine "weekendday" which matches Saturday or # Sunday, and returns the Sat/Sun prefix as \k. (?(DEFINE) (? (?|(?Sat)urday|(?Sun)day) ) ) # Call the routine. Matches "Saturday,Sat" or "Sunday,Sun". (?&weekendday()),\k ) This feature is not available using the Oniguruma syntax \g<...> or \g'...' below. Oniguruma subroutine syntax For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative syntax for calling a group as a subroutine, possibly re- cursively. Here are two of the examples used above, rewritten using this syntax: (? \( ( (?>[^()]+) | \g )* \) ) (sens|respons)e and \g'1'ibility PCRE2 supports an extension to Oniguruma: if a number is preceded by a plus or a minus sign it is taken as a relative reference. For example: (abc)(?i:\g<-1>) Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call. CALLOUTS Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it possible, amongst other things, to extract different sub- strings that match the same pair of parentheses when there is a repeti- tion. PCRE2 provides a similar feature, but of course it cannot obey arbi- trary Perl code. The feature is called "callout". The caller of PCRE2 provides an external function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is passed, or if the callout entry point is set to NULL, callout points will be passed over silently during matching. To disallow callouts in the pattern syntax, you may use the PCRE2_EXTRA_NEVER_CALLOUT option. Within a regular expression, (?C) indicates a point at which the external function is to be called. There are two kinds of callout: those with a numerical argument and those with a string argument. (?C) on its own with no argument is treated as (?C0). A numerical argument allows the application to distinguish between different callouts. String arguments were added for release 10.20 to make it possible for script languages that use PCRE2 to embed short scripts within patterns in a similar way to Perl. During matching, when PCRE2 reaches a callout point, the external func- tion is called. It is provided with the number or string argument of the callout, the position in the pattern, and one item of data that is also set in the match block. The callout function may cause matching to proceed, to backtrack, or to fail. By default, PCRE2 implements a number of optimizations at matching time, and one side-effect is that sometimes callouts are skipped. If you need all possible callouts to happen, you need to set options that disable the relevant optimizations. More details, including a complete description of the programming interface to the callout function, are given in the pcre2callout documentation. Callouts with numerical arguments If you just want to have a means of identifying different callout points, put a number less than 256 after the letter C. For example, this pattern has two callout points: (?C1)abc(?C2)def If the PCRE2_AUTO_CALLOUT flag is passed to pcre2_compile(), numerical callouts are automatically installed before each item in the pattern. They are all numbered 255. If there is a conditional group in the pat- tern whose condition is an assertion, an additional callout is inserted just before the condition. An explicit callout may also be set at this position, as in this example: (?(?C9)(?=a)abc|def) Note that this applies only to assertion conditions, not to other types of condition. Callouts with string arguments A delimited string may be used instead of a number as a callout argu- ment. The starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the start, except for {, where the end- ing delimiter is }. If the ending delimiter is needed within the string, it must be doubled. For example: (?C'ab ''c'' d')xyz(?C{any text})pqr The doubling is removed before the string is passed to the callout function. BACKTRACKING CONTROL There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form, and may behave differently depending on whether or not a name argument is present. The names are not required to be unique within the pattern. By default, for compatibility with Perl, a name is any sequence of characters that does not include a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result is no longer Perl-compati- ble. When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names and only an unescaped closing parenthesis terminates the name. However, the only backslash items that are permitted are \Q, \E, and sequences such as \x{100} that define character code points. Char- acter type escapes such as \d are faulted. A closing parenthesis can be included in a name either as \) or between \Q and \E. In addition to backslash processing, if the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is also set, unescaped white space in verb names is skipped, and #-comments are recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED and PCRE2_EXTENDED_MORE do not af- fect verb names unless PCRE2_ALT_VERBNAMES is also set. The maximum length of a name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit libraries. If the name is empty, that is, if the closing parenthesis immediately follows the colon, the effect is as if the colon were not there. Any number of these verbs may occur in a pat- tern. Except for (*ACCEPT), they may not be quantified. Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the tra- ditional matching function or JIT, because they use backtracking algo- rithms. With the exception of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function. The behaviour of these verbs in repeated groups, assertions, and in capture groups called as subroutines (whether or not recursively) is documented below. Optimizations that affect backtracking verbs PCRE2 contains some optimizations that are used to speed up matching by running some checks at the start of each match attempt. For example, it may know the minimum length of matching subject, or that a particular character must be present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option when calling pcre2_com- pile(), by calling pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled "Compiling a pattern" in the pcre2api documentation. Experiments with Perl suggest that it too has similar optimizations, and like PCRE2, turning them off can change the result of a match. Verbs that act immediately The following verbs act as soon as they are encountered. (*ACCEPT) or (*ACCEPT:NAME) This verb causes the match to end successfully, skipping the remainder of the pattern. However, when it is inside a capture group that is called as a subroutine, only that group is ended successfully. Matching then continues at the outer level. If (*ACCEPT) in triggered in a posi- tive assertion, the assertion succeeds; in a negative assertion, the assertion fails. If (*ACCEPT) is inside capturing parentheses, the data so far is cap- tured. For example: A((?:A|B(*ACCEPT)|C)D) This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- tured by the outer parentheses. (*ACCEPT) is the only backtracking verb that is allowed to be quanti- fied because an ungreedy quantification with a minimum of zero acts only when a backtrack happens. Consider, for example, (A(*ACCEPT)??B)C where A, B, and C may be complex expressions. After matching "A", the matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and the match succeeds. In both cases, all but C is cap- tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- peated (*ACCEPT) of this type means "succeed on backtrack". Warning: (*ACCEPT) should not be used within a script run group, be- cause it causes an immediate exit from the group, bypassing the script run checking. (*FAIL) or (*FAIL:NAME) This verb causes a matching failure, forcing backtracking to occur. It may be abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl documentation notes that it is probably useful only when combined with (?{}) or (??{}). Those are, of course, Perl features that are not present in PCRE2. The nearest equivalent is the callout fea- ture, as for example in this pattern: a+(?C)(*FAIL) A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times). (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before the verb acts. Recording which path was taken There is one verb whose main purpose is to track how a match was ar- rived at, though it also has a secondary use in conjunction with ad- vancing the match starting point (see (*SKIP) below). (*MARK:NAME) or (*:NAME) A name is always required with this verb. For all the other backtrack- ing control verbs, a NAME argument is optional. When a match succeeds, the name of the last-encountered mark name on the matching path is passed back to the caller as described in the sec- tion entitled "Other information about the match" in the pcre2api docu- mentation. This applies to all instances of (*MARK) and other verbs, including those inside assertions and atomic groups. However, there are differences in those cases when (*MARK) is used in conjunction with (*SKIP) as described below. The mark name that was last encountered on the matching path is passed back. A verb without a NAME argument is ignored for this purpose. Here is an example of pcre2test output, where the "mark" modifier requests the retrieval and outputting of (*MARK) data: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XY 0: XY MK: A XZ 0: XZ MK: B The (*MARK) name is tagged with "MK:" in this output, and in this exam- ple it indicates which of the two alternatives matched. This is a more efficient way of obtaining this information than putting each alterna- tive in its own capturing parentheses. If a verb with a name is encountered in a positive assertion that is true, the name is recorded and passed back if it is the last-encoun- tered. This does not happen for negative assertions or failing positive assertions. After a partial match or a failed match, the last encountered name in the entire match process is returned. For example: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XP No match, mark = B Note that in this unanchored example the mark is retained from the match attempt that started at the letter "X" in the subject. Subsequent match attempts starting at "P" and then with an empty string do not get as far as the (*MARK) item, but nevertheless do not reset it. If you are interested in (*MARK) values after failed matches, you should probably either set the PCRE2_NO_START_OPTIMIZE option or call pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see above) to ensure that the match is always attempted. Verbs that act after backtracking The following verbs do nothing when they are encountered. Matching con- tinues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, back- tracking cannot pass to the left of the verb. However, when one of these verbs appears inside an atomic group or in an atomic lookaround assertion that is true, its effect is confined to that group, because once the group has been matched, there is never any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point. These verbs differ in exactly what kind of failure occurs when back- tracking reaches them. The behaviour described below is what happens when the verb is not in a subroutine or an assertion. Subsequent sec- tions cover these special cases. (*COMMIT) or (*COMMIT:NAME) This verb causes the whole match to fail outright if there is a later matching failure that causes backtracking to reach it. Even if the pat- tern is unanchored, no further attempts to find a match by advancing the starting point take place. If (*COMMIT) is the only backtracking verb that is encountered, once it has been passed pcre2_match() is com- mitted to finding a match at the current starting point, or not at all. For example: a+(*COMMIT)b This matches "xxaab" but not "aacaab". It can be thought of as a kind of dynamic anchor, or "I've started, so I must finish." The behaviour of (*COMMIT:NAME) is not the same as (*MARK:NAME)(*COM- MIT). It is like (*MARK:NAME) in that the name is remembered for pass- ing back to the caller. However, (*SKIP:NAME) searches only for names that are set with (*MARK), ignoring those set by any of the other back- tracking verbs. If there is more than one backtracking verb in a pattern, a different one that follows (*COMMIT) may be triggered first, so merely passing (*COMMIT) during a match does not always guarantee that a match must be at this starting point. Note that (*COMMIT) at the start of a pattern is not the same as an an- chor, unless PCRE2's start-of-match optimizations are turned off, as shown in this output from pcre2test: re> /(*COMMIT)abc/ data> xyzabc 0: abc data> re> /(*COMMIT)abc/no_start_optimize data> xyzabc No match For the first pattern, PCRE2 knows that any match must start with "a", so the optimization skips along the subject to "a" before applying the pattern to the first set of data. The match attempt then succeeds. The second pattern disables the optimization that skips along to the first character. The pattern is now applied starting at "x", and so the (*COMMIT) causes the match to fail without trying any other starting points. (*PRUNE) or (*PRUNE:NAME) This verb causes the match to fail at the current starting position in the subject if there is a later matching failure that causes backtrack- ing to reach it. If the pattern is unanchored, the normal "bumpalong" advance to the next starting character then happens. Backtracking can occur as usual to the left of (*PRUNE), before it is reached, or when matching to the right of (*PRUNE), but if there is no match to the right, backtracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE) is just an alternative to an atomic group or possessive quan- tifier, but there are some uses of (*PRUNE) that cannot be expressed in any other way. In an anchored pattern (*PRUNE) has the same effect as (*COMMIT). The behaviour of (*PRUNE:NAME) is not the same as (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names set with (*MARK), ignoring those set by other backtracking verbs. (*SKIP) This verb, when given without a name, is like (*PRUNE), except that if the pattern is unanchored, the "bumpalong" advance is not to the next character, but to the position in the subject where (*SKIP) was encoun- tered. (*SKIP) signifies that whatever text was matched leading up to it cannot be part of a successful match if there is a later mismatch. Consider: a+(*SKIP)b If the subject is "aaaac...", after the first match attempt fails (starting at the first character in the string), the starting point skips on to start the next attempt at "c". Note that a possessive quan- tifier does not have the same effect as this example; although it would suppress backtracking during the first match attempt, the second at- tempt would start at the second character instead of skipping on to "c". If (*SKIP) is used to specify a new starting position that is the same as the starting position of the current match, or (by being inside a lookbehind) earlier, the position specified by (*SKIP) is ignored, and instead the normal "bumpalong" occurs. (*SKIP:NAME) When (*SKIP) has an associated name, its behaviour is modified. When such a (*SKIP) is triggered, the previous path through the pattern is searched for the most recent (*MARK) that has the same name. If one is found, the "bumpalong" advance is to the subject position that corre- sponds to that (*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with a matching name is found, the (*SKIP) is ignored. The search for a (*MARK) name uses the normal backtracking mechanism, which means that it does not see (*MARK) settings that are inside atomic groups or assertions, because they are never re-entered by back- tracking. Compare the following pcre2test examples: re> /a(?>(*MARK:X))(*SKIP:X)(*F)|(.)/ data: abc 0: a 1: a data: re> /a(?:(*MARK:X))(*SKIP:X)(*F)|(.)/ data: abc 0: b 1: b In the first example, the (*MARK) setting is in an atomic group, so it is not seen when (*SKIP:X) triggers, causing the (*SKIP) to be ignored. This allows the second branch of the pattern to be tried at the first character position. In the second example, the (*MARK) setting is not in an atomic group. This allows (*SKIP:X) to find the (*MARK) when it backtracks, and this causes a new matching attempt to start at the sec- ond character. This time, the (*MARK) is never seen because "a" does not match "b", so the matcher immediately jumps to the second branch of the pattern. Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It ignores names that are set by other backtracking verbs. (*THEN) or (*THEN:NAME) This verb causes a skip to the next innermost alternative when back- tracking reaches it. That is, it cancels any further backtracking within the current alternative. Its name comes from the observation that it can be used for a pattern-based if-then-else block: ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ... If the COND1 pattern matches, FOO is tried (and possibly further items after the end of the group if FOO succeeds); on failure, the matcher skips to the second alternative and tries COND2, without backtracking into COND1. If that succeeds and BAR fails, COND3 is tried. If subse- quently BAZ fails, there are no more alternatives, so there is a back- track to whatever came before the entire group. If (*THEN) is not in- side an alternation, it acts like (*PRUNE). The behaviour of (*THEN:NAME) is not the same as (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names set with (*MARK), ignoring those set by other backtracking verbs. A group that does not contain a | character is just a part of the en- closing alternative; it is not a nested alternation with only one al- ternative. The effect of (*THEN) extends beyond such a group to the en- closing alternative. Consider this pattern, where A, B, etc. are com- plex pattern fragments that do not contain any | characters at this level: A (B(*THEN)C) | D If A and B are matched, but there is a failure in C, matching does not backtrack into A; instead it moves to the next alternative, that is, D. However, if the group containing (*THEN) is given an alternative, it behaves differently: A (B(*THEN)C | (*FAIL)) | D The effect of (*THEN) is now confined to the inner group. After a fail- ure in C, matching moves to (*FAIL), which causes the whole group to fail because there are no more alternatives to try. In this case, matching does backtrack into A. Note that a conditional group is not considered as having two alterna- tives, because only one is ever used. In other words, the | character in a conditional group has a different meaning. Ignoring white space, consider: ^.*? (?(?=a) a | b(*THEN)c ) If the subject is "ba", this pattern does not match. Because .*? is un- greedy, it initially matches zero characters. The condition (?=a) then fails, the character "b" is matched, but "c" is not. At this point, matching does not backtrack to .*? as might perhaps be expected from the presence of the | character. The conditional group is part of the single alternative that comprises the whole pattern, and so the match fails. (If there was a backtrack into .*?, allowing it to match "b", the match would succeed.) The verbs just described provide four different "strengths" of control when subsequent matching fails. (*THEN) is the weakest, carrying on the match at the next alternative. (*PRUNE) comes next, failing the match at the current starting position, but allowing an advance to the next character (for an unanchored pattern). (*SKIP) is similar, except that the advance may be more than one character. (*COMMIT) is the strongest, causing the entire match to fail. More than one backtracking verb If more than one backtracking verb is present in a pattern, the one that is backtracked onto first acts. For example, consider this pat- tern, where A, B, etc. are complex pattern fragments: (A(*COMMIT)B(*THEN)C|ABD) If A matches but B fails, the backtrack to (*COMMIT) causes the entire match to fail. However, if A and B match, but C fails, the backtrack to (*THEN) causes the next alternative (ABD) to be tried. This behaviour is consistent, but is not always the same as Perl's. It means that if two or more backtracking verbs appear in succession, all but the last of them has no effect. Consider this example: ...(*COMMIT)(*PRUNE)... If there is a matching failure to the right, backtracking onto (*PRUNE) causes it to be triggered, and its action is taken. There can never be a backtrack onto (*COMMIT). Backtracking verbs in repeated groups PCRE2 sometimes differs from Perl in its handling of backtracking verbs in repeated groups. For example, consider: /(a(*COMMIT)b)+ac/ If the subject is "abac", Perl matches unless its optimizations are disabled, but PCRE2 always fails because the (*COMMIT) in the second repeat of the group acts. Backtracking verbs in assertions (*FAIL) in any assertion has its normal effect: it forces an immediate backtrack. The behaviour of the other backtracking verbs depends on whether or not the assertion is standalone or acting as the condition in a conditional group. (*ACCEPT) in a standalone positive assertion causes the assertion to succeed without any further processing; captured strings and a mark name (if set) are retained. In a standalone negative assertion, (*AC- CEPT) causes the assertion to fail without any further processing; cap- tured substrings and any mark name are discarded. If the assertion is a condition, (*ACCEPT) causes the condition to be true for a positive assertion and false for a negative one; captured substrings are retained in both cases. The remaining verbs act only when a later failure causes a backtrack to reach them. This means that, for the Perl-compatible assertions, their effect is confined to the assertion, because Perl lookaround assertions are atomic. A backtrack that occurs after such an assertion is complete does not jump back into the assertion. Note in particular that a (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern. PCRE2 now supports non-atomic positive assertions and also "scan sub- string" assertions, as described in the sections entitled "Non-atomic assertions" and "Scan substring assertions" above. These assertions must be standalone (not used as conditions). They are not Perl-compati- ble. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by backtracks from later in the pattern. The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, and a negative assertion to be true. This behaviour dif- fers from Perl when the assertion has only one branch. The other backtracking verbs are not treated specially if they appear in a standalone positive assertion. In a conditional positive asser- tion, backtracking (from within the assertion) into (*COMMIT), (*SKIP), or (*PRUNE) causes the condition to be false. However, for both stand- alone and conditional negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes the assertion to be true, without consider- ing any further alternative branches. Backtracking verbs in subroutines These behaviours occur whether or not the group is called recursively. (*ACCEPT) in a group called as a subroutine causes the subroutine match to succeed without any further processing. Matching then continues af- ter the subroutine call. Perl documents this behaviour. Perl's treat- ment of the other verbs in subroutines is different in some cases. (*FAIL) in a group called as a subroutine has its normal effect: it forces an immediate backtrack. (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail when triggered by being backtracked to in a group called as a subrou- tine. There is then a backtrack at the outer level. (*THEN), when triggered, skips to the next alternative in the innermost enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. EBCDIC ENVIRONMENTS Differences in the way PCRE behaves when it is running in an EBCDIC en- vironment are covered in this section. Escape sequences When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c escape is processed as specified for Perl in the perlebcdic doc- ument. The only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- time error. The sequence \c@ encodes character code 0; after \c the letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- comes either 255 (hex FF) or 95 (hex 5F). Thus, apart from \c?, these escapes generate the same character code values as they do in an ASCII or Unicode environment, though the mean- ings of the values mostly differ. For example, \cG always generates code value 7, which is BEL in ASCII but DEL in EBCDIC. The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but because 127 is not a control character in EBCDIC, Perl makes it generate the APC character. Unfortunately, there are several variants of EBCDIC. In most of them the APC character has the value 255 (hex FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC values, PCRE2 makes \c? generate 95; otherwise it generates 255. Character classes In character classes there is a special case in EBCDIC environments for ranges whose end points are both specified as literal letters in the same case. For compatibility with Perl, EBCDIC code points within the range that are not letters are omitted. For example, [h-k] matches only four characters, even though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code points. However, if the range is specified nu- merically, for example, [\x88-\x92] or [h-\x92], all code points are included. SEE ALSO pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 03 September 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 03 September 2025 PCRE2PATTERN(3) ------------------------------------------------------------------------------ PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 PERFORMANCE Two aspects of performance are discussed below: memory usage and pro- cessing time. The way you express your pattern as a regular expression can affect both of them. COMPILED PATTERN MEMORY USAGE Patterns are compiled by PCRE2 into a reasonably efficient interpretive code, so that most simple patterns do not use much memory for storing the compiled version. However, there is one case where the memory usage of a compiled pattern can be unexpectedly large. If a parenthesized group has a quantifier with a minimum greater than 1 and/or a limited maximum, the whole group is repeated in the compiled code. For example, the pattern (abc|def){2,4} is compiled as if it were (abc|def)(abc|def)((abc|def)(abc|def)?)? (Technical aside: It is done this way so that backtrack points within each of the repetitions can be independently maintained.) For regular expressions whose quantifiers use only small numbers, this is not usually a problem. However, if the numbers are large, and par- ticularly if such repetitions are nested, the memory usage can become an embarrassment. For example, the very simple pattern ((ab){1,1000}c){1,3} uses over 50KiB when compiled using the 8-bit library. When PCRE2 is compiled with its default internal pointer size of two bytes, the size limit on a compiled pattern is 65535 code units in the 8-bit and 16-bit libraries, and this is reached with the above pattern if the outer rep- etition is increased from 3 to 4. PCRE2 can be compiled to use larger internal pointers and thus handle larger compiled patterns, but it is better to try to rewrite your pattern to use less memory if you can. One way of reducing the memory usage for such patterns is to make use of PCRE2's "subroutine" facility. Re-writing the above pattern as ((ab)(?2){0,999}c)(?1){0,2} reduces the memory requirements to around 16KiB, and indeed it remains under 20KiB even with the outer repetition increased to 100. However, this kind of pattern is not always exactly equivalent, because any cap- tures within subroutine calls are lost when the subroutine completes. If this is not a problem, this kind of rewriting will allow you to process patterns that PCRE2 cannot otherwise handle. The matching per- formance of the two different versions of the pattern are roughly the same. (This applies from release 10.30 - things were different in ear- lier releases.) STACK AND HEAP USAGE AT RUN TIME From release 10.30, the interpretive (non-JIT) version of pcre2_match() uses very little system stack at run time. In earlier releases recur- sive function calls could use a great deal of stack, and this could cause problems, but this usage has been eliminated. Backtracking posi- tions are now explicitly remembered in memory frames controlled by the code. The size of each frame depends on the size of pointer variables and the number of capturing parenthesized groups in the pattern being matched. On a 64-bit system the frame size for a pattern with no captures is 128 bytes. For each capturing group the size increases by 16 bytes. Until release 10.41, an initial 20KiB frames vector was allocated on the system stack, but this still caused some issues for multi-thread applications where each thread has a very small stack. From release 10.41 backtracking memory frames are always held in heap memory. An initial heap allocation is obtained the first time any match data block is passed to pcre2_match(). This is remembered with the match data block and re-used if that block is used for another match. It is freed when the match data block itself is freed. The size of the initial block is the larger of 20KiB or ten times the pattern's frame size, unless the heap limit is less than this, in which case the heap limit is used. If the initial block proves to be too small during matching, it is replaced by a larger block, subject to the heap limit. The heap limit is checked only when a new block is to be allocated. Reducing the heap limit between calls to pcre2_match() with the same match data block does not affect the saved block. In contrast to pcre2_match(), pcre2_dfa_match() does use recursive function calls, but only for processing atomic groups, lookaround as- sertions, and recursion within the pattern. The original version of the code used to allocate quite large internal workspace vectors on the stack, which caused some problems for some patterns in environments with small stacks. From release 10.32 the code for pcre2_dfa_match() has been re-factored to use heap memory when necessary for internal workspace when recursing, though recursive function calls are still used. The "match depth" parameter can be used to limit the depth of function recursion, and the "match heap" parameter to limit heap memory in pcre2_dfa_match(). PROCESSING TIME Certain items in regular expression patterns are processed more effi- ciently than others. It is more efficient to use a character class like [aeiou] than a set of single-character alternatives such as (a|e|i|o|u). In general, the simplest construction that provides the required behaviour is usually the most efficient. Jeffrey Friedl's book contains a lot of useful general discussion about optimizing regular expressions for efficient performance. This document contains a few ob- servations about PCRE2. Using Unicode character properties (the \p, \P, and \X escapes) is slow, because PCRE2 has to use a multi-stage table lookup whenever it needs a character's property. If you can find an alternative pattern that does not use character properties, it will probably be faster. By default, the escape sequences \b, \d, \s, and \w, and the POSIX character classes such as [:alpha:] do not use Unicode properties, partly for backwards compatibility, and partly for performance reasons. However, you can set the PCRE2_UCP option or start the pattern with (*UCP) if you want Unicode character properties to be used. This can double the matching time for items such as \d, when matched with pcre2_match(); the performance loss is less with a DFA matching func- tion, and in both cases there is not much difference for \b. When a pattern begins with .* not in atomic parentheses, nor in paren- theses that are the subject of a backreference, and the PCRE2_DOTALL option is set, the pattern is implicitly anchored by PCRE2, since it can match only at the start of a subject string. If the pattern has multiple top-level branches, they must all be anchorable. The optimiza- tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au- tomatically disabled if the pattern contains (*PRUNE) or (*SKIP). If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be- cause the dot metacharacter does not then match a newline, and if the subject string contains newlines, the pattern may match from the char- acter immediately following one of them instead of from the very start. For example, the pattern .*second matches the subject "first\nand second" (where \n stands for a newline character), with the match starting at the seventh character. In order to do this, PCRE2 has to retry the match starting after every newline in the subject. If you are using such a pattern with subject strings that do not con- tain newlines, the best performance is obtained by setting PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex- plicit anchoring. That saves PCRE2 from having to scan along the sub- ject looking for a newline to restart at. Beware of patterns that contain nested indefinite repeats. These can take a long time to run when applied to a string that does not match. Consider the pattern fragment ^(a+)* This can match "aaaa" in 16 different ways, and this number increases very rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4 times, and for each of those cases other than 0 or 4, the + repeats can match different numbers of times.) When the remainder of the pattern is such that the entire match is going to fail, PCRE2 has in principle to try every possible variation, and this can take an ex- tremely long time, even for relatively short strings. An optimization catches some of the more simple cases such as (a+)*b where a literal character follows. Before embarking on the standard matching procedure, PCRE2 checks that there is a "b" later in the sub- ject string, and if there is not, it fails the match immediately. How- ever, when there is no following literal this optimization cannot be used. You can see the difference by comparing the behaviour of (a+)*\d with the pattern above. The former gives a failure almost instantly when applied to a whole line of "a" characters, whereas the latter takes an appreciable time with strings longer than about 20 characters. In many cases, the solution to this kind of performance issue is to use an atomic group or a possessive quantifier. This can often reduce mem- ory requirements as well. As another example, consider this pattern: ([^<]|<(?!inet))+ It matches from wherever it starts until it encounters " int pcre2_regcomp(regex_t *preg, const char *pattern, int cflags); int pcre2_regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); size_t pcre2_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size); void pcre2_regfree(regex_t *preg); DESCRIPTION This set of functions provides a POSIX-style API for the PCRE2 regular expression 8-bit library. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit libraries. See the pcre2api documentation for a de- scription of PCRE2's native API, which contains much additional func- tionality. IMPORTANT NOTE: The functions described here are NOT thread-safe, and should not be used in multi-threaded applications. They are also lim- ited to processing subjects that are not bigger than 2GB. Use the na- tive API instead. These functions are wrapper functions that ultimately call the PCRE2 native API. Their prototypes are defined in the pcre2posix.h header file, and they all have unique names starting with pcre2_. However, the pcre2posix.h header also contains macro definitions that convert the standard POSIX names such regcomp() into pcre2_regcomp() etc. This means that a program can use the usual POSIX names without running the risk of accidentally linking with POSIX functions from a different li- brary. On Unix-like systems the PCRE2 POSIX library is called libpcre2-posix, so can be accessed by adding -lpcre2-posix to the command for linking an application. Because the POSIX functions call the native ones, it is also necessary to add -lpcre2-8. On Windows systems, if you are linking to a DLL version of the library, it is recommended that PCRE2POSIX_SHARED is defined before including the pcre2posix.h header, as it will allow for a more efficient way to invoke the functions by adding the __declspec(dllimport) decorator. Although they were not defined as prototypes in pcre2posix.h, releases 10.33 to 10.36 of the library contained functions with the POSIX names regcomp() etc. These simply passed their arguments to the PCRE2 func- tions. These functions were provided for backwards compatibility with earlier versions of PCRE2, which had only POSIX names. However, this has proved troublesome in situations where a program links with several libraries, some of which use PCRE2's POSIX interface while others use the real POSIX functions. For this reason, the POSIX names have been removed since release 10.37. Calling the header file pcre2posix.h avoids any conflict with other POSIX libraries. It can, of course, be renamed or aliased as regex.h, which is the "correct" name, if there is no clash. It provides two structure types, regex_t for compiled internal forms, and regmatch_t for returning captured substrings. It also defines some constants whose names start with "REG_"; these are used for setting options and identi- fying error codes. USING THE POSIX FUNCTIONS Note that these functions are just POSIX-style wrappers for PCRE2's na- tive API. They do not give POSIX regular expression behaviour, and they are not thread-safe or even POSIX compatible. Those POSIX option bits that can reasonably be mapped to PCRE2 native options have been implemented. In addition, the option REG_EXTENDED is defined with the value zero. This has no effect, but since programs that are written to the POSIX interface often use it, this makes it easier to slot in PCRE2 as a replacement library. Other POSIX options are not even defined. There are also some options that are not defined by POSIX. These have been added at the request of users who want to make use of certain PCRE2-specific features via the POSIX calling interface or to add BSD or GNU functionality. When PCRE2 is called via these functions, it is only the API that is POSIX-like in style. The syntax and semantics of the regular expres- sions themselves are still those of Perl, subject to the setting of various PCRE2 options, as described below. "POSIX-like in style" means that the API approximates to the POSIX definition; it is not fully POSIX-compatible, and in multi-unit encoding domains it is probably even less compatible. The descriptions below use the actual names of the functions, but, as described above, the standard POSIX names (without the pcre2_ prefix) may also be used. COMPILING A PATTERN The function pcre2_regcomp() is called to compile a pattern into an in- ternal form. By default, the pattern is a C string terminated by a bi- nary zero (but see REG_PEND below). The preg argument is a pointer to a regex_t structure that is used as a base for storing information about the compiled regular expression. It is also used for input when REG_PEND is set. The regex_t structure used by pcre2_regcomp() is de- fined in pcre2posix.h and is not the same as the structure used by other libraries that provide POSIX-style matching. The argument cflags is either zero, or contains one or more of the bits defined by the following macros: REG_DOTALL The PCRE2_DOTALL option is set when the regular expression is passed for compilation to the native function. Note that REG_DOTALL is not part of the POSIX standard. REG_ICASE The PCRE2_CASELESS option is set when the regular expression is passed for compilation to the native function. REG_NEWLINE The PCRE2_MULTILINE option is set when the regular expression is passed for compilation to the native function. Note that this does not mimic the defined POSIX behaviour for REG_NEWLINE (see the following sec- tion). REG_NOSPEC The PCRE2_LITERAL option is set when the regular expression is passed for compilation to the native function. This disables all meta charac- ters in the pattern, causing it to be treated as a literal string. The only other options that are allowed with REG_NOSPEC are REG_ICASE, REG_NOSUB, REG_PEND, and REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard. REG_NOSUB When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments are ig- nored, and no captured strings are returned. Versions of the PCRE2 li- brary prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile op- tion, but this no longer happens because it disables the use of back- references. REG_PEND If this option is set, the reg_endp field in the preg structure (which has the type const char *) must be set to point to the character beyond the end of the pattern before calling pcre2_regcomp(). The pattern it- self may now contain binary zeros, which are treated as data charac- ters. Without REG_PEND, a binary zero terminates the pattern and the re_endp field is ignored. This is a GNU extension to the POSIX standard and should be used with caution in software intended to be portable to other systems. REG_UCP The PCRE2_UCP option is set when the regular expression is passed for compilation to the native function. This causes PCRE2 to use Unicode properties when matching \d, \w, etc., instead of just recognizing ASCII values. Note that REG_UCP is not part of the POSIX standard. REG_UNGREEDY The PCRE2_UNGREEDY option is set when the regular expression is passed for compilation to the native function. Note that REG_UNGREEDY is not part of the POSIX standard. REG_UTF The PCRE2_UTF option is set when the regular expression is passed for compilation to the native function. This causes the pattern itself and all data strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF is not part of the POSIX standard. In the absence of these flags, no options are passed to the native function. This means that the regex is compiled with PCRE2 default se- mantics. In particular, the way it handles newline characters in the subject string is the Perl way, not the POSIX way. Note that setting PCRE2_MULTILINE has only some of the effects specified for REG_NEWLINE. It does not affect the way newlines are matched by the dot metacharac- ter (they are not) or by a negative class such as [^a] (they are). The yield of pcre2_regcomp() is zero on success, and non-zero other- wise. The preg structure is filled in on success, and one other member of the structure (as well as re_endp) is public: re_nsub contains the number of capturing subpatterns in the regular expression. Various er- ror codes are defined in the header file. NOTE: If the yield of pcre2_regcomp() is non-zero, you must not attempt to use the contents of the preg structure. If, for example, you pass it to pcre2_regexec(), the result is undefined and your program is likely to crash. MATCHING NEWLINE CHARACTERS This area is not simple, because POSIX and Perl take different views of things. It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was never intended to be a POSIX engine. The following table lists the different possibilities for matching newline characters in Perl and PCRE2: Default Change with . matches newline no PCRE2_DOTALL newline matches [^a] yes not changeable $ matches \n at end yes PCRE2_DOLLAR_ENDONLY $ matches \n in middle no PCRE2_MULTILINE ^ matches \n in middle no PCRE2_MULTILINE This is the equivalent table for a POSIX-compatible pattern matcher: Default Change with . matches newline yes REG_NEWLINE newline matches [^a] yes REG_NEWLINE $ matches \n at end no REG_NEWLINE $ matches \n in middle no REG_NEWLINE ^ matches \n in middle no REG_NEWLINE This behaviour is not what happens when PCRE2 is called via its POSIX API. By default, PCRE2's behaviour is the same as Perl's, except that there is no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there is no way to stop newline from matching [^a]. Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and PCRE2_DOLLAR_ENDONLY when calling pcre2_compile() directly, but there is no way to make PCRE2 behave exactly as for the REG_NEWLINE ac- tion. When using the POSIX API, passing REG_NEWLINE to PCRE2's pcre2_regcomp() function causes PCRE2_MULTILINE to be passed to pcre2_compile(), and REG_DOTALL passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY. MATCHING A PATTERN The function pcre2_regexec() is called to match a compiled pattern preg against a given string, which is by default terminated by a zero byte (but see REG_STARTEND below), subject to the options in eflags. These can be: REG_NOTBOL The PCRE2_NOTBOL option is set when calling the underlying PCRE2 match- ing function. REG_NOTEMPTY The PCRE2_NOTEMPTY option is set when calling the underlying PCRE2 matching function. Note that REG_NOTEMPTY is not part of the POSIX standard. However, setting this option can give more POSIX-like behav- iour in some situations. REG_NOTEOL The PCRE2_NOTEOL option is set when calling the underlying PCRE2 match- ing function. REG_STARTEND When this option is set, the subject string starts at string + pmatch[0].rm_so and ends at string + pmatch[0].rm_eo, which should point to the first character beyond the string. There may be binary ze- ros within the subject string, and indeed, using REG_STARTEND is the only way to pass a subject string that contains a binary zero. Whatever the value of pmatch[0].rm_so, the offsets of the matched string and any captured substrings are still given relative to the start of string itself. (Before PCRE2 release 10.30 these were given relative to string + pmatch[0].rm_so, but this differs from other im- plementations.) This is a BSD extension, compatible with but not specified by IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software intended to be portable to other systems. Note that a non-zero rm_so does not imply REG_NOTBOL; REG_STARTEND affects only the location and length of the string, not how it is matched. Setting REG_STARTEND and passing pmatch as NULL are mutually exclusive; the error REG_INVARG is returned. If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The nmatch and pmatch arguments of pcre2_regexec() are ignored (except possibly as input for REG_STAR- TEND). The value of nmatch may be zero, and the value pmatch may be NULL (un- less REG_STARTEND is set); in both these cases no data about any matched strings is returned. Otherwise, the portion of the string that was matched, and also any captured substrings, are returned via the pmatch argument, which points to an array of nmatch structures of type regmatch_t, containing the members rm_so and rm_eo. These contain the byte offset to the first character of each substring and the offset to the first character after the end of each substring, respectively. The 0th element of the vector relates to the entire portion of string that was matched; subsequent elements relate to the capturing subpatterns of the regular expression. Unused entries in the array have both structure members set to -1. regmatch_t as well as the regoff_t typedef it uses are defined in pcre2posix.h and are not warranted to have the same size or layout as other similarly named types from other libraries that provide POSIX- style matching. A successful match yields a zero return; various error codes are de- fined in the header file, of which REG_NOMATCH is the "expected" fail- ure code. ERROR MESSAGES The pcre2_regerror() function maps a non-zero errorcode from either pcre2_regcomp() or pcre2_regexec() to a printable message. If preg is not NULL, the error should have arisen from the use of that structure. A message terminated by a binary zero is placed in errbuf. If the buffer is too short, only the first errbuf_size - 1 characters of the error message are used. The yield of the function is the size of buffer needed to hold the whole message, including the terminating zero. This value is greater than errbuf_size if the message was truncated. MEMORY USAGE Compiling a regular expression causes memory to be allocated and asso- ciated with the preg structure. The function pcre2_regfree() frees all such memory, after which preg may no longer be used as a compiled ex- pression. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 27 November 2024 PCRE2POSIX(3) ------------------------------------------------------------------------------ PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 SAMPLE PROGRAM A simple, complete demonstration program to get you started with using PCRE2 is supplied in the file pcre2demo.c in the src directory in the PCRE2 distribution. A listing of this program is given in the pcre2demo documentation. If you do not have a copy of the PCRE2 distribution, you can save this listing to recreate the contents of pcre2demo.c. The demonstration program compiles the regular expression that is its first argument, and matches it against the subject string in its second argument. No PCRE2 options are set, and default character tables are used. If matching succeeds, the program outputs the portion of the sub- ject that matched, together with the contents of any captured sub- strings. If the -g option is given on the command line, the program then goes on to check for further matches of the same regular expression in the same subject string. The logic is a little bit tricky because of the possi- bility of matching an empty string. Comments in the code explain what is going on. The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit library. It handles strings and characters that are stored in 8-bit code units. By default, one character corresponds to one code unit, but if the pattern starts with "(*UTF)", both it and the subject are treated as UTF-8 strings, where characters may occupy multiple code units. If PCRE2 is installed in the standard include and library directories for your operating system, you should be able to compile the demonstra- tion program using a command like this: cc -o pcre2demo pcre2demo.c -lpcre2-8 If PCRE2 is installed elsewhere, you may need to add additional options to the command line. For example, on a Unix-like system that has PCRE2 installed in /usr/local, you can compile the demonstration program us- ing a command like this: cc -o pcre2demo -I/usr/local/include pcre2demo.c \ -L/usr/local/lib -lpcre2-8 Once you have built the demonstration program, you can run simple tests like this: ./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -i 'cat' 'the dog sat on the CAT' Note that there is a much more comprehensive test program, called pcre2test, which supports many more facilities for testing regular ex- pressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be installed). The pcre2demo program is pro- vided as a relatively simple coding example. If you try to run pcre2demo when PCRE2 is not installed in the standard library directory, you may get an error like this on some operating systems (e.g. Solaris): ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory This is caused by the way shared library support works on those sys- tems. You need to add -R/usr/local/lib (for example) to the compile command to get round this problem. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 28 February 2025 Copyright (c) 1997-2016 University of Cambridge. PCRE2 10.48-DEV 28 February 2025 PCRE2SAMPLE(3) ------------------------------------------------------------------------------ PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS int32_t pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext); int32_t pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext); void pcre2_serialize_free(uint8_t *bytes); int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes); If you are running an application that uses a large number of regular expression patterns, it may be useful to store them in a precompiled form instead of having to compile them every time the application is run. However, if you are using the just-in-time optimization feature, it is not possible to save and reload the JIT data, because it is posi- tion-dependent. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width and PCRE2_SIZE type. For example, patterns compiled on a 32-bit system using PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor can they be reloaded using the 8-bit library. Note that "serialization" in PCRE2 does not convert compiled patterns to an abstract format like Java or .NET serialization. The serialized output is really just a bytecode dump, which is why it can only be re- loaded in the same environment as the one that created it. Hence the restrictions mentioned above. Applications that are not statically linked with a fixed version of PCRE2 must be prepared to recompile pat- terns from their sources, in order to be immune to PCRE2 upgrades. SECURITY CONCERNS The facility for saving and restoring compiled patterns is intended for use within individual applications. As such, the data supplied to pcre2_serialize_decode() is expected to be trusted data, not data from arbitrary external sources. There is only some simple consistency checking, not complete validation of what is being re-loaded. Corrupted data may cause undefined results. For example, if the length field of a pattern in the serialized data is corrupted, the deserializing code may read beyond the end of the byte stream that is passed to it. SAVING COMPILED PATTERNS Before compiled patterns can be saved they must be serialized, which in PCRE2 means converting the pattern to a stream of bytes. A single byte stream may contain any number of compiled patterns, but they must all use the same character tables. A single copy of the tables is included in the byte stream (its size is 1088 bytes). For more details of char- acter tables, see the section on locale support in the pcre2api docu- mentation. The function pcre2_serialize_encode() creates a serialized byte stream from a list of compiled patterns. Its first two arguments specify the list, being a pointer to a vector of pointers to compiled patterns, and the length of the vector. The third and fourth arguments point to vari- ables which are set to point to the created byte stream and its length, respectively. The final argument is a pointer to a general context, which can be used to specify custom memory management functions. If this argument is NULL, malloc() is used to obtain memory for the byte stream. The yield of the function is the number of serialized patterns, or one of the following negative error codes: PCRE2_ERROR_BADDATA the number of patterns is zero or less PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns PCRE2_ERROR_NOMEMORY memory allocation failed PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL PCRE2_ERROR_BADMAGIC means either that a pattern's code has been cor- rupted, or that a slot in the vector does not point to a compiled pat- tern. Once a set of patterns has been serialized you can save the data in any appropriate manner. Here is sample code that compiles two patterns and writes them to a file. It assumes that the variable fd refers to a file that is open for output. The error checking that should be present in a real application has been omitted for simplicity. int errorcode; uint8_t *bytes; PCRE2_SIZE erroroffset; PCRE2_SIZE bytescount; pcre2_code *list_of_codes[2]; list_of_codes[0] = pcre2_compile("first pattern", PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL); list_of_codes[1] = pcre2_compile("second pattern", PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL); errorcode = pcre2_serialize_encode(list_of_codes, 2, &bytes, &bytescount, NULL); errorcode = fwrite(bytes, 1, bytescount, fd); Note that the serialized data is binary data that may contain any of the 256 possible byte values. On systems that make a distinction be- tween binary and non-binary data, be sure that the file is opened for binary output. Serializing a set of patterns leaves the original data untouched, so they can still be used for matching. Their memory must eventually be freed in the usual way by calling pcre2_code_free(). When you have fin- ished with the byte stream, it too must be freed by calling pcre2_seri- alize_free(). If this function is called with a NULL argument, it re- turns immediately without doing anything. RE-USING PRECOMPILED PATTERNS In order to re-use a set of saved patterns you must first make the se- rialized byte stream available in main memory (for example, by reading from a file). The management of this memory block is up to the applica- tion. You can use the pcre2_serialize_get_number_of_codes() function to find out how many compiled patterns are in the serialized data without actually decoding the patterns: uint8_t *bytes = ; int32_t number_of_codes = pcre2_serialize_get_number_of_codes(bytes); The pcre2_serialize_decode() function reads a byte stream and recreates the compiled patterns in new memory blocks, setting pointers to them in a vector. The first two arguments are a pointer to a suitable vector and its length, and the third argument points to a byte stream. The fi- nal argument is a pointer to a general context, which can be used to specify custom memory management functions for the decoded patterns. If this argument is NULL, malloc() and free() are used. After deserializa- tion, the byte stream is no longer needed and can be discarded. pcre2_code *list_of_codes[2]; uint8_t *bytes = ; int32_t number_of_codes = pcre2_serialize_decode(list_of_codes, 2, bytes, NULL); If the vector is not large enough for all the patterns in the byte stream, it is filled with those that fit, and the remainder are ig- nored. The yield of the function is the number of decoded patterns, or one of the following negative error codes: PCRE2_ERROR_BADDATA second argument is zero or less PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data PCRE2_ERROR_BADMODE mismatch of code unit size or PCRE2 version PCRE2_ERROR_BADSERIALIZEDDATA other sanity check failure PCRE2_ERROR_MEMORY memory allocation failed PCRE2_ERROR_NULL first or third argument is NULL PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness. Decoded patterns can be used for matching in the usual way, and must be freed by calling pcre2_code_free(). However, be aware that there is a potential race issue if you are using multiple patterns that were de- coded from a single byte stream in a multithreaded application. A sin- gle copy of the character tables is used by all the decoded patterns and a reference count is used to arrange for its memory to be automati- cally freed when the last pattern is freed, but there is no locking on this reference count. Therefore, if you want to call pcre2_code_free() for these patterns in different threads, you must arrange your own locking, and ensure that pcre2_code_free() cannot be called by two threads at the same time. If a pattern was processed by pcre2_jit_compile() before being serial- ized, the JIT data is discarded and so is no longer available after a save/restore cycle. You can, however, process a restored pattern with pcre2_jit_compile() if you wish. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 19 January 2024 Copyright (c) 1997-2018 University of Cambridge. PCRE2 10.48-DEV 19 January 2024 PCRE2SERIALIZE(3) ------------------------------------------------------------------------------ PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY The full syntax and semantics of the regular expression patterns that are supported by PCRE2 are described in the pcre2pattern documentation. This document contains a quick-reference summary of the pattern syntax followed by the syntax of replacement strings in substitution function. The full description of the latter is in the pcre2api documentation. QUOTING \x where x is non-alphanumeric is a literal x \Q...\E treat enclosed characters as literal Note that white space inside \Q...\E is always treated as literal, even if PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also that PCRE2's handling of \Q...\E has some differences from Perl's. See the pcre2pattern documentation for details. BRACED ITEMS With one exception, wherever brace characters { and } are required to enclose data for constructions such as \g{2} or \k{name}, space and/or horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or after the comma. The exception is \u{...} which is not Perl-compatible and is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This is an EC- MAScript compatibility feature, and follows ECMAScript's behaviour. ESCAPED CHARACTERS This table applies to ASCII and Unicode environments. An unrecognized escape sequence causes an error. \a alarm, that is, the BEL character (hex 07) \cx "control-x", where x is a non-control ASCII character \e escape (hex 1B) \f form feed (hex 0C) \n newline (hex 0A) \r carriage return (hex 0D) \t tab (hex 09) \0dd character with octal code 0dd \ddd character with octal code ddd, or backreference \o{ddd..} character with octal code ddd.. \N{U+hh..} character with Unicode code point hh.. (Unicode mode only) \xhh character with hex code hh \x{hh..} character with hex code hh.. \N{U+hh..} is synonymous with \x{hh..} but is not supported in environ- ments that use EBCDIC code (mainly IBM mainframes). Note that \N not followed by an opening curly bracket has a different meaning (see be- low). If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized: \U the character "U" \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX When \x is not followed by {, one or two hexadecimal digits are read, but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise it matches a literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexa- decimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it matches a literal "u". Note that \0dd is always an octal code. The treatment of backslash fol- lowed by a non-zero digit is complicated; for details see the section "Non-printing characters" in the pcre2pattern documentation, where de- tails of escape processing in EBCDIC environments are also given. CHARACTER TYPES . any character except newline; in dotall mode, any character whatsoever \C one code unit, even in UTF mode (best avoided) \d a decimal digit \D a character that is not a decimal digit \h a horizontal white space character \H a character that is not a horizontal white space character \N a character that is not a newline \p{xx} a character with the xx property \P{xx} a character without the xx property \R a newline sequence \s a white space character \S a character that is not a white space character \v a vertical white space character \V a character that is not a vertical white space character \w a "word" character \W a "non-word" character \X a Unicode extended grapheme cluster \C is dangerous because it may leave the current matching point in the middle of a UTF-8 or UTF-16 character. The application can lock out the use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 with the use of \C permanently disabled. By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific matching is happening, \s and \w may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behav- iour of these escape sequences is changed to use Unicode properties and they match many more characters, but there are some option settings that can restrict individual sequences to matching only ASCII charac- ters. Property descriptions in \p and \P are matched caselessly; hyphens, un- derscores, and ASCII white space characters are ignored, in accordance with Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} is the same as \p{ bidi class = AL }. GENERAL CATEGORY PROPERTIES FOR \p and \P C Other Cc Control Cf Format Cn Unassigned Co Private use Cs Surrogate L Letter Lc Cased letter, the union of Ll, Lu, and Lt L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter M Mark Mc Spacing mark Me Enclosing mark Mn Non-spacing mark N Number Nd Decimal number Nl Letter number No Other number P Punctuation Pc Connector punctuation Pd Dash punctuation Pe Close punctuation Pf Final punctuation Pi Initial punctuation Po Other punctuation Ps Open punctuation S Symbol Sc Currency symbol Sk Modifier symbol Sm Mathematical symbol So Other symbol Z Separator Zl Line separator Zp Paragraph separator Zs Space separator From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all equivalent to Lc. PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P Xan Alphanumeric: union of properties L and N Xps POSIX space: property Z or tab, NL, VT, FF, CR Xsp Perl space: property Z or tab, NL, VT, FF, CR Xuc Universally-named character: one that can be represented by a Universal Character Name Xwd Perl word: property Xan or underscore Perl and POSIX space are now the same. Perl added VT to its space char- acter set at release 5.18. BINARY PROPERTIES FOR \p AND \P Unicode defines a number of binary properties, that is, properties whose only values are true or false. You can obtain a list of those that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP SCRIPT MATCHING WITH \p AND \P Many script names and their 4-letter abbreviations are recognized in \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of course). You can obtain a list of these scripts by running this com- mand: pcre2test -LS THE BIDI_CLASS PROPERTY FOR \p AND \P \p{Bidi_Class:} matches a character with the given class \p{BC:} matches a character with the given class The recognized classes are: AL Arabic letter AN Arabic number B paragraph separator BN boundary neutral CS common separator EN European number ES European separator ET European terminator FSI first strong isolate L left-to-right LRE left-to-right embedding LRI left-to-right isolate LRO left-to-right override NSM non-spacing mark ON other neutral PDF pop directional format PDI pop directional isolate R right-to-left RLE right-to-left embedding RLI right-to-left isolate RLO right-to-left override S segment separator WS white space CHARACTER CLASSES [...] positive character class [^...] negative character class [x-y] range (can be used for hex characters) [[:xxx:]] positive POSIX named set [[:^xxx:]] negative POSIX named set alnum alphanumeric alpha alphabetic ascii 0-127 blank space or tab cntrl control character digit decimal digit graph printing, excluding space lower lower case letter print printing, including space punct printing, excluding alphanumeric space white space upper upper case letter word same as \w xdigit hexadecimal digit In PCRE2, POSIX character set names recognize only ASCII characters by default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class. When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be used, allowing nested character classes, combined using set op- erators. [x&&[^y]] UTS#18 extended character class x||y set union (OR) x&&y set intersection (AND) x--y set difference (AND NOT) x~~y set symmetric difference (XOR) PERL EXTENDED CHARACTER CLASSES (?[...]) Perl extended character class (?[\p{Thai} & \p{Nd}]) operators; white space ignored (?[(x - y) & z]) parentheses for grouping (?[ [^3] & \p{Nd} ]) [...] is a nested ordinary class (?[ [:alpha:] - [z] ]) POSIX set is allowed outside [...] (?[ \d - [3] ]) backslash-escaped set is allowed outside [...] (?[ !\n & [:ascii:] ]) backslash-escaped character is allowed out- side [...] all other characters or ranges must be enclosed in [...] x|y, x+y set union (OR) x&y set intersection (AND) x-y set difference (AND NOT) x^y set symmetric difference (XOR) !x set complement (NOT) Inside a Perl extended character class, [...] switches mode to be in- terpreted as an ordinary character class. Outside of a nested [...], the only items permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside a nested ordinary class, ^ has its usual mean- ing (inverts the class when used as the first character); outside of a nested class, ^ is the XOR operator. QUANTIFIERS ? 0 or 1, greedy ?+ 0 or 1, possessive ?? 0 or 1, lazy * 0 or more, greedy *+ 0 or more, possessive *? 0 or more, lazy + 1 or more, greedy ++ 1 or more, possessive +? 1 or more, lazy {n} exactly n {n,m} at least n, no more than m, greedy {n,m}+ at least n, no more than m, possessive {n,m}? at least n, no more than m, lazy {n,} n or more, greedy {n,}+ n or more, possessive {n,}? n or more, lazy {,m} zero up to m, greedy {,m}+ zero up to m, possessive {,m}? zero up to m, lazy ANCHORS AND SIMPLE ASSERTIONS \b word boundary \B not a word boundary ^ start of subject also after an internal newline in multiline mode (after any newline if PCRE2_ALT_CIRCUMFLEX is set) \A start of subject $ end of subject also before newline at end of subject also before internal newline in multiline mode \Z end of subject also before newline at end of subject \z end of subject \G first matching position in subject REPORTED MATCH POINT SETTING \K set reported start of match From release 10.38 \K is not permitted by default in lookaround asser- tions, for compatibility with Perl. However, if the PCRE2_EXTRA_AL- LOW_LOOKAROUND_BSK option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ig- nored in negative ones. ALTERNATION expr|expr|expr... CAPTURING (...) capture group (?...) named capture group (Perl) (?'name'...) named capture group (Perl) (?P...) named capture group (Python) (?:...) non-capture group (?|...) non-capture group; reset group numbers for capture groups in each alternative In non-UTF modes, names may contain underscores and ASCII letters and digits; in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit. ATOMIC GROUPS (?>...) atomic non-capture group (*atomic:...) atomic non-capture group COMMENT (?#....) comment (not nestable) OPTION SETTING Changes of these options within a group are automatically cancelled at the end of the group. (?a) all ASCII options (?aD) restrict \d to ASCII in UCP mode (?aS) restrict \s to ASCII in UCP mode (?aW) restrict \w to ASCII in UCP mode (?aP) restrict all POSIX classes to ASCII in UCP mode (?aT) restrict POSIX digit classes to ASCII in UCP mode (?i) caseless (?J) allow duplicate named groups (?m) multiline (?n) no auto capture (?r) restrict caseless to either ASCII or non-ASCII (?s) single line (dotall) (?U) default ungreedy (lazy) (?x) ignore white space except in classes or \Q...\E (?xx) as (?x) but also ignore space and tab in classes (?-...) unset the given option(s) (?^) unset imnrsx options (?aP) implies (?aT) as well, though this has no additional effect. How- ever, it means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes. Unsetting x or xx unsets both. Several options may be set at once, and a mixture of setting and unsetting such as (?i-x) is allowed, but there may be only one hyphen. Setting (but no unsetting) is allowed after (?^ for example (?^in). An option setting may appear at the start of a non- capture group, for example (?i:...). The following are recognized only at the very start of a pattern or af- ter one of the newline or \R sequences or options with similar syntax. More than one of them may appear. For the first three, d is a decimal number. (*LIMIT_DEPTH=d) set the backtracking limit to d (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes (*LIMIT_MATCH=d) set the match limit to d (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching (*NOTEMPTY) set PCRE2_NOTEMPTY when matching (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) (*NO_JIT) disable JIT optimization (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OP- TIMIZE) (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching (*UTF) set appropriate UTF mode for the library in use (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or pcre2_dfa_match(), not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time. NEWLINE CONVENTION These are recognized only at the very start of the pattern or after op- tion settings with a similar syntax. (*CR) carriage return only (*LF) linefeed only (*CRLF) carriage return followed by linefeed (*ANYCRLF) all three of the above (*ANY) any Unicode newline sequence (*NUL) the NUL character (binary zero) WHAT \R MATCHES These are recognized only at the very start of the pattern or after op- tion setting with a similar syntax. (*BSR_ANYCRLF) CR, LF, or CRLF (*BSR_UNICODE) any Unicode newline sequence LOOKAHEAD AND LOOKBEHIND ASSERTIONS (?=...) ) (*pla:...) ) positive lookahead (*positive_lookahead:...) ) (?!...) ) (*nla:...) ) negative lookahead (*negative_lookahead:...) ) (?<=...) ) (*plb:...) ) positive lookbehind (*positive_lookbehind:...) ) (? name 'name' name SCRIPT RUNS (*script_run:...) ) script run, can be backtracked into (*sr:...) ) (*atomic_script_run:...) ) atomic script run (*asr:...) ) BACKREFERENCES \n reference by number (can be ambiguous) \gn reference by number \g{n} reference by number \g+n relative reference by number (PCRE2 extension) \g-n relative reference by number \g{+n} relative reference by number (PCRE2 extension) \g{-n} relative reference by number \k reference by name (Perl) \k'name' reference by name (Perl) \g{name} reference by name (Perl) \k{name} reference by name (.NET) (?P=name) reference by name (Python) SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) (?R) recurse whole pattern (?n) call subroutine by absolute number (?+n) call subroutine by relative number (?-n) call subroutine by relative number (?&name) call subroutine by name (Perl) (?P>name) call subroutine by name (Python) \g call subroutine by name (Oniguruma) \g'name' call subroutine by name (Oniguruma) \g call subroutine by absolute number (Oniguruma) \g'n' call subroutine by absolute number (Oniguruma) \g<+n> call subroutine by relative number (PCRE2 extension) \g'+n' call subroutine by relative number (PCRE2 extension) \g<-n> call subroutine by relative number (PCRE2 extension) \g'-n' call subroutine by relative number (PCRE2 extension) The variants using parentheses (?...) may also specify a list of cap- ture groups to return, which shall be retained in the calling subex- pression if set during the recursion (this feature is not supported by Perl). (?R(grouplist)) recurse whole pattern, returning capture groups (PCRE2 extension) (?n(grouplist)) ) (?+n(grouplist)) ) call subroutine, returning capture groups (?-n(grouplist)) ) (PCRE2 extension) (?&name(grouplist)) ) (?P>name(grouplist)) ) The comma-separated list "grouplist" uses the same syntax as (*scan_substring:(grouplist)...), and may identify groups in any of the following ways: n absolute reference +n relative reference -n relative reference name 'name' name CONDITIONAL PATTERNS (?(condition)yes-pattern) (?(condition)yes-pattern|no-pattern) (?(n) absolute reference condition (?(+n) relative reference condition (PCRE2 extension) (?(-n) relative reference condition (PCRE2 extension) (?() named reference condition (Perl) (?('name') named reference condition (Perl) (?(name) named reference condition (PCRE2, deprecated) (?(R) overall recursion condition (?(Rn) specific numbered group recursion condition (?(R&name) specific named group recursion condition (?(DEFINE) define groups for reference (?(VERSION[>]=n[.m]) test PCRE2 version (?(assert) assertion condition Note the ambiguity of (?(R) and (?(Rn) which might be named reference conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. The parts within brackets for the VERSION conditional syntax could be ommited. The fractional part of the version number defaults to 0 in that case. BACKTRACKING CONTROL All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour if :NAME is present. The others just set a name for passing back to the caller, but this is not a name that (*SKIP) can see. The following act immediately they are reached: (*ACCEPT) force successful match (*FAIL) force backtrack; synonym (*F) (*MARK:NAME) set name to be passed back; synonym (*:NAME) The following act only when a subsequent match failure causes a back- track to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored. (*COMMIT) overall failure, no advance of starting point (*PRUNE) advance to next starting character (*SKIP) advance to current matching position (*SKIP:NAME) advance to position corresponding to an earlier (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call. CALLOUTS (?C) callout (assumed number 0) (?Cn) callout with numerical data n (?C"text") callout with string data The allowed string delimiters are ` ' " ^ % # $ (which are the same for the start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it. REPLACEMENT STRINGS If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for pcre2_substitute() is not interpreted. Otherwise, by default, the only special character is the dollar character in one of the following forms: $$ insert a dollar character $n or ${n} insert the contents of group n $ insert the contents of named group $0 or $& insert the entire matched substring $` insert the substring that precedes the match $' insert the substring that follows the match $_ insert the entire input string $+ insert the highest-numbered capture group which matched $*MARK or ${*MARK} insert a control verb name For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, there is additional interpretation: 1. Backslash is an escape character, and the forms described in "ES- CAPED CHARACTERS" above are recognized. Also: \Q...\E can be used to suppress interpretation \l force the next character to lower case \u force the next character to upper case \L force subsequent characters to lower case \U force subsequent characters to upper case \u\L force next character to upper case, then all lower \l\U force next character to lower case, then all upper \E end \L or \U case forcing \b backspace character (note: as in character class in pattern) \v vertical tab character (note: not the same as in a pattern) 2. The Python form \g, where the angle brackets are part of the syn- tax and n is either a group name or a number, is recognized as an al- ternative way of inserting the contents of a group, for example \g<3>. 3. Capture substitution supports the following additional forms: ${n:-string} default for unset group ${n:+string1:string2} values for set/unset group The substitution strings themselves are expanded. Backslash can be used to escape colons and closing curly brackets. SEE ALSO pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 14 October 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 14 October 2025 PCRE2SYNTAX(3) ------------------------------------------------------------------------------ PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) NAME PCRE2 - Perl-compatible regular expressions (revised API) UNICODE AND UTF SUPPORT PCRE2 is normally built with Unicode support, though if you do not need it, you can build it without, in which case the library will be smaller. With Unicode support, PCRE2 has knowledge of Unicode character properties and can process strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit width), but this is not the default. Unless specifically requested, PCRE2 treats each code unit in a string as one character. There are two ways of telling PCRE2 to switch to UTF mode, where char- acters may consist of more than one code unit and the range of values is constrained. The program can call pcre2_compile() with the PCRE2_UTF option, or the pattern may start with the sequence (*UTF). However, the latter facility can be locked out by the PCRE2_NEVER_UTF option. That is, the programmer can prevent the supplier of the pattern from switching to UTF mode. Note that the PCRE2_MATCH_INVALID_UTF option (see below) forces PCRE2_UTF to be set. In UTF mode, both the pattern and any subject strings that are matched against it are treated as UTF strings instead of strings of individual one-code-unit characters. There are also some other changes to the way characters are handled, as documented below. UNICODE PROPERTY SUPPORT When PCRE2 is built with Unicode support, the escape sequences \p{..}, \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF set- ting. The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal num- ber, the derived properties Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties. The full lists are given in the pcre2pattern and pcre2syntax documenta- tion. In general, only the short names for properties are supported. For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not supported. Furthermore, in Perl, many properties may optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE2 does not sup- port this. WIDE CHARACTERS AND UTF MODES Code points less than 256 can be specified in patterns by either braced or unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). Larger values have to use braced sequences. Unbraced octal code points up to \777 are also recognized; larger ones can be coded using \o{...}. The escape sequence \N{U+} is recognized as another way of specifying a Unicode character by code point in a UTF mode. It is not allowed in non-UTF mode. In UTF mode, repeat quantifiers apply to complete UTF characters, not to individual code units. In UTF mode, the dot metacharacter matches one UTF character instead of a single code unit. In UTF mode, capture group names are not restricted to ASCII, and may contain any Unicode letters and decimal digits, as well as underscore. The escape sequence \C can be used to match a single code unit in UTF mode, but its use can lead to some strange effects because it breaks up multi-unit characters (see the description of \C in the pcre2pattern documentation). For this reason, there is a build-time option that dis- ables support for \C completely. There is also a less draconian com- pile-time option for locking out the use of \C when a pattern is com- piled. The use of \C is not supported by the alternative matching function pcre2_dfa_match() when in UTF-8 or UTF-16 mode, that is, when a charac- ter may consist of more than one code unit. The use of \C in these modes provokes a match-time error. Also, the JIT optimization does not support \C in these modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that contains \C, it will not succeed, and so when pcre2_match() is called, the matching will be carried out by the inter- pretive function. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test characters of any code value, but, by default, the characters that PCRE2 recognizes as digits, spaces, or word characters remain the same set as in non-UTF mode, all with code points less than 256. This re- mains true even when PCRE2 is built to include Unicode support, because to do otherwise would slow down matching in many common cases. Note that this also applies to \b and \B, because they are defined in terms of \w and \W. If you want to test for a wider sense of, say, "digit", you can use explicit Unicode property tests such as \p{Nd}. Alterna- tively, if you set the PCRE2_UCP option, the way that the character es- capes work is changed so that Unicode properties are used to determine which characters match, though there are some options that suppress this for individual escapes. For details see the section on generic character types in the pcre2pattern documentation. Like the escapes, characters that match the POSIX named character classes are all low-valued characters unless the PCRE2_UCP option is set, but there is an option to override this. In contrast to the character escapes and character classes, the special horizontal and vertical white space escapes (\h, \H, \v, and \V) do match all the appropriate Unicode characters, whether or not PCRE2_UCP is set. UNICODE CASE-EQUIVALENCE If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use of Unicode properties except for characters whose code points are less than 128 and that have at most two case-equivalent values. For these, a direct table lookup is used for speed. A few Unicode charac- ters such as Greek sigma have more than two code points that are case- equivalent, and these are treated specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case processing for non-UTF character encodings such as UCS-2. There are two ASCII characters (S and K) that, in addition to their ASCII lower case equivalents, have a non-ASCII one as well (long S and Kelvin sign). Recognition of these non-ASCII characters as case-equiv- alent to their ASCII counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. Without PCRE2_EXTRA_CASELESS_RESTRICT: 'k' = 'K' = U+212A (Kelvin sign) 's' = 'S' = U+017F (long S) With PCRE2_EXTRA_CASELESS_RESTRICT: 'k' = 'K' U+212A (Kelvin sign) only case-equivalent to itself 's' = 'S' U+017F (long S) only case-equivalent to itself One language family, Turkish and Azeri, has its own case-insensitivity rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 (small dotless i) characters. Without PCRE2_EXTRA_TURKISH_CASING: 'i' = 'I' U+0130 (capital I with dot above) only case-equivalent to itself U+0131 (small dotless i) only case-equivalent to itself With PCRE2_EXTRA_TURKISH_CASING: 'i' = U+0130 (capital I with dot above) U+0131 (small dotless i) = 'I' It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING together. From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower case), and Lt (title case) are all treated as Lc (cased letter) when caseless matching is set by the PCRE2_CASELESS option or (?i) within the pattern. SCRIPT RUNS The pattern constructs (*script_run:...) and (*atomic_script_run:...), with synonyms (*sr:...) and (*asr:...), verify that the string matched within the parentheses is a script run. In concept, a script run is a sequence of characters that are all from the same Unicode script. How- ever, because some scripts are commonly used together, and because some diacritical and other marks are used with multiple scripts, it is not that simple. Every Unicode character has a Script property, mostly with a value cor- responding to the name of a script, such as Latin, Greek, or Cyrillic. There are also three special values: "Unknown" is used for code points that have not been assigned, and also for the surrogate code points. In the PCRE2 32-bit library, characters whose code points are greater than the Unicode maximum (U+10FFFF), which are accessible only in non-UTF mode, are assigned the Unknown script. "Common" is used for characters that are used with many scripts. These include punctuation, emoji, mathematical, musical, and currency sym- bols, and the ASCII digits 0 to 9. "Inherited" is used for characters such as diacritical marks that mod- ify a previous character. These are considered to take on the script of the character that they modify. Some Inherited characters are used with many scripts, but many of them are only normally used with a small number of scripts. For example, U+102E0 (Coptic Epact thousands mark) is used only with Arabic and Cop- tic. In order to make it possible to check this, a Unicode property called Script Extension exists. Its value is a list of scripts that ap- ply to the character. For the majority of characters, the list contains just one script, the same one as the Script property. However, for characters such as U+102E0 more than one Script is listed. There are also some Common characters that have a single, non-Common script in their Script Extension list. The next section describes the basic rules for deciding whether a given string of characters is a script run. Note, however, that there are some special cases involving the Chinese Han script, and an additional constraint for decimal digits. These are covered in subsequent sec- tions. Basic script run rules A string that is less than two characters long is a script run. This is the only case in which an Unknown character can be part of a script run. Longer strings are checked using only the Script Extensions prop- erty, not the basic Script property. If a character's Script Extension property is the single value "Inher- ited", it is always accepted as part of a script run. This is also true for the property "Common", subject to the checking of decimal digits described below. All the remaining characters in a script run must have at least one script in common in their Script Extension lists. In set- theoretic terminology, the intersection of all the sets of scripts must not be empty. A simple example is an Internet name such as "google.com". The letters are all in the Latin script, and the dot is Common, so this string is a script run. However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a string that looks the same, but with Cyrillic "o"s is not a script run. More interesting examples involve characters with more than one script in their Script Extension. Consider the following characters: U+060C Arabic comma U+06D4 Arabic full stop The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could appear in script runs of either Arabic or Hanifi Ro- hingya. The first could also appear in Syriac or Thaana script runs, but the second could not. The Chinese Han script The Chinese Han script is commonly used in conjunction with other scripts for writing certain languages. Japanese uses the Hiragana and Katakana scripts together with Han; Korean uses Hangul and Han; Tai- wanese Mandarin uses Bopomofo and Han. These three combinations are treated as special cases when checking script runs and are, in effect, "virtual scripts". Thus, a script run may contain a mixture of Hira- gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture of Bopomofo and Han, but not, for example, a mixture of Hangul and Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- ports/tr39/) in allowing such mixtures. Decimal digits Unicode contains many sets of 10 decimal digits in different scripts, and some scripts (including the Common script) contain more than one set. Some of these decimal digits them are visually indistinguishable from the common ASCII digits. In addition to the script checking de- scribed above, if a script run contains any decimal digits, they must all come from the same set of 10 adjacent characters. VALIDITY OF UTF STRINGS When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant functions. If an invalid UTF string is passed, a negative error code is returned. The code unit offset to the offending character can be ex- tracted from the match data block by calling pcre2_get_startchar(), which is used for this purpose after a UTF error. In some situations, you may already know that your strings are valid, and therefore want to skip these checks in order to improve perfor- mance, for example in the case of a long subject string that is being scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- pile time or at match time, PCRE2 assumes that the pattern or subject it is given (respectively) contains only valid UTF code unit sequences. If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result is undefined and your program may crash or loop indefinitely or give incorrect results. There is, however, one mode of matching that can handle invalid UTF subject strings. This is enabled by passing PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in the next section. The rest of this section covers the case when PCRE2_MATCH_INVALID_UTF is not set. Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF check for the pattern; it does not also apply to subject strings. If you want to disable the check for a subject string you must pass this same option to pcre2_match() or pcre2_dfa_match(). UTF-16 and UTF-32 strings can indicate their endianness by special code knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other processing takes place. In the case of pcre2_match() and pcre2_dfa_match() calls with a non-zero starting offset, the check is applied only to that part of the subject that could be inspected during matching, and there is a check that the starting offset points to the first code unit of a character or to the end of the subject. If there are no lookbehind assertions in the pattern, the check starts at the starting offset. Otherwise, it starts at the length of the longest lookbehind before the starting offset, or at the start of the subject if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds. In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be. Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, where they are used in pairs to encode code points with values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs are available independently in the UTF-8 and UTF-32 encodings. (In other words, the whole surrogate thing is a fudge for UTF-16 which un- fortunately messes up UTF-8 and UTF-32.) Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. If you want to allow escape sequences such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not repre- sentable in UTF-16. Errors in UTF-8 strings The following negative error codes are given for invalid UTF-8 strings: PCRE2_ERROR_UTF8_ERR1 PCRE2_ERROR_UTF8_ERR2 PCRE2_ERROR_UTF8_ERR3 PCRE2_ERROR_UTF8_ERR4 PCRE2_ERROR_UTF8_ERR5 The string ends with a truncated UTF-8 character; the code specifies how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be no longer than 4 bytes, the encoding scheme (origi- nally defined by RFC 2279) allows for up to 6 bytes, and this is checked first; hence the possibility of 4 or 5 missing bytes. PCRE2_ERROR_UTF8_ERR6 PCRE2_ERROR_UTF8_ERR7 PCRE2_ERROR_UTF8_ERR8 PCRE2_ERROR_UTF8_ERR9 PCRE2_ERROR_UTF8_ERR10 The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the character do not have the binary value 0b10 (that is, either the most significant bit is 0, or the next bit is 1). PCRE2_ERROR_UTF8_ERR11 PCRE2_ERROR_UTF8_ERR12 A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; these code points are excluded by RFC 3629. PCRE2_ERROR_UTF8_ERR13 A 4-byte character has a value greater than 0x10ffff; these code points are excluded by RFC 3629. PCRE2_ERROR_UTF8_ERR14 A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of code points are reserved by RFC 3629 for use with UTF-16, and so are excluded from UTF-8. PCRE2_ERROR_UTF8_ERR15 PCRE2_ERROR_UTF8_ERR16 PCRE2_ERROR_UTF8_ERR17 PCRE2_ERROR_UTF8_ERR18 PCRE2_ERROR_UTF8_ERR19 A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a value that can be represented by fewer bytes, which is invalid. For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- rect coding uses just one byte. PCRE2_ERROR_UTF8_ERR20 The two most significant bits of the first byte of a character have the binary value 0b10 (that is, the most significant bit is 1 and the sec- ond is 0). Such a byte can only validly occur as the second or subse- quent byte of a multi-byte character. PCRE2_ERROR_UTF8_ERR21 The first byte of a character has the value 0xfe or 0xff. These values can never occur in a valid UTF-8 string. Errors in UTF-16 strings The following negative error codes are given for invalid UTF-16 strings: PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate follows high surrogate PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate Errors in UTF-32 strings The following negative error codes are given for invalid UTF-32 strings: PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff MATCHING IN INVALID UTF STRINGS You can run pattern matches on subject strings that may contain invalid UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- VALID_UTF option. This is supported by pcre2_match(), including JIT matching, but not by pcre2_dfa_match(). When PCRE2_MATCH_INVALID_UTF is set, it forces PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a valid UTF string. If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, and you are not certain that your subject strings are valid UTF se- quences, you should not make use of the JIT "fast path" function pcre2_jit_match() because it bypasses sanity checks, including the one for UTF validity. An invalid string may cause undefined behaviour, in- cluding looping, crashing, or giving the wrong answer. Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() generates, but if pcre2_jit_compile() is subsequently called, it does generate different code. If JIT is not used, the option affects the be- haviour of the interpretive code in pcre2_match(). When PCRE2_MATCH_IN- VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time. In this mode, an invalid code unit sequence in the subject never matches any pattern item. It does not match dot, it does not match \p{Any}, it does not even match negative items such as [^X]. A lookbe- hind assertion fails if it encounters an invalid sequence while moving the current point backwards. In other words, an invalid UTF code unit sequence acts as a barrier which no match can cross. You can also think of this as the subject being split up into fragments of valid UTF, delimited internally by invalid code unit sequences. The pattern is matched fragment by fragment. The result of a successful match, however, is given as code unit offsets in the entire subject string in the usual way. There are a few points to consider: The internal boundaries are not interpreted as the beginnings or ends of lines and so do not match circumflex or dollar characters in the pattern. If pcre2_match() is called with an offset that points to an invalid UTF-sequence, that sequence is skipped, and the match starts at the next valid UTF character, or the end of the subject. At internal fragment boundaries, \b and \B behave in the same way as at the beginning and end of the subject. For example, a sequence such as \bWORD\b would match an instance of WORD that is surrounded by invalid UTF code units. Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- trary data, knowing that any matched strings that are returned are valid UTF. This can be useful when searching for UTF text in executable or other binary files. Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 27 November 2024 PCRE2UNICODE(3) ------------------------------------------------------------------------------ ================================================ FILE: doc/pcre2_callout_enumerate.3 ================================================ .TH PCRE2_COMPILE 3 "23 March 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP, .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *)," .B " void *\fIcallout_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function scans a compiled regular expression and calls the \fIcallback()\fP function for each callout within the pattern. The yield of the function is zero for success and non-zero otherwise. The arguments are: .sp \fIcode\fP Points to the compiled pattern \fIcallback\fP The callback function \fIcallout_data\fP User data that is passed to the callback .sp The \fIcallback()\fP function is passed a pointer to a data block containing the following fields (not necessarily in this order): .sp uint32_t \fIversion\fP Block version number uint32_t \fIcallout_number\fP Number for numbered callouts PCRE2_SIZE \fIpattern_position\fP Offset to next item in pattern PCRE2_SIZE \fInext_item_length\fP Length of next item in pattern PCRE2_SIZE \fIcallout_string_offset\fP Offset to string within pattern PCRE2_SIZE \fIcallout_string_length\fP Length of callout string PCRE2_SPTR \fIcallout_string\fP Points to callout string or is NULL .sp The second argument passed to the \fBcallback()\fP function is the callout data that was passed to \fBpcre2_callout_enumerate()\fP. The \fBcallback()\fP function must return zero for success. Any other value causes the pattern scan to stop, with the value being passed back as the result of \fBpcre2_callout_enumerate()\fP. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_code_copy.3 ================================================ .TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP); .fi . .SH DESCRIPTION .rs .sp This function makes a copy of the memory used for a compiled pattern, excluding any memory used by the JIT compiler. Without a subsequent call to \fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. The pointer to the character tables is copied, not the tables themselves (see \fBpcre2_code_copy_with_tables()\fP). The yield of the function is NULL if \fIcode\fP is NULL or if sufficient memory cannot be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_code_copy_with_tables.3 ================================================ .TH PCRE2_CODE_COPY 3 "16 January 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP); .fi . .SH DESCRIPTION .rs .sp This function makes a copy of the memory used for a compiled pattern, excluding any memory used by the JIT compiler. Without a subsequent call to \fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. Unlike \fBpcre2_code_copy()\fP, a separate copy of the character tables is also made, with the new code pointing to it. This memory will be automatically freed when \fBpcre2_code_free()\fP is called. The yield of the function is NULL if \fIcode\fP is NULL or if sufficient memory cannot be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_code_free.3 ================================================ .TH PCRE2_CODE_FREE 3 "28 June 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_code_free(pcre2_code *\fIcode\fP); .fi . .SH DESCRIPTION .rs .sp If \fIcode\fP is NULL, this function does nothing. Otherwise, \fIcode\fP must point to a compiled pattern. This function frees its memory, including any memory used by the JIT compiler. If the compiled pattern was created by a call to \fBpcre2_code_copy_with_tables()\fP, the memory for the character tables is also freed. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_compile.3 ================================================ .TH PCRE2_COMPILE 3 "30 October 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_code *pcre2_compile(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP" .B " pcre2_compile_context *\fIccontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function compiles a regular expression pattern into an internal form. Its arguments are: .sp \fIpattern\fP A string containing expression to be compiled \fIlength\fP The length of the string or PCRE2_ZERO_TERMINATED \fIoptions\fP Primary option bits \fIerrorcode\fP Where to put an error code \fIerroffset\fP Where to put an error offset \fIccontext\fP Pointer to a compile context or NULL .sp The length of the pattern and any error offset that is returned are in code units, not characters. A NULL pattern with zero length is treated as an empty string. A compile context is needed only if you want to provide custom memory allocation functions, or to provide an external function for system stack size checking (see \fBpcre2_set_compile_recursion_guard()\fP), or to change one or more of these parameters: .sp What \eR matches (Unicode newlines, or CR, LF, CRLF only); PCRE2's character tables; The newline character sequence; The compile time nested parentheses limit; The maximum pattern length (in code units) that is allowed; The additional options bits. .sp The primary option bits are: .sp PCRE2_ANCHORED Force pattern anchoring PCRE2_ALLOW_EMPTY_CLASS Allow empty classes PCRE2_ALT_BSUX Alternative handling of \eu, \eU, and \ex PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax PCRE2_ALT_VERBNAMES Process backslashes in verb names PCRE2_AUTO_CALLOUT Compile automatic callouts PCRE2_CASELESS Do caseless matching PCRE2_DOLLAR_ENDONLY $ not to match newline at end PCRE2_DOTALL . matches anything including NL PCRE2_DUPNAMES Allow duplicate names for subpatterns PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_EXTENDED Ignore white space and # comments PCRE2_FIRSTLINE Force matching to be before newline PCRE2_LITERAL Pattern characters are all literal PCRE2_MATCH_INVALID_UTF Enable support for matching invalid UTF PCRE2_MATCH_UNSET_BACKREF Match unset backreferences PCRE2_MULTILINE ^ and $ match newlines within data PCRE2_NEVER_BACKSLASH_C Lock out the use of \eC in patterns PCRE2_NEVER_UCP Lock out PCRE2_UCP, e.g. via (*UCP) PCRE2_NEVER_UTF Lock out PCRE2_UTF, e.g. via (*UTF) PCRE2_NO_AUTO_CAPTURE Disable numbered capturing paren- theses (named ones available) PCRE2_NO_AUTO_POSSESS Disable auto-possessification PCRE2_NO_DOTSTAR_ANCHOR Disable automatic anchoring for .* PCRE2_NO_START_OPTIMIZE Disable match-time start optimizations PCRE2_NO_UTF_CHECK Do not check the pattern for UTF validity (only relevant if PCRE2_UTF is set) PCRE2_UCP Use Unicode properties for \ed, \ew, etc. PCRE2_UNGREEDY Invert greediness of quantifiers PCRE2_USE_OFFSET_LIMIT Enable offset limit for unanchored matching PCRE2_UTF Treat pattern and subjects as UTF strings .sp PCRE2 must be built with Unicode support (the default) in order to use PCRE2_UTF, PCRE2_UCP and related options. .P Additional options may be set in the compile context via the .\" HREF \fBpcre2_set_compile_extra_options\fP .\" function. .P If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns NULL immediately. Otherwise, the yield of this function is a pointer to a private data structure that contains the compiled pattern, or NULL if an error was detected. In the error case, a text error message can be obtained by passing the value returned via the \fIerrorcode\fP argument to the \fBpcre2_get_error_message()\fP function. The offset (in code units) where the error was encountered is returned via the \fIerroroffset\fP argument. .P If there is no error, the value passed via \fIerrorcode\fP returns the message "no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed via \fIerroroffset\fP is zero. .P There is a complete description of the PCRE2 native API, with more detail on each option, in the .\" HREF \fBpcre2api\fP .\" page, and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_compile_context_copy.3 ================================================ .TH PCRE2_COMPILE_CONTEXT_COPY 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_compile_context *pcre2_compile_context_copy( .B " pcre2_compile_context *\fIccontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function makes a new copy of a compile context, using the memory allocation function that was used for the original context. The result is NULL if the memory cannot be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_compile_context_create.3 ================================================ .TH PCRE2_COMPILE_CONTEXT_CREATE 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_compile_context *pcre2_compile_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function creates and initializes a new compile context. If its argument is NULL, \fBmalloc()\fP is used to get the necessary memory; otherwise the memory allocation function within the general context is used. The result is NULL if the memory could not be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_compile_context_free.3 ================================================ .TH PCRE2_COMPILE_CONTEXT_FREE 3 "28 June 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_compile_context_free(pcre2_compile_context *\fIccontext\fP); .fi . .SH DESCRIPTION .rs .sp This function frees the memory occupied by a compile context, using the memory freeing function from the general context with which it was created, or \fBfree()\fP if that was not set. If the argument is NULL, the function returns immediately without doing anything. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_config.3 ================================================ .TH PCRE2_CONFIG 3 "03 September 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP); .fi . .SH DESCRIPTION .rs .sp This function makes it possible for a client program to find out which optional features are available in the version of the PCRE2 library it is using. The arguments are as follows: .sp \fIwhat\fP A code specifying what information is required \fIwhere\fP Points to where to put the information .sp If \fIwhere\fP is NULL, the function returns the amount of memory needed for the requested information. When the information is a string, the value is in code units; for other types of data it is in bytes. .P If \fBwhere\fP is not NULL, for PCRE2_CONFIG_JITTARGET, PCRE2_CONFIG_UNICODE_VERSION, and PCRE2_CONFIG_VERSION it must point to a buffer that is large enough to hold the string. For all other codes it must point to a uint32_t integer variable. The available codes are: .sp PCRE2_CONFIG_BSR Indicates what \eR matches by default: PCRE2_BSR_UNICODE PCRE2_BSR_ANYCRLF PCRE2_CONFIG_COMPILED_WIDTHS Which of 8/16/32 support was compiled PCRE2_CONFIG_DEPTHLIMIT Default backtracking depth limit PCRE2_CONFIG_EFFECTIVE_LINKSIZE How many bytes are used for link size PCRE2_CONFIG_HEAPLIMIT Default heap memory limit .\" JOIN PCRE2_CONFIG_JIT Availability of just-in-time compiler support (1=yes 0=no) .\" JOIN PCRE2_CONFIG_JITTARGET Information (a string) about the target architecture for the JIT compiler PCRE2_CONFIG_LINKSIZE Configured internal link size (2, 3, 4) PCRE2_CONFIG_MATCHLIMIT Default internal resource limit PCRE2_CONFIG_NEVER_BACKSLASH_C Whether or not \eC is disabled PCRE2_CONFIG_NEWLINE Code for the default newline sequence: PCRE2_NEWLINE_CR PCRE2_NEWLINE_LF PCRE2_NEWLINE_CRLF PCRE2_NEWLINE_ANY PCRE2_NEWLINE_ANYCRLF PCRE2_NEWLINE_NUL PCRE2_CONFIG_PARENSLIMIT Default parentheses nesting limit PCRE2_CONFIG_RECURSIONLIMIT Obsolete: use PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_STACKRECURSE Obsolete: always returns 0 .\" JOIN PCRE2_CONFIG_UNICODE Availability of Unicode support (1=yes 0=no) PCRE2_CONFIG_UNICODE_VERSION The Unicode version (a string) PCRE2_CONFIG_VERSION The PCRE2 version (a string) .sp The function yields a non-negative value on success or the negative value PCRE2_ERROR_BADOPTION otherwise. This is also the result for the PCRE2_CONFIG_JITTARGET code if JIT support is not available. When a string is requested, the function returns the number of code units used, including the terminating zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_convert_context_copy.3 ================================================ .TH PCRE2_CONVERT_CONTEXT_COPY 3 "12 July 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_convert_context *pcre2_convert_context_copy( .B " pcre2_convert_context *\fIcvcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It makes a new copy of a convert context, using the memory allocation function that was used for the original context. The result is NULL if the memory cannot be obtained. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_convert_context_create.3 ================================================ .TH PCRE2_CONVERT_CONTEXT_CREATE 3 "12 July 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_convert_context *pcre2_convert_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It creates and initializes a new convert context. If its argument is NULL, \fBmalloc()\fP is used to get the necessary memory; otherwise the memory allocation function within the general context is used. The result is NULL if the memory could not be obtained. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_convert_context_free.3 ================================================ .TH PCRE2_CONVERT_CONTEXT_FREE 3 "13 August 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_convert_context_free(pcre2_convert_context *\fIcvcontext\fP); .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It frees the memory occupied by a convert context, using the memory freeing function from the general context with which it was created, or \fBfree()\fP if that was not set. If the argument is NULL, the function returns immediately without doing anything. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_converted_pattern_free.3 ================================================ .TH PCRE2_CONVERTED_PATTERN_FREE 3 "13 August 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_converted_pattern_free(PCRE2_UCHAR *\fIconverted_pattern\fP); .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It frees the memory occupied by a converted pattern that was obtained by calling \fBpcre2_pattern_convert()\fP with arguments that caused it to place the converted pattern into newly obtained heap memory. If the argument is NULL, the function returns immediately without doing anything. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_dfa_match.3 ================================================ .TH PCRE2_DFA_MATCH 3 "31 August 2021" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_dfa_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP," .B " int *\fIworkspace\fP, PCRE2_SIZE \fIwscount\fP);" .fi . .SH DESCRIPTION .rs .sp This function matches a compiled regular expression against a given subject string, using an alternative matching algorithm that scans the subject string just once (except when processing lookaround assertions). This function is \fInot\fP Perl-compatible (the Perl-compatible matching function is \fBpcre2_match()\fP). The arguments for this function are: .sp \fIcode\fP Points to the compiled pattern \fIsubject\fP Points to the subject string \fIlength\fP Length of the subject string \fIstartoffset\fP Offset in the subject at which to start matching \fIoptions\fP Option bits \fImatch_data\fP Points to a match data block, for results \fImcontext\fP Points to a match context, or is NULL \fIworkspace\fP Points to a vector of ints used as working space \fIwscount\fP Number of elements in the vector .sp The size of output vector needed to contain all the results depends on the number of simultaneous matches, not on the number of parentheses in the pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match data block is therefore not advisable when using this function. .P A match context is needed only if you want to set up a callout function or specify the heap limit or the match or the recursion depth limits. The \fIlength\fP and \fIstartoffset\fP values are code units, not characters. The options are: .sp PCRE2_ANCHORED Match only at the first position PCRE2_COPY_MATCHED_SUBJECT On success, make a private subject copy PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_NOTBOL Subject is not the beginning of a line PCRE2_NOTEOL Subject is not the end of a line PCRE2_NOTEMPTY An empty string is not a valid match .\" JOIN PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match .\" JOIN PCRE2_NO_UTF_CHECK Do not check the subject for UTF validity (only relevant if PCRE2_UTF was set at compile time) .\" JOIN PCRE2_PARTIAL_HARD Return PCRE2_ERROR_PARTIAL for a partial match even if there is a full match .\" JOIN PCRE2_PARTIAL_SOFT Return PCRE2_ERROR_PARTIAL for a partial match if no full matches are found PCRE2_DFA_RESTART Restart after a partial match PCRE2_DFA_SHORTEST Return only the shortest match .sp There are restrictions on what may appear in a pattern when using this matching function. Details are given in the .\" HREF \fBpcre2matching\fP .\" documentation. For details of partial matching, see the .\" HREF \fBpcre2partial\fP .\" page. There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_general_context_copy.3 ================================================ .TH PCRE2_GENERAL_CONTEXT_COPY 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_general_context *pcre2_general_context_copy( .B " pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function makes a new copy of a general context, using the memory allocation functions in the context, if set, to get the necessary memory. Otherwise \fBmalloc()\fP is used. The result is NULL if the memory cannot be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_general_context_create.3 ================================================ .TH PCRE2_GENERAL_CONTEXT_CREATE 3 "23 January 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_general_context *pcre2_general_context_create( .B " void *(*\fIprivate_malloc\fP)(size_t, void *)," .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function creates and initializes a general context. The arguments define custom memory management functions and a data value that is passed to them when they are called. The \fBprivate_malloc()\fP function is used to get memory for the context. If either of the first two arguments is NULL, the system memory management function is used. The result is NULL if no memory could be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_general_context_free.3 ================================================ .TH PCRE2_GENERAL_CONTEXT_FREE 3 "28 June 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_general_context_free(pcre2_general_context *\fIgcontext\fP); .fi . .SH DESCRIPTION .rs .sp This function frees the memory occupied by a general context, using the memory freeing function within the context, if set. If the argument is NULL, the function returns immediately without doing anything. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_error_message.3 ================================================ .TH PCRE2_GET_ERROR_MESSAGE 3 "24 March 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP, .B " PCRE2_SIZE \fIbufflen\fP);" .fi . .SH DESCRIPTION .rs .sp This function provides a textual error message for each PCRE2 error code. Compilation errors are positive numbers; UTF formatting errors and matching errors are negative numbers. The arguments are: .sp \fIerrorcode\fP an error code (positive or negative) \fIbuffer\fP where to put the message \fIbufflen\fP the length of the buffer (code units) .sp The function returns the length of the message in code units, excluding the trailing zero, or the negative error code PCRE2_ERROR_NOMEMORY if the buffer is too small. In this case, the returned message is truncated (but still with a trailing zero). If \fIerrorcode\fP does not contain a recognized error code number, the negative value PCRE2_ERROR_BADDATA is returned. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_mark.3 ================================================ .TH PCRE2_GET_MARK 3 "13 January 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B PCRE2_SPTR pcre2_get_mark(pcre2_match_data *\fImatch_data\fP); .fi . .SH DESCRIPTION .rs .sp After a call of \fBpcre2_match()\fP that was passed the match block that is this function's argument, this function returns a pointer to the last (*MARK), (*PRUNE), or (*THEN) name that was encountered during the matching process. The name is zero-terminated, and is within the compiled pattern. The length of the name is in the preceding code unit. If no name is available, NULL is returned. .P After a successful match, the name that is returned is the last one on the matching path. After a failed match or a partial match, the last encountered name is returned. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_match_data_heapframes_size.3 ================================================ .TH PCRE2_GET_MATCH_DATA_HEAPFRAMES_SIZE 3 "18 January 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B PCRE2_SIZE pcre2_get_match_data_heapframes_size( .B " pcre2_match_data *\fImatch_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function returns the size, in bytes, of the heapframes data block that is owned by its argument. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_match_data_size.3 ================================================ .TH PCRE2_GET_MATCH_DATA_SIZE 3 "17 October 2019" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *\fImatch_data\fP); .fi . .SH DESCRIPTION .rs .sp This function returns the size, in bytes, of the match data block that is its argument. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_ovector_count.3 ================================================ .TH PCRE2_GET_OVECTOR_COUNT 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B uint32_t pcre2_get_ovector_count(pcre2_match_data *\fImatch_data\fP); .fi . .SH DESCRIPTION .rs .sp This function returns the number of pairs of offsets in the ovector that forms part of the given match data block. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_ovector_pointer.3 ================================================ .TH PCRE2_GET_OVECTOR_POINTER 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *\fImatch_data\fP); .fi . .SH DESCRIPTION .rs .sp This function returns a pointer to the vector of offsets that forms part of the given match data block. The number of pairs can be found by calling \fBpcre2_get_ovector_count()\fP. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_get_startchar.3 ================================================ .TH PCRE2_GET_STARTCHAR 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *\fImatch_data\fP); .fi . .SH DESCRIPTION .rs .sp After a successful call of \fBpcre2_match()\fP that was passed the match block that is this function's argument, this function returns the code unit offset of the character at which the successful match started. For a non-partial match, this can be different to the value of \fIovector[0]\fP if the pattern contains the \eK escape sequence. After a partial match, however, this value is always the same as \fIovector[0]\fP because \eK does not affect the result of a partial match. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_jit_compile.3 ================================================ .TH PCRE2_JIT_COMPILE 3 "22 August 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_jit_compile(pcre2_code *\fIcode\fP, uint32_t \fIoptions\fP); .fi . .SH DESCRIPTION .rs .sp This function requests JIT compilation, which, if the just-in-time compiler is available, further processes a compiled pattern into machine code that executes much faster than the \fBpcre2_match()\fP interpretive matching function. Full details are given in the .\" HREF \fBpcre2jit\fP .\" documentation. .P The availability of JIT support can be tested by calling \fBpcre2_compile_jit()\fP with a single option PCRE2_JIT_TEST_ALLOC (the code argument is ignored, so a NULL value is accepted). Such a call returns zero if JIT is available and has a working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. .P Otherwise, the first argument must be a pointer that was returned by a successful call to \fBpcre2_compile()\fP, and the second must contain one or more of the following bits: .sp PCRE2_JIT_COMPLETE compile code for full matching PCRE2_JIT_PARTIAL_SOFT compile code for soft partial matching PCRE2_JIT_PARTIAL_HARD compile code for hard partial matching .sp There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been superseded by the \fBpcre2_compile()\fP option PCRE2_MATCH_INVALID_UTF. The old option is deprecated and may be removed in the future. .P The yield of the function when called with any of the three options above is 0 for success, or a negative error code otherwise. In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or if an unknown bit is set in \fIoptions\fP. The function can also return PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the compiler, even if it was because of a system security restriction. In a few cases, the function may return with PCRE2_ERROR_JIT_UNSUPPORTED for unsupported features. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_jit_free_unused_memory.3 ================================================ .TH PCRE2_JIT_FREE_UNUSED_MEMORY 3 "24 April 2020" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP); .fi . .SH DESCRIPTION .rs .sp This function frees unused JIT executable memory. The argument is a general context, for custom memory management, or NULL for standard memory management. JIT memory allocation retains some memory in order to improve future JIT compilation speed. In low memory conditions, \fBpcre2_jit_free_unused_memory()\fP can be used to cause this memory to be freed. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_jit_match.3 ================================================ .TH PCRE2_JIT_MATCH 3 "20 January 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_jit_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function matches a compiled regular expression that has been successfully processed by the JIT compiler against a given subject string, using a matching algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and it bypasses some of the sanity checks that \fBpcre2_match()\fP applies. .P In UTF mode, the subject string is not checked for UTF validity. Unless PCRE2_MATCH_INVALID_UTF was set when the pattern was compiled, passing an invalid UTF string results in undefined behaviour. Your program may crash or loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you should only call \fBpcre2_jit_match()\fP in UTF mode if you are sure the subject is valid. .P The arguments for \fBpcre2_jit_match()\fP are exactly the same as for .\" HREF \fBpcre2_match()\fP, .\" except that the subject string must be specified with a length; PCRE2_ZERO_TERMINATED is not supported. .P The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Unsupported options are ignored. .P The return values are the same as for \fBpcre2_match()\fP plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. For details of partial matching, see the .\" HREF \fBpcre2partial\fP .\" page. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the JIT API in the .\" HREF \fBpcre2jit\fP .\" page. ================================================ FILE: doc/pcre2_jit_stack_assign.3 ================================================ .TH PCRE2_JIT_STACK_ASSIGN 3 "13 August 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_jit_stack_assign(pcre2_match_context *\fImcontext\fP, .B " pcre2_jit_callback \fIcallback_function\fP, void *\fIcallback_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function provides control over the memory used by JIT as a run-time stack when \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP is called with a pattern that has been successfully processed by the JIT compiler. The information that determines which stack is used is put into a match context that is subsequently passed to a matching function. The arguments of this function are: .sp mcontext a pointer to a match context callback a callback function callback_data a JIT stack or a value to be passed to the callback .P If \fImcontext\fP is NULL, the function returns immediately, without doing anything. .P If \fIcallback\fP is NULL and \fIcallback_data\fP is NULL, an internal 32KiB block on the machine stack is used. .P If \fIcallback\fP is NULL and \fIcallback_data\fP is not NULL, \fIcallback_data\fP must be a valid JIT stack, the result of calling \fBpcre2_jit_stack_create()\fP. .P If \fIcallback\fP not NULL, it is called with \fIcallback_data\fP as an argument at the start of matching, in order to set up a JIT stack. If the result is NULL, the internal 32KiB stack is used; otherwise the return value must be a valid JIT stack, the result of calling \fBpcre2_jit_stack_create()\fP. .P You may safely use the same JIT stack for multiple patterns, as long as they are all matched in the same thread. In a multithread application, each thread must use its own JIT stack. For more details, see the .\" HREF \fBpcre2jit\fP .\" page. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_jit_stack_create.3 ================================================ .TH PCRE2_JIT_STACK_CREATE 3 "23 January 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_jit_stack *pcre2_jit_stack_create(size_t \fIstartsize\fP, .B " size_t \fImaxsize\fP, pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function is used to create a stack for use by the code compiled by the JIT compiler. The first two arguments are a starting size for the stack, and a maximum size to which it is allowed to grow. The final argument is a general context, for memory allocation functions, or NULL for standard memory allocation. The result can be passed to the JIT run-time code by calling \fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern, which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP. A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. If the stack couldn't be allocated or the values passed were not reasonable, NULL will be returned. For more details, see the .\" HREF \fBpcre2jit\fP .\" page. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_jit_stack_free.3 ================================================ .TH PCRE2_JIT_STACK_FREE 3 "26 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_jit_stack_free(pcre2_jit_stack *\fIjit_stack\fP); .fi . .SH DESCRIPTION .rs .sp This function is used to free a JIT stack that was created by \fBpcre2_jit_stack_create()\fP when it is no longer needed. If the argument is NULL, the function returns immediately without doing anything. For more details, see the .\" HREF \fBpcre2jit\fP .\" page. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_maketables.3 ================================================ .TH PCRE2_MAKETABLES 3 "26 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B const uint8_t *pcre2_maketables(pcre2_general_context *\fIgcontext\fP); .fi . .SH DESCRIPTION .rs .sp This function builds a set of character tables for character code points that are less than 256. These can be passed to \fBpcre2_compile()\fP in a compile context in order to override the internal, built-in tables (which were either defaulted or made by \fBpcre2_maketables()\fP when PCRE2 was compiled). See the .\" HREF \fBpcre2_set_character_tables()\fP .\" page. You might want to do this if you are using a non-standard locale. .P If the argument is NULL, \fBmalloc()\fP is used to get memory for the tables. Otherwise it must point to a general context, which can supply pointers to a custom memory manager. The function yields a pointer to the tables. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_maketables_free.3 ================================================ .TH PCRE2_MAKETABLES_FREE 3 "03 September 2019" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_maketables_free(pcre2_general_context *\fIgcontext\fP, .B " const uint8_t *\fItables\fP);" .fi . .SH DESCRIPTION .rs .sp This function discards a set of character tables that were created by a call to .\" HREF \fBpcre2_maketables()\fP. .\" .P The \fIgcontext\fP parameter should match what was used in that call to account for any custom allocators that might be in use; if it is NULL the system \fBfree()\fP is used. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page. ================================================ FILE: doc/pcre2_match.3 ================================================ .TH PCRE2_MATCH 3 "27 January 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function matches a compiled regular expression against a given subject string, using a matching algorithm that is similar to Perl's. It returns offsets to what it has matched and to captured substrings via the \fBmatch_data\fP block, which can be processed by functions with names that start with \fBpcre2_get_ovector_...()\fP or \fBpcre2_substring_...()\fP. The return from \fBpcre2_match()\fP is one more than the highest numbered capturing pair that has been set (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. The function arguments are: .sp \fIcode\fP Points to the compiled pattern \fIsubject\fP Points to the subject string \fIlength\fP Length of the subject string \fIstartoffset\fP Offset in the subject at which to start matching \fIoptions\fP Option bits \fImatch_data\fP Points to a match data block, for results \fImcontext\fP Points to a match context, or is NULL .sp A match context is needed only if you want to: .sp Set up a callout function Set a matching offset limit Change the heap memory limit Change the backtracking match limit Change the backtracking depth limit Set custom memory management specifically for the match .sp The \fIlength\fP and \fIstartoffset\fP values are code units, not characters. The length may be given as PCRE2_ZERO_TERMINATED for a subject that is terminated by a binary zero code unit. The options are: .sp PCRE2_ANCHORED Match only at the first position PCRE2_COPY_MATCHED_SUBJECT On success, make a private subject copy PCRE2_DISABLE_RECURSELOOP_CHECK Only useful in rare cases; use with care PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_NOTBOL Subject string is not the beginning of a line PCRE2_NOTEOL Subject string is not the end of a line PCRE2_NOTEMPTY An empty string is not a valid match .\" JOIN PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match PCRE2_NO_JIT Do not use JIT matching .\" JOIN PCRE2_NO_UTF_CHECK Do not check the subject for UTF validity (only relevant if PCRE2_UTF was set at compile time) .\" JOIN PCRE2_PARTIAL_HARD Return PCRE2_ERROR_PARTIAL for a partial match even if there is a full match .\" JOIN PCRE2_PARTIAL_SOFT Return PCRE2_ERROR_PARTIAL for a partial match if no full matches are found .sp For details of partial matching, see the .\" HREF \fBpcre2partial\fP .\" page. There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_match_context_copy.3 ================================================ .TH PCRE2_MATCH_CONTEXT_COPY 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_match_context *pcre2_match_context_copy( .B " pcre2_match_context *\fImcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function makes a new copy of a match context, using the memory allocation function that was used for the original context. The result is NULL if the memory cannot be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_match_context_create.3 ================================================ .TH PCRE2_MATCH_CONTEXT_CREATE 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_match_context *pcre2_match_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function creates and initializes a new match context. If its argument is NULL, \fBmalloc()\fP is used to get the necessary memory; otherwise the memory allocation function within the general context is used. The result is NULL if the memory could not be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_match_context_free.3 ================================================ .TH PCRE2_MATCH_CONTEXT_FREE 3 "28 June 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_match_context_free(pcre2_match_context *\fImcontext\fP); .fi . .SH DESCRIPTION .rs .sp This function frees the memory occupied by a match context, using the memory freeing function from the general context with which it was created, or \fBfree()\fP if that was not set. If the argument is NULL, the function returns immediately without doing anything. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_match_data_create.3 ================================================ .TH PCRE2_MATCH_DATA_CREATE 3 "28 August 2021" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP, .B " pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function creates a new match data block, which is used for holding the result of a match. The first argument specifies the number of pairs of offsets that are required. These form the "output vector" (ovector) within the match data block, and are used to identify the matched string and any captured substrings when matching with \fBpcre2_match()\fP, or a number of different matches at the same point when used with \fBpcre2_dfa_match()\fP. There is always one pair of offsets; if \fBovecsize\fP is zero, it is treated as one. .P The second argument points to a general context, for custom memory management, or is NULL for system memory management. The result of the function is NULL if the memory for the block could not be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_match_data_create_from_pattern.3 ================================================ .TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "11 August 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B pcre2_match_data *pcre2_match_data_create_from_pattern( .B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function creates a new match data block for holding the result of a match. If the first argument is NULL, this function returns NULL, otherwise the first argument points to a compiled pattern. The number of capturing parentheses within the pattern is used to compute the number of pairs of offsets that are required in the match data block. These form the "output vector" (ovector) within the match data block, and are used to identify the matched string and any captured substrings when matching with \fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the output vector in a different way, you should use \fBpcre2_match_data_create()\fP instead of this function. .P The second argument points to a general context, for custom memory management, or is NULL to use the same memory allocator that was used for the compiled pattern. The result of the function is NULL if the memory for the block could not be obtained or if NULL was provided as the first argument. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_match_data_free.3 ================================================ .TH PCRE2_MATCH_DATA_FREE 3 "16 August 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_match_data_free(pcre2_match_data *\fImatch_data\fP); .fi . .SH DESCRIPTION .rs .sp If \fImatch_data\fP is NULL, this function does nothing. Otherwise, \fImatch_data\fP must point to a match data block, which this function frees, using the memory freeing function from the general context or compiled pattern with which it was created, or \fBfree()\fP if that was not set. If the match data block was previously passed to \fBpcre2_match()\fP, it will have an attached heapframe vector; this is also freed. .P If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this match data block, the copy of the subject that was referenced within the block is also freed. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_next_match.3 ================================================ .TH PCRE2_NEXT_MATCH 3 "24 March 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_next_match(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SIZE *\fIpstart_offset\fP, uint32_t *\fIpoptions\fP);" .fi . .SH DESCRIPTION .rs .sp This function can be called after one of the match functions (\fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP), and must be provided with the same \fImatch_data\fP parameter. It outputs the appropriate parameters for searching for the next match in the same subject string, and is suitable for applications providing "global" matching behaviour (for example, replacing all matches in the subject, or splitting the subject on all matches, or simply counting the number of matches). .P It returns 0 ("false") if there is no need to make any further match attempts, or 1 ("true") if another match should be attempted. .P The *\fIpstart_offset\fP and *\fIpoptions\fP are set if the function returns 1. The *\fIpstart_offset\fP should be passed to the next match attempt directly, and the *\fIpoptions\fP should be passed to the next match attempt by combining with the application's match options using OR. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_pattern_convert.3 ================================================ .TH PCRE2_PATTERN_CONVERT 3 "12 July 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_pattern_convert(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, .B " uint32_t \fIoptions\fP, PCRE2_UCHAR **\fIbuffer\fP," .B " PCRE2_SIZE *\fIblength\fP, pcre2_convert_context *\fIcvcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It converts a foreign pattern (for example, a glob) into a PCRE2 regular expression pattern. Its arguments are: .sp \fIpattern\fP The foreign pattern \fIlength\fP The length of the input pattern or PCRE2_ZERO_TERMINATED \fIoptions\fP Option bits \fIbuffer\fP Pointer to pointer to output buffer, or NULL \fIblength\fP Pointer to output length field \fIcvcontext\fP Pointer to a convert context or NULL .sp The length of the converted pattern (excluding the terminating zero) is returned via \fIblength\fP. If \fIbuffer\fP is NULL, the function just returns the output length. If \fIbuffer\fP points to a NULL pointer, heap memory is obtained for the converted pattern, using the allocator in the context if present (or else \fBmalloc()\fP), and the field pointed to by \fIbuffer\fP is updated. If \fIbuffer\fP points to a non-NULL field, that must point to a buffer whose size is in the variable pointed to by \fIblength\fP. This value is updated. .P The option bits are: .sp PCRE2_CONVERT_UTF Input is UTF PCRE2_CONVERT_NO_UTF_CHECK Do not check UTF validity PCRE2_CONVERT_POSIX_BASIC Convert POSIX basic pattern PCRE2_CONVERT_POSIX_EXTENDED Convert POSIX extended pattern PCRE2_CONVERT_GLOB ) Convert PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR ) various types PCRE2_CONVERT_GLOB_NO_STARSTAR ) of glob .sp The return value from \fBpcre2_pattern_convert()\fP is zero on success or a non-zero PCRE2 error code. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_pattern_info.3 ================================================ .TH PCRE2_PATTERN_INFO 3 "14 February 2019" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_pattern_info(const pcre2_code *\fIcode\fP, uint32_t \fIwhat\fP, .B " void *\fIwhere\fP);" .fi . .SH DESCRIPTION .rs .sp This function returns information about a compiled pattern. Its arguments are: .sp \fIcode\fP Pointer to a compiled regular expression pattern \fIwhat\fP What information is required \fIwhere\fP Where to put the information .sp The recognized values for the \fIwhat\fP argument, and the information they request are as follows: .sp PCRE2_INFO_ALLOPTIONS Final options after compiling PCRE2_INFO_ARGOPTIONS Options passed to \fBpcre2_compile()\fP PCRE2_INFO_BACKREFMAX Number of highest backreference PCRE2_INFO_BSR What \eR matches: PCRE2_BSR_UNICODE: Unicode line endings PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only PCRE2_INFO_CAPTURECOUNT Number of capturing subpatterns .\" JOIN PCRE2_INFO_DEPTHLIMIT Backtracking depth limit if set, otherwise PCRE2_ERROR_UNSET PCRE2_INFO_EXTRAOPTIONS Extra options that were passed in the compile context PCRE2_INFO_FIRSTBITMAP Bitmap of first code units, or NULL PCRE2_INFO_FIRSTCODETYPE Type of start-of-match information 0 nothing set 1 first code unit is set 2 start of string or after newline PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1 PCRE2_INFO_FRAMESIZE Size of backtracking frame PCRE2_INFO_HASBACKSLASHC Return 1 if pattern contains \eC .\" JOIN PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist in the pattern .\" JOIN PCRE2_INFO_HEAPLIMIT Heap memory limit if set, otherwise PCRE2_ERROR_UNSET PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0 PCRE2_INFO_LASTCODETYPE Type of must-be-present information 0 nothing set 1 code unit is set PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1 .\" JOIN PCRE2_INFO_MATCHEMPTY 1 if the pattern can match an empty string, 0 otherwise .\" JOIN PCRE2_INFO_MATCHLIMIT Match limit if set, otherwise PCRE2_ERROR_UNSET .\" JOIN PCRE2_INFO_MAXLOOKBEHIND Length (in characters) of the longest lookbehind assertion PCRE2_INFO_MINLENGTH Lower bound length of matching strings PCRE2_INFO_NAMECOUNT Number of named subpatterns PCRE2_INFO_NAMEENTRYSIZE Size of name table entries PCRE2_INFO_NAMETABLE Pointer to name table PCRE2_CONFIG_NEWLINE Code for the newline sequence: PCRE2_NEWLINE_CR PCRE2_NEWLINE_LF PCRE2_NEWLINE_CRLF PCRE2_NEWLINE_ANY PCRE2_NEWLINE_ANYCRLF PCRE2_NEWLINE_NUL PCRE2_INFO_RECURSIONLIMIT Obsolete synonym for PCRE2_INFO_DEPTHLIMIT PCRE2_INFO_SIZE Size of compiled pattern .sp If \fIwhere\fP is NULL, the function returns the amount of memory needed for the requested information, in bytes. Otherwise, the \fIwhere\fP argument must point to an unsigned 32-bit integer (uint32_t variable), except for the following \fIwhat\fP values, when it must point to a variable of the type shown: .sp PCRE2_INFO_FIRSTBITMAP const uint8_t * PCRE2_INFO_JITSIZE size_t PCRE2_INFO_NAMETABLE PCRE2_SPTR PCRE2_INFO_SIZE size_t .sp The yield of the function is zero on success or: .sp PCRE2_ERROR_NULL the argument \fIcode\fP is NULL PCRE2_ERROR_BADMAGIC the "magic number" was not found PCRE2_ERROR_BADOPTION the value of \fIwhat\fP is invalid PCRE2_ERROR_BADMODE the pattern was compiled in the wrong mode PCRE2_ERROR_UNSET the requested information is not set .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_serialize_decode.3 ================================================ .TH PCRE2_SERIALIZE_DECODE 3 "22 April 2022" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP, .B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP," .B " pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function decodes a serialized set of compiled patterns back into a list of individual patterns. This is possible only on a host that is running the same version of PCRE2, with the same code unit width, and the host must also have the same endianness, pointer width and PCRE2_SIZE type. The arguments for \fBpcre2_serialize_decode()\fP are: .sp \fIcodes\fP pointer to a vector in which to build the list \fInumber_of_codes\fP number of slots in the vector \fIbytes\fP the serialized byte stream \fIgcontext\fP pointer to a general context or NULL .sp The \fIbytes\fP argument must point to a block of data that was originally created by \fBpcre2_serialize_encode()\fP, though it may have been saved on disc or elsewhere in the meantime. If there are more codes in the serialized data than slots in the list, only those compiled patterns that will fit are decoded. The yield of the function is the number of decoded patterns, or one of the following negative error codes: .sp PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version PCRE2_ERROR_NOMEMORY memory allocation failed PCRE2_ERROR_NULL \fIcodes\fP or \fIbytes\fP is NULL .sp PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the serialization functions in the .\" HREF \fBpcre2serialize\fP .\" page. ================================================ FILE: doc/pcre2_serialize_encode.3 ================================================ .TH PCRE2_SERIALIZE_ENCODE 3 "13 August 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP, .B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP," .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION .rs .sp This function encodes a list of compiled patterns into a byte stream that can be saved on disc or elsewhere. Note that this is not an abstract format like Java or .NET. Conversion of the byte stream back into usable compiled patterns can only happen on a host that is running the same version of PCRE2, with the same code unit width, and the host must also have the same endianness, pointer width and PCRE2_SIZE type. The arguments for \fBpcre2_serialize_encode()\fP are: .sp \fIcodes\fP pointer to a vector containing the list \fInumber_of_codes\fP number of slots in the vector \fIserialized_bytes\fP set to point to the serialized byte stream \fIserialized_size\fP set to the number of bytes in the byte stream \fIgcontext\fP pointer to a general context or NULL .sp The context argument is used to obtain memory for the byte stream. When the serialized data is no longer needed, it must be freed by calling \fBpcre2_serialize_free()\fP. The yield of the function is the number of serialized patterns, or one of the following negative error codes: .sp PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns PCRE2_ERROR_MEMORY memory allocation failed PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables PCRE2_ERROR_NULL an argument other than \fIgcontext\fP is NULL .sp PCRE2_ERROR_BADMAGIC means either that a pattern's code has been corrupted, or that a slot in the vector does not point to a compiled pattern. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the serialization functions in the .\" HREF \fBpcre2serialize\fP .\" page. ================================================ FILE: doc/pcre2_serialize_free.3 ================================================ .TH PCRE2_SERIALIZE_FREE 3 "13 August 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_serialize_free(uint8_t *\fIbytes\fP); .fi . .SH DESCRIPTION .rs .sp This function frees the memory that was obtained by \fBpcre2_serialize_encode()\fP to hold a serialized byte stream. The argument must point to such a byte stream or be NULL, in which case the function returns without doing anything. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the serialization functions in the .\" HREF \fBpcre2serialize\fP .\" page. ================================================ FILE: doc/pcre2_serialize_get_number_of_codes.3 ================================================ .TH PCRE2_SERIALIZE_GET_NUMBER_OF_CODES 3 "13 August 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int32_t pcre2_serialize_get_number_of_codes(const uint8_t *\fIbytes\fP); .fi . .SH DESCRIPTION .rs .sp The \fIbytes\fP argument must point to a serialized byte stream that was originally created by \fBpcre2_serialize_encode()\fP (though it may have been saved on disc or elsewhere in the meantime). The function returns the number of serialized patterns in the byte stream, or one of the following negative error codes: .sp PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version PCRE2_ERROR_NULL the argument is NULL .sp PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the serialization functions in the .\" HREF \fBpcre2serialize\fP .\" page. ================================================ FILE: doc/pcre2_set_bsr.3 ================================================ .TH PCRE2_SET_BSR 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_bsr(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the convention for processing \eR within a compile context. The second argument must be one of PCRE2_BSR_ANYCRLF or PCRE2_BSR_UNICODE. The result is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_callout.3 ================================================ .TH PCRE2_SET_CALLOUT 3 "25 March 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_callout_block *)," .B " void *\fIcallout_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the callout fields in a match context (the first argument). The second argument specifies a callout function, and the third argument is an opaque data item that is passed to it. The result of this function is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_character_tables.3 ================================================ .TH PCRE2_SET_CHARACTER_TABLES 3 "15 April 2020" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP, .B " const uint8_t *\fItables\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets a pointer to custom character tables within a compile context. The second argument must point to a set of PCRE2 character tables or be NULL to request the default tables. The result is always zero. Character tables can be created by calling \fBpcre2_maketables()\fP or by running the \fBpcre2_dftables\fP maintenance command in binary mode (see the .\" HREF \fBpcre2build\fP .\" documentation). .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_compile_extra_options.3 ================================================ .TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "14 October 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIextra_options\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets additional option bits for \fBpcre2_compile()\fP that are housed in a compile context. It completely replaces all the bits. The extra options are: .sp PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds .\" JOIN PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff} in UTF-8 and UTF-32 modes .\" JOIN PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and \ex handling PCRE2_EXTRA_ASCII_BSD \ed remains ASCII in UCP mode PCRE2_EXTRA_ASCII_BSS \es remains ASCII in UCP mode PCRE2_EXTRA_ASCII_BSW \ew remains ASCII in UCP mode .\" JOIN PCRE2_EXTRA_ASCII_DIGIT [:digit:] and [:xdigit:] POSIX classes remain ASCII in UCP mode .\" JOIN PCRE2_EXTRA_ASCII_POSIX POSIX classes remain ASCII in UCP mode .\" JOIN PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character .\" JOIN PCRE2_EXTRA_CASELESS_RESTRICT Disable mixed ASCII/non-ASCII case folding PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \er as \en PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines PCRE2_EXTRA_MATCH_WORD Pattern matches "words" PCRE2_EXTRA_NEVER_CALLOUT Disallow callouts in pattern PCRE2_EXTRA_NO_BS0 Disallow \e0 (but not \e00 or \e000) PCRE2_EXTRA_PYTHON_OCTAL Use Python rules for octal PCRE2_EXTRA_TURKISH_CASING Use Turkish I case folding .sp There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_compile_recursion_guard.3 ================================================ .TH PCRE2_SET_COMPILE_RECURSION_GUARD 3 "26 November 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, .B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function defines, within a compile context, a function that is called whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a pattern. The first argument to the function gives the current depth of parenthesis nesting, and the second is user data that is supplied when the function is set up. The callout function should return zero if all is well, or non-zero to force an error. This feature is provided so that applications can check the available system stack space, in order to avoid running out. The result of \fBpcre2_set_compile_recursion_guard()\fP is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_depth_limit.3 ================================================ .TH PCRE2_SET_DEPTH_LIMIT 3 "25 March 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_depth_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the backtracking depth limit field in a match context. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_glob_escape.3 ================================================ .TH PCRE2_SET_GLOB_ESCAPE 3 "12 July 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_glob_escape(pcre2_convert_context *\fIcvcontext\fP, .B " uint32_t \fIescape_char\fP);" .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It sets the escape character that is used when converting globs. The second argument must either be zero (meaning there is no escape character) or a punctuation character whose code point is less than 256. The default is grave accent if running under Windows, otherwise backslash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_set_glob_separator.3 ================================================ .TH PCRE2_SET_GLOB_SEPARATOR 3 "17 June 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_glob_separator(pcre2_convert_context *\fIcvcontext\fP, .B " uint32_t \fIseparator_char\fP);" .fi . .SH DESCRIPTION .rs .sp This function is part of an experimental set of pattern conversion functions. It sets the component separator character that is used when converting globs. The second argument must be one of the characters forward slash, backslash, or dot. The default is backslash when running under Windows, otherwise forward slash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid. .P The pattern conversion functions are described in the .\" HREF \fBpcre2convert\fP .\" documentation. ================================================ FILE: doc/pcre2_set_heap_limit.3 ================================================ .TH PCRE2_SET_HEAP_LIMIT 3 "17 June 2018" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the backtracking heap limit field in a match context. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_match_limit.3 ================================================ .TH PCRE2_SET_MATCH_LIMIT 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the match limit field in a match context. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_max_pattern_compiled_length.3 ================================================ .TH PCRE2_SET_MAX_PATTERN_COMPILED_LENGTH 3 "09 June 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_max_pattern_compiled_length( .B " pcre2_compile_context *\fIccontext\fP, PCRE2_SIZE \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets, in a compile context, the maximum size (in bytes) for the memory needed to hold the compiled version of a pattern that is using this context. The result is always zero. If a pattern that is passed to \fBpcre2_compile()\fP referencing this context needs more memory, an error is generated. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_max_pattern_length.3 ================================================ .TH PCRE2_SET_MAX_PATTERN_LENGTH 3 "05 October 2016" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP, .B " PCRE2_SIZE \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets, in a compile context, the maximum text length (in code units) of the pattern that can be compiled. The result is always zero. If a longer pattern is passed to \fBpcre2_compile()\fP there is an immediate error return. The default is effectively unlimited, being the largest value a PCRE2_SIZE variable can hold. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_max_varlookbehind.3 ================================================ .TH PCRE2_SET_NEWLINE 3 "11 August 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_max_varlookbehind(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This sets a maximum length for the number of characters matched by a variable-length lookbehind assertion. The default is set when PCRE2 is built, with the ultimate default being 255, the same as Perl. Lookbehind assertions without a bounding length are not supported. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_newline.3 ================================================ .TH PCRE2_SET_NEWLINE 3 "19 July 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the newline convention within a compile context. This specifies which character(s) are recognized as newlines when compiling and matching patterns. The second argument must be one of: .sp PCRE2_NEWLINE_CR Carriage return only PCRE2_NEWLINE_LF Linefeed only PCRE2_NEWLINE_CRLF CR followed by LF only PCRE2_NEWLINE_ANYCRLF Any of the above PCRE2_NEWLINE_ANY Any Unicode newline sequence PCRE2_NEWLINE_NUL The NUL character (binary zero) .sp The result is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_offset_limit.3 ================================================ .TH PCRE2_SET_OFFSET_LIMIT 3 "22 September 2015" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the offset limit field in a match context. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_optimize.3 ================================================ .TH PCRE2_SET_OPTIMIZE 3 "22 September 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_optimize(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIdirective\fP);" .fi . .SH DESCRIPTION .rs .sp This function controls which performance optimizations will be applied by \fBpcre2_compile()\fP. It can be called multiple times with the same compile context; the effects are cumulative, with the effects of later calls taking precedence over earlier ones. .P The result is zero for success, PCRE2_ERROR_NULL if \fIccontext\fP is NULL, or PCRE2_ERROR_BADOPTION if \fIdirective\fP is unknown. The latter could be useful to detect if a certain optimization is available. .P The list of possible values for the \fIdirective\fP parameter are: .sp PCRE2_OPTIMIZATION_FULL Enable all optimizations (default) PCRE2_OPTIMIZATION_NONE Disable all optimizations PCRE2_AUTO_POSSESS Enable auto-possessification PCRE2_AUTO_POSSESS_OFF Disable auto-possessification PCRE2_DOTSTAR_ANCHOR Enable implicit dotstar anchoring PCRE2_DOTSTAR_ANCHOR_OFF Disable implicit dotstar anchoring PCRE2_START_OPTIMIZE Enable start-up optimizations at match time PCRE2_START_OPTIMIZE_OFF Disable start-up optimizations at match time .sp There is a complete description of the PCRE2 native API, including detailed descriptions \fIdirective\fP parameter values in the .\" HREF \fBpcre2api\fP .\" page. ================================================ FILE: doc/pcre2_set_parens_nest_limit.3 ================================================ .TH PCRE2_SET_PARENS_NEST_LIMIT 3 "25 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_parens_nest_limit(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets, in a compile context, the maximum depth of nested parentheses in a pattern. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_recursion_limit.3 ================================================ .TH PCRE2_SET_RECURSION_LIMIT 3 "19 July 2017" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_recursion_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi . .SH DESCRIPTION .rs .sp This function is obsolete and should not be used in new code. Use \fBpcre2_set_depth_limit()\fP instead. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_recursion_memory_management.3 ================================================ .TH PCRE2_SET_RECURSION_MEMORY_MANAGEMENT 3 "23 January 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_recursion_memory_management( .B " pcre2_match_context *\fImcontext\fP," .B " void *(*\fIprivate_malloc\fP)(size_t, void *)," .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);" .fi . .SH DESCRIPTION .rs .sp From release 10.30 onwards, this function is obsolete and does nothing. The result is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_substitute_callout.3 ================================================ .TH PCRE2_SET_SUBSTITUTE_CALLOUT 3 "04 October 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *)," .B " void *\fIcallout_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the substitute callout fields in a match context (the first argument). The second argument specifies a callout function, and the third argument is an opaque data item that is passed to it. The result of this function is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_set_substitute_case_callout.3 ================================================ .TH PCRE2_SET_SUBSTITUTE_CASE_CALLOUT 3 "26 December 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_set_substitute_case_callout(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE (*\fIcallout_function\fP)(PCRE2_SPTR, PCRE2_SIZE," .B " PCRE2_UCHAR *, PCRE2_SIZE," .B " int, void *)," .B " void *\fIcallout_data\fP);" .fi . .SH DESCRIPTION .rs .sp This function sets the substitute case callout fields in a match context (the first argument). The second argument specifies a callout function, and the third argument is an opaque data item that is passed to it. The result of this function is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substitute.3 ================================================ .TH PCRE2_SUBSTITUTE 3 "03 October 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP," .B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP," .B " PCRE2_SIZE *\fIoutlengthptr\fP);" .fi . .SH DESCRIPTION .rs .sp This function matches a compiled regular expression against a given subject string, using a matching algorithm that is similar to Perl's. It then makes a copy of the subject, substituting a replacement string for what was matched. Its arguments are: .sp \fIcode\fP Points to the compiled pattern \fIsubject\fP Points to the subject string \fIlength\fP Length of the subject string \fIstartoffset\fP Offset in the subject at which to start matching \fIoptions\fP Option bits \fImatch_data\fP Points to a match data block, or is NULL \fImcontext\fP Points to a match context, or is NULL \fIreplacement\fP Points to the replacement string \fIrlength\fP Length of the replacement string \fIoutputbuffer\fP Points to the output buffer \fIoutlengthptr\fP Points to the length of the output buffer .sp A match data block is needed only if you want to inspect the data from the final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is set. A match context is needed only if you want to: .sp Set up a callout function Set a matching offset limit Change the backtracking match limit Change the backtracking depth limit Set custom memory management in the match context .sp The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code units, not characters, as is the contents of the variable pointed at by \fIoutlengthptr\fP. This variable must contain the length of the output buffer when the function is called. If the function is successful, the value is changed to the length of the new string, excluding the trailing zero that is automatically added. .P The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for zero-terminated strings. The options are: .sp PCRE2_ANCHORED Match only at the first position PCRE2_ENDANCHORED Match only at end of subject .\" JOIN PCRE2_NOTBOL Subject is not the beginning of a line PCRE2_NOTEOL Subject is not the end of a line .\" JOIN PCRE2_NOTEMPTY An empty string is not a valid match .\" JOIN PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match PCRE2_NO_JIT Do not use JIT matching .\" JOIN PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement .\" JOIN (only relevant if PCRE2_UTF was set at compile time) PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing .\" JOIN PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject PCRE2_SUBSTITUTE_LITERAL The replacement string is literal .\" JOIN PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s) PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string .sp If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored. .P If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its contents must be the result of a call to \fBpcre2_match()\fP (or \fBpcre2_jit_match()\fP) using the same pattern, subject pointer, effective subject length, start offset, and match options. .P The function returns the number of substitutions, which may be zero if there are no matches. The result may be greater than one only when PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code is returned. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_copy_byname.3 ================================================ .TH PCRE2_SUBSTRING_COPY_BYNAME 3 "19 December 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_copy_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR *\fIbuffer\fP, PCRE2_SIZE *\fIbufflen\fP);" .fi . .SH DESCRIPTION .rs .sp This is a convenience function for extracting a captured substring, identified by name, into a given buffer. The arguments are: .sp \fImatch_data\fP The match data block for the match \fIname\fP Name of the required substring \fIbuffer\fP Buffer to receive the string \fIbufflen\fP Length of buffer (code units) .sp The \fIbufflen\fP variable is updated to contain the length of the extracted string, excluding the trailing zero. The yield of the function is zero for success or one of the following error numbers: .sp PCRE2_ERROR_NOSUBSTRING there are no groups of that name PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group PCRE2_ERROR_UNSET the group did not participate in the match PCRE2_ERROR_NOMEMORY the buffer is not big enough .sp If there is more than one group with the given name, the first one that is set is returned. In this situation PCRE2_ERROR_UNSET means that no group with the given name was set. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_copy_bynumber.3 ================================================ .TH PCRE2_SUBSTRING_COPY_BYNUMBER 3 "13 December 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_copy_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_UCHAR *\fIbuffer\fP," .B " PCRE2_SIZE *\fIbufflen\fP);" .fi . .SH DESCRIPTION .rs .sp This is a convenience function for extracting a captured substring into a given buffer. The arguments are: .sp \fImatch_data\fP The match data block for the match \fInumber\fP Number of the required substring \fIbuffer\fP Buffer to receive the string \fIbufflen\fP Length of buffer .sp The \fIbufflen\fP variable is updated with the length of the extracted string, excluding the terminating zero. The yield of the function is zero for success or one of the following error numbers: .sp PCRE2_ERROR_NOSUBSTRING there are no groups of that number PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group PCRE2_ERROR_UNSET the group did not participate in the match PCRE2_ERROR_NOMEMORY the buffer is too small .sp .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_free.3 ================================================ .TH PCRE2_SUBSTRING_FREE 3 "26 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP); .fi . .SH DESCRIPTION .rs .sp This is a convenience function for freeing the memory obtained by a previous call to \fBpcre2_substring_get_byname()\fP or \fBpcre2_substring_get_bynumber()\fP. Its only argument is a pointer to the string. If the argument is NULL, the function does nothing. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_get_byname.3 ================================================ .TH PCRE2_SUBSTRING_GET_BYNAME 3 "19 December 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_get_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR **\fIbufferptr\fP, PCRE2_SIZE *\fIbufflen\fP);" .fi . .SH DESCRIPTION .rs .sp This is a convenience function for extracting a captured substring by name into newly acquired memory. The arguments are: .sp \fImatch_data\fP The match data for the match \fIname\fP Name of the required substring \fIbufferptr\fP Where to put the string pointer \fIbufflen\fP Where to put the string length .sp The memory in which the substring is placed is obtained by calling the same memory allocation function that was used for the match data block. The convenience function \fBpcre2_substring_free()\fP can be used to free it when it is no longer needed. The yield of the function is zero for success or one of the following error numbers: .sp PCRE2_ERROR_NOSUBSTRING there are no groups of that name PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group PCRE2_ERROR_UNSET the group did not participate in the match PCRE2_ERROR_NOMEMORY memory could not be obtained .sp If there is more than one group with the given name, the first one that is set is returned. In this situation PCRE2_ERROR_UNSET means that no group with the given name was set. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_get_bynumber.3 ================================================ .TH PCRE2_SUBSTRING_GET_BYNUMBER 3 "13 December 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_get_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_UCHAR **\fIbufferptr\fP, PCRE2_SIZE *\fIbufflen\fP);" .fi . .SH DESCRIPTION .rs .sp This is a convenience function for extracting a captured substring by number into newly acquired memory. The arguments are: .sp \fImatch_data\fP The match data for the match \fInumber\fP Number of the required substring \fIbufferptr\fP Where to put the string pointer \fIbufflen\fP Where to put the string length .sp The memory in which the substring is placed is obtained by calling the same memory allocation function that was used for the match data block. The convenience function \fBpcre2_substring_free()\fP can be used to free it when it is no longer needed. The yield of the function is zero for success or one of the following error numbers: .sp PCRE2_ERROR_NOSUBSTRING there are no groups of that number PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group PCRE2_ERROR_UNSET the group did not participate in the match PCRE2_ERROR_NOMEMORY memory could not be obtained .sp .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_length_byname.3 ================================================ .TH PCRE2_SUBSTRING_LENGTH_BYNAME 3 "26 September 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_length_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_SIZE *\fIlength\fP);" .fi . .SH DESCRIPTION .rs .sp This function returns the length of a matched substring, identified by name. The arguments are: .sp \fImatch_data\fP The match data block for the match \fIname\fP The substring name \fIlength\fP Where to return the length, or NULL .sp The third argument may be NULL if all you want to know is whether or not a substring is set. The yield is zero on success, or a negative error code otherwise. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_length_bynumber.3 ================================================ .TH PCRE2_SUBSTRING_LENGTH_BYNUMBER 3 "22 December 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_length_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_SIZE *\fIlength\fP);" .fi . .SH DESCRIPTION .rs .sp This function returns the length of a matched substring, identified by number. The arguments are: .sp \fImatch_data\fP The match data block for the match \fInumber\fP The substring number \fIlength\fP Where to return the length, or NULL .sp The third argument may be NULL if all you want to know is whether or not a substring is set. The yield is zero on success, or a negative error code otherwise. After a partial match, only substring 0 is available. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_list_free.3 ================================================ .TH PCRE2_SUBSTRING_LIST_FREE 3 "26 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B void pcre2_substring_list_free(PCRE2_UCHAR **\fIlist\fP); .fi . .SH DESCRIPTION .rs .sp This is a convenience function for freeing the store obtained by a previous call to \fBpcre2substring_list_get()\fP. Its only argument is a pointer to the list of string pointers. If the argument is NULL, the function returns immediately, without doing anything. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_list_get.3 ================================================ .TH PCRE2_SUBSTRING_LIST_GET 3 "21 October 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_list_get(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_UCHAR ***\fIlistptr\fP, PCRE2_SIZE **\fIlengthsptr\fP); .fi . .SH DESCRIPTION .rs .sp This is a convenience function for extracting all the captured substrings after a pattern match. It builds a list of pointers to the strings, and (optionally) a second list that contains their lengths (in code units), excluding a terminating zero that is added to each of them. All this is done in a single block of memory that is obtained using the same memory allocation function that was used to get the match data block. The convenience function \fBpcre2_substring_list_free()\fP can be used to free it when it is no longer needed. The arguments are: .sp \fImatch_data\fP The match data block \fIlistptr\fP Where to put a pointer to the list \fIlengthsptr\fP Where to put a pointer to the lengths, or NULL .sp A pointer to a list of pointers is put in the variable whose address is in \fIlistptr\fP. The list is terminated by a NULL pointer. If \fIlengthsptr\fP is not NULL, a matching list of lengths is created, and its address is placed in \fIlengthsptr\fP. The yield of the function is zero on success or PCRE2_ERROR_NOMEMORY if sufficient memory could not be obtained. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_nametable_scan.3 ================================================ .TH PCRE2_SUBSTRING_NAMETABLE_SCAN 3 "06 February 2019" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_nametable_scan(const pcre2_code *\fIcode\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_SPTR *\fIfirst\fP, PCRE2_SPTR *\fIlast\fP);" .fi . .SH DESCRIPTION .rs .sp This convenience function finds, for a compiled pattern, the first and last entries for a given name in the table that translates capture group names into numbers. .sp \fIcode\fP Compiled regular expression \fIname\fP Name whose entries required \fIfirst\fP Where to return a pointer to the first entry \fIlast\fP Where to return a pointer to the last entry .sp When the name is found in the table, if \fIfirst\fP is NULL, the function returns a group number, but if there is more than one matching entry, it is not defined which one. Otherwise, when both pointers have been set, the yield of the function is the length of each entry in code units. If the name is not found, PCRE2_ERROR_NOSUBSTRING is returned. .P There is a complete description of the PCRE2 native API, including the format of the table entries, in the .\" HREF \fBpcre2api\fP .\" page, and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2_substring_number_from_name.3 ================================================ .TH PCRE2_SUBSTRING_NUMBER_FROM_NAME 3 "03 November 2014" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int pcre2_substring_number_from_name(const pcre2_code *\fIcode\fP, .B " PCRE2_SPTR \fIname\fP);" .fi . .SH DESCRIPTION .rs .sp This convenience function finds the number of a named substring capturing parenthesis in a compiled pattern, provided that it is a unique name. The function arguments are: .sp \fIcode\fP Compiled regular expression \fIname\fP Name whose number is required .sp The yield of the function is the number of the parenthesis if the name is found, or PCRE2_ERROR_NOSUBSTRING if it is not found. When duplicate names are allowed (PCRE2_DUPNAMES is set), if the name is not unique, PCRE2_ERROR_NOUNIQUESUBSTRING is returned. You can obtain the list of numbers with the same name by calling \fBpcre2_substring_nametable_scan()\fP. .P There is a complete description of the PCRE2 native API in the .\" HREF \fBpcre2api\fP .\" page and a description of the POSIX API in the .\" HREF \fBpcre2posix\fP .\" page. ================================================ FILE: doc/pcre2api.3 ================================================ .TH PCRE2API 3 "29 October 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp .B #include .sp PCRE2 is a new API for PCRE, starting at release 10.0. This document contains a description of all its native functions. See the .\" HREF \fBpcre2\fP .\" document for an overview of all the PCRE2 documentation. . . .SH "PCRE2 NATIVE API BASIC FUNCTIONS" .rs .sp .nf .B pcre2_code *pcre2_compile(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP" .B " pcre2_compile_context *\fIccontext\fP);" .sp .B void pcre2_code_free(pcre2_code *\fIcode\fP); .sp .B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP, .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_match_data *pcre2_match_data_create_from_pattern( .B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B int pcre2_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP);" .sp .B int pcre2_dfa_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP," .B " int *\fIworkspace\fP, PCRE2_SIZE \fIwscount\fP);" .sp .B void pcre2_match_data_free(pcre2_match_data *\fImatch_data\fP); .fi . . .SH "PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS" .rs .sp .nf .B PCRE2_SPTR pcre2_get_mark(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE pcre2_get_match_data_heapframes_size( .B " pcre2_match_data *\fImatch_data\fP);" .sp .B uint32_t pcre2_get_ovector_count(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *\fImatch_data\fP); .fi . . .SH "PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS" .rs .sp .nf .B pcre2_general_context *pcre2_general_context_create( .B " void *(*\fIprivate_malloc\fP)(PCRE2_SIZE, void *)," .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);" .sp .B pcre2_general_context *pcre2_general_context_copy( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_general_context_free(pcre2_general_context *\fIgcontext\fP); .fi . . .SH "PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS" .rs .sp .nf .B pcre2_compile_context *pcre2_compile_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_compile_context *pcre2_compile_context_copy( .B " pcre2_compile_context *\fIccontext\fP);" .sp .B void pcre2_compile_context_free(pcre2_compile_context *\fIccontext\fP); .sp .B int pcre2_set_bsr(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP, .B " const uint8_t *\fItables\fP);" .sp .B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIextra_options\fP);" .sp .B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP, .B " PCRE2_SIZE \fIvalue\fP);" .sp .B int pcre2_set_max_pattern_compiled_length( .B " pcre2_compile_context *\fIccontext\fP, PCRE2_SIZE \fIvalue\fP);" .sp .B int pcre2_set_max_varlookbehind(pcre2_compile_contest *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_parens_nest_limit(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, .B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);" .sp .B int pcre2_set_optimize(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIdirective\fP);" .fi . . .SH "PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS" .rs .sp .nf .B pcre2_match_context *pcre2_match_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_match_context *pcre2_match_context_copy( .B " pcre2_match_context *\fImcontext\fP);" .sp .B void pcre2_match_context_free(pcre2_match_context *\fImcontext\fP); .sp .B int pcre2_set_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *)," .B " void *\fIcallout_data\fP);" .sp .B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *)," .B " void *\fIcallout_data\fP);" .sp .B int pcre2_set_substitute_case_callout(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE (*\fIcallout_function\fP)(PCRE2_SPTR, PCRE2_SIZE," .B " PCRE2_UCHAR *, PCRE2_SIZE," .B " int, void *)," .B " void *\fIcallout_data\fP);" .sp .B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE \fIvalue\fP);" .sp .B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_depth_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi . . .SH "PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS" .rs .sp .nf .B int pcre2_substring_copy_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR *\fIbuffer\fP, PCRE2_SIZE *\fIbufflen\fP);" .sp .B int pcre2_substring_copy_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_UCHAR *\fIbuffer\fP," .B " PCRE2_SIZE *\fIbufflen\fP);" .sp .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP); .sp .B int pcre2_substring_get_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR **\fIbufferptr\fP, PCRE2_SIZE *\fIbufflen\fP);" .sp .B int pcre2_substring_get_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_UCHAR **\fIbufferptr\fP," .B " PCRE2_SIZE *\fIbufflen\fP);" .sp .B int pcre2_substring_length_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_SIZE *\fIlength\fP);" .sp .B int pcre2_substring_length_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_SIZE *\fIlength\fP);" .sp .B int pcre2_substring_nametable_scan(const pcre2_code *\fIcode\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_SPTR *\fIfirst\fP, PCRE2_SPTR *\fIlast\fP);" .sp .B int pcre2_substring_number_from_name(const pcre2_code *\fIcode\fP, .B " PCRE2_SPTR \fIname\fP);" .sp .B void pcre2_substring_list_free(PCRE2_UCHAR **\fIlist\fP); .sp .B int pcre2_substring_list_get(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_UCHAR ***\fIlistptr\fP, PCRE2_SIZE **\fIlengthsptr\fP);" .fi . . .SH "PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION" .rs .sp .nf .B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP," .B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP," .B " PCRE2_SIZE *\fIoutlengthptr\fP);" .fi . . .SH "PCRE2 NATIVE API JIT FUNCTIONS" .rs .sp .nf .B int pcre2_jit_compile(pcre2_code *\fIcode\fP, uint32_t \fIoptions\fP); .sp .B int pcre2_jit_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP);" .sp .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP); .sp .B pcre2_jit_stack *pcre2_jit_stack_create(size_t \fIstartsize\fP, .B " size_t \fImaxsize\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_jit_stack_assign(pcre2_match_context *\fImcontext\fP, .B " pcre2_jit_callback \fIcallback_function\fP, void *\fIcallback_data\fP);" .sp .B void pcre2_jit_stack_free(pcre2_jit_stack *\fIjit_stack\fP); .fi . . .SH "PCRE2 NATIVE API SERIALIZATION FUNCTIONS" .rs .sp .nf .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP, .B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP," .B " pcre2_general_context *\fIgcontext\fP);" .sp .B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP, .B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP," .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_serialize_free(uint8_t *\fIbytes\fP); .sp .B int32_t pcre2_serialize_get_number_of_codes(const uint8_t *\fIbytes\fP); .fi . . .SH "PCRE2 NATIVE API AUXILIARY FUNCTIONS" .rs .sp .nf .B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP); .sp .B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP); .sp .B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP, .B " PCRE2_SIZE \fIbufflen\fP);" .sp .B const uint8_t *pcre2_maketables(pcre2_general_context *\fIgcontext\fP); .sp .B void pcre2_maketables_free(pcre2_general_context *\fIgcontext\fP, .B " const uint8_t *\fItables\fP);" .sp .B int pcre2_pattern_info(const pcre2_code *\fIcode\fP, uint32_t \fIwhat\fP, .B " void *\fIwhere\fP);" .sp .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP, .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *)," .B " void *\fIuser_data\fP);" .sp .B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP); .fi . . .SH "PCRE2 NATIVE API OBSOLETE FUNCTIONS" .rs .sp .nf .B int pcre2_set_recursion_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_recursion_memory_management( .B " pcre2_match_context *\fImcontext\fP," .B " void *(*\fIprivate_malloc\fP)(size_t, void *)," .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);" .fi .sp These functions became obsolete at release 10.30 and are retained only for backward compatibility. They should not be used in new code. The first is replaced by \fBpcre2_set_depth_limit()\fP; the second is no longer needed and has no effect (it always returns zero). . . .SH "PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS" .rs .sp .nf .B pcre2_convert_context *pcre2_convert_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_convert_context *pcre2_convert_context_copy( .B " pcre2_convert_context *\fIcvcontext\fP);" .sp .B void pcre2_convert_context_free(pcre2_convert_context *\fIcvcontext\fP); .sp .B int pcre2_set_glob_escape(pcre2_convert_context *\fIcvcontext\fP, .B " uint32_t \fIescape_char\fP);" .sp .B int pcre2_set_glob_separator(pcre2_convert_context *\fIcvcontext\fP, .B " uint32_t \fIseparator_char\fP);" .sp .B int pcre2_pattern_convert(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, .B " uint32_t \fIoptions\fP, PCRE2_UCHAR **\fIbuffer\fP," .B " PCRE2_SIZE *\fIblength\fP, pcre2_convert_context *\fIcvcontext\fP);" .sp .B void pcre2_converted_pattern_free(PCRE2_UCHAR *\fIconverted_pattern\fP); .fi .sp These functions provide a way of converting non-PCRE2 patterns into patterns that can be processed by \fBpcre2_compile()\fP. This facility is experimental and may be changed in future releases. At present, "globs" and POSIX basic and extended patterns can be converted. Details are given in the .\" HREF \fBpcre2convert\fP .\" documentation. . . .SH "PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES" .rs .sp There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code units, respectively. However, there is just one header file, \fBpcre2.h\fP. This contains the function prototypes and other definitions for all three libraries. One, two, or all three can be installed simultaneously. On Unix-like systems the libraries are called \fBlibpcre2-8\fP, \fBlibpcre2-16\fP, and \fBlibpcre2-32\fP, and they can also co-exist with the original PCRE libraries. Every PCRE2 function comes in three different forms, one for each library, for example: .sp \fBpcre2_compile_8()\fP \fBpcre2_compile_16()\fP \fBpcre2_compile_32()\fP .sp There are also three different sets of data types: .sp \fBPCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32\fP \fBPCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32\fP .sp The UCHAR types define unsigned code units of the appropriate widths. For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are pointers to constants of the equivalent UCHAR types, that is, they are pointers to vectors of unsigned code units. .P Character strings are passed to a PCRE2 library as sequences of unsigned integers in code units of the appropriate width. The length of a string may be given as a number of code units, or the string may be specified as zero-terminated. .P Many applications use only one code unit width. For their convenience, macros are defined whose names are the generic forms such as \fBpcre2_compile()\fP and PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific function and macro names. PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it to be 8, 16, or 32 before including \fBpcre2.h\fP in order to make use of the generic names. .P Applications that use more than one code unit width can be linked with more than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before including \fBpcre2.h\fP, and then use the real function names. Any code that is to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is unknown should also use the real function names. (Unfortunately, it is not possible in C code to save and restore the value of a macro.) .P If PCRE2_CODE_UNIT_WIDTH is not defined before including \fBpcre2.h\fP, a compiler error occurs. .P When using multiple libraries in an application, you must take care when processing any particular pattern to use only functions from a single library. For example, if you want to run a match using a pattern that was compiled with \fBpcre2_compile_16()\fP, you must do so with \fBpcre2_match_16()\fP, not \fBpcre2_match_8()\fP or \fBpcre2_match_32()\fP. .P In the function summaries above, and in the rest of this document and other PCRE2 documents, functions and data types are described using their generic names, without the _8, _16, or _32 suffix. . . .SH "PCRE2 API OVERVIEW" .rs .sp PCRE2 has its own native API, which is described in this document. There are also some wrapper functions for the 8-bit library that correspond to the POSIX regular expression API, but they do not give access to all the functionality of PCRE2 and they are not thread-safe. They are described in the .\" HREF \fBpcre2posix\fP .\" documentation. Both these APIs define a set of C function calls. .P The native API C data types, function prototypes, option values, and error codes are defined in the header file \fBpcre2.h\fP, which also contains definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the library. Applications can use these to include support for different releases of PCRE2. .P In a Windows environment, if you want to statically link an application program against a non-dll PCRE2 library, you must define PCRE2_STATIC before including \fBpcre2.h\fP. .P The functions \fBpcre2_compile()\fP and \fBpcre2_match()\fP are used for compiling and matching regular expressions in a Perl-compatible manner. A sample program that demonstrates the simplest way of using them is provided in the file called \fIpcre2demo.c\fP in the PCRE2 source distribution. A listing of this program is given in the .\" HREF \fBpcre2demo\fP .\" documentation, and the .\" HREF \fBpcre2sample\fP .\" documentation describes how to compile and run it. .P The compiling and matching functions recognize various options that are passed as bits in an options argument. There are also some more complicated parameters such as custom memory management functions and resource limits that are passed in "contexts" (which are just memory blocks, described below). Simple applications do not need to make use of contexts. .P Just-in-time (JIT) compiler support is an optional feature of PCRE2 that can be built in appropriate hardware environments. It greatly speeds up the matching performance of many patterns. Programs can request that it be used if available by calling \fBpcre2_jit_compile()\fP after a pattern has been successfully compiled by \fBpcre2_compile()\fP. This does nothing if JIT support is not available. .P More complicated programs might need to make use of the specialist functions \fBpcre2_jit_stack_create()\fP, \fBpcre2_jit_stack_free()\fP, and \fBpcre2_jit_stack_assign()\fP in order to control the JIT code's memory usage. .P JIT matching is automatically used by \fBpcre2_match()\fP if it is available, unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT matching, which gives improved performance at the expense of less sanity checking. The JIT-specific functions are discussed in the .\" HREF \fBpcre2jit\fP .\" documentation. .P A second matching function, \fBpcre2_dfa_match()\fP, which is not Perl-compatible, is also provided. This uses a different algorithm for the matching. The alternative algorithm finds all possible matches (at a given point in the subject), and scans the subject just once (unless there are lookaround assertions). However, this algorithm does not return captured substrings. A description of the two matching algorithms and their advantages and disadvantages is given in the .\" HREF \fBpcre2matching\fP .\" documentation. There is no JIT support for \fBpcre2_dfa_match()\fP. .P In addition to the main compiling and matching functions, there are convenience functions for extracting captured substrings from a subject string that has been matched by \fBpcre2_match()\fP. They are: .sp \fBpcre2_substring_copy_byname()\fP \fBpcre2_substring_copy_bynumber()\fP \fBpcre2_substring_get_byname()\fP \fBpcre2_substring_get_bynumber()\fP \fBpcre2_substring_list_get()\fP \fBpcre2_substring_length_byname()\fP \fBpcre2_substring_length_bynumber()\fP \fBpcre2_substring_nametable_scan()\fP \fBpcre2_substring_number_from_name()\fP .sp \fBpcre2_substring_free()\fP and \fBpcre2_substring_list_free()\fP are also provided, to free memory used for extracted strings. If either of these functions is called with a NULL argument, the function returns immediately without doing anything. .P The function \fBpcre2_substitute()\fP can be called to match a pattern and return a copy of the subject string with substitutions for parts that were matched. .P Functions whose names begin with \fBpcre2_serialize_\fP are used for saving compiled patterns on disc or elsewhere, and reloading them later. .P Finally, there are functions for finding out information about a compiled pattern (\fBpcre2_pattern_info()\fP) and about the configuration with which PCRE2 was built (\fBpcre2_config()\fP) and that it is using. .P Functions with names ending with \fB_free()\fP are used for freeing memory blocks of various sorts. In all cases, if one of these functions is called with a NULL argument, it does nothing. . . .SH "STRING LENGTHS AND OFFSETS" .rs .sp The PCRE2 API uses string lengths and offsets into strings of code units in several places. These values are always of type PCRE2_SIZE, which is an unsigned integer type, currently always defined as \fIsize_t\fP. The largest value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings and unset offsets. Therefore, the longest string that can be handled is one less than this maximum. Note that string lengths are always given in code units. Only in the 8-bit library is such a length the same as the number of bytes in the string. . . .\" HTML .SH NEWLINES .rs .sp PCRE2 supports five different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) character, the two-character sequence CRLF, any of the three preceding, or any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). .P Each of the first three conventions is used by at least one operating system as its standard newline sequence. When PCRE2 is built, a default can be specified. If it is not, the default is set to LF, which is the Unix standard. However, the newline convention can be changed by an application when calling \fBpcre2_compile()\fP, or it can be specified by special text at the start of the pattern itself; this overrides any other settings. See the .\" HREF \fBpcre2pattern\fP .\" page for details of the special character sequences. .P In the PCRE2 documentation the word "newline" is used to mean "the character or pair of characters that indicate a line break". The choice of newline convention affects the handling of the dot, circumflex, and dollar metacharacters, the handling of #-comments in /x mode, and, when CRLF is a recognized line ending sequence, the match position advancement for a non-anchored pattern. There is more detail about this in the .\" HTML .\" section on \fBpcre2_match()\fP options .\" below. .P The choice of newline convention does not affect the interpretation of the \en or \er escape sequences, nor does it affect what \eR matches; this has its own separate convention. . . .SH MULTITHREADING .rs .sp In a multithreaded application it is important to keep thread-specific data separate from data that can be shared between threads. The PCRE2 library code itself is thread-safe: it contains no static or global variables. The API is designed to be fairly simple for non-threaded applications while at the same time ensuring that multithreaded applications can use it. .P There are several different blocks of data that are used to pass information between the application and the PCRE2 libraries. . . .SS "The compiled pattern" .rs .sp A pointer to the compiled form of a pattern is returned to the user when \fBpcre2_compile()\fP is successful. The data in the compiled pattern is fixed, and does not change when the pattern is matched. Therefore, it is thread-safe, that is, the same compiled pattern can be used by more than one thread simultaneously. For example, an application can compile all its patterns at the start, before forking off multiple threads that use them. However, if the just-in-time (JIT) optimization feature is being used, it needs separate memory stack areas for each thread. See the .\" HREF \fBpcre2jit\fP .\" documentation for more details. .P In a more complicated situation, where patterns are compiled only when they are first needed, but are still shared between threads, pointers to compiled patterns must be protected from simultaneous writing by multiple threads. This is somewhat tricky to do correctly. If you know that writing to a pointer is atomic in your environment, you can use logic like this: .sp Get a read-only (shared) lock (mutex) for pointer if (pointer == NULL) { Get a write (unique) lock for pointer if (pointer == NULL) pointer = pcre2_compile(... } Release the lock Use pointer in pcre2_match() .sp Of course, testing for compilation errors should also be included in the code. .P The reason for checking the pointer a second time is as follows: Several threads may have acquired the shared lock and tested the pointer for being NULL, but only one of them will be given the write lock, with the rest kept waiting. The winning thread will compile the pattern and store the result. After this thread releases the write lock, another thread will get it, and if it does not retest pointer for being NULL, will recompile the pattern and overwrite the pointer, creating a memory leak and possibly causing other issues. .P In an environment where writing to a pointer may not be atomic, the above logic is not sufficient. The thread that is doing the compiling may be descheduled after writing only part of the pointer, which could cause other threads to use an invalid value. Instead of checking the pointer itself, a separate "pointer is valid" flag (that can be updated atomically) must be used: .sp Get a read-only (shared) lock (mutex) for pointer if (!pointer_is_valid) { Get a write (unique) lock for pointer if (!pointer_is_valid) { pointer = pcre2_compile(... pointer_is_valid = TRUE } } Release the lock Use pointer in pcre2_match() .sp If JIT is being used, but the JIT compilation is not being done immediately (perhaps waiting to see if the pattern is used often enough), similar logic is required. JIT compilation updates a value within the compiled code block, so a thread must gain unique write access to the pointer before calling \fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP or \fBpcre2_code_copy_with_tables()\fP can be used to obtain a private copy of the compiled code before calling the JIT compiler. . . .SS "Context blocks" .rs .sp The next main section below introduces the idea of "contexts" in which PCRE2 functions are called. A context is nothing more than a collection of parameters that control the way PCRE2 operates. Grouping a number of parameters together in a context is a convenient way of passing them to a PCRE2 function without using lots of arguments. The parameters that are stored in contexts are in some sense "advanced features" of the API. Many straightforward applications will not need to use contexts. .P In a multithreaded application, if the parameters in a context are values that are never changed, the same context can be used by all the threads. However, if any thread needs to change any value in a context, it must make its own thread-specific copy. . . .SS "Match blocks" .rs .sp The matching functions need a block of memory for storing the results of a match. This includes details of what was matched, as well as additional information such as the name of a (*MARK) setting. Each thread must provide its own copy of this memory. . . .SH "PCRE2 CONTEXTS" .rs .sp Some PCRE2 functions have a lot of parameters, many of which are used only by specialist applications, for example, those that use custom memory management or non-standard character tables. To keep function argument lists at a reasonable size, and at the same time to keep the API extensible, "uncommon" parameters are passed to certain functions in a \fBcontext\fP instead of directly. A context is just a block of memory that holds the parameter values. Applications that do not need to adjust any of the context parameters can pass NULL when a context pointer is required. .P There are three different types of context: a general context that is relevant for several PCRE2 operations, a compile-time context, and a match-time context. . . .SS "The general context" .rs .sp At present, this context just contains pointers to (and data for) external memory management functions that are called from several places in the PCRE2 library. The context is named `general' rather than specifically `memory' because in future other fields may be added. If you do not want to supply your own custom memory management functions, you do not need to bother with a general context. A general context is created by: .sp .nf .B pcre2_general_context *pcre2_general_context_create( .B " void *(*\fIprivate_malloc\fP)(PCRE2_SIZE, void *)," .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);" .fi .sp The two function pointers specify custom memory management functions, whose prototypes are: .sp \fBvoid *private_malloc(PCRE2_SIZE, void *);\fP \fBvoid private_free(void *, void *);\fP .sp Whenever code in PCRE2 calls these functions, the final argument is the value of \fImemory_data\fP. Either of the first two arguments of the creation function may be NULL, in which case the system memory management functions \fImalloc()\fP and \fIfree()\fP are used. (This is not currently useful, as there are no other fields in a general context, but in future there might be.) The \fIprivate_malloc()\fP function is used (if supplied) to obtain memory for storing the context, and all three values are saved as part of the context. .P Whenever PCRE2 creates a data block of any kind, the block contains a pointer to the \fIfree()\fP function that matches the \fImalloc()\fP function that was used. When the time comes to free the block, this function is called. .P A general context can be copied by calling: .sp .nf .B pcre2_general_context *pcre2_general_context_copy( .B " pcre2_general_context *\fIgcontext\fP);" .fi .sp The memory used for a general context should be freed by calling: .sp .nf .B void pcre2_general_context_free(pcre2_general_context *\fIgcontext\fP); .fi .sp If this function is passed a NULL argument, it returns immediately without doing anything. . . .\" HTML .SS "The compile context" .rs .sp A compile context is required if you want to provide an external function for stack checking during compilation or to change the default values of any of the following compile-time parameters: .sp What \eR matches (Unicode newlines or CR, LF, CRLF only) PCRE2's character tables The newline character sequence The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) Which performance optimizations the compiler should apply .sp A compile context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of \fIpcre2_compile()\fP. .P A compile context is created, copied, and freed by the following functions: .sp .nf .B pcre2_compile_context *pcre2_compile_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_compile_context *pcre2_compile_context_copy( .B " pcre2_compile_context *\fIccontext\fP);" .sp .B void pcre2_compile_context_free(pcre2_compile_context *\fIccontext\fP); .fi .sp A compile context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected. .sp .nf .B int pcre2_set_bsr(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp The value must be PCRE2_BSR_ANYCRLF, to specify that \eR matches only CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \eR matches any Unicode line ending sequence. The value is used by the JIT compiler and by the two interpreted matching functions, \fIpcre2_match()\fP and \fIpcre2_dfa_match()\fP. .sp .nf .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP, .B " const uint8_t *\fItables\fP);" .fi .sp The value must be the result of a call to \fBpcre2_maketables()\fP, whose only argument is a general context. This function builds a set of character tables in the current locale. .sp .nf .B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIextra_options\fP);" .fi .sp As PCRE2 has developed, almost all the 32 option bits that are available in the \fIoptions\fP argument of \fBpcre2_compile()\fP have been used up. To avoid running out, the compile context contains a set of extra option bits which are used for some newer, assumed rarer, options. This function sets those bits. It always sets all the bits (either on or off). It does not modify any existing setting. The available options are defined in the section entitled "Extra compile options" .\" HTML .\" below. .\" .sp .nf .B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP, .B " PCRE2_SIZE \fIvalue\fP);" .fi .sp This sets a maximum length, in code units, for any pattern string that is compiled with this context. If the pattern is longer, an error is generated. This facility is provided so that applications that accept patterns from external sources can limit their size. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited. .sp .nf .B int pcre2_set_max_pattern_compiled_length( .B " pcre2_compile_context *\fIccontext\fP, PCRE2_SIZE \fIvalue\fP);" .fi .sp This sets a maximum size, in bytes, for the memory needed to hold the compiled version of a pattern that is compiled with this context. If the pattern needs more memory, an error is generated. This facility is provided so that applications that accept patterns from external sources can limit the amount of memory they use. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited. .sp .nf .B int pcre2_set_max_varlookbehind(pcre2_compile_contest *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp This sets a maximum length for the number of characters matched by a variable-length lookbehind assertion. The default is set when PCRE2 is built, with the ultimate default being 255, the same as Perl. Lookbehind assertions without a bounding length are not supported. .sp .nf .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp This specifies which characters or character sequences are to be recognized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), PCRE2_NEWLINE_ANY (any Unicode newline sequence), or PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero). .P A pattern can override the value set in the compile context by starting with a sequence such as (*CRLF). See the .\" HREF \fBpcre2pattern\fP .\" page for details. .P When a pattern is compiled with the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option, the newline convention affects the recognition of the end of internal comments starting with #. The value is saved with the compiled pattern for subsequent use by the JIT compiler and by the two interpreted matching functions, \fIpcre2_match()\fP and \fIpcre2_dfa_match()\fP. .sp .nf .B int pcre2_set_parens_nest_limit(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp This parameter adjusts the limit, set when PCRE2 is built (default 250), on the depth of parenthesis nesting in a pattern. This limit stops rogue patterns using up too much system stack when being compiled. The limit applies to parentheses of all kinds, not just capturing parentheses. .sp .nf .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, .B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);" .fi .sp There is at least one application that runs PCRE2 in threads with very limited system stack, where running out of stack is to be avoided at all costs. The parenthesis limit above cannot take account of how much stack is actually available during compilation. For a finer control, you can supply a function that is called whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a pattern. This function can check the actual stack size (or anything else that it wants to, of course). .P The first argument to the callout function gives the current depth of nesting, and the second is user data that is set up by the last argument of \fBpcre2_set_compile_recursion_guard()\fP. The callout function should return zero if all is well, or non-zero to force an error. .sp .nf .B int pcre2_set_optimize(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIdirective\fP);" .fi .sp PCRE2 can apply various performance optimizations during compilation, in order to make matching faster. For example, the compiler might convert some regex constructs into an equivalent construct which \fBpcre2_match()\fP can execute faster. By default, all available optimizations are enabled. However, in rare cases, one might wish to disable specific optimizations. For example, if it is known that some optimizations cannot benefit a certain regex, it might be desirable to disable them, in order to speed up compilation. .P The permitted values of \fIdirective\fP are as follows: .sp PCRE2_OPTIMIZATION_FULL .sp Enable all optional performance optimizations. This is the default value. .sp PCRE2_OPTIMIZATION_NONE .sp Disable all optional performance optimizations. .sp PCRE2_AUTO_POSSESS PCRE2_AUTO_POSSESS_OFF .sp Enable/disable "auto-possessification" of variable quantifiers such as * and +. This optimization, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can disable this optimization if you want the matching functions to do a full, unoptimized search and run all the callouts. .sp PCRE2_DOTSTAR_ANCHOR PCRE2_DOTSTAR_ANCHOR_OFF .sp Enable/disable an optimization that is applied when .* is the first significant item in a top-level branch of a pattern, and all the other branches also start with .* or with \eA or \eG or ^. Such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. .P Dotstar anchor optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). .sp PCRE2_START_OPTIMIZE PCRE2_START_OPTIMIZE_OFF .sp Enable/disable optimizations which cause matching functions to scan the subject string for specific code unit values before attempting a match. For example, if it is known that an unanchored match must start with a specific value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without actually running the main matching function. This means that a special item such as (*COMMIT) at the start of a pattern is not considered until after a suitable starting point for the match has been found. Also, when callouts or (*MARK) items are in use, these "start-up" optimizations can cause them to be skipped if the pattern is never actually used. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run. .P Disabling start-up optimizations ensures that in cases where the result is "no match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are considered at every possible starting position in the subject string. .P Disabling start-up optimizations may change the outcome of a matching operation. Consider the pattern .sp (*COMMIT)ABC .sp When this is compiled, PCRE2 records the fact that a match must start with the character "A". Suppose the subject string is "DEFABC". The start-up optimization scans along the subject, finds "A" and runs the first match attempt from there. The (*COMMIT) item means that the pattern must match the current starting position, which in this case, it does. However, if the same match is run without start-up optimizations, the initial scan along the subject string does not happen. The first match attempt is run starting from "D" and when this fails, (*COMMIT) prevents any further matches being tried, so the overall result is "no match". .P Another start-up optimization makes use of a minimum length for a matching subject, which is recorded when possible. Consider the pattern .sp (*MARK:1)B(*MARK:2)(X|Y) .sp The minimum length for a match is two characters. If the subject is "XXBB", the "starting character" optimization skips "XX", then tries to match "BB", which is long enough. In the process, (*MARK:2) is encountered and remembered. When the match attempt fails, the next "B" is found, but there is only one character left, so there are no more attempts, and "no match" is returned with the "last mark seen" set to "2". Without start-up optimizations, however, matches are tried at every possible starting position, including at the end of the subject, where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is returned is "1". In this case, the optimizations do not affect the overall match result, which is still "no match", but they do affect the auxiliary information that is returned. . . .\" HTML .SS "The match context" .rs .sp A match context is required if you want to: .sp Set up a callout function Set an offset limit for matching an unanchored pattern Change the limit on the amount of heap used when matching Change the backtracking match limit Change the backtracking depth limit Set custom memory management specifically for the match .sp If none of these apply, just pass NULL as the context argument of \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP. .P A match context is created, copied, and freed by the following functions: .sp .nf .B pcre2_match_context *pcre2_match_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_match_context *pcre2_match_context_copy( .B " pcre2_match_context *\fImcontext\fP);" .sp .B void pcre2_match_context_free(pcre2_match_context *\fImcontext\fP); .fi .sp A match context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected. .sp .nf .B int pcre2_set_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *)," .B " void *\fIcallout_data\fP);" .fi .sp This sets up a callout function for PCRE2 to call at specified points during a matching operation. Details are given in the .\" HREF \fBpcre2callout\fP .\" documentation. .sp .nf .B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *)," .B " void *\fIcallout_data\fP);" .fi .sp This sets up a callout function for PCRE2 to call after each substitution made by \fBpcre2_substitute()\fP. Details are given in the section entitled "Creating a new string with substitutions" .\" HTML .\" below. .\" .sp .nf .B int pcre2_set_substitute_case_callout(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE (*\fIcallout_function\fP)(PCRE2_SPTR, PCRE2_SIZE," .B " PCRE2_UCHAR *, PCRE2_SIZE," .B " int, void *)," .B " void *\fIcallout_data\fP);" .fi .sp This sets up a callout function for PCRE2 to call when performing case transformations inside \fBpcre2_substitute()\fP. Details are given in the section entitled "Creating a new string with substitutions" .\" HTML .\" below. .\" .sp .nf .B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE \fIvalue\fP);" .fi .sp The \fIoffset_limit\fP parameter limits how far an unanchored search can advance in the subject string. The default value is PCRE2_UNSET. The \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP functions return PCRE2_ERROR_NOMATCH if a match with a starting point before or at the given offset is not found. The \fBpcre2_substitute()\fP function makes no more substitutions. .P For example, if the pattern /abc/ is matched against "123abc" with an offset limit less than 3, the result is PCRE2_ERROR_NOMATCH. A match can never be found if the \fIstartoffset\fP argument of \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_substitute()\fP is greater than the offset limit set in the match context. .P When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT option when calling \fBpcre2_compile()\fP so that when JIT is in use, different code can be compiled. If a match is started with a non-default match limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is generated. .P The offset limit facility can be used to track progress when searching large subject strings or to limit the extent of global substitutions. See also the PCRE2_FIRSTLINE option, which requires a match to start before or at the first newline that follows the start of matching in the subject. If this is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. .sp .nf .B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp The \fIheap_limit\fP parameter specifies, in units of kibibytes (1024 bytes), the maximum amount of heap memory that \fBpcre2_match()\fP may use to hold backtracking information when running an interpretive match. This limit also applies to \fBpcre2_dfa_match()\fP, which may use the heap when processing patterns with a lot of nested pattern recursion or lookarounds or atomic groups. This limit does not apply to matching with the JIT optimization, which has its own memory control arrangements (see the .\" HREF \fBpcre2jit\fP .\" documentation for more details). If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2 is built; if it is not, the default is set very large and is essentially unlimited. .P A value for the heap limit may also be supplied by an item at the start of a pattern of the form .sp (*LIMIT_HEAP=ddd) .sp where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of \fBpcre2_match()\fP or, if no such limit is set, less than the default. .P The \fBpcre2_match()\fP function always needs some heap memory, so setting a value of zero guarantees a "heap limit exceeded" error. Details of how \fBpcre2_match()\fP uses the heap are given in the .\" HREF \fBpcre2perform\fP .\" documentation. .P For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when processing pattern recursions, lookarounds, or atomic groups, and only if this is not big enough is heap memory used. In this case, setting a value of zero disables the use of the heap. .sp .nf .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp The \fImatch_limit\fP parameter provides a means of preventing PCRE2 from using up too many computing resources when processing patterns that are not going to match, but which have a very large number of possibilities in their search trees. The classic example is a pattern that uses nested unlimited repeats. .P There is an internal counter in \fBpcre2_match()\fP that is incremented each time round its main matching loop. If this value reaches the match limit, \fBpcre2_match()\fP returns the negative value PCRE2_ERROR_MATCHLIMIT. This has the effect of limiting the amount of backtracking that can take place. For patterns that are not anchored, the count restarts from zero for each position in the subject string. This limit also applies to \fBpcre2_dfa_match()\fP, though the counting is done in a different way. .P When \fBpcre2_match()\fP is called with a pattern that was successfully processed by \fBpcre2_jit_compile()\fP, the way in which matching is executed is entirely different. However, there is still the possibility of runaway matching that goes on for a very long time, and so the \fImatch_limit\fP value is also used in this case (but in a different way) to limit how long the matching can continue. .P The default value for the limit can be set when PCRE2 is built; the default is 10 million, which handles all but the most extreme cases. A value for the match limit may also be supplied by an item at the start of a pattern of the form .sp (*LIMIT_MATCH=ddd) .sp where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP or, if no such limit is set, less than the default. .sp .nf .B int pcre2_set_depth_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi .sp This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP. Each time a nested backtracking point is passed, a new memory frame is used to remember the state of matching at that point. Thus, this parameter indirectly limits the amount of memory that is used in a match. However, because the size of each memory frame depends on the number of capturing parentheses, the actual memory limit varies from pattern to pattern. This limit was more useful in versions before 10.30, where function recursion was used for backtracking. .P The depth limit is not relevant, and is ignored, when matching is done using JIT compiled code. However, it is supported by \fBpcre2_dfa_match()\fP, which uses it to limit the depth of nested internal recursive function calls that implement atomic groups, lookaround assertions, and pattern recursions. This limits, indirectly, the amount of system stack that is used. It was more useful in versions before 10.32, when stack memory was used for local workspace vectors for recursive function calls. From version 10.32, only local variables are allocated on the stack and as each call uses only a few hundred bytes, even a small stack can support quite a lot of recursion. .P If the depth of internal recursive function calls is great enough, local workspace vectors are allocated on the heap from version 10.32 onwards, so the depth limit also indirectly limits the amount of heap memory that is used. A recursive pattern such as /(.(?2))((?1)|)/, when matched to a very long string using \fBpcre2_dfa_match()\fP, can use a great deal of memory. However, it is probably better to limit heap usage directly by calling \fBpcre2_set_heap_limit()\fP. .P The default value for the depth limit can be set when PCRE2 is built; if it is not, the default is set to the same value as the default for the match limit. If the limit is exceeded, \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth limit may also be supplied by an item at the start of a pattern of the form .sp (*LIMIT_DEPTH=ddd) .sp where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP or, if no such limit is set, less than the default. . . .SH "CHECKING BUILD-TIME OPTIONS" .rs .sp .B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP); .P The function \fBpcre2_config()\fP makes it possible for a PCRE2 client to find the value of certain configuration parameters and to discover which optional features have been compiled into the PCRE2 library. The .\" HREF \fBpcre2build\fP .\" documentation has more details about these features. .P The first argument for \fBpcre2_config()\fP specifies which information is required. The second argument is a pointer to memory into which the information is placed. If NULL is passed, the function returns the amount of memory that is needed for the requested information. For calls that return numerical values, the value is in bytes; when requesting these values, \fIwhere\fP should point to appropriately aligned memory. For calls that return strings, the required length is given in code units, not counting the terminating zero. .P When requesting information, the returned value from \fBpcre2_config()\fP is non-negative on success, or the negative error code PCRE2_ERROR_BADOPTION if the value in the first argument is not recognized. The following information is available: .sp PCRE2_CONFIG_BSR .sp The output is a uint32_t integer whose value indicates what character sequences the \eR escape sequence matches by default. A value of PCRE2_BSR_UNICODE means that \eR matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means that \eR matches only CR, LF, or CRLF. The default can be overridden when a pattern is compiled. .sp PCRE2_CONFIG_COMPILED_WIDTHS .sp The output is a uint32_t integer whose lower bits indicate which code unit widths were selected when PCRE2 was built. The 1-bit indicates 8-bit support, and the 2-bit and 4-bit indicate 16-bit and 32-bit support, respectively. .sp PCRE2_CONFIG_DEPTHLIMIT .sp The output is a uint32_t integer that gives the default limit for the depth of nested backtracking in \fBpcre2_match()\fP or the depth of nested recursions, lookarounds, and atomic groups in \fBpcre2_dfa_match()\fP. Further details are given with \fBpcre2_set_depth_limit()\fP above. .sp PCRE2_CONFIG_EFFECTIVE_LINKSIZE .sp The output is a uint32_t integer that contains the number of bytes the library uses for internal linkage in compiled regular expressions. Its value is derived from the value that was provided at build time and that is described below by PCRE2_CONFIG_LINKSIZE. .sp PCRE2_CONFIG_HEAPLIMIT .sp The output is a uint32_t integer that gives, in kibibytes, the default limit for the amount of heap memory used by \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. Further details are given with \fBpcre2_set_heap_limit()\fP above. .sp PCRE2_CONFIG_JIT .sp The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee that JIT will be used for any given match, and neither does it guarantee that JIT will actually be able to function, because it may not be able to allocate executable memory in some environments. There is a special call to \fBpcre2_jit_compile()\fP that can be used to check this. See the .\" HREF \fBpcre2jit\fP .\" documentation for more details. .sp PCRE2_CONFIG_JITTARGET .sp The \fIwhere\fP argument should point to a code-unit-aligned buffer. All previous versions of PCRE2 have required no more than 128 code units of buffer capacity. However, this requirement is not guaranteed to be maintained, so applications should call \fBpcre2_config()\fP with \fBwhere\fP set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to \fBpcre2_config()\fP. The buffer is filled with a string that contains the name of the architecture for which the JIT compiler is configured at build time, for example, a 64-bit ARM CPU that supports the Armv8.1 extension writes "ARM-64 (LSE) 64bit (little endian + unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is returned; otherwise the number of code units used is returned. This is the length of the string plus one unit for the terminating zero. .sp PCRE2_CONFIG_LINKSIZE .sp The output is a uint32_t integer that contains the number of bytes the library was instructed to use for internal linkage in compiled regular expressions. When PCRE2 is configured, the value can be set to 2, 3, or 4, with the default being 2 for most libraries. .P The actual number of bytes used depends on the size of the code units that the library supports and can be higher. See PCRE2_CONFIG_EFFECTIVE_LINKSIZE above for details. .P The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all but the most massive patterns, since it allows the size of the compiled pattern to be up to 65535 code units. Larger values allow larger regular expressions to be compiled by those two libraries, but at the expense of slower matching. .sp PCRE2_CONFIG_MATCHLIMIT .sp The output is a uint32_t integer that gives the default match limit for \fBpcre2_match()\fP. Further details are given with \fBpcre2_set_match_limit()\fP above. .sp PCRE2_CONFIG_NEWLINE .sp The output is a uint32_t integer whose value specifies the default character sequence that is recognized as meaning "newline". The values are: .sp PCRE2_NEWLINE_CR Carriage return (CR) PCRE2_NEWLINE_LF Linefeed (LF) PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF PCRE2_NEWLINE_NUL The NUL character (binary zero) .sp The default should normally correspond to the standard sequence for your operating system. .sp PCRE2_CONFIG_NEVER_BACKSLASH_C .sp The output is a uint32_t integer that is set to one if the use of \eC was permanently disabled when PCRE2 was built; otherwise it is set to zero. .sp PCRE2_CONFIG_PARENSLIMIT .sp The output is a uint32_t integer that gives the maximum depth of nesting of parentheses (of any kind) in a pattern. This limit is imposed to cap the amount of system stack used when a pattern is compiled. It is specified when PCRE2 is built; the default is 250. This limit does not take into account the stack that may already be used by the calling application. For finer control over compilation stack usage, see \fBpcre2_set_compile_recursion_guard()\fP. .sp PCRE2_CONFIG_STACKRECURSE .sp This parameter is obsolete and should not be used in new code. The output is a uint32_t integer that is always set to zero. .sp PCRE2_CONFIG_TABLES_LENGTH .sp The output is a uint32_t integer that gives the length of PCRE2's character processing tables in bytes. For details of these tables see the .\" HTML .\" section on locale support .\" below. .sp PCRE2_CONFIG_UNICODE_VERSION .sp The \fIwhere\fP argument should point to a code-unit-aligned buffer. All previous versions of PCRE2 have required no more than 24 code units of buffer capacity. However, applications should call \fBpcre2_config()\fP with \fBwhere\fP set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to \fBpcre2_config()\fP. If PCRE2 has been compiled without Unicode support, the buffer is filled with the text "Unicode not supported". Otherwise, the Unicode version string (for example, "8.0.0") is written. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero. .sp PCRE2_CONFIG_UNICODE .sp The output is a uint32_t integer that is set to one if Unicode support is available; otherwise it is set to zero. Unicode support implies UTF support. .sp PCRE2_CONFIG_VERSION .sp The \fIwhere\fP argument should point to a code-unit-aligned buffer. All previous versions of PCRE2 have required no more than 24 code units of buffer capacity. However, applications should call \fBpcre2_config()\fP with \fBwhere\fP set to NULL to receive the required buffer size, then assert or allocate a suitably-size buffer for a second call to \fBpcre2_config()\fP. The buffer is filled with the PCRE2 version string, zero-terminated. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero. . . .\" HTML .SH "COMPILING A PATTERN" .rs .sp .nf .B pcre2_code *pcre2_compile(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP" .B " pcre2_compile_context *\fIccontext\fP);" .sp .B void pcre2_code_free(pcre2_code *\fIcode\fP); .sp .B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP); .sp .B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP); .fi .P The \fBpcre2_compile()\fP function compiles a pattern into an internal form. The pattern is defined by a pointer to a string of code units and a length in code units. If the pattern is zero-terminated, the length can be specified as PCRE2_ZERO_TERMINATED. A NULL pattern pointer with a length of zero is treated as an empty string (NULL with a non-zero length causes an error return). The function returns a pointer to a block of memory that contains the compiled pattern and related data, or NULL if an error occurred. .P If the compile context argument \fIccontext\fP is NULL, memory for the compiled pattern is obtained by calling \fBmalloc()\fP. Otherwise, it is obtained from the same memory function that was used for the compile context. The caller must free the memory by calling \fBpcre2_code_free()\fP when it is no longer needed. If \fBpcre2_code_free()\fP is called with a NULL argument, it returns immediately, without doing anything. .P The function \fBpcre2_code_copy()\fP makes a copy of the compiled code in new memory, using the same memory allocator as was used for the original. However, if the code has been processed by the JIT compiler (see .\" HTML .\" below), .\" the JIT information cannot be copied (because it is position-dependent). The new copy can initially be used only for non-JIT matching, though it can be passed to \fBpcre2_jit_compile()\fP if required. If \fBpcre2_code_copy()\fP is called with a NULL argument, it returns NULL. .P The \fBpcre2_code_copy()\fP function provides a way for individual threads in a multithreaded application to acquire a private copy of shared compiled code. However, it does not make a copy of the character tables used by the compiled pattern; the new pattern code points to the same tables as the original code. (See .\" HTML .\" "Locale Support" .\" below for details of these character tables.) In many applications the same tables are used throughout, so this behaviour is appropriate. Nevertheless, there are occasions when a copy of a compiled pattern and the relevant tables are needed. The \fBpcre2_code_copy_with_tables()\fP provides this facility. Copies of both the code and the tables are made, with the new code pointing to the new tables. The memory for the new tables is automatically freed when \fBpcre2_code_free()\fP is called for the new copy of the compiled code. If \fBpcre2_code_copy_with_tables()\fP is called with a NULL argument, it returns NULL. .P NOTE: When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can be referenced by the substring extraction functions after a successful match. After running a match, you must not free a compiled pattern or a subject string until after all operations on the .\" HTML .\" match data block .\" have taken place, unless, in the case of the subject string, you have used the PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled "Option bits for \fBpcre2_match()\fP" .\" HTML .\" below. .\" .P The \fIoptions\fP argument for \fBpcre2_compile()\fP contains various bit settings that affect the compilation. It should be zero if none of them are required. The available options are described below. Some of them (in particular, those that are compatible with Perl, but some others as well) can also be set and unset from within the pattern (see the detailed description in the .\" HREF \fBpcre2pattern\fP .\" documentation). .P For those options that can be different in different parts of the pattern, the contents of the \fIoptions\fP argument specifies their settings at the start of compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and PCRE2_NO_UTF_CHECK options can be set at the time of matching as well as at compile time. .P Some additional options and less frequently required compile-time parameters (for example, the newline setting) can be provided in a compile context (as described .\" HTML .\" above). .\" .P If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns NULL immediately. Otherwise, the variables to which these point are set to an error code and an offset (number of code units) within the pattern, respectively, when \fBpcre2_compile()\fP returns NULL because a compilation error has occurred. .P There are over 100 positive error codes that \fBpcre2_compile()\fP may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, and are described in the .\" HREF \fBpcre2unicode\fP .\" documentation. There is no separate documentation for the positive error codes, because the textual error messages that are obtained by calling the \fBpcre2_get_error_message()\fP function (see "Obtaining a textual error message" .\" HTML .\" below) .\" should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined for both positive and negative error codes in \fBpcre2.h\fP. When compilation is successful \fIerrorcode\fP is set to a value that returns the message "no error" if passed to \fBpcre2_get_error_message()\fP. .P The value returned in \fIerroroffset\fP is an indication of where in the pattern an error occurred. When there is no error, zero is returned. A non-zero value is not necessarily the furthest point in the pattern that was read. For example, after the error "lookbehind assertion is not fixed length", the error offset points to the start of the failing assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the first code unit of the failing character. .P Some errors are not detected until the whole pattern has been scanned; in these cases, the offset passed back is the length of the pattern. Note that the offset is in code units, not characters, even in a UTF mode. It may sometimes point into the middle of a UTF-8 or UTF-16 character. .P This code fragment shows a typical straightforward call to \fBpcre2_compile()\fP: .sp pcre2_code *re; PCRE2_SIZE erroffset; int errorcode; re = pcre2_compile( "^A.*Z", /* the pattern */ PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */ 0, /* default options */ &errorcode, /* for error code */ &erroffset, /* for error offset */ NULL); /* no compile context */ .sp . . .SS "Main compile options" .rs .sp The following names for option bits are defined in the \fBpcre2.h\fP header file: .sp PCRE2_ANCHORED .sp If this bit is set, the pattern is forced to be "anchored", that is, it is constrained to match only at the first matching point in the string that is being searched (the "subject string"). This effect can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. .sp PCRE2_ALLOW_EMPTY_CLASS .sp By default, for compatibility with Perl, a closing square bracket that immediately follows an opening one is treated as a data character for the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which therefore contains no characters and so can never match. .sp PCRE2_ALT_BSUX .sp This option request alternative handling of three escape sequences, which makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set: .P (1) \eU matches an upper case "U" character; by default \eU causes a compile time error (Perl uses \eU to upper case subsequent characters). .P (2) \eu matches a lower case "u" character unless it is followed by four hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, \eu causes a compile time error (Perl uses it to upper case the following character). .P (3) \ex matches a lower case "x" character unless it is followed by two hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, as in Perl, a hexadecimal number is always expected after \ex, but it may have one or two digits. .P ECMAscript 6 added additional functionality to \eu. This can be accessed using the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile options" .\" HTML .\" below). .\" Note that this alternative escape handling applies only to patterns. Neither of these options affects the processing of replacement strings passed to \fBpcre2_substitute()\fP. .sp PCRE2_ALT_CIRCUMFLEX .sp In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter matches at the start of the subject (unless PCRE2_NOTBOL is set), and also after any internal newline. However, it does not match after a newline at the end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. .sp PCRE2_ALT_EXTENDED_CLASS .sp Alters the parsing of character classes to follow the extended syntax described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact on the behaviour of the Perl-specific "(?[...])" syntax for extended classes, but instead enables the alternative syntax of extended class behaviour inside ordinary "[...]" character classes. See the .\" HREF \fBpcre2pattern\fP .\" documentation for details of the character classes supported. .sp PCRE2_ALT_VERBNAMES .sp By default, for compatibility with Perl, the name in any verb sequence such as (*MARK:NAME) is any sequence of characters that does not include a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing is applied to verb names and only an unescaped closing parenthesis terminates the name. A closing parenthesis can be included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped white space in verb names is skipped and #-comments are recognized, exactly as in the rest of the pattern. .sp PCRE2_AUTO_CALLOUT .sp If this bit is set, \fBpcre2_compile()\fP automatically inserts callout items, all with number 255, before each pattern item, except immediately before or after an explicit callout in the pattern. For discussion of the callout facility, see the .\" HREF \fBpcre2callout\fP .\" documentation. .sp PCRE2_CASELESS .sp If this bit is set, letters in the pattern match both upper and lower case letters in the subject. It is equivalent to Perl's /i option, and it can be changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than U+007F. .P Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT. .P One language family, Turkish and Azeri, has its own case-insensitivity rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 (small dotless i) characters. .P For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having another case. .P From release 10.45 PCRE2_CASELESS also affects what some of the letter-related Unicode property escapes (\ep and \eP) match. The properties Lu (upper case letter), Ll (lower case letter), and Lt (title case letter) are all treated as LC (cased letter) when PCRE2_CASELESS is set. .sp PCRE2_DOLLAR_ENDONLY .sp If this bit is set, a dollar metacharacter in the pattern matches only at the end of the subject string. Without this option, a dollar also matches immediately before a newline at the end of the string (but not before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. There is no equivalent to this option in Perl, and no way to set it within a pattern. .sp PCRE2_DOTALL .sp If this bit is set, a dot metacharacter in the pattern matches any character, including one that indicates a newline. However, it only ever matches one character, even if newlines are coded as CRLF. Without this option, a dot does not match when the current position in the subject is at a newline. This option is equivalent to Perl's /s option, and it can be changed within a pattern by a (?s) option setting. A negative class such as [^a] always matches newline characters, and the \eN escape sequence always matches a non-newline character, independent of the setting of PCRE2_DOTALL. .sp PCRE2_DUPNAMES .sp If this bit is set, names used to identify capture groups need not be unique. This can be helpful for certain types of pattern when it is known that only one instance of the named group can ever be matched. There are more details of named capture groups below; see also the .\" HREF \fBpcre2pattern\fP .\" documentation. .sp PCRE2_ENDANCHORED .sp If this bit is set, the end of any pattern match must be right at the end of the string being searched (the "subject string"). If the pattern match succeeds by reaching (*ACCEPT), but does not reach the end of the subject, the match fails at the current starting point. For unanchored patterns, a new match is then tried at the next starting point. However, if the match succeeds by reaching the end of the pattern, but not the end of the subject, backtracking occurs and an alternative match may be found. Consider these two patterns: .sp .(*ACCEPT)|.. .|.. .sp If matched against "abc" with PCRE2_ENDANCHORED set, the first matches "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. .P For DFA matching with \fBpcre2_dfa_match()\fP, PCRE2_ENDANCHORED applies only to the first (that is, the longest) matched string. Other parallel matches, which are necessarily substrings of the first one, must obviously end before the end of the subject. .sp PCRE2_EXTENDED .sp If this bit is set, most white space characters in the pattern are totally ignored except when escaped, inside a character class, or inside a \eQ...\eE sequence. However, white space is not allowed within sequences such as (?> that introduce various parenthesized groups, nor within numerical quantifiers such as {1,3}. Ignorable white space is permitted between an item and a following quantifier and between a quantifier and a following + that indicates possessiveness. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be changed within a pattern by a (?x) option setting. .P When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recognizes as white space only those characters with code points less than 256 that are flagged as white space in its low-character table. The table is normally created by .\" HREF \fBpcre2_maketables()\fP, .\" which uses the \fBisspace()\fP function to identify space characters. In most ASCII environments, the relevant characters are those with code points 0x0009 (tab), 0x000A (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage return), and 0x0020 (space). .P When PCRE2 is compiled with Unicode support, in addition to these characters, five more Unicode "Pattern White Space" characters are recognized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to-right mark), U+200F (right-to-left mark), U+2028 (line separator), and U+2029 (paragraph separator). This set of characters is the same as recognized by Perl's /x option. Note that the horizontal and vertical space characters that are matched by the \eh and \ev escapes in patterns are a much bigger set. .P As well as ignoring most white space, PCRE2_EXTENDED also causes characters between an unescaped # outside a character class and the next newline, inclusive, to be ignored, which makes it possible to include comments inside complicated patterns. Note that the end of this type of comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. .P Which characters are interpreted as newlines can be specified by a setting in the compile context that is passed to \fBpcre2_compile()\fP or by a special sequence at the start of the pattern, as described in the section entitled .\" HTML .\" "Newline conventions" .\" in the \fBpcre2pattern\fP documentation. A default is defined when PCRE2 is built. .sp PCRE2_EXTENDED_MORE .sp This option has the effect of PCRE2_EXTENDED, but, in addition, unescaped space and horizontal tab characters are ignored inside a character class. Note: only these two characters are ignored, not the full set of pattern white space characters that are ignored outside a character class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, and it can be changed within a pattern by a (?xx) option setting. .sp PCRE2_FIRSTLINE .sp If this option is set, the start of an unanchored pattern match must be before or at the first newline in the subject string following the start of matching, though the matched text may continue over the newline. If \fIstartoffset\fP is non-zero, the limiting newline is not necessarily the first newline in the subject. For example, if the subject string is "abc\enxyz" (where \en represents a single-character newline) a pattern match for "yz" succeeds with PCRE2_FIRSTLINE if \fIstartoffset\fP is greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. This option has no effect for anchored patterns. .sp PCRE2_LITERAL .sp If this option is set, all meta-characters in the pattern are disabled, and it is treated as a literal string. Matching literal strings with a regular expression engine is not the most efficient way of doing it. If you are doing a lot of literal matching and are worried about efficiency, you should consider using other approaches. The only other main options that are allowed with PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error. .sp PCRE2_MATCH_INVALID_UTF .sp This option forces PCRE2_UTF (see below) and also enables support for matching by \fBpcre2_match()\fP in subject strings that contain invalid UTF sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. This facility is not supported for DFA matching. For details, see the .\" HREF \fBpcre2unicode\fP .\" documentation. .sp PCRE2_MATCH_UNSET_BACKREF .sp If this option is set, a backreference to an unset capture group matches an empty string (by default this causes the current matching alternative to fail). A pattern such as (\e1)(a) succeeds when this option is set (assuming it can find an "a" in the subject), whereas it fails by default, for Perl compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka JavaScript). .sp PCRE2_MULTILINE .sp By default, for the purposes of matching "start of line" and "end of line", PCRE2 treats the subject string as consisting of a single line of characters, even if it actually contains newlines. The "start of line" metacharacter (^) matches only at the start of the string, and the "end of line" metacharacter ($) matches only at the end of the string, or before a terminating newline (except when PCRE2_DOLLAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a newline. This behaviour (for ^, $, and dot) is the same as Perl. .P When PCRE2_MULTILINE it is set, the "start of line" and "end of line" constructs match immediately following or immediately before internal newlines in the subject string, respectively, as well as at the very start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. Note that the "start of line" metacharacter does not match after a newline at the end of the subject, for compatibility with Perl. However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. .sp PCRE2_NEVER_BACKSLASH_C .sp This option locks out the use of \eC in the pattern that is being compiled. This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in applications that process patterns from external sources. Note that there is also a build-time option that permanently locks out the use of \eC. .sp PCRE2_NEVER_UCP .sp This option locks out the use of Unicode properties for handling \eB, \eb, \eD, \ed, \eS, \es, \eW, \ew, and some of the POSIX character classes, as described for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error. .sp PCRE2_NEVER_UTF .sp This option locks out interpretation of the pattern as UTF-8, UTF-16, or UTF-32, depending on which library is in use. In particular, it prevents the creator of the pattern from switching to UTF interpretation by starting the pattern with (*UTF). This option may be useful in applications that process patterns from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes an error. .sp PCRE2_NO_AUTO_CAPTURE .sp If this option is set, it disables the use of numbered capturing parentheses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). This is the same as Perl's /n option. Note that, when this option is set, references to capture groups (backreferences or recursion/subroutine calls) may only refer to named groups, though the reference can be by name or by number. .sp PCRE2_NO_AUTO_POSSESS .sp If this (deprecated) option is set, it disables "auto-possessification", which is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. .P If a compile context is available, it is recommended to use \fBpcre2_set_optimize()\fP with the \fIdirective\fP PCRE2_AUTO_POSSESS_OFF rather than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS takes precedence over the \fBpcre2_set_optimize()\fP optimization directives PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF. .sp PCRE2_NO_DOTSTAR_ANCHOR .sp If this (deprecated) option is set, it disables an optimization that is applied when .* is the first significant item in a top-level branch of a pattern, and all the other branches also start with .* or with \eA or \eG or ^. The optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. (If a compile context is available, it is recommended to use \fBpcre2_set_optimize()\fP with the \fIdirective\fP PCRE2_DOTSTAR_ANCHOR_OFF instead.) .sp PCRE2_NO_START_OPTIMIZE .sp This is an option whose main effect is at matching time. It does not change what \fBpcre2_compile()\fP generates, but it does affect the output of the JIT compiler. Setting this option is equivalent to calling \fBpcre2_set_optimize()\fP with the \fIdirective\fP parameter set to PCRE2_START_OPTIMIZE_OFF. .P There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without actually running the main matching function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run. .P Disabling the start-up optimizations may cause performance to suffer. However, this may be desirable for patterns which contain callouts or items such as (*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF for further details. .sp PCRE2_NO_UTF_CHECK .sp When PCRE2_UTF is set, the validity of the pattern as a UTF string is automatically checked. There are discussions about the validity of .\" HTML .\" UTF-8 strings, .\" .\" HTML .\" UTF-16 strings, .\" and .\" HTML .\" UTF-32 strings .\" in the .\" HREF \fBpcre2unicode\fP .\" document. If an invalid UTF sequence is found, \fBpcre2_compile()\fP returns a negative error code. .P If you know that your pattern is a valid UTF string, and you want to skip this check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set, the effect of passing an invalid UTF string as a pattern is undefined. It may cause your program to crash or loop. .P Note that this option can also be passed to \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject string. .P Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. In particular, the so-called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described in the section entitled "Extra compile options" .\" HTML .\" below. .\" However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16. .sp PCRE2_UCP .sp This option has two effects. Firstly, it change the way PCRE2 processes \eB, \eb, \eD, \ed, \eS, \es, \eW, \ew, and some of the POSIX character classes. By default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode properties are used to classify characters. There are some PCRE2_EXTRA options (see below) that add finer control to this behaviour. More details are given in the section on .\" HTML .\" generic character types .\" in the .\" HREF \fBpcre2pattern\fP .\" page. .P The second effect of PCRE2_UCP is to force the use of Unicode properties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default). .P The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option (see above) alters the matching of the 'i' characters to follow their behaviour in Turkish and Azeri languages. For further details on PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the .\" HREF \fBpcre2unicode\fP .\" page. .sp PCRE2_UNGREEDY .sp This option inverts the "greediness" of the quantifiers so that they are not greedy by default, but become greedy if followed by "?". It is not compatible with Perl. It can also be set by a (?U) option setting within the pattern. .sp PCRE2_USE_OFFSET_LIMIT .sp This option must be set for \fBpcre2_compile()\fP if \fBpcre2_set_offset_limit()\fP is going to be used to set a non-default offset limit in a match context for matches that use this pattern. An error is generated if an offset limit is set without this option. For more details, see the description of \fBpcre2_set_offset_limit()\fP in the .\" HTML .\" section .\" that describes match contexts. See also the PCRE2_FIRSTLINE option above. .sp PCRE2_UTF .sp This option causes PCRE2 to regard both the pattern and the subject strings that are subsequently processed as strings of UTF characters instead of single-code-unit strings. It is available when PCRE2 is built to include Unicode support (which is the default). If Unicode support is not available, the use of this option provokes an error. Details of how PCRE2_UTF changes the behaviour of PCRE2 are given in the .\" HREF \fBpcre2unicode\fP .\" page. In particular, note that it changes the way PCRE2_CASELESS works. . . .\" HTML .SS "Extra compile options" .rs .sp The option bits that can be set in a compile context by calling the \fBpcre2_set_compile_extra_options()\fP function are as follows: .sp PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK .sp Since release 10.38 PCRE2 has forbidden the use of \eK within lookaround assertions, following Perl's lead. This option is provided to re-enable the previous behaviour (act in positive lookarounds, ignore in negative ones) in case anybody is relying on it. .sp PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES .sp This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot therefore be represented in UTF-16. They can be represented in UTF-8 and UTF-32, but are defined as invalid code points, and cause errors if encountered in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2. .P These values also cause errors if encountered in escape sequences such as \ex{d912} within a pattern. However, it seems that some applications, when using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does not disable the error that occurs, because it applies only to the testing of input strings for UTF validity. .P If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code point values in UTF-8 and UTF-32 patterns no longer provoke errors and are incorporated in the compiled pattern. However, they can only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set. .sp PCRE2_EXTRA_ALT_BSUX .sp The original option PCRE2_ALT_BSUX causes PCRE2 to process \eU, \eu, and \ex in the way that ECMAscript (aka JavaScript) does. Additional functionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of PCRE2_ALT_BSUX, but in addition it recognizes \eu{hhh..} as a hexadecimal character code, where hhh.. is any number of hexadecimal digits. .sp PCRE2_EXTRA_ASCII_BSD .sp This option forces \ed to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aD) option setting. .sp PCRE2_EXTRA_ASCII_BSS .sp This option forces \es to match only ASCII space characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) option setting. .sp PCRE2_EXTRA_ASCII_BSW .sp This option forces \ew to match only ASCII word characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aW) option setting. .sp PCRE2_EXTRA_ASCII_DIGIT .sp This option forces the POSIX character classes [:digit:] and [:xdigit:] to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option setting. .sp PCRE2_EXTRA_ASCII_POSIX .sp This option forces all the POSIX character classes, including [:digit:] and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aP) option setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. .sp PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL .sp This is a dangerous option. Use with care. By default, an unrecognized escape such as \ej or a malformed one such as \ex{2z} causes a compile-time error when detected by \fBpcre2_compile()\fP. Perl is somewhat inconsistent in handling such items: for example, \ej is treated as a literal "j", and non-hexadecimal digits in \ex{} are just ignored, though warnings are given in both cases if Perl's warning switch is enabled. However, a malformed octal number after \eo{ always causes an error in Perl. .P If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to \fBpcre2_compile()\fP, all unrecognized or malformed escape sequences are treated as single-character escapes. For example, \ej is a literal "j" and \ex{2z} is treated as the literal string "x{2z}". Setting this option means that typos in patterns may go undetected and have unexpected results. Also note that a sequence such as [\eN{] is interpreted as a malformed attempt at [\eN{...}] and so is treated as [N{] whereas [\eN] gives an error because an unqualified \eN is a valid escape sequence but is not supported in a character class. To reiterate: this is a dangerous option. Use with great care. .sp PCRE2_EXTRA_CASELESS_RESTRICT .sp When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow for more than two cases per character. There are two case-equivalent character sets that contain both ASCII and non-ASCII characters. The ASCII letter S is case-equivalent to U+017f (long S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must either be ASCII or non-ASCII. The option can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option settings. .sp PCRE2_EXTRA_ESCAPED_CR_IS_LF .sp There are some legacy applications where the escape sequence \er in a pattern is expected to match a newline. If this option is set, \er in a pattern is converted to \en so that it matches a LF (linefeed) instead of a CR (carriage return) character. The option does not affect a literal CR in the pattern, nor does it affect CR specified as an explicit code point such as \ex{0D}. .sp PCRE2_EXTRA_MATCH_LINE .sp This option is provided for use by the \fB-x\fP option of \fBpcre2grep\fP. It causes the pattern only to match complete lines. This is achieved by automatically inserting the code for "^(?:" at the start of the compiled pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched line may be in the middle of the subject string. This option can be used with PCRE2_LITERAL. .sp PCRE2_EXTRA_MATCH_WORD .sp This option is provided for use by the \fB-w\fP option of \fBpcre2grep\fP. It causes the pattern only to match strings that have a word boundary at the start and the end. This is achieved by automatically inserting the code for "\eb(?:" at the start of the compiled pattern and ")\eb" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. .sp PCRE2_EXTRA_NO_BS0 .sp If this option is set (note that its final character is the digit 0) it locks out the use of the sequence \e0 unless at least one more octal digit follows. .sp PCRE2_EXTRA_PYTHON_OCTAL .sp If this option is set, PCRE2 follows Python's rules for interpreting octal escape sequences. The rules for handling sequences such as \e14, which could be an octal number or a back reference are different. Details are given in the .\" HREF \fBpcre2pattern\fP .\" documentation. .sp PCRE2_EXTRA_NEVER_CALLOUT .sp If this option is set, PCRE2 treats callouts in the pattern as a syntax error, returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application knows that a callout will not be provided to \fBpcre2_match()\fP, so that callouts in the pattern are not silently ignored. .sp PCRE2_EXTRA_TURKISH_CASING .sp This option alters case-equivalence of the 'i' letters to follow the alphabet used by Turkish and Azeri languages. The option can be changed within a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or UCP options must be set. In the 8-bit library, UTF must be set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT. . . .\" HTML .SH "JUST-IN-TIME (JIT) COMPILATION" .rs .sp .nf .B int pcre2_jit_compile(pcre2_code *\fIcode\fP, uint32_t \fIoptions\fP); .sp .B int pcre2_jit_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP);" .sp .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP); .sp .B pcre2_jit_stack *pcre2_jit_stack_create(size_t \fIstartsize\fP, .B " size_t \fImaxsize\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_jit_stack_assign(pcre2_match_context *\fImcontext\fP, .B " pcre2_jit_callback \fIcallback_function\fP, void *\fIcallback_data\fP);" .sp .B void pcre2_jit_stack_free(pcre2_jit_stack *\fIjit_stack\fP); .fi .P These functions provide support for JIT compilation, which, if the just-in-time compiler is available, further processes a compiled pattern into machine code that executes much faster than the \fBpcre2_match()\fP interpretive matching function. Full details are given in the .\" HREF \fBpcre2jit\fP .\" documentation. .P JIT compilation is a heavyweight optimization. It can take some time for patterns to be analyzed, and for one-off matches and simple patterns the benefit of faster execution might be offset by a much slower compilation time. Most (but not all) patterns can be optimized by the JIT compiler. . . .\" HTML .SH "LOCALE SUPPORT" .rs .sp .nf .B const uint8_t *pcre2_maketables(pcre2_general_context *\fIgcontext\fP); .sp .B void pcre2_maketables_free(pcre2_general_context *\fIgcontext\fP, .B " const uint8_t *\fItables\fP);" .fi .P PCRE2 handles caseless matching, and determines whether characters are letters, digits, or whatever, by reference to a set of tables, indexed by character code point. However, this applies only to characters whose code points are less than 256. By default, higher-valued code points never match escapes such as \ew or \ed. .P When PCRE2 is built with Unicode support (the default), certain Unicode character properties can be tested with \ep and \eP, or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and friends to use Unicode property support instead of the built-in tables. PCRE2_UCP also causes upper/lower casing operations on characters with code points greater than 127 to use Unicode properties. These effects apply even when PCRE2_UTF is not set. There are, however, some PCRE2_EXTRA options (see above) that can be used to modify or suppress them. .P The use of locales with Unicode is discouraged. If you are handling characters with code points greater than 127, you should either use Unicode support, or use locales, but not try to mix the two. .P PCRE2 contains a built-in set of character tables that are used by default. These are sufficient for many applications. Normally, the internal tables recognize only ASCII characters. However, when PCRE2 is built, it is possible to cause the internal tables to be rebuilt in the default "C" locale of the local system, which may cause them to be different. .P The built-in tables can be overridden by tables supplied by the application that calls PCRE2. These may be created in a different locale from the default. As more and more applications change to using Unicode, the need for this locale support is expected to die away. .P External tables are built by calling the \fBpcre2_maketables()\fP function, in the relevant locale. The only argument to this function is a general context, which can be used to pass a custom memory allocator. If the argument is NULL, the system \fBmalloc()\fP is used. The result can be passed to \fBpcre2_compile()\fP as often as necessary, by creating a compile context and calling \fBpcre2_set_character_tables()\fP to set the tables pointer therein. .P For example, to build and use tables that are appropriate for the French locale (where accented characters with values greater than 127 are treated as letters), the following code could be used: .sp setlocale(LC_CTYPE, "fr_FR"); tables = pcre2_maketables(NULL); ccontext = pcre2_compile_context_create(NULL); pcre2_set_character_tables(ccontext, tables); re = pcre2_compile(..., ccontext); .sp The locale name "fr_FR" is used on Linux and other Unix-like systems; if you are using Windows, the name for the French locale is "french". .P The pointer that is passed (via the compile context) to \fBpcre2_compile()\fP is saved with the compiled pattern, and the same tables are used by the matching functions. Thus, for any single pattern, compilation and matching both happen in the same locale, but different patterns can be processed in different locales. .P It is the caller's responsibility to ensure that the memory containing the tables remains available while they are still in use. When they are no longer needed, you can discard them using \fBpcre2_maketables_free()\fP, which should pass as its first parameter the same global context that was used to create the tables. . . .SS "Saving locale tables" .rs .sp The tables described above are just a sequence of binary bytes, which makes them independent of hardware characteristics such as endianness or whether the processor is 32-bit or 64-bit. A copy of the result of \fBpcre2_maketables()\fP can therefore be saved in a file or elsewhere and re-used later, even in a different program or on another computer. The size of the tables (number of bytes) must be obtained by calling \fBpcre2_config()\fP with the PCRE2_CONFIG_TABLES_LENGTH option because \fBpcre2_maketables()\fP does not return this value. Note that the \fBpcre2_dftables\fP program, which is part of the PCRE2 build system, can be used stand-alone to create a file that contains a set of binary tables. See the .\" HTML .\" \fBpcre2build\fP .\" documentation for details. . . .\" HTML .SH "INFORMATION ABOUT A COMPILED PATTERN" .rs .sp .nf .B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP); .fi .P The \fBpcre2_pattern_info()\fP function returns general information about a compiled pattern. For information about callouts, see the .\" HTML .\" next section. .\" The first argument for \fBpcre2_pattern_info()\fP is a pointer to the compiled pattern. The second argument specifies which piece of information is required, and the third argument is a pointer to a variable to receive the data. If the third argument is NULL, the first argument is ignored, and the function returns the size in bytes of the variable that is required for the information requested. Otherwise, the yield of the function is zero for success, or one of the following negative numbers: .sp PCRE2_ERROR_NULL the argument \fIcode\fP was NULL PCRE2_ERROR_BADMAGIC the "magic number" was not found PCRE2_ERROR_BADOPTION the value of \fIwhat\fP was invalid PCRE2_ERROR_UNSET the requested field is not set .sp The "magic number" is placed at the start of each compiled pattern as a simple check against passing an arbitrary memory pointer. Here is a typical call of \fBpcre2_pattern_info()\fP, to obtain the length of the compiled pattern: .sp int rc; size_t length; rc = pcre2_pattern_info( re, /* result of pcre2_compile() */ PCRE2_INFO_SIZE, /* what is required */ &length); /* where to put the data */ .sp The possible values for the second argument are defined in \fBpcre2.h\fP, and are as follows: .sp PCRE2_INFO_ALLOPTIONS PCRE2_INFO_ARGOPTIONS PCRE2_INFO_EXTRAOPTIONS .sp Return copies of the pattern's options. The third argument should point to a \fBuint32_t\fP variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that were passed to \fBpcre2_compile()\fP, whereas PCRE2_INFO_ALLOPTIONS returns the compile options as modified by any top-level (*XXX) option settings such as (*UTF) at the start of the pattern itself. PCRE2_INFO_EXTRAOPTIONS returns the extra options that were set in the compile context by calling the pcre2_set_compile_extra_options() function. .P For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EXTENDED option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can change within a pattern do not affect the result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the pattern. (This was different in some earlier releases.) .P A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if the first significant item in every top-level branch is one of the following: .sp ^ unless PCRE2_MULTILINE is set \eA always \eG always .* sometimes - see below .sp When .* is the first significant item, anchoring is possible only when all the following are true: .sp .* is not in an atomic group .\" JOIN .* is not in a capture group that is the subject of a backreference PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF .sp For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. .sp PCRE2_INFO_BACKREFMAX .sp Return the number of the highest backreference in the pattern. The third argument should point to a \fBuint32_t\fP variable. Named capture groups acquire numbers as well as names, and these count towards the highest backreference. Backreferences such as \e4 or \eg{12} match the captured characters of the given group, but in addition, the check that a capture group is set in a conditional group such as (?(3)a|b) is also a backreference. Zero is returned if there are no backreferences. .sp PCRE2_INFO_BSR .sp The output is a uint32_t integer whose value indicates what character sequences the \eR escape sequence matches. A value of PCRE2_BSR_UNICODE means that \eR matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means that \eR matches only CR, LF, or CRLF. .sp PCRE2_INFO_CAPTURECOUNT .sp Return the highest capture group number in the pattern. In patterns where (?| is not used, this is also the total number of capture groups. The third argument should point to a \fBuint32_t\fP variable. .sp PCRE2_INFO_DEPTHLIMIT .sp If the pattern set a backtracking depth limit by including an item of the form (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to \fBpcre2_pattern_info()\fP returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function. .sp PCRE2_INFO_FIRSTBITMAP .sp In the absence of a single first code unit for a non-anchored pattern, \fBpcre2_compile()\fP may construct a 256-bit table that defines a fixed set of values for the first code unit in any match. For example, a pattern that starts with [abc] results in a table with three bits set. When code unit values greater than 255 are supported, the flag bit for 255 means "any code unit of value 255 or above". If such a table was constructed, a pointer to it is returned. Otherwise NULL is returned. The third argument should point to a \fBconst uint8_t *\fP variable. .sp PCRE2_INFO_FIRSTCODETYPE .sp Return information about the first code unit of any matched string, for a non-anchored pattern. The third argument should point to a \fBuint32_t\fP variable. If there is a fixed first value, for example, the letter "c" from a pattern such as (cat|cow|coyote), 1 is returned, and the value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is known that a match can occur only at the start of the subject or following a newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0 is returned. .sp PCRE2_INFO_FIRSTCODEUNIT .sp Return the value of the first code unit of any matched string for a pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third argument should point to a \fBuint32_t\fP variable. In the 8-bit library, the value is always less than 256. In the 16-bit library the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode. .sp PCRE2_INFO_FRAMESIZE .sp Return the size (in bytes) of the data frames that are used to remember backtracking positions when the pattern is processed by \fBpcre2_match()\fP without the use of JIT. The third argument should point to a \fBsize_t\fP variable. The frame size depends on the number of capturing parentheses in the pattern. Each additional capture group adds two PCRE2_SIZE variables. .sp PCRE2_INFO_HASBACKSLASHC .sp Return 1 if the pattern contains any instances of \eC, otherwise 0. The third argument should point to a \fBuint32_t\fP variable. .sp PCRE2_INFO_HASCRORLF .sp Return 1 if the pattern contains any explicit matches for CR or LF characters, otherwise 0. The third argument should point to a \fBuint32_t\fP variable. An explicit match is either a literal CR or LF character, or \er or \en or one of the equivalent hexadecimal or octal escape sequences. .sp PCRE2_INFO_HEAPLIMIT .sp If the pattern set a heap memory limit by including an item of the form (*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to \fBpcre2_pattern_info()\fP returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function. .sp PCRE2_INFO_JCHANGED .sp Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise 0. The third argument should point to a \fBuint32_t\fP variable. (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respectively. .sp PCRE2_INFO_JITSIZE .sp If the compiled pattern was successfully processed by \fBpcre2_jit_compile()\fP, return the size of the JIT compiled code, otherwise return zero. The third argument should point to a \fBsize_t\fP variable. .sp PCRE2_INFO_LASTCODETYPE .sp Returns 1 if there is a rightmost literal code unit that must exist in any matched string, other than at its start. The third argument should point to a \fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is recorded only if it follows something of variable length. For example, for the pattern /^a\ed+z\ed+/ the returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\edz\ed/ the returned value is 0. .sp PCRE2_INFO_LASTCODEUNIT .sp Return the value of the rightmost literal code unit that must exist in any matched string, other than at its start, for a pattern where PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argument should point to a \fBuint32_t\fP variable. .sp PCRE2_INFO_MATCHEMPTY .sp Return 1 if the pattern might match an empty string, otherwise 0. The third argument should point to a \fBuint32_t\fP variable. When a pattern contains recursive subroutine calls it is not always possible to determine whether or not it can match an empty string. PCRE2 takes a cautious approach and returns 1 in such cases. .sp PCRE2_INFO_MATCHLIMIT .sp If the pattern set a match limit by including an item of the form (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument should point to a uint32_t integer. If no such value has been set, the call to \fBpcre2_pattern_info()\fP returns the error PCRE2_ERROR_UNSET. Note that this limit will only be used during matching if it is less than the limit set or defaulted by the caller of the match function. .sp PCRE2_INFO_MAXLOOKBEHIND .sp A lookbehind assertion moves back a certain number of characters (not code units) when it starts to process each of its branches. This request returns the largest of these backward moves. The third argument should point to a uint32_t integer. The simple assertions \eb and \eB require a one-character lookbehind and cause PCRE2_INFO_MAXLOOKBEHIND to return 1 in the absence of anything longer. \eA also registers a one-character lookbehind, though it does not actually inspect the previous character. .P Note that this information is useful for multi-segment matching only if the pattern contains no nested lookbehinds. For example, the pattern (?<=a(?<=ba)c) returns a maximum lookbehind of 2, but when it is processed, the first lookbehind moves back by two characters, matches one character, then the nested lookbehind also moves back by two characters. This puts the matching point three characters earlier than it was at the start. PCRE2_INFO_MAXLOOKBEHIND is really only useful as a debugging tool. See the .\" HREF \fBpcre2partial\fP .\" documentation for a discussion of multi-segment matching. .sp PCRE2_INFO_MINLENGTH .sp If a minimum length for matching subject strings was computed, its value is returned. Otherwise the returned value is 0. This value is not computed when PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in UTF mode may be different from the number of code units. The third argument should point to a \fBuint32_t\fP variable. The value is a lower bound to the length of any matching string. There may not be any strings of that length that do actually match, but every string that does match is at least that long. .sp PCRE2_INFO_NAMECOUNT PCRE2_INFO_NAMEENTRYSIZE PCRE2_INFO_NAMETABLE .sp PCRE2 supports the use of named as well as numbered capturing parentheses. The names are just an additional way of identifying the parentheses, which still acquire numbers. Several convenience functions such as \fBpcre2_substring_get_byname()\fP are provided for extracting captured substrings by name. It is also possible to extract the data directly, by first converting the name to a number in order to access the correct pointers in the output vector (described with \fBpcre2_match()\fP below). To do the conversion, you need to use the name-to-number map, which is described by these three values. .P The map consists of a number of fixed-size entries. PCRE2_INFO_NAMECOUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each entry in code units; both of these return a \fBuint32_t\fP value. The entry size depends on the length of the longest name. .P PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table. This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit library, the first two bytes of each entry are the number of the capturing parenthesis, most significant byte first. In the 16-bit library, the pointer points to 16-bit code units, the first of which contains the parenthesis number. In the 32-bit library, the pointer points to 32-bit code units, the first of which contains the parenthesis number. The rest of the entry is the corresponding name, zero terminated. .P The names are in alphabetical order. If (?| is used to create multiple capture groups with the same number, as described in the .\" HTML .\" section on duplicate group numbers .\" in the .\" HREF \fBpcre2pattern\fP .\" page, the groups may be given the same name, but there is only one entry in the table. Different names for groups of the same number are not permitted. .P Duplicate names for capture groups with different numbers are permitted, but only if PCRE2_DUPNAMES is set. They appear in the table in the order in which they were found in the pattern. In the absence of (?| this is the order of increasing number; when (?| is used this is not necessarily the case because later capture groups may have lower numbers. .P As a simple example of the name/number table, consider the following pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white space - including newlines - is ignored): .sp .\" JOIN (? (?(\ed\ed)?\ed\ed) - (?\ed\ed) - (?\ed\ed) ) .sp There are four named capture groups, so the table has four entries, and each entry in the table is eight bytes long. The table is as follows, with non-printing bytes shows in hexadecimal, and undefined bytes shown as ??: .sp 00 01 d a t e 00 ?? 00 05 d a y 00 ?? ?? 00 04 m o n t h 00 00 02 y e a r 00 ?? .sp When writing code to extract data from named capture groups using the name-to-number map, remember that the length of the entries is likely to be different for each compiled pattern. .sp PCRE2_INFO_NEWLINE .sp The output is one of the following \fBuint32_t\fP values: .sp PCRE2_NEWLINE_CR Carriage return (CR) PCRE2_NEWLINE_LF Linefeed (LF) PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF PCRE2_NEWLINE_NUL The NUL character (binary zero) .sp This identifies the character sequence that will be recognized as meaning "newline" while matching. .sp PCRE2_INFO_SIZE .sp Return the size of the compiled pattern in bytes (for all three libraries). The third argument should point to a \fBsize_t\fP variable. This value includes the size of the general data block that precedes the code units of the compiled pattern itself. The value that is used when \fBpcre2_compile()\fP is getting memory in which to place the compiled pattern may be slightly larger than the value returned by this option, because there are cases where the code that calculates the size has to over-estimate. Processing a pattern with the JIT compiler does not alter the value returned by this option. . . .\" HTML .SH "INFORMATION ABOUT A PATTERN'S CALLOUTS" .rs .sp .nf .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP, .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *)," .B " void *\fIuser_data\fP);" .fi .sp A script language that supports the use of string arguments in callouts might like to scan all the callouts in a pattern before running the match. This can be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a pointer to a compiled pattern, the second points to a callback function, and the third is arbitrary user data. The callback function is called for every callout in the pattern in the order in which they appear. Its first argument is a pointer to a callout enumeration block, and its second argument is the \fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The contents of the callout enumeration block are described in the .\" HREF \fBpcre2callout\fP .\" documentation, which also gives further details about callouts. . . .SH "SERIALIZATION AND PRECOMPILING" .rs .sp It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width, and PCRE2_SIZE type. Before compiled patterns can be saved, they must be converted to a "serialized" form, which in the case of PCRE2 is really just a bytecode dump. The functions whose names begin with \fBpcre2_serialize_\fP are used for converting to and from the serialized form. They are described in the .\" HREF \fBpcre2serialize\fP .\" documentation. Note that PCRE2 serialization does not convert compiled patterns to an abstract format like Java or .NET serialization. . . .\" HTML .SH "THE MATCH DATA BLOCK" .rs .sp .nf .B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP, .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_match_data *pcre2_match_data_create_from_pattern( .B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_match_data_free(pcre2_match_data *\fImatch_data\fP); .fi .P Information about a successful or unsuccessful match is placed in a match data block, which is an opaque structure that is accessed by function calls. In particular, the match data block contains a vector of offsets into the subject string that define the matched parts of the subject. This is known as the \fIovector\fP. .P Before calling \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP you must create a match data block by calling one of the creation functions above. For \fBpcre2_match_data_create()\fP, the first argument is the number of pairs of offsets in the \fIovector\fP. .P When using \fBpcre2_match()\fP, one pair of offsets is required to identify the string that matched the whole pattern, with an additional pair for each captured substring. For example, a value of 4 creates enough space to record the matched portion of the subject plus three captured substrings. .P When using \fBpcre2_dfa_match()\fP there may be multiple matched substrings of different lengths at the same point in the subject. The ovector should be made large enough to hold as many as are expected. .P A minimum of at least 1 pair is imposed by \fBpcre2_match_data_create()\fP, so it is always possible to return the overall matched string in the case of \fBpcre2_match()\fP or the longest match in the case of \fBpcre2_dfa_match()\fP. The maximum number of pairs is 65535; if the first argument of \fBpcre2_match_data_create()\fP is greater than this, 65535 is used. .P The second argument of \fBpcre2_match_data_create()\fP is a pointer to a general context, which can specify custom memory management for obtaining the memory for the match data block. If you are not using custom memory management, pass NULL, which causes \fBmalloc()\fP to be used. .P For \fBpcre2_match_data_create_from_pattern()\fP, the first argument is a pointer to a compiled pattern. The ovector is created to be exactly the right size to hold all the substrings a pattern might capture when matched using \fBpcre2_match()\fP. You should not use this call when matching with \fBpcre2_dfa_match()\fP. The second argument is again a pointer to a general context, but in this case if NULL is passed, the memory is obtained using the same allocator that was used for the compiled pattern (custom or default). .P A match data block can be used many times, with the same or different compiled patterns. You can extract information from a match data block after a match operation has finished, using functions that are described in the sections on .\" HTML .\" matched strings .\" and .\" HTML .\" other match data .\" below. .P When a call of \fBpcre2_match()\fP fails, valid data is available in the match block only when the error is PCRE2_ERROR_NOMATCH, PCRE2_ERROR_PARTIAL, or one of the error codes for an invalid UTF string. Exactly what is available depends on the error, and is detailed below. .P When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can be referenced by the extraction functions after a successful match. After running a match, you must not free a compiled pattern or a subject string until after all operations on the match data block (for that match) have taken place, unless, in the case of the subject string, you have used the PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled "Option bits for \fBpcre2_match()\fP" .\" HTML .\" below. .\" .P When a match data block itself is no longer needed, it should be freed by calling \fBpcre2_match_data_free()\fP. If this function is called with a NULL argument, it returns immediately, without doing anything. . . .SH "MEMORY USE FOR MATCH DATA BLOCKS" .rs .sp .nf .B PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE pcre2_get_match_data_heapframes_size( .B " pcre2_match_data *\fImatch_data\fP);" .fi .P The size of a match data block depends on the size of the ovector that it contains. The function \fBpcre2_get_match_data_size()\fP returns the size, in bytes, of the block that is its argument. .P When \fBpcre2_match()\fP runs interpretively (that is, without using JIT), it makes use of a vector of data frames for remembering backtracking positions. The size of each individual frame depends on the number of capturing parentheses in the pattern and can be obtained by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_FRAMESIZE option (see the section entitled "Information about a compiled pattern" .\" HTML .\" above). .\" .P Heap memory is used for the frames vector; if the initial memory block turns out to be too small during matching, it is automatically expanded. When \fBpcre2_match()\fP returns, the memory is not freed, but remains attached to the match data block, for use by any subsequent matches that use the same block. It is automatically freed when the match data block itself is freed. .P You can find the current size of the frames vector that a match data block owns by calling \fBpcre2_get_match_data_heapframes_size()\fP. For a newly created match data block the size will be zero. Some types of match may require a lot of frames and thus a large vector; applications that run in environments where memory is constrained can check this and free the match data block if the heap frames vector has become too big. . . .SH "MATCHING A PATTERN: THE TRADITIONAL FUNCTION" .rs .sp .nf .B int pcre2_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP);" .fi .P The function \fBpcre2_match()\fP is called to match a subject string against a compiled pattern, which is passed in the \fIcode\fP argument. You can call \fBpcre2_match()\fP with the same \fIcode\fP argument as many times as you like, in order to find multiple matches in the subject string or to match different subject strings with the same pattern. .P This function is the main matching facility of the library, and it operates in a Perl-like manner. For specialist use there is also an alternative matching function, which is described .\" HTML .\" below .\" in the section about the \fBpcre2_dfa_match()\fP function. .P Here is an example of a simple call to \fBpcre2_match()\fP: .sp pcre2_match_data *md = pcre2_match_data_create(4, NULL); int rc = pcre2_match( re, /* result of pcre2_compile() */ "some string", /* the subject string */ 11, /* the length of the subject string */ 0, /* start at offset 0 in the subject */ 0, /* default options */ md, /* the match data block */ NULL); /* a match context; NULL means use defaults */ .sp If the subject string is zero-terminated, the length can be given as PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common matching parameters are to be changed. For details, see the section on .\" HTML .\" the match context .\" above. . . .SS "The string to be matched by \fBpcre2_match()\fP" .rs .sp The subject string is passed to \fBpcre2_match()\fP as a pointer in \fIsubject\fP, a length in \fIlength\fP, and a starting offset in \fIstartoffset\fP. The length and offset are in code units, not characters. That is, they are in bytes for the 8-bit library, 16-bit code units for the 16-bit library, and 32-bit code units for the 32-bit library, whether or not UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and \fIlength\fP is zero, the subject is assumed to be an empty string. If \fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL. .P If \fIstartoffset\fP is greater than the length of the subject, \fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the search for a match starts at the beginning of the subject, and this is by far the most common case. In UTF-8 or UTF-16 mode, the starting offset must point to the start of a character, or to the end of the subject (in UTF-32 mode, one code unit equals one character, so all offsets are valid). Like the pattern string, the subject may contain binary zeros. .P A non-zero starting offset is useful when searching for another match in the same subject by calling \fBpcre2_match()\fP again after a previous success. Setting \fIstartoffset\fP differs from passing over a shortened string and setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of lookbehind. For example, consider the pattern .sp \eBiss\eB .sp which finds occurrences of "iss" in the middle of words. (\eB matches only if the current position in the subject is not a word boundary.) When applied to the string "Mississippi" the first call to \fBpcre2_match()\fP finds the first occurrence. If \fBpcre2_match()\fP is called again with just the remainder of the subject, namely "issippi", it does not match, because \eB is always false at the start of the subject, which is deemed to be a word boundary. However, if \fBpcre2_match()\fP is passed the entire string again, but with \fIstartoffset\fP set to 4, it finds the second occurrence of "iss" because it is able to look behind the starting point to discover that it is preceded by a letter. .P Finding all the matches in a subject is tricky when the pattern can match an empty string. PCRE2 includes a helper API to assist with this; see the section entitled "Iterating over all matches" .\" HTML .\" below .\" for details. .P If a non-zero starting offset is passed when the pattern is anchored, a single attempt to match at the given offset is made. This can only succeed if the pattern does not require the match to be at the start of the subject. In other words, the anchoring must be the result of setting the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL, not by starting the pattern with ^ or \eA. . . .\" HTML .SS "Option bits for \fBpcre2_match()\fP" .rs .sp The unused bits of the \fIoptions\fP argument for \fBpcre2_match()\fP must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_DISABLE_RECURSELOOP_CHECK, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. .P Setting PCRE2_ANCHORED or PCRE2_ENDANCHORED at match time is not supported by the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the interpretive code in \fBpcre2_match()\fP is run. PCRE2_DISABLE_RECURSELOOP_CHECK is ignored by JIT, but apart from PCRE2_NO_JIT (obviously), the remaining options are supported for JIT matching. .sp PCRE2_ANCHORED .sp The PCRE2_ANCHORED option limits \fBpcre2_match()\fP to matching at the first matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out to be anchored by virtue of its contents, it cannot be made unanchored at matching time. Note that setting the option at match time disables JIT matching. .sp PCRE2_COPY_MATCHED_SUBJECT .sp By default, a pointer to the subject is remembered in the match data block so that, after a successful match, it can be referenced by the substring extraction functions. This means that the subject's memory must not be freed until all such operations are complete. For some applications where the lifetime of the subject string is not guaranteed, it may be necessary to make a copy of the subject string, but it is wasteful to do this unless the match is successful. After a successful match, if PCRE2_COPY_MATCHED_SUBJECT is set, the subject is copied and the new pointer is remembered in the match data block instead of the original subject pointer. The memory allocator that was used for the match block itself is used. The copy is automatically freed when \fBpcre2_match_data_free()\fP is called to free the match data block. It is also automatically freed if the match data block is re-used for another match operation. .sp PCRE2_DISABLE_RECURSELOOP_CHECK .sp This option is relevant only to \fBpcre2_match()\fP for interpretive matching. It is ignored when JIT is used, and is forbidden for \fBpcre2_dfa_match()\fP. .P The use of recursion in patterns can lead to infinite loops. In the interpretive matcher these would be eventually caught by the match or heap limits, but this could take a long time and/or use a lot of memory if the limits are large. There is therefore a check at the start of each recursion. If the same group is still active from a previous call, and the current subject pointer is the same as it was at the start of that group, and the furthest inspected character of the subject has not changed, an error is generated. .P There are rare cases of matches that would complete, but nevertheless trigger this error. This option disables the check. It is provided mainly for testing when comparing JIT and interpretive behaviour. .sp PCRE2_ENDANCHORED .sp If the PCRE2_ENDANCHORED option is set, any string that \fBpcre2_match()\fP matches must be right at the end of the subject string. Note that setting the option at match time disables JIT matching. .sp PCRE2_NOTBOL .sp This option specifies that first character of the subject string is not the beginning of a line, so the circumflex metacharacter should not match before it. Setting this without having set PCRE2_MULTILINE at compile time causes circumflex never to match. This option affects only the behaviour of the circumflex metacharacter. It does not affect \eA. .sp PCRE2_NOTEOL .sp This option specifies that the end of the subject string is not the end of a line, so the dollar metacharacter should not match it nor (except in multiline mode) a newline immediately before it. Setting this without having set PCRE2_MULTILINE at compile time causes dollar never to match. This option affects only the behaviour of the dollar metacharacter. It does not affect \eZ or \ez. .sp PCRE2_NOTEMPTY .sp An empty string is not considered to be a valid match if this option is set. If there are alternatives in the pattern, they are tried. If all the alternatives match the empty string, the entire match fails. For example, if the pattern .sp a?b? .sp is applied to a string not beginning with "a" or "b", it matches an empty string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not valid, so \fBpcre2_match()\fP searches further into the string for occurrences of "a" or "b". .sp PCRE2_NOTEMPTY_ATSTART .sp This is like PCRE2_NOTEMPTY, except that it locks out an empty string match only at the first matching position, that is, at the start of the subject plus the starting offset. An empty string match later in the subject is permitted. If the pattern is anchored, such a match can occur only if the pattern contains \eK. .sp PCRE2_NO_JIT .sp By default, if a pattern has been successfully processed by \fBpcre2_jit_compile()\fP, JIT is automatically used when \fBpcre2_match()\fP is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use of JIT; it forces matching to be done by the interpreter. .sp PCRE2_NO_UTF_CHECK .sp When PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked unless PCRE2_NO_UTF_CHECK is passed to \fBpcre2_match()\fP or PCRE2_MATCH_INVALID_UTF was passed to \fBpcre2_compile()\fP. The latter special case is discussed in detail in the .\" HREF \fBpcre2unicode\fP .\" documentation. .P In the default case, if a non-zero starting offset is given, the check is applied only to that part of the subject that could be inspected during matching, and there is a check that the starting offset points to the first code unit of a character or to the end of the subject. If there are no lookbehind assertions in the pattern, the check starts at the starting offset. Otherwise, it starts at the length of the longest lookbehind before the starting offset, or at the start of the subject if there are not that many characters before the starting offset. Note that the sequences \eb and \eB are one-character lookbehinds. .P The check is carried out before any other processing takes place, and a negative error code is returned if the check fails. There are several UTF error codes for each code unit width, corresponding to different problems with the code unit sequence. There are discussions about the validity of .\" HTML .\" UTF-8 strings, .\" .\" HTML .\" UTF-16 strings, .\" and .\" HTML .\" UTF-32 strings .\" in the .\" HREF \fBpcre2unicode\fP .\" documentation. .P If you know that your subject is valid, and you want to skip this check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling \fBpcre2_match()\fP. You might want to do this for the second and subsequent calls to \fBpcre2_match()\fP if you are making repeated calls to find multiple matches in the same subject string. .P \fBWarning:\fP Unless PCRE2_MATCH_INVALID_UTF was set at compile time, when PCRE2_NO_UTF_CHECK is set at match time the effect of passing an invalid string as a subject, or an invalid value of \fIstartoffset\fP, is undefined. Your program may crash or loop indefinitely or give wrong results. .sp PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT .sp These options turn on the partial matching feature. A partial match occurs if the end of the subject string is reached successfully, but there are not enough subject characters to complete the match. In addition, either at least one character must have been inspected or the pattern must contain a lookbehind, or the pattern must be one that could match an empty string. .P If this situation arises when PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set, matching continues by testing any remaining alternatives. Only if no complete match can be found is PCRE2_ERROR_PARTIAL returned instead of PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT specifies that the caller is prepared to handle a partial match, but only if no complete match can be found. .P If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if a partial match is found, \fBpcre2_match()\fP immediately returns PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other words, when PCRE2_PARTIAL_HARD is set, a partial match is considered to be more important than an alternative complete match. .P There is a more detailed discussion of partial and multi-segment matching, with examples, in the .\" HREF \fBpcre2partial\fP .\" documentation. . . . .SH "NEWLINE HANDLING WHEN MATCHING" .rs .sp When PCRE2 is built, a default newline convention is set; this is usually the standard convention for the operating system. The default can be overridden in a .\" HTML .\" compile context .\" by calling \fBpcre2_set_newline()\fP. It can also be overridden by starting a pattern string with, for example, (*CRLF), as described in the .\" HTML .\" section on newline conventions .\" in the .\" HREF \fBpcre2pattern\fP .\" page. During matching, the newline choice affects the behaviour of the dot, circumflex, and dollar metacharacters. It may also alter the way the match starting position is advanced after a match failure for an unanchored pattern. .P When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is set as the newline convention, and a match attempt for an unanchored pattern fails when the current starting position is at a CRLF sequence, and the pattern contains no explicit matches for CR or LF characters, the match position is advanced by two characters instead of one, in other words, to after the CRLF. .P The above rule is a compromise that makes the most common cases work as expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is not set), it does not match the string "\er\enA" because, after failing at the start, it skips both the CR and the LF before retrying. However, the pattern [\er\en]A does match that string, because it contains an explicit CR or LF reference, and so advances only by one character after the first failure. .P An explicit match for CR of LF is either a literal appearance of one of those characters in the pattern, or one of the \er or \en or equivalent octal or hexadecimal escape sequences. Implicit matches such as [^X] do not count, nor does \es, even though it includes CR and LF in the characters that it matches. .P Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \er or \en escapes appear in the pattern. . . .\" HTML .SH "HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS" .rs .sp .nf .B uint32_t pcre2_get_ovector_count(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *\fImatch_data\fP); .fi .P In general, a pattern matches a certain portion of the subject, and in addition, further substrings from the subject may be picked out by parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's book, this is called "capturing" in what follows, and the phrase "capture group" (Perl terminology) is used for a fragment of a pattern that picks out a substring. PCRE2 supports several other kinds of parenthesized group that do not cause substrings to be captured. The \fBpcre2_pattern_info()\fP function can be used to find out how many capture groups there are in a compiled pattern. .P You can use auxiliary functions for accessing captured substrings .\" HTML .\" by number .\" or .\" HTML .\" by name, .\" as described in sections below. .P Alternatively, you can make direct use of the vector of PCRE2_SIZE values, called the \fBovector\fP, which contains the offsets of captured strings. It is part of the .\" HTML .\" match data block. .\" The function \fBpcre2_get_ovector_pointer()\fP returns the address of the ovector, and \fBpcre2_get_ovector_count()\fP returns the number of pairs of values it contains. .P Within the ovector, the first in each pair of values is set to the offset of the first code unit of a substring, and the second is set to the offset of the first code unit after the end of a substring. These values are always code unit offsets, not character offsets. That is, they are byte offsets in the 8-bit library, 16-bit offsets in the 16-bit library, and 32-bit offsets in the 32-bit library. .P After a partial match (error return PCRE2_ERROR_PARTIAL), only the first pair of offsets (that is, \fIovector[0]\fP and \fIovector[1]\fP) are set. They identify the part of the subject that was partially matched. See the .\" HREF \fBpcre2partial\fP .\" documentation for details of partial matching. .P After a fully successful match, the first pair of offsets identifies the portion of the subject string that was matched by the entire pattern. The next pair is used for the first captured substring, and so on. The value returned by \fBpcre2_match()\fP is one more than the highest numbered pair that has been set. For example, if two substrings have been captured, the returned value is 3. If there are no captured substrings, the return value from a successful match is 1, indicating that just the first pair of offsets has been set. .P If a pattern uses the \eK escape sequence within a positive lookahead assertion, the reported start of a successful match can be greater than the end of the match. For example, if the pattern (?=ab\eK) is matched against "ab", the start and end offset values for the match are 2 and 0. .P If a capture group is matched repeatedly within a single match operation, it is the last portion of the subject that it matched that is returned. .P If the ovector is too small to hold all the captured substring offsets, as much as possible is filled in, and the function returns a value of zero. If captured substrings are not of interest, \fBpcre2_match()\fP may be called with a match data block whose ovector is of minimum length (that is, one pair). .P It is possible for capture group number \fIn+1\fP to match some part of the subject when group \fIn\fP has not been used at all. For example, if the string "abc" is matched against the pattern (a|(z))(bc) the return from the function is 4, and groups 1 and 3 are matched, but 2 is not. When this happens, both values in the offset pairs corresponding to unused groups are set to PCRE2_UNSET. .P Offset values that correspond to unused groups at the end of the expression are also set to PCRE2_UNSET. For example, if the string "abc" is matched against the pattern (abc)(x(yz)?)? groups 2 and 3 are not matched. The return from the function is 2, because the highest used capture group number is 1. The offsets for the second and third capture groups (assuming the vector is large enough, of course) are set to PCRE2_UNSET. .P Elements in the ovector that do not correspond to capturing parentheses in the pattern are never changed. That is, if a pattern contains \fIn\fP capturing parentheses, no more than \fIovector[0]\fP to \fIovector[2n+1]\fP are set by \fBpcre2_match()\fP. The other elements retain whatever values they previously had. After a failed match attempt, the contents of the ovector are unchanged. . . .\" HTML .SH "OTHER INFORMATION ABOUT A MATCH" .rs .sp .nf .B PCRE2_SPTR pcre2_get_mark(pcre2_match_data *\fImatch_data\fP); .sp .B PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *\fImatch_data\fP); .fi .P As well as the offsets in the ovector, other information about a match is retained in the match data block and can be retrieved by the above functions in appropriate circumstances. If they are called at other times, the result is undefined. .P After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function \fBpcre2_get_mark()\fP can be called to access this name, which can be specified in the pattern by any of the backtracking control verbs, not just (*MARK). The same function applies to all the verbs. It returns a pointer to the zero-terminated name, which is within the compiled pattern. If no name is available, NULL is returned. The length of the name (excluding the terminating zero) is stored in the code unit that precedes the name. You should use this length instead of relying on the terminating zero if the name might contain a binary zero. .P After a successful match, the name that is returned is the last mark name encountered on the matching path through the pattern. Instances of backtracking verbs without names do not count. Thus, for example, if the matching path contains (*MARK:A)(*PRUNE), the name "A" is returned. After a "no match" or a partial match, the last encountered name is returned. For example, consider this pattern: .sp ^(*MARK:A)((*MARK:B)a|b)c .sp When it matches "bc", the returned name is A. The B mark is "seen" in the first branch of the group, but it is not on the matching path. On the other hand, when this pattern fails to match "bx", the returned name is B. .P \fBWarning:\fP By default, certain start-of-match optimizations are used to give a fast "no match" result in some situations. For example, if the anchoring is removed from the pattern above, there is an initial check for the presence of "c" in the subject before running the matching engine. This check fails for "bx", causing a match failure without seeing any marks. You can disable the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option for \fBpcre2_compile()\fP or by starting the pattern with (*NO_START_OPT). .P After a successful match, a partial match, or one of the invalid UTF errors (for example, PCRE2_ERROR_UTF8_ERR5), \fBpcre2_get_startchar()\fP can be called. After a successful or partial match it returns the code unit offset of the character at which the match started. For a non-partial match, this can be different to the value of \fIovector[0]\fP if the pattern contains the \eK escape sequence. After a partial match, however, this value is always the same as \fIovector[0]\fP because \eK does not affect the result of a partial match. .P After a UTF check failure, \fBpcre2_get_startchar()\fP can be used to obtain the code unit offset of the invalid UTF character. Details are given in the .\" HREF \fBpcre2unicode\fP .\" page. . . .\" HTML .SH "ERROR RETURNS FROM \fBpcre2_match()\fP" .rs .sp If \fBpcre2_match()\fP fails, it returns a negative number. This can be converted to a text string by calling the \fBpcre2_get_error_message()\fP function (see "Obtaining a textual error message" .\" HTML .\" below). .\" Negative error codes are also returned by other functions, and are documented with them. The codes are given names in the header file. If UTF checking is in force and an invalid UTF subject string is detected, one of a number of UTF-specific negative error codes is returned. Details are given in the .\" HREF \fBpcre2unicode\fP .\" page. The following are the other errors that may be returned by \fBpcre2_match()\fP: .sp PCRE2_ERROR_NOMATCH .sp The subject string did not match the pattern. .sp PCRE2_ERROR_PARTIAL .sp The subject string did not match, but it did match partially. See the .\" HREF \fBpcre2partial\fP .\" documentation for details of partial matching. .sp PCRE2_ERROR_BADMAGIC .sp PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to catch the case when it is passed a junk pointer. This is the error that is returned when the magic number is not present. .sp PCRE2_ERROR_BADMODE .sp This error is given when a compiled pattern is passed to a function in a library of a different code unit width, for example, a pattern compiled by the 8-bit library is passed to a 16-bit or 32-bit library function. .sp PCRE2_ERROR_BADOFFSET .sp The value of \fIstartoffset\fP was greater than the length of the subject. .sp PCRE2_ERROR_BADOPTION .sp An unrecognized bit was set in the \fIoptions\fP argument. .sp PCRE2_ERROR_BADUTFOFFSET .sp The UTF code unit sequence that was passed as a subject was checked and found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of \fIstartoffset\fP did not point to the beginning of a UTF character or the end of the subject. .sp PCRE2_ERROR_CALLOUT .sp This error is never generated by \fBpcre2_match()\fP itself. It is provided for use by callout functions that want to cause \fBpcre2_match()\fP or \fBpcre2_callout_enumerate()\fP to return a distinctive error code. See the .\" HREF \fBpcre2callout\fP .\" documentation for details. .sp PCRE2_ERROR_DEPTHLIMIT .sp The nested backtracking depth limit was reached. .sp PCRE2_ERROR_HEAPLIMIT .sp The heap limit was reached. .sp PCRE2_ERROR_INTERNAL .sp An unexpected internal error has occurred. This error could be caused by a bug in PCRE2 or by overwriting of the compiled pattern. .sp PCRE2_ERROR_JIT_STACKLIMIT .sp This error is returned when a pattern that was successfully studied using JIT is being matched, but the memory available for the just-in-time processing stack is not large enough. See the .\" HREF \fBpcre2jit\fP .\" documentation for more details. .sp PCRE2_ERROR_MATCHLIMIT .sp The backtracking match limit was reached. .sp PCRE2_ERROR_NOMEMORY .sp Heap memory is used to remember backtracking points. This error is given when the memory allocation function (default or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. .sp PCRE2_ERROR_NULL .sp Either the \fIcode\fP, \fIsubject\fP, or \fImatch_data\fP argument was passed as NULL. .sp PCRE2_ERROR_RECURSELOOP .sp This error is returned when \fBpcre2_match()\fP detects a recursion loop within the pattern. Specifically, it means that either the whole pattern or a capture group has been called recursively for the second time at the same position in the subject string. Some simple patterns that might do this are detected and faulted at compile time, but more complicated cases, in particular mutual recursions between two different groups, cannot be detected until matching is attempted. . . .\" HTML .SH "OBTAINING A TEXTUAL ERROR MESSAGE" .rs .sp .nf .B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP, .B " PCRE2_SIZE \fIbufflen\fP);" .fi .P A text message for an error code from any PCRE2 function (compile, match, or auxiliary) can be obtained by calling \fBpcre2_get_error_message()\fP. The code is passed as the first argument, with the remaining two arguments specifying a code unit buffer and its length in code units, into which the text message is placed. The message is returned in code units of the appropriate width for the library that is being used. .P The returned message is terminated with a trailing zero, and the function returns the number of code units used, excluding the trailing zero. If the error number is unknown, the negative error code PCRE2_ERROR_BADDATA is returned. If the buffer is too small, the message is truncated (but still with a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. None of the messages is very long; a buffer size of 120 code units is ample. . . .\" HTML .SH "ITERATING OVER ALL MATCHES" .rs .sp .nf .B int pcre2_next_match(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SIZE *\fIpstart_offset\fP, uint32_t *\fIpoptions\fP);" .fi .P A common task for applications is to implement "global" matching behaviour, for example, replacing all matches in the subject; splitting the subject on all matches; or simply counting the number of matches. The \fBpcre2_next_match()\fP function helps with this task by providing the appropriate parameters for the next match attempt (available since PCRE2 10.47). .P First, a match attempt should be made using one of the matching functions (\fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP). Then, \fBpcre2_next_match()\fP can be called, providing the same \fImatch_data\fP parameter. .P It returns 0 ("false") if there is no need to make a further match attempt, or 1 ("true") if another match should be attempted. Returning 1 does not imply that there is another match, only that another match should be attempted (which may return PCRE2_ERROR_NOMATCH). .P The *\fIpstart_offset\fP and *\fIpoptions\fP are set if the function returns 1. The *\fIpstart_offset\fP should be passed to the next match attempt directly, and the *\fIpoptions\fP should be passed to the next match attempt by combining with the application's match options using OR. .P There is some code that demonstrates how to do this in the .\" HREF \fBpcre2demo\fP .\" sample program. The general pattern is: .sp .nf uint32_t app_options = ...; uint32_t global_options = 0; PCRE2_SIZE start_offset = 0; while (1) { int rc = pcre2_match(re, subject, subject_len, start_offset, app_options | global_options, match_data, match_context); \& if (rc == PCRE2_ERROR_NOMATCH) break; /* no match, and no more attempts */ if (rc < 0) { ... exit } \& ...handle the match \& if (!pcre2_next_match(match_data, &start_offset, &global_options)) break; /* no more attempts */ } .fi .P The guarantees provided by \fBpcre2_next_match()\fP are that the start_offset will advance, so the loop will definitely terminate. The conditions which ensure this are that either: (a) pcre2_next_match() returns 0 (false); or (b) the returned *\fIpstart_offset\fP is strictly greater than the previous start_offset; or (c) if the previous match was a successful match of the empty string then the returned *\fIpstart_offset\fP is equal to the previous ovector[1], and *\fIpoptions\fP will be set to PCRE2_NOTEMPTY_ATSTART to prevent another empty match from being returned. .P A loop implemented as shown above will always terminate, unless there is a bug in PCRE2. As a measure of "defensive programming", applications are encouraged to add an assertion or check to break their loop if it does not make progress (and report the issue as a bug). .P If an application does not use the flag PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, then each match is "well-behaved" and satisfies: .sp start_offset <= ovector[0] <= ovector[1]. .sp In this case, the matches found by pcre2_match() with pcre2_next_match() will be sorted, non-overlapping (possibly touching), and with no duplicates. .P Otherwise, if PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK is used, then the guarantees are considerably weaker. We do not guarantee that the matches will always advance: only that the start_offset will. The matches found by pcre2_match() with pcre2_next_match() will be a finite sequence (as pcre2_next_match() ensures that start_offset advances, so the search will terminate). The matches can however be overlapping, can contain duplicates, and (in truly pathological examples) may not even be sorted by ovector[0]. Additionally, each match itself can end before it starts (ovector[1] < ovector[0]). We recommend that applications do not set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK. . . .\" HTML .SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER" .rs .sp .nf .B int pcre2_substring_length_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_SIZE *\fIlength\fP);" .sp .B int pcre2_substring_copy_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_UCHAR *\fIbuffer\fP," .B " PCRE2_SIZE *\fIbufflen\fP);" .sp .B int pcre2_substring_get_bynumber(pcre2_match_data *\fImatch_data\fP, .B " uint32_t \fInumber\fP, PCRE2_UCHAR **\fIbufferptr\fP," .B " PCRE2_SIZE *\fIbufflen\fP);" .sp .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP); .fi .P Captured substrings can be accessed directly by using the ovector as described .\" HTML .\" above. .\" For convenience, auxiliary functions are provided for extracting captured substrings as new, separate, zero-terminated strings. A substring that contains a binary zero is correctly extracted and has a further zero added on the end, but the result is not, of course, a C string. .P The functions in this section identify substrings by number. The number zero refers to the entire matched substring, with higher numbers referring to substrings captured by parenthesized groups. After a partial match, only substring zero is available. An attempt to extract any other substring gives the error PCRE2_ERROR_PARTIAL. The next section describes similar functions for extracting captured substrings by name. .P If a pattern uses the \eK escape sequence within a positive lookahead assertion, the reported start of a successful match can be greater than the end of the match. For example, if the pattern (?=ab\eK) is matched against "ab", the start and end offset values for the match are 2 and 0. In this situation, calling these functions with a zero substring number extracts a zero-length empty string. .P You can find the length in code units of a captured substring without extracting it by calling \fBpcre2_substring_length_bynumber()\fP. The first argument is a pointer to the match data block, the second is the group number, and the third is a pointer to a variable into which the length is placed. If you just want to know whether or not the substring has been captured, you can pass the third argument as NULL. .P The \fBpcre2_substring_copy_bynumber()\fP function copies a captured substring into a supplied buffer, whereas \fBpcre2_substring_get_bynumber()\fP copies it into new memory, obtained using the same memory allocation function that was used for the match data block. The first two arguments of these functions are a pointer to the match data block and a capture group number. .P The final arguments of \fBpcre2_substring_copy_bynumber()\fP are a pointer to the buffer and a pointer to a variable that contains its length in code units. This is updated to contain the actual number of code units used for the extracted substring, excluding the terminating zero. .P For \fBpcre2_substring_get_bynumber()\fP the third and fourth arguments point to variables that are updated with a pointer to the new memory and the number of code units that comprise the substring, again excluding the terminating zero. When the substring is no longer needed, the memory should be freed by calling \fBpcre2_substring_free()\fP. .P The return value from all these functions is zero for success, or a negative error code. If the pattern match failed, the match failure code is returned. If a substring number greater than zero is used after a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible error codes are: .sp PCRE2_ERROR_NOMEMORY .sp The buffer was too small for \fBpcre2_substring_copy_bynumber()\fP, or the attempt to get memory failed for \fBpcre2_substring_get_bynumber()\fP. .sp PCRE2_ERROR_NOSUBSTRING .sp There is no substring with that number in the pattern, that is, the number is greater than the number of capturing parentheses. .sp PCRE2_ERROR_UNAVAILABLE .sp The substring number, though not greater than the number of captures in the pattern, is greater than the number of slots in the ovector, so the substring could not be captured. .sp PCRE2_ERROR_UNSET .sp The substring did not participate in the match. For example, if the pattern is (abc)|(def) and the subject is "def", and the ovector contains at least two capturing slots, substring number 1 is unset. . . .SH "EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS" .rs .sp .nf .B int pcre2_substring_list_get(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_UCHAR ***\fIlistptr\fP, PCRE2_SIZE **\fIlengthsptr\fP);" .sp .B void pcre2_substring_list_free(PCRE2_UCHAR **\fIlist\fP); .fi .P The \fBpcre2_substring_list_get()\fP function extracts all available substrings and builds a list of pointers to them. It also (optionally) builds a second list that contains their lengths (in code units), excluding a terminating zero that is added to each of them. All this is done in a single block of memory that is obtained using the same memory allocation function that was used to get the match data block. .P This function must be called only after a successful match. If called after a partial match, the error code PCRE2_ERROR_PARTIAL is returned. .P The address of the memory block is returned via \fIlistptr\fP, which is also the start of the list of string pointers. The end of the list is marked by a NULL pointer. The address of the list of lengths is returned via \fIlengthsptr\fP. If your strings do not contain binary zeros and you do not therefore need the lengths, you may supply NULL as the \fBlengthsptr\fP argument to disable the creation of a list of lengths. The yield of the function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the memory block could not be obtained. When the list is no longer needed, it should be freed by calling \fBpcre2_substring_list_free()\fP. .P If this function encounters a substring that is unset, which can happen when capture group number \fIn+1\fP matches some part of the subject, but group \fIn\fP has not been used at all, it returns an empty string. This can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain PCRE2_UNSET for unset substrings, or by calling \fBpcre2_substring_length_bynumber()\fP. . . .\" HTML .SH "EXTRACTING CAPTURED SUBSTRINGS BY NAME" .rs .sp .nf .B int pcre2_substring_number_from_name(const pcre2_code *\fIcode\fP, .B " PCRE2_SPTR \fIname\fP);" .sp .B int pcre2_substring_length_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_SIZE *\fIlength\fP);" .sp .B int pcre2_substring_copy_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR *\fIbuffer\fP, PCRE2_SIZE *\fIbufflen\fP);" .sp .B int pcre2_substring_get_byname(pcre2_match_data *\fImatch_data\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR **\fIbufferptr\fP, PCRE2_SIZE *\fIbufflen\fP);" .sp .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP); .fi .P To extract a substring by name, you first have to find associated number. For example, for this pattern: .sp (a+)b(?\ed+)... .sp the number of the capture group called "xxx" is 2. If the name is known to be unique (PCRE2_DUPNAMES was not set), you can find the number from the name by calling \fBpcre2_substring_number_from_name()\fP. The first argument is the compiled pattern, and the second is the name. The yield of the function is the group number, PCRE2_ERROR_NOSUBSTRING if there is no group with that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one group with that name. Given the number, you can extract the substring directly from the ovector, or use one of the "bynumber" functions described above. .P For convenience, there are also "byname" functions that correspond to the "bynumber" functions, the only difference being that the second argument is a name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate names, these functions scan all the groups with the given name, and return the captured substring from the first named group that is set. .P If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is returned. If all groups with the name have numbers that are greater than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is returned. If there is at least one group with a slot in the ovector, but no group is found to be set, PCRE2_ERROR_UNSET is returned. .P \fBWarning:\fP If the pattern uses the (?| feature to set up multiple capture groups with the same number, as described in the .\" HTML .\" section on duplicate group numbers .\" in the .\" HREF \fBpcre2pattern\fP .\" page, you cannot use names to distinguish the different capture groups, because names are not included in the compiled code. The matching process uses only numbers. For this reason, the use of different names for groups with the same number causes an error at compile time. . . .\" HTML .SH "CREATING A NEW STRING WITH SUBSTITUTIONS" .rs .sp .nf .B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP," .B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP," .B " PCRE2_SIZE *\fIoutlengthptr\fP);" .fi .P This function optionally calls \fBpcre2_match()\fP and then makes a copy of the subject string in \fIoutputbuffer\fP, replacing parts that were matched with the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an error occurs if \fIreplacement\fP is NULL. .P There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the replacement string(s). The default action is to perform just one replacement if the pattern matches, but there is an option that requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below). .P If successful, \fBpcre2_substitute()\fP returns the number of substitutions that were carried out. This may be zero if no match was found, and is never greater than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A negative value is returned if an error is detected. .P Matches in which a \eK item in a lookahead in the pattern causes the match to end before it starts are not supported, and give rise to an error return. For global replacements, matches in which \eK in a lookbehind causes the match to start earlier than the point that was reached in the previous iteration are also not supported. (These cases are only possible if the pattern was compiled with the backwards-compatibility option PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK.) .P The first seven arguments of \fBpcre2_substitute()\fP are the same as for \fBpcre2_match()\fP, except that the partial matching options are not permitted, and \fImatch_data\fP may be passed as NULL, in which case a match data block is obtained and freed within this function, using memory management functions from the match context, if provided, or else those that were used to allocate memory for the compiled code. .P If \fImatch_data\fP is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the provided block is used for all calls to \fBpcre2_match()\fP, and its contents afterwards are the result of the final call made internally by \fBpcre2_substitute()\fP to the matching function. For global changes, this will always be a no-match error. The contents of the ovector within the match data block may or may not have been changed. .P As well as the usual options for \fBpcre2_match()\fP, a number of additional options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP. One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external \fImatch_data\fP block must be provided, and it must have already been used for an external call to \fBpcre2_match()\fP (or \fBpcre2_jit_match()\fP) with the same pattern, subject pointer, effective subject length, start offset, and match option arguments (substitute-specific options can be added to the \fIoptions\fP argument). If any of these parameters is changed, \fBpcre2_substitute()\fP returns an error. The data in the \fImatch_data\fP block (return code, offset vector) is used for the first substitution instead of calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows an application to check for a match before choosing to substitute, without having to repeat the match. .P If the contents of the subject buffer are mutated in between \fBpcre2_match()\fP and a call to \fBpcre2_substitute()\fP with PCRE2_SUBSTITUTE_MATCHED, the behaviour is unsafe; in particular, in this case, PCRE2 is unable to ensure that the offsets in the ovector point to the start of characters (with UTF-encoded input). .P The contents of the externally supplied match data block are not changed when PCRE2_SUBSTITUTE_MATCHED is set, and so the match block is permitted for use in another call using PCRE2_SUBSTITUTE_MATCHED. If PCRE2_SUBSTITUTE_GLOBAL is also set, \fBpcre2_match()\fP is called after the first substitution to check for furthe matches, but this is done using an internally obtained match data block, thus always leaving the external block unchanged. .P The \fIcode\fP argument is not used for matching before the first substitution when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, even when PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains information such as the UTF setting and the number of capturing parentheses in the pattern. .P When using PCRE2_SUBSTITUTE_MATCHED, you should not modify the subject string in between the prior call to \fBpcre2_match()\fP and \fBpcre2_substitute()\fP, as the substitution assumes that the passed-in ovector is compatible with the subject string. Although PCRE2 does verify that the subject is a pointer to the same buffer, it cannot in general verify whether the contents of the buffer have changed. For example, if the subject buffer is mutated from one valid UTF-8 string to another valid string, of the same length in code units, the ovector offsets are no longer guaranteed to point to the start of a character. Beware that with PCRE2_SUBSTITUTE_MATCHED in UTF mode, the subject string is not re-scanned for UTF validity when \fBpcre2_substitute()\fP first uses it. .P The default action of \fBpcre2_substitute()\fP is to return a copy of the subject string with matched substrings replaced. However, if PCRE2_SUBSTITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are returned. In the global case, multiple replacements are concatenated in the output buffer. Substitution callouts (see .\" HTML .\" below) .\" can be used to separate them if necessary. .P Partial matching is supported, with limitations: if matching succeeds but with a partial match, then pcre2_substitute returns PCRE2_ERROR_PARTIAL. When partial-matching (either of PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT is passed), then PCRE2_SUBSTITUTE_REPLACEMENT_ONLY must also be set, or else PCRE2_ERROR_BADOPTION is returned. Similarly, certain replacement items ($' and $_) cause PCRE2_ERROR_PARTIALSUBS to be returned when partial-matching, even if a complete match is found. .P The \fIoutlengthptr\fP argument of \fBpcre2_substitute()\fP must point to a variable that contains the length, in code units, of the output buffer. If the function is successful, the value is updated to contain the length in code units of the new string, excluding the trailing zero that is automatically added. .P If the function is not successful, the value set via \fIoutlengthptr\fP depends on the type of error. For syntax errors in the replacement string, the value is the offset in the replacement string where the error was detected. For other errors, the value is PCRE2_UNSET by default. This includes the case of the output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set. .P PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, \fBpcre2_substitute()\fP continues to go through the motions of matching and substituting (without, of course, writing anything) in order to compute the size of buffer that is needed, which will include the extra space for the terminating NUL. This value is passed back via the \fIoutlengthptr\fP variable, with the result of the function still being PCRE2_ERROR_NOMEMORY. .P Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean that the entire operation is carried out twice. Depending on the application, it may be more efficient to allocate a large buffer and free the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH. .P The replacement string, which is interpreted as a UTF string in UTF mode, is checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF replacement string causes an immediate return with the relevant UTF error code. .P If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted in any way. By default, however, a dollar character is an escape character that can specify the insertion of characters from capture groups and names from (*MARK) or other control verbs in the pattern. Dollar is the only escape character (backslash is treated as literal). The following forms are recognized: .sp $$ insert a dollar character $n or ${n} insert the contents of group \fIn\fP $0 or $& insert the entire matched substring $` insert the substring that precedes the match $' insert the substring that follows the match $_ insert the entire input string $+ insert the highest-numbered capture group which matched $*MARK or ${*MARK} insert a control verb name .sp Either a group number or a group name can be given for \fIn\fP, for example $2 or $NAME. Curly brackets are required only if the following character would be interpreted as part of the number or name. The number may be zero to include the entire matched string. For example, if the pattern a(b)c is matched with "=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=". .P The JavaScript form $, where the angle brackets are part of the syntax, is also recognized for group names, but not for group numbers or *MARK. .P $*MARK inserts the name from the last encountered backtracking control verb on the matching path that has a name. (*MARK) must always include a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this \fBpcre2test\fP example shows: .sp /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} apple lemon 2: pear orange .sp PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string, replacing every matching substring. If this option is not set, only the first matching substring is replaced. The search for matches takes place in the original subject string (that is, previous replacements do not affect it). Iteration is implemented by advancing the \fIstartoffset\fP value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, searching stops when that limit is reached. .P Because global substitutions apply the pattern repeatedly to the subject string, and always iterate over non-overlapping matches, the substitutions done by \fBpcre2_substitute()\fP do not match and substitute text inside the replacement strings themselves (no recursive/iterative substitution). However, applications can easily implement other alternative replacement strategies, such as iteratively replacing, then matching and replacing on the result. The replacement loop inside \fBpcre2_substitute()\fP is simple and can be emulated in client code by allocating a buffer, searching for matches in a loop, and calling \fBpcre2_substitute()\fP with PCRE2_SUBSTITUTE_REPLACEMENT_ONLY an PCRE2_SUBSTITUTE_MATCHED, and without PCRE2_SUBSTITUTE_GLOBAL. .P You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of \fIstartoffset\fP and an offset limit. Here is a \fBpcre2test\fP example: .sp /B/g,replace=!,use_offset_limit ABC ABC ABC ABC\e=offset=3,offset_limit=12 2: ABC A!C A!C ABC .sp When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same offset is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next two characters are CR, LF. In this case, the offset is advanced by two characters. .P PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error. .P PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including unknown groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does not influence the extended substitution syntax described below. .P PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is set, several things change: .P Firstly, backslash in a replacement string is interpreted as an escape character. The usual forms such as \ex{ddd} can be used to specify particular character codes, and backslash followed by any non-alphanumeric character quotes that character. Extended quoting can be coded using \eQ...\eE, exactly as in pattern strings. The escapes \eb and \ev are interpreted as the characters backspace and vertical tab, respectively. .P The interpretation of backslash followed by one or more digits is the same as in a pattern, which in Perl has some ambiguities. Details are given in the .\" HREF \fBpcre2pattern\fP .\" page. .P The Python form \eg, where the angle brackets are part of the syntax and \fIn\fP is either a group name or number, is recognized as an alternative way of inserting the contents of a group, for example \eg<3>. .P There are also four escape sequences for forcing the case of inserted letters. Case forcing applies to all inserted characters, including those from capture groups and letters within \eQ...\eE quoted sequences. The insertion mechanism has three states: no case forcing, force upper case, and force lower case. The escape sequences change the current state: \eU and \eL change to upper or lower case forcing, respectively, and \eE (when not terminating a \eQ quoted sequence) reverts to no case forcing. The sequences \eu and \el force the next character (if it is a letter) to upper or lower case, respectively, and then the state automatically reverts to no case forcing. .P However, if \eu is immediately followed by \eL or \el is immediately followed by \eU, the next character's case is forced by the first escape sequence, and subsequent characters by the second. This provides a "title casing" facility that can be applied to group captures. For example, if group 1 has captured "heLLo", the replacement string "\eu\eL$1" becomes "Hello". .P If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater than 127. However, only simple case folding, as determined by the Unicode file \fBCaseFolding.txt\fP is supported. PCRE2 does not support language-specific special casing rules such as using different lower case Greek sigmas in the middle and ends of words (as defined in the Unicode file \fBSpecialCasing.txt\fP). .P Note that case forcing sequences such as \eU...\eE do not nest. For example, the result of processing "\eUaa\eLBB\eEcc\eE" is "AAbbcc"; the final \eE has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do not apply to replacement strings. .P The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash: .sp ${n:-string} ${n:+string1:string2} .sp As in the simple case, \fIn\fP may be a group number or a name. The first form specifies a default value. If group \fIn\fP is set, its value is inserted; if not, the string is expanded and the result inserted. The second form specifies strings that are expanded and inserted when group \fIn\fP is set or unset, respectively. The first form is just a convenient shorthand for .sp ${n:+${n}:string} .sp Backslash can be used to escape colons and closing curly brackets in the replacement strings. A change of the case forcing state within a replacement string remains in force afterwards, as shown in this \fBpcre2test\fP example: .sp /(some)?(body)/substitute_extended,replace=${1:+\eU:\eL}HeLLo body 1: hello somebody 1: HELLO .sp The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown groups in the extended syntax forms to be treated as unset. .P If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrelevant and are ignored. . . .SS "Substitution errors" .rs .sp In the event of an error, \fBpcre2_substitute()\fP returns a negative error code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP are passed straight back. .P PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. .P PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set. .P PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is needed is returned via \fIoutlengthptr\fP. Note that this does not happen by default. .P PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the \fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP arguments are NULL. For backward compatibility reasons an exception is made for the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0. .P PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier than the current position in the subject, which can happen if \eK is used in a lookaround assertion). .P As for all PCRE2 errors, a text message that describes the error can be obtained by calling the \fBpcre2_get_error_message()\fP function (see "Obtaining a textual error message" .\" HTML .\" above). .\" . . .\" HTML .SS "Substitution callouts" .rs .sp .nf .B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP, .B " int (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *)," .B " void *\fIcallout_data\fP);" .fi .sp The \fBpcre2_set_substitute_callout()\fP function can be used to specify a callout function for \fBpcre2_substitute()\fP. This information is passed in a match context. The callout function is called after each substitution has been processed, but it can cause the replacement not to happen. .P The callout function is not called for simulated substitutions that happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when substitution processing exceeds the buffer space provided by the caller, processing continues by counting code units. The simulation is unable to populate the callout block, and so the simulation is pessimistic about the required buffer size. Whichever is larger of accepted or rejected substitution is reported as the required size. Therefore, the returned buffer length may be an overestimate (without a substitution callout, it is normally an exact measurement). .P The first argument of the callout function is a pointer to a substitute callout block structure, which contains the following fields, not necessarily in this order: .sp uint32_t \fIversion\fP; uint32_t \fIsubscount\fP; PCRE2_SPTR \fIinput\fP; PCRE2_SPTR \fIoutput\fP; PCRE2_SIZE \fI*ovector\fP; uint32_t \fIoveccount\fP; PCRE2_SIZE \fIoutput_offsets[2]\fP; .sp The \fIversion\fP field contains the version number of the block format. The current version is 0. The version number will increase in future if more fields are added, but the intention is never to remove any of the existing fields. .P The \fIsubscount\fP field is the number of the current match. It is 1 for the first callout, 2 for the second, and so on. The \fIinput\fP and \fIoutput\fP pointers are copies of the values passed to \fBpcre2_substitute()\fP. .P The \fIovector\fP field points to the ovector, which contains the result of the most recent match. The \fIoveccount\fP field contains the number of pairs that are set in the ovector, and is always greater than zero. .P The \fIoutput_offsets\fP vector contains the offsets of the replacement in the output string. This has already been processed for dollar and (if requested) backslash substitutions as described above. .P The second argument of the callout function is the value passed as \fIcallout_data\fP when the function was registered. The value returned by the callout function is interpreted as follows: .P If the value is zero, the replacement is accepted, and, if PCRE2_SUBSTITUTE_GLOBAL is set, processing continues with a search for the next match. If the value is not zero, the current replacement is not accepted. If the value is greater than zero, processing continues when PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero or PCRE2_SUBSTITUTE_GLOBAL is not set), the rest of the input is copied to the output and the call to \fBpcre2_substitute()\fP exits, returning the number of matches so far. . . .SS "Substitution case callouts" .rs .sp .nf .B int pcre2_set_substitute_case_callout(pcre2_match_context *\fImcontext\fP, .B " PCRE2_SIZE (*\fIcallout_function\fP)(PCRE2_SPTR, PCRE2_SIZE," .B " PCRE2_UCHAR *, PCRE2_SIZE," .B " int, void *)," .B " void *\fIcallout_data\fP);" .fi .sp The \fBpcre2_set_substitute_case_callout()\fP function can be used to specify a callout function for \fBpcre2_substitute()\fP to use when performing case transformations. This does not affect any case insensitivity behaviour when performing a match, but only the user-visible transformations performed when processing a substitution such as: .sp pcre2_substitute(..., "\e\eU$1", ...) .P The default case transformations applied by PCRE2 are reasonably complete, and, in UTF or UCP mode, perform the simple locale-invariant case transformations as specified by Unicode. This is suitable for the internal (invisible) case-equivalence procedures used during pattern matching, but an application may wish to use more sophisticated locale-aware processing for the user-visible substitution transformations. .P One example implementation of the \fIcallout_function\fP using the ICU library would be: .sp .nf PCRE2_SIZE icu_case_callout( PCRE2_SPTR input, PCRE2_SIZE input_len, PCRE2_UCHAR *output, PCRE2_SIZE output_cap, int to_case, void *data_ptr) { UErrorCode err = U_ZERO_ERROR; int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER ? u_strToLower(output, output_cap, input, input_len, NULL, &err) : to_case == PCRE2_SUBSTITUTE_CASE_UPPER ? u_strToUpper(output, output_cap, input, input_len, NULL, &err) : u_strToTitle(output, output_cap, input, input_len, &first_char_only, NULL, &err); if (U_FAILURE(err)) return (~(PCRE2_SIZE)0); return r; } .fi .P The first and second arguments of the case callout function are the Unicode string to transform. .P The third and fourth arguments are the output buffer and its capacity. .P The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the callout to indicate that the case of the entire callout input should be case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that only the first character or glyph should be transformed to Unicode titlecase and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode properties to titlecase each word in a string; but PCRE2 is requesting that only the single leading character is to be titlecased). .P The sixth argument is the \fIcallout_data\fP supplied to \fBpcre2_set_substitute_case_callout()\fP. .P The resulting string in the destination buffer may be larger or smaller than the input, if the casing rules merge or split characters. The return value is the length required for the output string. If a buffer of sufficient size was provided to the callout, then the result must be written to the buffer and the number of code units returned. If the result does not fit in the provided buffer, then the required capacity must be returned and PCRE2 will not make use of the output buffer. PCRE2 provides input and output buffers which overlap, so the callout must support this by suitable internal buffering. .P Alternatively, if the callout wishes to indicate an error, then it may return (~(PCRE2_SIZE)0). In this case pcre2_substitute() will immediately fail with error PCRE2_ERROR_REPLACECASE. .P When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option, there are situations when pcre2_substitute() will return an underestimate of the required buffer size. If you call pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for the replacement string to be constructed, then instead of calling the case callout, pcre2_substitute() will make an estimate of the required buffer size. The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second call is not guaranteed to succeed either, if the case callout requires more buffer space than expected. The caller must make repeated attempts in a loop. . . .SH "DUPLICATE CAPTURE GROUP NAMES" .rs .sp .nf .B int pcre2_substring_nametable_scan(const pcre2_code *\fIcode\fP, .B " PCRE2_SPTR \fIname\fP, PCRE2_SPTR *\fIfirst\fP, PCRE2_SPTR *\fIlast\fP);" .fi .P When a pattern is compiled with the PCRE2_DUPNAMES option, names for capture groups are not required to be unique. Duplicate names are always allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names. .P Normally, patterns that use duplicate names are such that in any one match, only one of each set of identically-named groups participates. An example is shown in the .\" HREF \fBpcre2pattern\fP .\" documentation. .P When duplicates are present, \fBpcre2_substring_copy_byname()\fP and \fBpcre2_substring_get_byname()\fP return the first substring corresponding to the given name that is set. Only if none are set is PCRE2_ERROR_UNSET is returned. The \fBpcre2_substring_number_from_name()\fP function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. .P If you want to get full details of all captured substrings for a given name, you must use the \fBpcre2_substring_nametable_scan()\fP function. The first argument is the compiled pattern, and the second is the name. If the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. .P When the third and fourth arguments are not NULL, they must be pointers to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the given name, and the function returns the length of each entry in code units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. .P The format of the name table is described .\" HTML .\" above .\" in the section entitled \fIInformation about a pattern\fP. Given all the relevant entries for the name, you can extract each of their numbers, and hence the captured data. . . .SH "FINDING ALL POSSIBLE MATCHES AT ONE POSITION" .rs .sp The traditional matching function uses a similar algorithm to Perl, which stops when it finds the first match at a given point in the subject. If you want to find all possible matches, or the longest possible match at a given position, consider using the alternative matching function (see below) instead. If you cannot use the alternative function, you can kludge it up by making use of the callout facility, which is described in the .\" HREF \fBpcre2callout\fP .\" documentation. .P What you have to do is to insert a callout right at the end of the pattern. When your callout function is called, extract and save the current matched substring. Then return 1, which forces \fBpcre2_match()\fP to backtrack and try other alternatives. Ultimately, when it runs out of matches, \fBpcre2_match()\fP will yield PCRE2_ERROR_NOMATCH. . . .\" HTML .SH "MATCHING A PATTERN: THE ALTERNATIVE FUNCTION" .rs .sp .nf .B int pcre2_dfa_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," .B " pcre2_match_context *\fImcontext\fP," .B " int *\fIworkspace\fP, PCRE2_SIZE \fIwscount\fP);" .fi .P The function \fBpcre2_dfa_match()\fP is called to match a subject string against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does not backtrack (except when processing lookaround assertions). This has different characteristics to the normal algorithm, and is not compatible with Perl. Some of the features of PCRE2 patterns are not supported. Nevertheless, there are times when this kind of matching can be useful. For a discussion of the two matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does not support, see the .\" HREF \fBpcre2matching\fP .\" documentation. .P The arguments for the \fBpcre2_dfa_match()\fP function are the same as for \fBpcre2_match()\fP, plus two extras. The ovector within the match data block is used in a different way, and this is described below. The other common arguments are used in the same way as for \fBpcre2_match()\fP, so their description is not repeated here. .P The two additional arguments provide workspace for the function. The workspace vector should contain at least 20 elements. It is used for keeping track of multiple paths through the pattern tree. More workspace is needed for patterns and subjects where there are a lot of potential matches. .P Here is an example of a simple call to \fBpcre2_dfa_match()\fP: .sp int wspace[20]; pcre2_match_data *md = pcre2_match_data_create(4, NULL); int rc = pcre2_dfa_match( re, /* result of pcre2_compile() */ "some string", /* the subject string */ 11, /* the length of the subject string */ 0, /* start at offset 0 in the subject */ 0, /* default options */ md, /* the match data block */ NULL, /* a match context; NULL means use defaults */ wspace, /* working space vector */ 20); /* number of elements (NOT size in bytes) */ . .SS "Option bits for \fBpcre2_dfa_match()\fP" .rs .sp The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for \fBpcre2_match()\fP, so their description is not repeated here. .sp PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT .sp These have the same general effect as they do for \fBpcre2_match()\fP, but the details are slightly different. When PCRE2_PARTIAL_HARD is set for \fBpcre2_dfa_match()\fP, it returns PCRE2_ERROR_PARTIAL if the end of the subject is reached and there is still at least one matching possibility that requires additional characters. This happens even if some complete matches have already been found. When PCRE2_PARTIAL_SOFT is set, the return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the subject is reached, there have been no complete matches, but there is still at least one matching possibility. The portion of the string that was inspected when the longest partial match was found is set as the first matching string in both cases. There is a more detailed discussion of partial and multi-segment matching, with examples, in the .\" HREF \fBpcre2partial\fP .\" documentation. .sp PCRE2_DFA_SHORTEST .sp Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as soon as it has found one match. Because of the way the alternative algorithm works, this is necessarily the shortest possible match at the first possible matching point in the subject string. .sp PCRE2_DFA_RESTART .sp When \fBpcre2_dfa_match()\fP returns a partial match, it is possible to call it again, with additional subject characters, and have it continue with the same match. The PCRE2_DFA_RESTART option requests this action; when it is set, the \fIworkspace\fP and \fIwscount\fP options must reference the same vector as before because data about the match so far is left in them after a partial match. There is more discussion of this facility in the .\" HREF \fBpcre2partial\fP .\" documentation. . . .SS "Successful returns from \fBpcre2_dfa_match()\fP" .rs .sp When \fBpcre2_dfa_match()\fP succeeds, it may have matched more than one substring in the subject. Note, however, that all the matches from one run of the function start at the same point in the subject. The shorter matches are all initial substrings of the longer matches. For example, if the pattern .sp <.*> .sp is matched against the string .sp This is no more .sp the three matched strings are .sp .sp On success, the yield of the function is a number greater than zero, which is the number of matched substrings. The offsets of the substrings are returned in the ovector, and can be extracted by number in the same way as for \fBpcre2_match()\fP, but the numbers bear no relation to any capture groups that may exist in the pattern, because DFA matching does not support capturing. .P Calls to the convenience functions that extract substrings by name return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used after a DFA match. The convenience functions that extract substrings by number never return PCRE2_ERROR_NOSUBSTRING. .P The matched strings are stored in the ovector in reverse order of length; that is, the longest matching string is first. If there were too many matches to fit into the ovector, the yield of the function is zero, and the vector is filled with the longest matches. .P NOTE: PCRE2's "auto-possessification" optimization usually applies to character repeats at the end of a pattern (as well as internally). For example, the pattern "a\ed+" is compiled as if it were "a\ed++". For DFA matching, this means that only one possible match is found. If you really do want multiple matches in such cases, either use an ungreedy repeat such as "a\ed+?" or set the PCRE2_NO_AUTO_POSSESS option when compiling. . . .SS "Error returns from \fBpcre2_dfa_match()\fP" .rs .sp The \fBpcre2_dfa_match()\fP function returns a negative number when it fails. Many of the errors are the same as for \fBpcre2_match()\fP, as described .\" HTML .\" above. .\" There are in addition the following errors that are specific to \fBpcre2_dfa_match()\fP: .sp PCRE2_ERROR_DFA_UITEM .sp This return is given if \fBpcre2_dfa_match()\fP encounters an item in the pattern that it does not support, for instance, the use of \eC in a UTF mode or a backreference. .sp PCRE2_ERROR_DFA_UCOND .sp This return is given if \fBpcre2_dfa_match()\fP encounters a condition item that uses a backreference for the condition, or a test for recursion in a specific capture group. These are not supported. .sp PCRE2_ERROR_DFA_UINVALID_UTF .sp This return is given if \fBpcre2_dfa_match()\fP is called for a pattern that was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA matching. .sp PCRE2_ERROR_DFA_WSSIZE .sp This return is given if \fBpcre2_dfa_match()\fP runs out of space in the \fIworkspace\fP vector. .sp PCRE2_ERROR_DFA_RECURSE .sp When a recursion or subroutine call is processed, the matching function calls itself recursively, using private memory for the ovector and \fIworkspace\fP. This error is given if the internal ovector is not large enough. This should be extremely rare, as a vector of size 1000 is used. .sp PCRE2_ERROR_DFA_BADRESTART .sp When \fBpcre2_dfa_match()\fP is called with the \fBPCRE2_DFA_RESTART\fP option, some plausibility checks are made on the contents of the workspace, which should contain data about the previous partial match. If any of these checks fail, this error is given. . . .SH "SEE ALSO" .rs .sp \fBpcre2build\fP(3), \fBpcre2callout\fP(3), \fBpcre2demo(3)\fP, \fBpcre2matching\fP(3), \fBpcre2partial\fP(3), \fBpcre2posix\fP(3), \fBpcre2sample\fP(3), \fBpcre2unicode\fP(3). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 29 October 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2build.3 ================================================ .TH PCRE2BUILD 3 "17 October 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) . . .SH "BUILDING PCRE2" .rs .sp PCRE2 is distributed with a \fBconfigure\fP script that can be used to build the library in Unix-like environments using the Autotools applications. Also in the distribution are files to support building using \fBCMake\fP instead of \fBconfigure\fP. The text file .\" HTML .\" \fBREADME\fP .\" contains general information about building with Autotools (some of which is repeated below), and also has some comments about building on various operating systems. The files in the \fBvms\fP directory support building under OpenVMS. There is a lot more information about building PCRE2 without using Autotools (including information about using \fBCMake\fP and building "by hand") in the text file called .\" HTML .\" \fBNON-AUTOTOOLS-BUILD\fP. .\" You should consult this file as well as the .\" HTML .\" \fBREADME\fP .\" file if you are building in a non-Unix-like environment. . . .SH "PCRE2 BUILD-TIME OPTIONS" .rs .sp The rest of this document describes the optional features of PCRE2 that can be selected when the library is compiled. It assumes use of the \fBconfigure\fP script, where the optional features are selected or deselected by providing options to \fBconfigure\fP before running the \fBmake\fP command. However, the same options can be selected in both Unix-like and non-Unix-like environments if you are using \fBCMake\fP instead of \fBconfigure\fP to build PCRE2. .P If you are not using Autotools or \fBCMake\fP, option selection can be done by editing the \fBconfig.h\fP file, or by passing parameter settings to the compiler, as described in .\" HTML .\" \fBNON-AUTOTOOLS-BUILD\fP. .\" .P The complete list of options for \fBconfigure\fP (which includes the standard ones such as the selection of the installation directory) can be obtained by running .sp ./configure --help .sp The following sections include descriptions of "on/off" options whose names begin with --enable or --disable. Because of the way that \fBconfigure\fP works, --enable and --disable always come in pairs, so the complementary option always exists as well, but as it specifies the default, it is not described. Options that specify values have names that start with --with. At the end of a \fBconfigure\fP run, a summary of the configuration is output. . . .SH "BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES" .rs .sp By default, a library called \fBlibpcre2-8\fP is built, containing functions that take string arguments contained in arrays of bytes, interpreted either as single-byte characters, or UTF-8 strings. You can also build two other libraries, called \fBlibpcre2-16\fP and \fBlibpcre2-32\fP, which process strings that are contained in arrays of 16-bit and 32-bit code units, respectively. These can be interpreted either as single-unit characters or UTF-16/UTF-32 strings. To build these additional libraries, add one or both of the following to the \fBconfigure\fP command: .sp --enable-pcre2-16 --enable-pcre2-32 .sp If you do not want the 8-bit library, add .sp --disable-pcre2-8 .sp as well. At least one of the three libraries must be built. Note that the POSIX wrapper is for the 8-bit library only, and that \fBpcre2grep\fP is an 8-bit program. Neither of these are built if you select only the 16-bit or 32-bit libraries. . . .SH "BUILDING SHARED AND STATIC LIBRARIES" .rs .sp The Autotools PCRE2 building process uses \fBlibtool\fP to build both shared and static libraries by default. You can suppress an unwanted library by adding one of .sp --disable-shared --disable-static .sp to the \fBconfigure\fP command. Setting --disable-shared ensures that PCRE2 libraries are built as static libraries. The binaries that are then created as part of the build process (for example, \fBpcre2test\fP and \fBpcre2grep\fP) are linked statically with one or more PCRE2 libraries, but may also be dynamically linked with other libraries such as \fBlibc\fP. If you want these binaries to be fully statically linked, you can set LDFLAGS like this: .sp LDFLAGS=--static ./configure --disable-shared .sp Note the two hyphens in --static. Of course, this works only if static versions of all the relevant libraries are available for linking. . . .SH "UNICODE AND UTF SUPPORT" .rs .sp By default, PCRE2 is built with support for Unicode and UTF character strings. To build it without Unicode support, add .sp --disable-unicode .sp to the \fBconfigure\fP command. This setting applies to all three libraries. It is not possible to build one library with Unicode support and another without in the same configuration. .P Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16 or UTF-32. To do that, applications that use the library can set the PCRE2_UTF option when they call \fBpcre2_compile()\fP to compile a pattern. Alternatively, patterns may be started with (*UTF) unless the application has locked this out by setting PCRE2_NEVER_UTF. .P UTF support allows the libraries to process character code points up to 0x10ffff in the strings that they handle. Unicode support also gives access to the Unicode properties of characters, using pattern escapes such as \eP, \ep, and \eX. Only the general category properties such as \fILu\fP and \fINd\fP, script names, and some bi-directional and binary properties are supported. Details are given in the .\" HREF \fBpcre2pattern\fP .\" documentation. .P Pattern escapes such as \ed and \ew do not by default make use of Unicode properties. The application can request that they do by setting the PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP). . . .SH "DISABLING THE USE OF \eC" .rs .sp The \eC escape sequence, which matches a single code unit, even in a UTF mode, can cause unpredictable behaviour because it may leave the current matching point in the middle of a multi-code-unit character. The application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C option when calling \fBpcre2_compile()\fP. There is also a build-time option .sp --enable-never-backslash-C .sp (note the upper case C) which locks out the use of \eC entirely. . . .SH "JUST-IN-TIME COMPILER SUPPORT" .rs .sp Just-in-time (JIT) compiler support is included in the build by specifying .sp --enable-jit .sp This support is available only for certain hardware architectures. If this option is set for an unsupported architecture, a building error occurs. If in doubt, use .sp --enable-jit=auto .sp which enables JIT only if the current hardware is supported. You can check if JIT is enabled in the configuration summary that is output at the end of a \fBconfigure\fP run. If you are enabling JIT under SELinux you may also want to add .sp --enable-jit-sealloc .sp which enables the use of an execmem allocator in JIT that is compatible with SELinux. This has no effect if JIT is not enabled. See the .\" HREF \fBpcre2jit\fP .\" documentation for a discussion of JIT usage. When JIT support is enabled, \fBpcre2grep\fP automatically makes use of it, unless you add .sp --disable-pcre2grep-jit .sp to the \fBconfigure\fP command. . . .SH "NEWLINE RECOGNITION" .rs .sp By default, PCRE2 interprets the linefeed (LF) character as indicating the end of a line. This is the normal newline character on Unix-like systems. You can compile PCRE2 to use carriage return (CR) instead, by adding .sp --enable-newline-is-cr .sp to the \fBconfigure\fP command. There is also an --enable-newline-is-lf option, which explicitly specifies linefeed as the newline character. .P Alternatively, you can specify that line endings are to be indicated by the two-character sequence CRLF (CR immediately followed by LF). If you want this, add .sp --enable-newline-is-crlf .sp to the \fBconfigure\fP command. There is a fourth option, specified by .sp --enable-newline-is-anycrlf .sp which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as indicating a line ending. A fifth option, specified by .sp --enable-newline-is-any .sp causes PCRE2 to recognize any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). The final option is .sp --enable-newline-is-nul .sp which causes NUL (binary zero) to be set as the default line-ending character. .P Whatever default line ending convention is selected when PCRE2 is built can be overridden by applications that use the library. At build time it is recommended to use the standard for your operating system. . . .SH "WHAT \eR MATCHES" .rs .sp By default, the sequence \eR in a pattern matches any Unicode newline sequence, independently of what has been selected as the line ending sequence. If you specify .sp --enable-bsr-anycrlf .sp the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is selected when PCRE2 is built can be overridden by applications that use the library. . . .SH "HANDLING VERY LARGE PATTERNS" .rs .sp Within a compiled pattern, offset values are used to point from one part to another (for example, from an opening parenthesis to an alternation metacharacter). By default, in the 8-bit and 16-bit libraries, two-byte values are used for these offsets, leading to a maximum size for a compiled pattern of around 64 thousand code units. This is sufficient to handle all but the most gigantic patterns. Nevertheless, some people do want to process truly enormous patterns, so it is possible to compile PCRE2 to use three-byte or four-byte offsets by adding a setting such as .sp --with-link-size=3 .sp to the \fBconfigure\fP command. The value given must be 2, 3, or 4. For the 16-bit library, a value of 3 is rounded up to 4. In these libraries, using longer offsets slows down the operation of PCRE2 because it has to load additional data when handling them. For the 32-bit library the value is always 4 and cannot be overridden; the value of --with-link-size is ignored. . . .SH "LIMITING PCRE2 RESOURCE USAGE" .rs .sp The \fBpcre2_match()\fP function increments a counter each time it goes round its main loop. Putting a limit on this counter controls the amount of computing resource used by a single call to \fBpcre2_match()\fP. The limit can be changed at run time, as described in the .\" HREF \fBpcre2api\fP .\" documentation. The default is 10 million, but this can be changed by adding a setting such as .sp --with-match-limit=500000 .sp to the \fBconfigure\fP command. This setting also applies to the \fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the counting is done differently). .P The \fBpcre2_match()\fP function uses heap memory to record backtracking points. The more nested backtracking points there are (that is, the deeper the search tree), the more memory is needed. There is an upper limit, specified in kibibytes (units of 1024 bytes). This limit can be changed at run time, as described in the .\" HREF \fBpcre2api\fP .\" documentation. The default limit (in effect unlimited) is 20 million. You can change this by a setting such as .sp --with-heap-limit=500 .sp which limits the amount of heap to 500 KiB. This limit applies only to interpretive matching in \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, which may also use the heap for internal workspace when processing complicated patterns. This limit does not apply when JIT (which has its own memory arrangements) is used. .P You can also explicitly limit the depth of nested backtracking in the \fBpcre2_match()\fP interpreter. This limit defaults to the value that is set for --with-match-limit. You can set a lower default limit by adding, for example, .sp --with-match-limit-depth=10000 .sp to the \fBconfigure\fP command. This value can be overridden at run time. This depth limit indirectly limits the amount of heap memory that is used, but because the size of each backtracking "frame" depends on the number of capturing parentheses in a pattern, the amount of heap that is used before the limit is reached varies from pattern to pattern. This limit was more useful in versions before 10.30, where function recursion was used for backtracking. .P As well as applying to \fBpcre2_match()\fP, the depth limit also controls the depth of recursive function calls in \fBpcre2_dfa_match()\fP. These are used for lookaround assertions, atomic groups, and recursion within patterns. The limit does not apply to JIT matching. . . .SH "LIMITING VARIABLE-LENGTH LOOKBEHIND ASSERTIONS" .rs .sp Lookbehind assertions in which one or more branches can match a variable number of characters are supported only if there is a maximum matching length for each top-level branch. There is a limit to this maximum that defaults to 255 characters. You can alter this default by a setting such as .sp --with-max-varlookbehind=100 .sp The limit can be changed at runtime by calling \fBpcre2_set_max_varlookbehind()\fP. Lookbehind assertions in which every branch matches a fixed number of characters (not necessarily all the same) are not constrained by this limit. . . .\" HTML .SH "CREATING CHARACTER TABLES AT BUILD TIME" .rs .sp PCRE2 uses fixed tables for processing characters whose code points are less than 256. By default, PCRE2 is built with a set of tables that are distributed in the file \fIsrc/pcre2_chartables.c.dist\fP. These tables are for ASCII codes only. If you add .sp --enable-rebuild-chartables .sp to the \fBconfigure\fP command, the distributed tables are no longer used. Instead, a program called \fBpcre2_dftables\fP is compiled and run. This outputs the source for new set of tables, created in the default locale of your C run-time system. This method of replacing the tables does not work if you are cross compiling, because \fBpcre2_dftables\fP needs to be run on the local host and therefore not compiled with the cross compiler. .P If you need to create alternative tables when cross compiling, you will have to do so "by hand". There may also be other reasons for creating tables manually. To cause \fBpcre2_dftables\fP to be built on the local host, run a normal compiling command, and then run the program with the output file as its argument, for example: .sp cc src/pcre2_dftables.c -o pcre2_dftables ./pcre2_dftables src/pcre2_chartables.c .sp This builds the tables in the default locale of the local host. If you want to specify a locale, you must use the -L option: .sp LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c .sp You can also specify -b (with or without -L). This causes the tables to be written in binary instead of as source code. A set of binary tables can be loaded into memory by an application and passed to \fBpcre2_compile()\fP in the same way as tables created by calling \fBpcre2_maketables()\fP. The tables are just a string of bytes, independent of hardware characteristics such as endianness. This means they can be bundled with an application that runs in different environments, to ensure consistent behaviour. . . .SH "USING EBCDIC CODE" .rs .sp PCRE2 assumes by default that it will run in an environment where the character code is ASCII or Unicode, which is a superset of ASCII. This is the case for most computer operating systems. PCRE2 can, however, be compiled to run in an 8-bit EBCDIC environment by adding .sp --enable-ebcdic --disable-unicode .sp to the \fBconfigure\fP command. You should only use it if you know that you are in an EBCDIC environment (for example, an IBM mainframe operating system). .P This setting implies --enable-rebuild-chartables, in order to ensure that you have the correct default character tables for your system's codepage. There is an exception when you set --enable-ebcdic-ignoring-compiler (see below), which allows using a default set of EBCDIC 1047 character tables rather than forcing use of --enable-rebuild-chartables. .P It is not supported to enable both EBCDIC input and either ASCII or UTF-8/16/32 in the same build of the library. When PCRE2 is built with EBCDIC support, it always operates in EBCDIC, and consequently --enable-unicode and --enable-ebcdic are mutually exclusive. .P The EBCDIC character that corresponds to an ASCII LF is assumed to have the value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In such an environment you should use .sp --enable-ebcdic-nl25 .sp (which implies --enable-ebcdic). The EBCDIC character for CR has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is \fInot\fP chosen as LF is made to correspond to the Unicode NEL character (which, in Unicode, is 0x85). .P The options that select newline behaviour, such as --enable-newline-is-cr, and equivalent run-time options, refer to these character values in an EBCDIC environment. .P On systems requiring an EBCDIC build of PCRE2, the compiler should be set to use the correct codepage, so that C character literals such as 'z' use the correct numeric value for whichever EBCDIC codpage is in use. (PCRE2 cannot support multiple EBCDIC codepages dynamically.) However, if this not possible, then you can use .sp --enable-ebcdic-ignoring-compiler .sp in order to disregard the compiler's codepage, and instead force PCRE2 to use numeric constants corresponding to the EBCDIC 1047 codepage instead. This can be used to build (or test) EBCDIC support on an ASCII/UTF-8 system such as Linux. . . .SH "PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS" .rs .sp By default \fBpcre2grep\fP supports the use of callouts with string arguments within the patterns it is matching. There are two kinds: one that generates output using local code, and another that calls an external program or script. If --disable-pcre2grep-callout-fork is added to the \fBconfigure\fP command, only the first kind of callout is supported; if --disable-pcre2grep-callout is used, all callouts are completely ignored. For more details of \fBpcre2grep\fP callouts, see the .\" HREF \fBpcre2grep\fP .\" documentation. . . .SH "PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT" .rs .sp By default, \fBpcre2grep\fP reads all files as plain text. You can build it so that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of .sp --enable-pcre2grep-libz --enable-pcre2grep-libbz2 .sp to the \fBconfigure\fP command. These options naturally require that the relevant libraries are installed on your system. Configuration will fail if they are not. . . .SH "PCRE2GREP BUFFER SIZE" .rs .sp \fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it finds a match. The default starting size of the buffer is 20KiB. The buffer itself is three times this size, but because of the way it is used for holding "before" lines, the longest line that is guaranteed to be processable is the notional buffer size. If a longer line is encountered, \fBpcre2grep\fP automatically expands the buffer, up to a specified maximum size, whose default is 1MiB or the starting size, whichever is the larger. You can change the default parameter values by adding, for example, .sp --with-pcre2grep-bufsize=51200 --with-pcre2grep-max-bufsize=2097152 .sp to the \fBconfigure\fP command. The caller of \fBpcre2grep\fP can override these values by using --buffer-size and --max-buffer-size on the command line. . . .SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT" .rs .sp If you add one of .sp --enable-pcre2test-libreadline --enable-pcre2test-libedit .sp to the \fBconfigure\fP command, \fBpcre2test\fP is linked with the \fBlibreadline\fP or\fBlibedit\fP library, respectively, and when its input is from a terminal, it reads it using the \fBreadline()\fP function. This provides line-editing and history facilities. Note that \fBlibreadline\fP is GPL-licensed, so if you distribute a binary of \fBpcre2test\fP linked in this way, there may be licensing issues. These can be avoided by linking instead with \fBlibedit\fP, which has a BSD licence. .P Setting --enable-pcre2test-libreadline causes the \fB-lreadline\fP option to be added to the \fBpcre2test\fP build. In many operating environments with a system-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is in use), some extra configuration may be necessary. The INSTALL file for \fBlibreadline\fP says this: .sp "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link with readline the to choose an appropriate library." .sp If your environment has not been set up so that an appropriate library is automatically included, you may need to add something like .sp LIBS="-lncurses" .sp immediately before the \fBconfigure\fP command. . . .SH "INCLUDING DEBUGGING CODE" .rs .sp If you add .sp --enable-debug .sp to the \fBconfigure\fP command, additional debugging code is included in the build. This feature is intended for use by the PCRE2 maintainers. . . .SH "DEBUGGING WITH VALGRIND SUPPORT" .rs .sp If you add .sp --enable-valgrind .sp to the \fBconfigure\fP command, PCRE2 will use valgrind annotations to mark certain memory regions as unaddressable. This allows it to detect invalid memory accesses, and is mostly useful for debugging PCRE2 itself. . . .SH "CODE COVERAGE REPORTING" .rs .sp If your C compiler is gcc, you can build a version of PCRE2 that can generate a code coverage report for its test suite. To enable this, you must install \fBlcov\fP version 1.6 or above. Then specify .sp --enable-coverage .sp to the \fBconfigure\fP command and build PCRE2 in the usual way. .P Note that using \fBccache\fP (a caching C compiler) is incompatible with code coverage reporting. If you have configured \fBccache\fP to run automatically on your system, you must set the environment variable .sp CCACHE_DISABLE=1 .sp before running \fBmake\fP to build PCRE2, so that \fBccache\fP is not used. .P When --enable-coverage is used, the following addition targets are added to the \fIMakefile\fP: .sp make coverage .sp This creates a fresh coverage report for the PCRE2 test suite. It is equivalent to running "make coverage-reset", "make coverage-baseline", "make check", and then "make coverage-report". .sp make coverage-reset .sp This zeroes the coverage counters, but does nothing else. .sp make coverage-baseline .sp This captures baseline coverage information. .sp make coverage-report .sp This creates the coverage report. .sp make coverage-clean-report .sp This removes the generated coverage report without cleaning the coverage data itself. .sp make coverage-clean-data .sp This removes the captured coverage data without removing the coverage files created at compile time (*.gcno). .sp make coverage-clean .sp This cleans all coverage data including the generated coverage report. For more information about code coverage, see the \fBgcov\fP and \fBlcov\fP documentation. . . .SH "DISABLING THE Z AND T FORMATTING MODIFIERS" .rs .sp The C99 standard defines formatting modifiers z and t for size_t and ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in environments other than old versions of Microsoft Visual Studio when __STDC_VERSION__ is defined and has a value greater than or equal to 199901L (indicating support for C99). However, there is at least one environment that claims to be C99 but does not support these modifiers. If .sp --disable-percent-zt .sp is specified, no use is made of the z or t modifiers. Instead of %td or %zu, a suitable format is used depending in the size of long for the platform. . . .SH "SUPPORT FOR FUZZERS" .rs .sp There is a special option for use by people who want to run fuzzing tests on PCRE2: .sp --enable-fuzz-support .sp At present this applies only to the 8-bit library. If set, it causes an extra library called libpcre2-fuzzsupport.a to be built, but not installed. This contains a single function called LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the length of the string. When called, this function tries to compile the string as a pattern, and if that succeeds, to match it. This is done both with no options and with some random options bits that are generated from the string. .P Setting --enable-fuzz-support also causes a binary called \fBpcre2fuzzcheck\fP to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing function and outputs information about what it is doing. The input strings are specified by arguments: if an argument starts with "=" the rest of it is a literal input string. Otherwise, it is assumed to be a file name, and the contents of the file are the test string. . . .SH "OBSOLETE OPTION" .rs .sp In versions of PCRE2 prior to 10.30, there were two ways of handling backtracking in the \fBpcre2_match()\fP function. The default was to use the system stack, but if .sp --disable-stack-for-recursion .sp was set, memory on the heap was used. From release 10.30 onwards this has changed (the stack is no longer used) and this option now does nothing except give a warning. . .SH "SEE ALSO" .rs .sp \fBpcre2api\fP(3), \fBpcre2-config\fP(3). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 17 October 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2callout.3 ================================================ .TH PCRE2CALLOUT 3 "26 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS .rs .sp .B #include .PP .nf .B int (*pcre2_callout)(pcre2_callout_block *, void *); .sp .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP, .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *)," .B " void *\fIuser_data\fP);" .fi . .SH DESCRIPTION .rs .sp PCRE2 provides a feature called "callout", which is a means of temporarily passing control to the caller of PCRE2 in the middle of pattern matching. The caller of PCRE2 provides an external function by putting its entry point in a match context (see \fBpcre2_set_callout()\fP in the .\" HREF \fBpcre2api\fP .\" documentation). .P When using the \fBpcre2_substitute()\fP function, an additional callout feature is available. This does a callout after each change to the subject string and is described in the .\" HREF \fBpcre2api\fP .\" documentation; the rest of this document is concerned with callouts during pattern matching. .P Within a regular expression, (?C) indicates a point at which the external function is to be called. Different callout points can be identified by putting a number less than 256 after the letter C. The default value is zero. Alternatively, the argument may be a delimited string. The starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the start, except for {, where the ending delimiter is }. If the ending delimiter is needed within the string, it must be doubled. For example, this pattern has two callout points: .sp (?C1)abc(?C"some ""arbitrary"" text")def .sp If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 automatically inserts callouts, all with number 255, before each item in the pattern except for immediately before or after an explicit callout. For example, if PCRE2_AUTO_CALLOUT is used with the pattern .sp A(?C3)B .sp it is processed as if it were .sp (?C255)A(?C3)B(?C255) .sp Here is a more complicated example: .sp A(\ed{2}|--) .sp With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were .sp (?C255)A(?C255)((?C255)\ed{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) .sp Notice that there is a callout before and after each parenthesis and alternation bar. If the pattern contains a conditional group whose condition is an assertion, an automatic callout is inserted immediately before the condition. Such a callout may also be inserted explicitly, for example: .sp (?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de) .sp This applies only to assertion conditions (because they are themselves independent groups). .P Callouts can be useful for tracking the progress of pattern matching. The .\" HREF \fBpcre2test\fP .\" program has a pattern qualifier (/auto_callout) that sets automatic callouts. When any callouts are present, the output from \fBpcre2test\fP indicates how the pattern is being matched. This is useful information when you are trying to optimize the performance of a particular pattern. . . .SH "MISSING CALLOUTS" .rs .sp You should be aware that, because of optimizations in the way PCRE2 compiles and matches patterns, callouts sometimes do not happen exactly as you might expect. . . .SS "Auto-possessification" .rs .sp At compile time, PCRE2 "auto-possessifies" repeated items when it knows that what follows cannot be part of the repeat. For example, a+[bc] is compiled as if it were a++[bc]. The \fBpcre2test\fP output when this pattern is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string "aaaa" is: .sp --->aaaa +0 ^ a+ +2 ^ ^ [bc] No match .sp This indicates that when matching [bc] fails, there is no backtracking into a+ (because it is being treated as a++) and therefore the callouts that would be taken for the backtracks do not occur. You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to \fBpcre2_compile()\fP, or starting the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this: .sp --->aaaa +0 ^ a+ +2 ^ ^ [bc] +2 ^ ^ [bc] +2 ^ ^ [bc] +2 ^^ [bc] No match .sp This time, when matching [bc] fails, the matcher backtracks into a+ and tries again, repeatedly, until a+ itself fails. . . .SS "Automatic .* anchoring" .rs .sp By default, an optimization is applied when .* is the first significant item in a pattern. If PCRE2_DOTALL is set, so that the dot can match any character, the pattern is automatically anchored. If PCRE2_DOTALL is not set, a match can start only after an internal newline or at the beginning of the subject, and \fBpcre2_compile()\fP remembers this. If a pattern has more than one top-level branch, automatic anchoring occurs if all branches are anchorable. .P This optimization is disabled, however, if .* is in an atomic group or if there is a backreference to the capture group in which it appears. It is also disabled if the pattern contains (*PRUNE) or (*SKIP). However, the presence of callouts does not affect it. .P For example, if the pattern .*\ed is compiled with PCRE2_AUTO_CALLOUT and applied to the string "aa", the \fBpcre2test\fP output is: .sp --->aa +0 ^ .* +2 ^ ^ \ed +2 ^^ \ed +2 ^ \ed No match .sp This shows that all match attempts start at the beginning of the subject. In other words, the pattern is anchored. You can disable this optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to \fBpcre2_compile()\fP, or starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the output changes to: .sp --->aa +0 ^ .* +2 ^ ^ \ed +2 ^^ \ed +2 ^ \ed +0 ^ .* +2 ^^ \ed +2 ^ \ed No match .sp This shows more match attempts, starting at the second subject character. Another optimization, described in the next section, means that there is no subsequent attempt to match with an empty subject. . . .SS "Other optimizations" .rs .sp Other optimizations that provide fast "no match" results also affect callouts. For example, if the pattern is .sp ab(?C4)cd .sp PCRE2 knows that any matching string must contain the letter "d". If the subject string is "abyz", the lack of "d" means that matching doesn't ever start, and the callout is never reached. However, with "abyd", though the result is still no match, the callout is obeyed. .P For most patterns PCRE2 also knows the minimum length of a matching string, and will immediately give a "no match" return without actually running a match if the subject is not long enough, or, for unanchored patterns, if it has been scanned far enough. .P You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE option to \fBpcre2_compile()\fP, or by starting the pattern with (*NO_START_OPT). This slows down the matching process, but does ensure that callouts such as the example above are obeyed. . . .\" HTML .SH "THE CALLOUT INTERFACE" .rs .sp During matching, when PCRE2 reaches a callout point, if an external function is provided in the match context, it is called. This applies to both normal, DFA, and JIT matching. The first argument to the callout function is a pointer to a \fBpcre2_callout\fP block. The second argument is the void * callout data that was supplied when the callout was set up by calling \fBpcre2_set_callout()\fP (see the .\" HREF \fBpcre2api\fP .\" documentation). The callout block structure contains the following fields, not necessarily in this order: .sp uint32_t \fIversion\fP; uint32_t \fIcallout_number\fP; uint32_t \fIcapture_top\fP; uint32_t \fIcapture_last\fP; uint32_t \fIcallout_flags\fP; PCRE2_SIZE *\fIoffset_vector\fP; PCRE2_SPTR \fImark\fP; PCRE2_SPTR \fIsubject\fP; PCRE2_SIZE \fIsubject_length\fP; PCRE2_SIZE \fIstart_match\fP; PCRE2_SIZE \fIcurrent_position\fP; PCRE2_SIZE \fIpattern_position\fP; PCRE2_SIZE \fInext_item_length\fP; PCRE2_SIZE \fIcallout_string_offset\fP; PCRE2_SIZE \fIcallout_string_length\fP; PCRE2_SPTR \fIcallout_string\fP; .sp The \fIversion\fP field contains the version number of the block format. The current version is 2; the three callout string fields were added for version 1, and the \fIcallout_flags\fP field for version 2. If you are writing an application that might use an earlier release of PCRE2, you should check the version number before accessing any of these fields. The version number will increase in future if more fields are added, but the intention is never to remove any of the existing fields. . . .SS "Fields for numerical callouts" .rs .sp For a numerical callout, \fIcallout_string\fP is NULL, and \fIcallout_number\fP contains the number of the callout, in the range 0-255. This is the number that follows (?C for callouts that part of the pattern; it is 255 for automatically generated callouts. . . .SS "Fields for string callouts" .rs .sp For callouts with string arguments, \fIcallout_number\fP is always zero, and \fIcallout_string\fP points to the string that is contained within the compiled pattern. Its length is given by \fIcallout_string_length\fP. Duplicated ending delimiters that were present in the original pattern string have been turned into single characters, but there is no other processing of the callout string argument. An additional code unit containing binary zero is present after the string, but is not included in the length. The delimiter that was used to start the string is also stored within the pattern, immediately before the string itself. You can access this delimiter as \fIcallout_string\fP[-1] if you need it. .P The \fIcallout_string_offset\fP field is the code unit offset to the start of the callout argument string within the original pattern string. This is provided for the benefit of applications such as script languages that might need to report errors in the callout string within the pattern. . . .SS "Fields for all callouts" .rs .sp The remaining fields in the callout block are the same for both kinds of callout. .P The \fIoffset_vector\fP field is a pointer to a vector of capturing offsets (the "ovector"). You may read the elements in this vector, but you must not change any of them. .P For calls to \fBpcre2_match()\fP, the \fIoffset_vector\fP field is not (since release 10.30) a pointer to the actual ovector that was passed to the matching function in the match data block. Instead it points to an internal ovector of a size large enough to hold all possible captured substrings in the pattern. Note that whenever a recursion or subroutine call within a pattern completes, the capturing state is reset to what it was before. .P The \fIcapture_last\fP field contains the number of the most recently captured substring, and the \fIcapture_top\fP field contains one more than the number of the highest numbered captured substring so far. If no substrings have yet been captured, the value of \fIcapture_last\fP is 0 and the value of \fIcapture_top\fP is 1. The values of these fields do not always differ by one; for example, when the callout in the pattern ((a)(b))(?C2) is taken, \fIcapture_last\fP is 1 but \fIcapture_top\fP is 4. .P The contents of ovector[2] to ovector[*2-1] can be inspected in order to extract substrings that have been matched so far, in the same way as extracting substrings after a match has completed. The values in ovector[0] and ovector[1] are always PCRE2_UNSET because the match is by definition not complete. Substrings that have not been captured but whose numbers are less than \fIcapture_top\fP also have both of their ovector slots set to PCRE2_UNSET. .P For DFA matching, the \fIoffset_vector\fP field points to the ovector that was passed to the matching function in the match data block for callouts at the top level, but to an internal ovector during the processing of pattern recursions, lookarounds, and atomic groups. However, these ovectors hold no useful information because \fBpcre2_dfa_match()\fP does not support substring capturing. The value of \fIcapture_top\fP is always 1 and the value of \fIcapture_last\fP is always 0 for DFA matching. .P The \fIsubject\fP and \fIsubject_length\fP fields contain copies of the values that were passed to the matching function. .P The \fIstart_match\fP field normally contains the offset within the subject at which the current match attempt started. However, if the escape sequence \eK has been encountered, this value is changed to reflect the modified starting point. If the pattern is not anchored, the callout function may be called several times from the same point in the pattern for different starting points in the subject. .P The \fIcurrent_position\fP field contains the offset within the subject of the current match pointer. .P The \fIpattern_position\fP field contains the offset in the pattern string to the next item to be matched. .P The \fInext_item_length\fP field contains the length of the next item to be processed in the pattern string. When the callout is at the end of the pattern, the length is zero. When the callout precedes an opening parenthesis, the length includes meta characters that follow the parenthesis. For example, in a callout before an assertion such as (?=ab) the length is 3. For an alternation bar or a closing parenthesis, the length is one, unless a closing parenthesis is followed by a quantifier, in which case its length is included. (This changed in release 10.23. In earlier releases, before an opening parenthesis the length was that of the entire group, and before an alternation bar or a closing parenthesis the length was zero.) .P The \fIpattern_position\fP and \fInext_item_length\fP fields are intended to help in distinguishing between different automatic callouts, which all have the same callout number. However, they are set for all callouts, and are used by \fBpcre2test\fP to show the next item to be matched when displaying callout information. .P In callouts from \fBpcre2_match()\fP the \fImark\fP field contains a pointer to the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or (*THEN) item in the match, or NULL if no such items have been passed. Instances of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In callouts from the DFA matching function this field always contains NULL. .P The \fIcallout_flags\fP field is always zero in callouts from \fBpcre2_dfa_match()\fP or when JIT is being used. When \fBpcre2_match()\fP without JIT is used, the following bits may be set: .sp PCRE2_CALLOUT_STARTMATCH .sp This is set for the first callout after the start of matching for each new starting position in the subject. .sp PCRE2_CALLOUT_BACKTRACK .sp This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a \fBpcre2_match()\fP run. .P Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output from \fBpcre2test\fP does not indicate the presence of these bits unless the \fBcallout_extra\fP modifier is set. .P The information in the \fBcallout_flags\fP field is provided so that applications can track and tell their users how matching with backtracking is done. This can be useful when trying to optimize patterns, or just to understand how PCRE2 works. There is no support in \fBpcre2_dfa_match()\fP because there is no backtracking in DFA matching, and there is no support in JIT because JIT is all about maximimizing matching performance. In both these cases the \fBcallout_flags\fP field is always zero. . . .SH "RETURN VALUES FROM CALLOUTS" .rs .sp The external callout function returns an integer to PCRE2. If the value is zero, matching proceeds as normal. If the value is greater than zero, matching fails at the current point, but the testing of other matching possibilities goes ahead, just as if a lookahead assertion had failed. If the value is less than zero, the match is abandoned, and the matching function returns the negative value. .P Negative values should normally be chosen from the set of PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout functions; it will never be used by PCRE2 itself. . . .SH "CALLOUT ENUMERATION" .rs .sp .nf .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP, .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *)," .B " void *\fIuser_data\fP);" .fi .sp A script language that supports the use of string arguments in callouts might like to scan all the callouts in a pattern before running the match. This can be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a pointer to a compiled pattern, the second points to a callback function, and the third is arbitrary user data. The callback function is called for every callout in the pattern in the order in which they appear. Its first argument is a pointer to a callout enumeration block, and its second argument is the \fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The data block contains the following fields: .sp \fIversion\fP Block version number \fIpattern_position\fP Offset to next item in pattern \fInext_item_length\fP Length of next item in pattern \fIcallout_number\fP Number for numbered callouts \fIcallout_string_offset\fP Offset to string within pattern \fIcallout_string_length\fP Length of callout string \fIcallout_string\fP Points to callout string or is NULL .sp The version number is currently 0. It will increase if new fields are ever added to the block. The remaining fields are the same as their namesakes in the \fBpcre2_callout\fP block that is used for callouts during matching, as described .\" HTML .\" above. .\" .P Note that the value of \fIpattern_position\fP is unique for each callout. However, if a callout occurs inside a group that is quantified with a non-zero minimum or a fixed maximum, the group is replicated inside the compiled pattern. For example, a pattern such as /(a){2}/ is compiled as if it were /(a)(a)/. This means that the callout will be enumerated more than once, but with the same value for \fIpattern_position\fP in each case. .P The callback function should normally return zero. If it returns a non-zero value, scanning the pattern stops, and that value is returned from \fBpcre2_callout_enumerate()\fP. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 26 February 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2compat.3 ================================================ .TH PCRE2COMPAT 3 "02 June 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "DIFFERENCES BETWEEN PCRE2 AND PERL" .rs .sp This document describes some of the known differences in the ways that PCRE2 and Perl handle regular expressions. The differences described here are with respect to Perl version 5.38.0, but as both Perl and PCRE2 are continually changing, the information may at times be out of date. .P 1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the next character unless it is the start of a newline sequence. This means that, if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF (0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline indicator. .P 2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does have are given in the .\" HREF \fBpcre2unicode\fP .\" page. .P 3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but they do not mean what you might think. For example, (?!a){3} does not assert that the next three characters are not "a". It just asserts that the next character is not "a" three times (in principle; PCRE2 optimizes this to run the assertion just once). Perl allows some repeat quantifiers on other assertions, for example, \eb* , but these do not seem to have any use. PCRE2 does not allow any kind of quantifier on non-lookaround assertions. .P 4. If a braced quantifier such as {1,2} appears where there is nothing to repeat (for example, at the start of a branch), PCRE2 raises an error whereas Perl treats the quantifier characters as literal. When a braced quantifier (...){min,max} has min > max, Perl treats it as an item which fails to match any portion of the subject (as no number of repetitions can meet the condition), and additionally issues a warning when in warning mode. PCRE2 has no warning features, so it gives an error in this case. .P 5. Capture groups that occur inside negative lookaround assertions are counted, but their entries in the offsets vector are set only when a negative assertion is a condition that has a matching branch (that is, the condition is false). Perl may set such capture groups in other circumstances. .P 6. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu, \eU, and \eN when followed by a character name. \eN on its own, matching a non-newline character, and \eN{U+dd..}, matching a Unicode code point, are supported. The escapes that modify the case of following letters are implemented by Perl's general string-handling and are not part of its pattern matching engine. If any of these are encountered by PCRE2, an error is generated by default. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript interprets them. .P 7. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \ep and \eP are limited to the general category properties such as Lu and Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See the .\" HREF \fBpcre2pattern\fP .\" documentation for details. The long synonyms for property names that Perl supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". .P 8. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters in between are treated as literals. However, this is slightly different from Perl in that $ and @ are also handled as literals inside the quotes. In Perl, they cause variable interpolation (PCRE2 does not have variables). Also, Perl does "double-quotish backslash interpolation" on any backslashes between \eQ and \eE which, its documentation says, "may lead to confusing results". PCRE2 treats a backslash between \eQ and \eE just like any other character. Note the following examples: .sp Pattern PCRE2 matches Perl matches .sp .\" JOIN \eQabc$xyz\eE abc$xyz abc followed by the contents of $xyz \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz \eQA\eB\eE A\eB A\eB \eQ\e\eE \e \e\eE .sp The \eQ...\eE sequence is recognized both inside and outside character classes by both PCRE2 and Perl. Another difference from Perl is that any appearance of \eQ or \eE inside what might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers is inside \eQ...\eE, but not if the separating comma is. When not recognized as a quantifier a sequence such as {\eQ1\eE,2} is treated as the literal string "{1,2}". .P 9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the .\" HREF \fBpcre2callout\fP .\" documentation for details. .P 10. Subroutine calls (whether recursive or not) were treated as atomic groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. .P 11. In PCRE2, if any of the backtracking control verbs are used in a group that is called as a subroutine (whether or not recursively), their effect is confined to that group; it does not extend to the surrounding pattern. This is not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are processed as anchored at the point where they are tested. PCRE2 also confines all control verbs within atomic assertions, again including (*THEN) in assertions with only one branch. .P 12. If a pattern contains more than one backtracking control verb, the first one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. .P 13. There are some differences that are concerned with the settings of captured strings when part of a pattern is repeated. For example, matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to "b". .P 14. PCRE2's handling of duplicate capture group numbers and names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to translate between numbers and names. In particular, a pattern such as (?|(?A)|(?B)), where the two capture groups have the same number but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. .P 15. Perl used to recognize comments in some places that PCRE2 does not, for example, between the ( and ? at the start of a group. If the /x modifier is set, Perl allowed white space between ( and ? though the latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. .P 16. Perl, when in warning mode, gives warnings for character classes such as [A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. .P 17. In PCRE2, until release 10.45, the upper/lower case character properties Lu and Ll were not affected when case-independent matching was specified. Perl has changed in this respect, and PCRE2 has now changed to match. When caseless matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased letter). .P 18. From release 5.32.0, Perl locks out the use of \eK in lookaround assertions. From release 10.38 PCRE2 does the same by default. However, there is an option for re-enabling the previous behaviour. When this option is set, \eK is acted on when it occurs in positive assertions, but is ignored in negative assertions. .P 19. PCRE2 provides some extensions to the Perl regular expression facilities. Perl 5.10 included new features that were not in earlier versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This list is with respect to Perl 5.38: .sp (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. .sp (b) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) .sp (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is inverted, that is, by default they are not greedy, but if followed by a question mark they are. .sp (d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. .sp (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART options have no Perl equivalents. .sp (f) The \eR escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. .sp (g) The callout facility is PCRE2-specific. Perl supports codeblocks and variable interpolation, but not general hooks on every match. .sp (h) The partial matching facility is PCRE2-specific. .sp (i) The alternative matching function (\fBpcre2_dfa_match()\fP) matches in a different way and is not Perl-compatible. .sp (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at the start of a pattern. These set overall options that cannot be changed within the pattern. .sp (k) PCRE2 supports non-atomic positive lookaround assertions. This is an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. .sp (l) There are three syntactical items in patterns that can refer to a capturing group by number: back references such as \eg{2}, subroutine calls such as (?3), and condition references such as (?(4)...). PCRE2 supports relative group numbers such as +2 and -4 in all three cases. Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. .sp (m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 extension that is not available in Perl. .P 20. Perl has different limits than PCRE2. See the .\" HREF \fBpcre2limits\fP .\" documentation for details. Perl went with 5.10 from recursion to iteration keeping the intermediate matches on the heap, which is ~10% slower but does not fall into any stack-overflow limit. PCRE2 made a similar change at release 10.30, and also has many build-time and run-time customizable limits. .P 21. Unlike Perl, PCRE2 doesn't have character set modifiers and specially no way to set characters by context just like Perl's "/d". A regular expression using PCRE2_UTF and PCRE2_UCP will use similar rules to Perl's "/u"; something closer to "/a" could be selected by adding other PCRE2_EXTRA_ASCII* options on top. .P 22. Some recursive patterns that Perl diagnoses as infinite recursions can be handled by PCRE2, either by the interpreter or the JIT. An example is /(?:|(?0)abcd)(?(R)|\ez)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject. .P 23. Both PCRE2 and Perl error when \ex{ escapes are invalid, but Perl tries to recover and prints a warning if the problem was that an invalid hexadecimal digit was found. Since PCRE2 doesn't have warnings it returns an error instead. Additionally, Perl accepts \ex{} and generates NUL unlike PCRE2. .P 24. From release 10.45, PCRE2 gives an error if \ex is not followed by a hexadecimal digit or a curly bracket. It used to interpret this as the NUL character. Perl still generates NUL, but warns when in warning mode in most cases. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 02 June 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2convert.3 ================================================ .TH PCRE2CONVERT 3 "14 November 2023" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "EXPERIMENTAL PATTERN CONVERSION FUNCTIONS" .rs .sp This document describes a set of functions that can be used to convert "foreign" patterns into PCRE2 regular expressions. This facility is currently experimental, and may be changed in future releases. Two kinds of pattern, globs and POSIX patterns, are supported. . . .SH "THE CONVERT CONTEXT" .rs .sp .nf .B pcre2_convert_context *pcre2_convert_context_create( .B " pcre2_general_context *\fIgcontext\fP);" .sp .B pcre2_convert_context *pcre2_convert_context_copy( .B " pcre2_convert_context *\fIcvcontext\fP);" .sp .B void pcre2_convert_context_free(pcre2_convert_context *\fIcvcontext\fP); .sp .B int pcre2_set_glob_escape(pcre2_convert_context *\fIcvcontext\fP, .B " uint32_t \fIescape_char\fP);" .sp .B int pcre2_set_glob_separator(pcre2_convert_context *\fIcvcontext\fP, .B " uint32_t \fIseparator_char\fP);" .fi .sp A convert context is used to hold parameters that affect the way that pattern conversion works. Like all PCRE2 contexts, you need to use a context only if you want to override the defaults. There are the usual create, copy, and free functions. If custom memory management functions are set in a general context that is passed to \fBpcre2_convert_context_create()\fP, they are used for all memory management within the conversion functions. .P There are only two parameters in the convert context at present. Both apply only to glob conversions. The escape character defaults to grave accent under Windows, otherwise backslash. It can be set to zero, meaning no escape character, or to any punctuation character with a code point less than 256. The separator character defaults to backslash under Windows, otherwise forward slash. It can be set to forward slash, backslash, or dot. .P The two setting functions return zero on success, or PCRE2_ERROR_BADDATA if their second argument is invalid. . . .SH "THE CONVERSION FUNCTION" .rs .sp .nf .B int pcre2_pattern_convert(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, .B " uint32_t \fIoptions\fP, PCRE2_UCHAR **\fIbuffer\fP," .B " PCRE2_SIZE *\fIblength\fP, pcre2_convert_context *\fIcvcontext\fP);" .sp .B void pcre2_converted_pattern_free(PCRE2_UCHAR *\fIconverted_pattern\fP); .fi .sp The first two arguments of \fBpcre2_pattern_convert()\fP define the foreign pattern that is to be converted. The length may be given as PCRE2_ZERO_TERMINATED. The \fBoptions\fP argument defines how the pattern is to be processed. If the input is UTF, the PCRE2_CONVERT_UTF option should be set. PCRE2_CONVERT_NO_UTF_CHECK may also be set if you are sure the input is valid. One or more of the glob options, or one of the following POSIX options must be set to define the type of conversion that is required: .sp PCRE2_CONVERT_GLOB PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR PCRE2_CONVERT_GLOB_NO_STARSTAR PCRE2_CONVERT_POSIX_BASIC PCRE2_CONVERT_POSIX_EXTENDED .sp Details of the conversions are given below. The \fBbuffer\fP and \fBblength\fP arguments define how the output is handled: .P If \fBbuffer\fP is NULL, the function just returns the length of the converted pattern via \fBblength\fP. This is one less than the length of buffer needed, because a terminating zero is always added to the output. .P If \fBbuffer\fP points to a NULL pointer, an output buffer is obtained using the allocator in the context or \fBmalloc()\fP if no context is supplied. A pointer to this buffer is placed in the variable to which \fBbuffer\fP points. When no longer needed the output buffer must be freed by calling \fBpcre2_converted_pattern_free()\fP. If this function is called with a NULL argument, it returns immediately without doing anything. .P If \fBbuffer\fP points to a non-NULL pointer, \fBblength\fP must be set to the actual length of the buffer provided (in code units). .P In all cases, after successful conversion, the variable pointed to by \fBblength\fP is updated to the length actually used (in code units), excluding the terminating zero that is always added. .P If an error occurs, the length (via \fBblength\fP) is set to the offset within the input pattern where the error was detected. Only gross syntax errors are caught; there are plenty of errors that will get passed on for \fBpcre2_compile()\fP to discover. .P The return from \fBpcre2_pattern_convert()\fP is zero on success or a non-zero PCRE2 error code. Note that PCRE2 error codes may be positive or negative: \fBpcre2_compile()\fP uses mostly positive codes and \fBpcre2_match()\fP negative ones; \fBpcre2_convert()\fP uses existing codes of both kinds. A textual error message can be obtained by calling \fBpcre2_get_error_message()\fP. . . .SH "CONVERTING GLOBS" .rs .sp Globs are used to match file names, and consequently have the concept of a "path separator", which defaults to backslash under Windows and forward slash otherwise. If PCRE2_CONVERT_GLOB is set, the wildcards * and ? are not permitted to match separator characters, but the double-star (**) feature (which does match separators) is supported. .P PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with the double-star feature disabled. These options may be given together. . . .SH "CONVERTING POSIX PATTERNS" .rs .sp POSIX defines two kinds of regular expression pattern: basic and extended. These can be processed by setting PCRE2_CONVERT_POSIX_BASIC or PCRE2_CONVERT_POSIX_EXTENDED, respectively. .P In POSIX patterns, backslash is not special in a character class. Unmatched closing parentheses are treated as literals. .P In basic patterns, ? + | {} and () must be escaped to be recognized as metacharacters outside a character class. If the first character in the pattern is * it is treated as a literal. ^ is a metacharacter only at the start of a branch. .P In extended patterns, a backslash not in a character class always makes the next character literal, whatever it is. There are no backreferences. .P Note: POSIX mandates that the longest possible match at the first matching position must be found. This is not what \fBpcre2_match()\fP does; it yields the first match that is found. An application can use \fBpcre2_dfa_match()\fP to find the longest match, but that does not support backreferences (but then neither do POSIX extended patterns). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 14 November 2023 Copyright (c) 1997-2018 University of Cambridge. .fi ================================================ FILE: doc/pcre2demo.3 ================================================ .TH PCRE2DEMO 3 "24 March 2025" "PCRE2 10.48-DEV" .\"AUTOMATICALLY GENERATED BY UpdateAlways - do not EDIT! .SH NAME PCRE2DEMO - A demonstration C program for PCRE2 .SH "SOURCE CODE" .rs .sp .\" Start example. .de EX . do ds mF \\n[.fam] . nr mE \\n(.f . nf . nh . do fam C . ft CW .. . . .\" End example. .de EE . do fam \\*(mF . ft \\n(mE . fi . hy \\n(HY .. . .RS -7 .EX /************************************************* * PCRE2 DEMONSTRATION PROGRAM * *************************************************/ /* This is a demonstration program to illustrate a straightforward way of using the PCRE2 regular expression library from a C program. See the pcre2sample documentation for a short discussion ("man pcre2sample" if you have the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is incompatible with the original PCRE API. There are actually three libraries, each supporting a different code unit width. This demonstration program uses the 8-bit library. The default is to process each code unit as a separate character, but if the pattern begins with "(*UTF)", both it and the subject are treated as UTF-8 strings, where characters may occupy multiple code units. In Unix-like environments, if PCRE2 is installed in your standard system libraries, you should be able to compile this program using this command: cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo If PCRE2 is not installed in a standard place, it is likely to be installed with support for the pkg-config mechanism. If you have pkg-config, you can compile this program using this command: cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo If you do not have pkg-config, you may have to use something like this: cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e -R/usr/local/lib -lpcre2-8 -o pcre2demo Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and library files for PCRE2 are installed on your system. Only some operating systems (Solaris is one) use the -R option. Building under Windows: If you want to statically link this program against a non-dll .a file, you must define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment the following line. */ /* #define PCRE2_STATIC */ /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. For a program that uses only one code unit width, setting it to 8, 16, or 32 makes it possible to use generic function names such as pcre2_compile(). Note that just changing 8 to 16 (for example) is not sufficient to convert this program to process 16-bit characters. Even in a fully 16-bit environment, where string-handling functions such as strcmp() and printf() work with 16-bit characters, the code for handling the table of named substrings will still need to be modified. */ #define PCRE2_CODE_UNIT_WIDTH 8 #include #include #include /************************************************************************** * Here is the program. The API includes the concept of "contexts" for * * setting up unusual interface requirements for compiling and matching, * * such as custom memory managers and non-standard newline definitions. * * This program does not do any of this, so it makes no use of contexts, * * always passing NULL where a context could be given. * **************************************************************************/ int main(int argc, char **argv) { pcre2_code *re; PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */ PCRE2_SPTR name_table; int errornumber; int find_all, caseless_match; int i; int rc; uint32_t namecount; uint32_t name_entry_size; PCRE2_SIZE erroroffset; PCRE2_SIZE *ovector; PCRE2_SIZE ovector_last[2]; PCRE2_SIZE subject_length; pcre2_match_data *match_data; /************************************************************************** * First, sort out the command line. Options: * * - "-g" to request repeated matching to find all occurrences, * * like Perl's /g option. We set the variable find_all to a non-zero * * value if the -g option is present. * * - "-i" to request caseless matching, like Perl's /i option. We set the * * variable caseless_match to PCRE2_CASELESS if the -i option is * * present. * **************************************************************************/ find_all = 0; caseless_match = 0; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-g") == 0) find_all = 1; else if (strcmp(argv[i], "-i") == 0) caseless_match = PCRE2_CASELESS; else if (argv[i][0] == '-') { printf("Unrecognised option %s\en", argv[i]); return 1; } else break; } /* After the options, we require exactly two arguments, which are the pattern, and the subject string. */ if (argc - i != 2) { printf("Exactly two arguments required: a regex and a subject string\en"); return 1; } /* Pattern and subject are char arguments, so they can be straightforwardly cast to PCRE2_SPTR because we are working in 8-bit code units. The subject length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact defined to be size_t. */ pattern = (PCRE2_SPTR)argv[i]; subject = (PCRE2_SPTR)argv[i+1]; subject_length = (PCRE2_SIZE)strlen((char *)subject); /************************************************************************* * Now we are going to compile the regular expression pattern, and handle * * any errors that are detected. * *************************************************************************/ re = pcre2_compile( pattern, /* the pattern */ PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ caseless_match, /* possibly enable caseless */ &errornumber, /* for error number */ &erroroffset, /* for error offset */ NULL); /* use default compile context */ /* Compilation failed: print the error message and exit. */ if (re == NULL) { PCRE2_UCHAR buffer[256]; pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); printf("PCRE2 compilation failed at offset %d: %s\en", (int)erroroffset, buffer); return 1; } /************************************************************************* * If the compilation succeeded, we call PCRE2 again, in order to do a * * pattern match against the subject string. This does just ONE match. If * * further matching is needed, it will be done below. Before running the * * match we must set up a match_data block for holding the result. Using * * pcre2_match_data_create_from_pattern() ensures that the block is * * exactly the right size for the number of capturing parentheses in the * * pattern. If you need to know the actual size of a match_data block as * * a number of bytes, you can find it like this: * * * * PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); * *************************************************************************/ match_data = pcre2_match_data_create_from_pattern(re, NULL); /* Now run the match. */ rc = pcre2_match( re, /* the compiled pattern */ subject, /* the subject string */ subject_length, /* the length of the subject */ 0, /* start at offset 0 in the subject */ 0, /* default options */ match_data, /* block for storing the result */ NULL); /* use default match context */ /* Matching failed: handle error cases */ if (rc < 0) { switch(rc) { case PCRE2_ERROR_NOMATCH: printf("No match\en"); break; /* Handle other special cases if you like */ default: printf("Matching error %d\en", rc); break; } pcre2_match_data_free(match_data); /* Release memory used for the match */ pcre2_code_free(re); /* data and the compiled pattern. */ return 1; } /* Match succeeded. Get a pointer to the output vector, where string offsets are stored. */ ovector = pcre2_get_ovector_pointer(match_data); printf("Match succeeded at offset %d\en", (int)ovector[0]); /************************************************************************* * We have found the first match within the subject string. If the output * * vector wasn't big enough, say so. Then output any substrings that were * * captured. * *************************************************************************/ /* The output vector wasn't big enough. This should not happen, because we used pcre2_match_data_create_from_pattern() above. */ if (rc == 0) printf("ovector was not big enough for all the captured substrings\en"); /* Since release 10.38 PCRE2 has locked out the use of \eK in lookaround assertions. This is the recommended behaviour. However, the option PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK allows applications to re-enable the old behaviour. If that is set, it is possible to run patterns such as /(?=.\eK)/ that use \eK in an assertion to set the start of a match later than its end. In this demonstration program, we show how to detect this case, although it cannot arise because the option is never set. */ if (ovector[0] > ovector[1]) { printf("\e\eK was used in an assertion to set the match start after its end.\en" "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]), (char *)(subject + ovector[1])); printf("Run abandoned\en"); pcre2_match_data_free(match_data); pcre2_code_free(re); return 1; } /* Show substrings stored in the output vector by number. Obviously, in a real application you might want to do things other than print them. */ for (i = 0; i < rc; i++) { PCRE2_SPTR substring_start = subject + ovector[2*i]; PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i]; printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); } /************************************************************************** * That concludes the basic part of this demonstration program. We have * * compiled a pattern, and performed a single match. The code that follows * * shows first how to access named substrings, and then how to code for * * repeated matches on the same subject. * **************************************************************************/ /* See if there are any named substrings, and if so, show them by name. First we have to extract the count of named parentheses from the pattern. */ (void)pcre2_pattern_info( re, /* the compiled pattern */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ &namecount); /* where to put the answer */ if (namecount == 0) printf("No named substrings\en"); else { PCRE2_SPTR tabptr; printf("Named substrings\en"); /* Before we can access the substrings, we must extract the table for translating names to numbers, and the size of each entry in the table. */ (void)pcre2_pattern_info( re, /* the compiled pattern */ PCRE2_INFO_NAMETABLE, /* address of the table */ &name_table); /* where to put the answer */ (void)pcre2_pattern_info( re, /* the compiled pattern */ PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ &name_entry_size); /* where to put the answer */ /* Now we can scan the table and, for each entry, print the number, the name, and the substring itself. In the 8-bit library the number is held in two bytes, most significant first. */ tabptr = name_table; for (i = 0; i < namecount; i++) { int n = (tabptr[0] << 8) | tabptr[1]; printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2, (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); tabptr += name_entry_size; } } /************************************************************************* * If the "-g" option was given on the command line, we want to continue * * to search for additional matches in the subject string, in a similar * * way to the /g option in Perl. This turns out to be trickier than you * * might think because of the possibility of matching an empty string. * * * * To help with this task, PCRE2 provides the pcre2_next_match() helper. * *************************************************************************/ if (!find_all) /* Check for -g */ { pcre2_match_data_free(match_data); /* Release the memory that was used */ pcre2_code_free(re); /* for the match data and the pattern. */ return 0; /* Exit the program. */ } /* Loop for second and subsequent matches */ ovector_last[0] = ovector[0]; ovector_last[1] = ovector[1]; for (;;) { PCRE2_SIZE start_offset; uint32_t options; /* After each successful match, we use pcre2_next_match() to obtain the match parameters for subsequent match attempts. */ if (!pcre2_next_match(match_data, &start_offset, &options)) break; /* Run the next matching operation */ rc = pcre2_match( re, /* the compiled pattern */ subject, /* the subject string */ subject_length, /* the length of the subject */ start_offset, /* starting offset in the subject */ options, /* options */ match_data, /* block for storing the result */ NULL); /* use default match context */ /* If this match attempt fails, exit the loop for subsequent matches. */ if (rc == PCRE2_ERROR_NOMATCH) break; /* Other matching errors are not recoverable. */ if (rc < 0) { printf("Matching error %d\en", rc); pcre2_match_data_free(match_data); pcre2_code_free(re); return 1; } /* This demonstration program depends on pcre2_next_match() to ensure that the loop for second and subsequent matches does not run forever. However, it would be robust practice for a production application to verify this. The following block of code shows how to do this. This error case is not reachable unless there is a bug in PCRE2. Because this program does not set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, the logic is simple. We verify that either ovector[1] has advanced, or that we have an empty match touching the end of a previous non-empty match. See the API documentation for guidance if your application uses PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK and searches for multiple matches. */ if (!(ovector[1] > ovector_last[1] || (ovector[1] == ovector[0] && ovector_last[1] > ovector_last[0] && ovector[1] == ovector_last[1]))) { printf("\e\eK was used in an assertion to yield non-advancing matches.\en"); printf("Run abandoned\en"); pcre2_match_data_free(match_data); pcre2_code_free(re); return 1; } ovector_last[0] = ovector[0]; ovector_last[1] = ovector[1]; /* Match succeeded. */ printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]); /* The match succeeded, but the output vector wasn't big enough. This should not happen. */ if (rc == 0) printf("ovector was not big enough for all the captured substrings\en"); /* We guard against patterns such as /(?=.\eK)/ that use \eK in an assertion to set the start of a match later than its end. As explained above, this case should not occur because this demonstration program does not set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, however, we do include code showing how to detect it. */ if (ovector[0] > ovector[1]) { printf("\e\eK was used in an assertion to set the match start after its end.\en" "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]), (char *)(subject + ovector[1])); printf("Run abandoned\en"); pcre2_match_data_free(match_data); pcre2_code_free(re); return 1; } /* As before, show substrings stored in the output vector by number, and then also any named substrings. */ for (i = 0; i < rc; i++) { PCRE2_SPTR substring_start = subject + ovector[2*i]; size_t substring_length = ovector[2*i+1] - ovector[2*i]; printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); } if (namecount == 0) printf("No named substrings\en"); else { PCRE2_SPTR tabptr = name_table; printf("Named substrings\en"); for (i = 0; i < namecount; i++) { int n = (tabptr[0] << 8) | tabptr[1]; printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2, (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); tabptr += name_entry_size; } } } /* End of loop to find second and subsequent matches */ printf("\en"); pcre2_match_data_free(match_data); pcre2_code_free(re); return 0; } /* End of pcre2demo.c */ .EE ================================================ FILE: doc/pcre2grep.1 ================================================ .TH PCRE2GREP 1 "24 January 2025" "PCRE2 10.48-DEV" .SH NAME pcre2grep - a grep with Perl-compatible regular expressions. .SH SYNOPSIS .B pcre2grep [options] [long options] [pattern] [path1 path2 ...] . .SH DESCRIPTION .rs .sp \fBpcre2grep\fP searches files for character patterns, in the same way as other grep commands do, but it uses the PCRE2 regular expression library to support patterns that are compatible with the regular expressions of Perl 5. See .\" HREF \fBpcre2syntax\fP(3) .\" for a quick-reference summary of pattern syntax, or .\" HREF \fBpcre2pattern\fP(3) .\" for a full description of the syntax and semantics of the regular expressions that PCRE2 supports. .P Patterns, whether supplied on the command line or in a separate file, are given without delimiters. For example: .sp pcre2grep Thursday /etc/motd .sp If you attempt to use delimiters (for example, by surrounding a pattern with slashes, as is common in Perl scripts), they are interpreted as part of the pattern. Quotes can of course be used to delimit patterns on the command line because they are interpreted by the shell, and indeed quotes are required if a pattern contains white space or shell metacharacters. .P The first argument that follows any option settings is treated as the single pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present. Conversely, when one or both of these options are used to specify patterns, all arguments are treated as path names. At least one of \fB-e\fP, \fB-f\fP, or an argument pattern must be provided. .P If no files are specified, \fBpcre2grep\fP reads the standard input. The standard input can also be referenced by a name consisting of a single hyphen. For example: .sp pcre2grep some-pattern file1 - file3 .sp By default, input files are searched line by line, so pattern assertions about the beginning and end of a subject string (^, $, \eA, \eZ, and \ez) match at the beginning and end of each line. When a line matches a pattern, it is copied to the standard output, and if there is more than one file, the file name is output at the start of each line, followed by a colon. However, there are options that can change how \fBpcre2grep\fP behaves. For example, the \fB-M\fP option makes it possible to search for strings that span line boundaries. What defines a line boundary is controlled by the \fB-N\fP (\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or not file names are shown, and the \fB-Z\fP option changes the file name terminator to a zero byte. .P The amount of memory used for buffering files that are being scanned is controlled by parameters that can be set by the \fB--buffer-size\fP and \fB--max-buffer-size\fP options. The first of these sets the size of buffer that is obtained at the start of processing. If an input file contains very long lines, a larger buffer may be needed; this is handled by automatically extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The default values for these parameters can be set when \fBpcre2grep\fP is built; if nothing is specified, the defaults are set to 20KiB and 1MiB respectively. An error occurs if a line is too long and the buffer can no longer be expanded. .P The block of memory that is actually used is three times the "buffer size", to allow for buffering "before" and "after" lines. If the buffer size is too small, fewer than requested "before" and "after" lines may be output. .P When matching with a multiline pattern, the size of the buffer must be at least half of the maximum match expected or the pattern might fail to match. .P Patterns can be no longer than 8KiB or BUFSIZ bytes, whichever is the greater. BUFSIZ is defined in \fB\fP. When there is more than one pattern (specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to each line in the order in which they are defined, except that all the \fB-e\fP patterns are tried before the \fB-f\fP patterns. .P By default, as soon as one pattern matches a line, no further patterns are considered. However, if \fB--colour\fP (or \fB--color\fP) is used to colour the matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, \fB--line-offsets\fP, or \fB--output\fP is used to output only the part of the line that matched (either shown literally, or as an offset), the behaviour is different. In this situation, all the patterns are applied to the line. If there is more than one match, the one that begins nearest to the start of the subject is processed; if there is more than one match at that position, the one with the longest matching substring is processed; if the matching substrings are equal, the first match found is processed. .P Scanning with all the patterns resumes immediately following the match, so that later matches on the same line can be found. Note, however, that an overlapping match that starts in the middle of another match will not be processed. .P The above behaviour was changed at release 10.41 to be more compatible with GNU grep. In earlier releases, \fBpcre2grep\fP did not recognize matches from later patterns that were earlier in the subject. .P Patterns that can match an empty string are accepted, but empty string matches are never recognized. An example is the pattern "(super)?(man)?", in which all components are optional. This pattern finds all occurrences of both "super" and "man"; the output differs from matching with "super|man" when only the matching substrings are being shown. .P If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set, \fBpcre2grep\fP uses the value to set a locale when calling the PCRE2 library. The \fB--locale\fP option can be used to override this. . . .SH "SUPPORT FOR COMPRESSED FILES" .rs .sp Compile-time options for \fBpcre2grep\fP can set it up to use \fBlibz\fP or \fBlibbz2\fP for reading compressed files whose names end in \fB.gz\fP or \fB.bz2\fP, respectively. You can find out whether your \fBpcre2grep\fP binary has support for one or both of these file types by running it with the \fB--help\fP option. If the appropriate support is not present, all files are treated as plain text. The standard input is always so treated. If a file with a \fB.gz\fP or \fB.bz2\fP extension is not in fact compressed, it is read as a plain text file. When input is from a compressed .gz or .bz2 file, the \fB--line-buffered\fP option is ignored. . . .SH "BINARY FILES" .rs .sp By default, a file that contains a binary zero byte within the first 1024 bytes is identified as a binary file, and is processed specially. However, if the newline type is specified as NUL, that is, the line terminator is a binary zero, the test for a binary file is not applied. See the \fB--binary-files\fP option for a means of changing the way binary files are handled. . . .SH "BINARY ZEROS IN PATTERNS" .rs .sp Patterns passed from the command line are strings that are terminated by a binary zero, so cannot contain internal zeros. However, patterns that are read from a file via the \fB-f\fP option may contain binary zeros. . . .SH OPTIONS .rs .sp The order in which some of the options appear can affect the output. For example, both the \fB-H\fP and \fB-l\fP options affect the printing of file names. Whichever comes later in the command line will be the one that takes effect. Similarly, except where noted below, if an option is given twice, the later setting is used. Numerical values for options may be followed by K or M, to signify multiplication by 1024 or 1024*1024 respectively. .TP 10 \fB--\fP This terminates the list of options. It is useful if the next item on the command line starts with a hyphen but is not an option. This allows for the processing of patterns and file names that start with hyphens. .TP \fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP Output up to \fInumber\fP lines of context after each matching line. Fewer lines are output if the next match or the end of the file is reached, or if the processing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used instead of a colon for the context lines (the \fB-Z\fP option can be used to change the file name terminator to a zero byte). A line containing "--" is output between each group of lines, unless they are in fact contiguous in the input file. The value of \fInumber\fP is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored. .TP \fB-a\fP, \fB--text\fP Treat binary files as text. This is equivalent to \fB--binary-files\fP=\fItext\fP. .TP \fB--allow-lookaround-bsk\fP PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl. This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which enables this somewhat dangerous usage. .TP \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP Output up to \fInumber\fP lines of context before each matching line. Fewer lines are output if the previous match or the start of the file is within \fInumber\fP lines, or if the processing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used instead of a colon for the context lines (the \fB-Z\fP option can be used to change the file name terminator to a zero byte). A line containing "--" is output between each group of lines, unless they are in fact contiguous in the input file. The value of \fInumber\fP is expected to be relatively small. When \fB-c\fP is used, \fB-B\fP is ignored. .TP \fB--binary-files=\fP\fIword\fP Specify how binary files are to be processed. If the word is "binary" (the default), pattern matching is performed on binary files, but the only output is "Binary file matches" when a match succeeds. If the word is "text", which is equivalent to the \fB-a\fP or \fB--text\fP option, binary files are processed in the same way as any other file. In this case, when a match succeeds, the output may be binary garbage, which can have nasty effects if sent to a terminal. If the word is "without-match", which is equivalent to the \fB-I\fP option, binary files are not processed at all; they are assumed not to be of interest and are skipped without causing any output or affecting the return code. .TP \fB--buffer-size=\fP\fInumber\fP Set the parameter that controls how much memory is obtained at the start of processing for buffering files that are being scanned. See also \fB--max-buffer-size\fP below. .TP \fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP Output \fInumber\fP lines of context both before and after each matching line. This is equivalent to setting both \fB-A\fP and \fB-B\fP to the same value. .TP \fB-c\fP, \fB--count\fP Do not output lines from the files that are being scanned; instead output the number of lines that would have been shown, either because they matched, or, if \fB-v\fP is set, because they failed to match. By default, this count is exactly the same as the number of lines that would have been output, but if the \fB-M\fP (multiline) option is used (without \fB-v\fP), there may be more suppressed lines than the count (that is, the number of matches). .sp If no lines are selected, the number zero is output. If several files are being scanned, a count is output for each of them and the \fB-t\fP option can be used to cause a total to be output at the end. However, if the \fB--files-with-matches\fP option is also used, only those files whose counts are greater than zero are listed. When \fB-c\fP is used, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. .TP \fB--colour\fP, \fB--color\fP If this option is given without any data, it is equivalent to "--colour=auto". If data is required, it must be given in the same shell item, separated by an equals sign. .TP \fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP This option specifies under what circumstances the parts of a line that matched a pattern should be coloured in the output. It is ignored if \fB--file-offsets\fP, \fB--line-offsets\fP, or \fB--output\fP is set. By default, output is not coloured. The value for the \fB--colour\fP option (which is optional, see above) may be "never", "always", or "auto". In the latter case, colouring happens only if the standard output is connected to a terminal. More resources are used when colouring is enabled, because \fBpcre2grep\fP has to search for all possible matches in a line, not just one, in order to colour them all. .sp The colour that is used can be specified by setting one of the environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or PCREGREP_COLOR, which are checked in that order. If none of these are set, \fBpcre2grep\fP looks for GREP_COLORS or GREP_COLOR (in that order). The value of the variable should be a string of two numbers, separated by a semicolon, except in the case of GREP_COLORS, which must start with "ms=" or "mt=" followed by two semicolon-separated colours, terminated by the end of the string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is ignored, and GREP_COLOR is checked. .sp If the string obtained from one of the above variables contains any characters other than semicolon or digits, the setting is ignored and the default colour is used. The string is copied directly into the control string for setting colour on a terminal, so it is your responsibility to ensure that the values make sense. If no relevant environment variable is set, the default is "1;31", which gives red. .TP \fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP If an input path is not a regular file or a directory, "action" specifies how it is to be processed. Valid values are "read" (the default) or "skip" (silently skip the path). .TP \fB-d\fP \fIaction\fP, \fB--directories=\fP\fIaction\fP If an input path is a directory, "action" specifies how it is to be processed. Valid values are "read" (the default in non-Windows environments, for compatibility with GNU grep), "recurse" (equivalent to the \fB-r\fP option), or "skip" (silently skip the path, the default in Windows environments). In the "read" case, directories are read as if they were ordinary files. In some operating systems the effect of reading a directory like this is an immediate end-of-file; in others it may provoke an error. .TP \fB--depth-limit\fP=\fInumber\fP See \fB--match-limit\fP below. .TP \fB-E\fP, \fB--case-restrict\fP When case distinctions are being ignored in Unicode mode, two ASCII letters (K and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F (long S) respectively, as well as their lower case ASCII counterparts. When this option is set, case equivalences are restricted such that no ASCII character matches a non-ASCII character, and vice versa. .TP \fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP Specify a pattern to be matched. This option can be used multiple times in order to specify several patterns. It can also be used as a way of specifying a single pattern that starts with a hyphen. When \fB-e\fP is used, no argument pattern is taken from the command line; all arguments are treated as file names. There is no limit to the number of patterns. They are applied to each line in the order in which they are defined. .sp If \fB-f\fP is used with \fB-e\fP, the command line patterns are matched first, followed by the patterns from the file(s), independent of the order in which these options are specified. .TP \fB--exclude\fP=\fIpattern\fP Files (but not directories) whose names match the pattern are skipped without being processed. This applies to all files, whether listed on the command line, obtained from \fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the file name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not apply to this pattern. The option may be given any number of times in order to specify multiple patterns. If a file name matches both an \fB--include\fP and an \fB--exclude\fP pattern, it is excluded. There is no short form for this option. .TP \fB--exclude-from=\fP\fIfilename\fP Treat each non-empty line of the file as the data for an \fB--exclude\fP option. What constitutes a newline when reading the file is the operating system's default. The \fB--newline\fP option has no effect on this option. This option may be given more than once in order to specify a number of files to read. .TP \fB--exclude-dir\fP=\fIpattern\fP Directories whose names match the pattern are skipped without being processed, whatever the setting of the \fB--recursive\fP option. This applies to all directories, whether listed on the command line, obtained from \fB--file-list\fP, or by scanning a parent directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the directory name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not apply to this pattern. The option may be given any number of times in order to specify more than one pattern. If a directory matches both \fB--include-dir\fP and \fB--exclude-dir\fP, it is excluded. There is no short form for this option. .TP \fB-F\fP, \fB--fixed-strings\fP Interpret each data-matching pattern as a list of fixed strings, separated by newlines, instead of as a regular expression. What constitutes a newline for this purpose is controlled by the \fB--newline\fP option. The \fB-w\fP (match as a word) and \fB-x\fP (match whole line) options can be used with \fB-F\fP. They apply to each of the fixed strings. A line is selected if any of the fixed strings are found in it (subject to \fB-w\fP or \fB-x\fP, if present). This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the \fB--include\fP or \fB--exclude\fP options. .TP \fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP Read patterns from the file, one per line. As is the case with patterns on the command line, no delimiters should be used. What constitutes a newline when reading the file is the operating system's default interpretation of \en. The \fB--newline\fP option has no effect on this option. Trailing white space is removed from each line, and blank lines are ignored unless the \fB--posix-pattern-file\fP option is also provided. An empty file contains no patterns and therefore matches nothing. Patterns read from a file in this way may contain binary zeros, which are treated as ordinary character literals. .sp If this option is given more than once, all the specified files are read. A data line is output if any of the patterns match it. A file name can be given as "-" to refer to the standard input. When \fB-f\fP is used, patterns specified on the command line using \fB-e\fP may also be present; they are matched before the file's patterns. However, no pattern is taken from the command line; all arguments are treated as the names of paths to be searched. .TP \fB--file-list\fP=\fIfilename\fP Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a newline when reading the file is the operating system's default. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any that are listed on the command line. The file name can be given as "-" to refer to the standard input. If \fB--file\fP and \fB--file-list\fP are both specified as "-", patterns are read first. This is useful only when the standard input is a terminal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read. .TP \fB--file-offsets\fP Instead of showing lines or parts of lines that match, show each match as an offset from the start of the file and a length, separated by a comma. In this mode, \fB--colour\fP has no effect, and no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one match in a line, each of them is shown separately. This option is mutually exclusive with \fB--output\fP, \fB--line-offsets\fP, and \fB--only-matching\fP. .TP \fB--group-separator\fP=\fItext\fP Output this text string instead of two hyphens between groups of lines when \fB-A\fP, \fB-B\fP, or \fB-C\fP is in use. See also \fB--no-group-separator\fP. .TP \fB-H\fP, \fB--with-filename\fP Force the inclusion of the file name at the start of output lines when searching a single file. The file name is not normally shown in this case. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The \fB-Z\fP option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. When the \fB-M\fP option causes a pattern to match more than one line, only the first is preceded by the file name. This option overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options. .TP \fB-h\fP, \fB--no-filename\fP Suppress the output file names when searching multiple files. File names are normally shown when multiple files are searched. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The \fB-Z\fP option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options. .TP \fB--heap-limit\fP=\fInumber\fP See \fB--match-limit\fP below. .TP \fB--help\fP Output a help message, giving brief details of the command options and file type support, and then exit. Anything else on the command line is ignored. .TP \fB-I\fP Ignore binary files. This is equivalent to \fB--binary-files\fP=\fIwithout-match\fP. .TP \fB-i\fP, \fB--ignore-case\fP Ignore upper/lower case distinctions when pattern matching. This applies when matching path names for inclusion or exclusion as well as when matching lines in files. .TP \fB--include\fP=\fIpattern\fP If any \fB--include\fP patterns are specified, the only files that are processed are those whose names match one of the patterns and do not match an \fB--exclude\fP pattern. This option does not affect directories, but it applies to all files, whether listed on the command line, obtained from \fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the file name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not apply to this pattern. The option may be given any number of times. If a file name matches both an \fB--include\fP and an \fB--exclude\fP pattern, it is excluded. There is no short form for this option. .TP \fB--include-from=\fP\fIfilename\fP Treat each non-empty line of the file as the data for an \fB--include\fP option. What constitutes a newline for this purpose is the operating system's default. The \fB--newline\fP option has no effect on this option. This option may be given any number of times; all the files are read. .TP \fB--include-dir\fP=\fIpattern\fP If any \fB--include-dir\fP patterns are specified, the only directories that are processed are those whose names match one of the patterns and do not match an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed on the command line, obtained from \fB--file-list\fP, or by scanning a parent directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the directory name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not apply to this pattern. The option may be given any number of times. If a directory matches both \fB--include-dir\fP and \fB--exclude-dir\fP, it is excluded. There is no short form for this option. .TP \fB-L\fP, \fB--files-without-match\fP Instead of outputting lines from the files, just output the names of the files that do not contain any lines that would have been output. Each file name is output once, on a separate line by default, but if the \fB-Z\fP option is set, they are separated by zero bytes instead of newlines. This option overrides any previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options. .TP \fB-l\fP, \fB--files-with-matches\fP Instead of outputting lines from the files, just output the names of the files containing lines that would have been output. Each file name is output once, on a separate line, but if the \fB-Z\fP option is set, they are separated by zero bytes instead of newlines. Searching normally stops as soon as a matching line is found in a file. However, if the \fB-c\fP (count) option is also used, matching continues in order to obtain the correct count, and those files that have at least one match are listed along with their counts. Using this option with \fB-c\fP is a way of suppressing the listing of files with no matches that occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options. .TP \fB--label\fP=\fIname\fP This option supplies a name to be used for the standard input when file names are being output. If not supplied, "(standard input)" is used. There is no short form for this option. .TP \fB--line-buffered\fP When this option is given, non-compressed input is read and processed line by line, and the output is flushed after each write. By default, input is read in large chunks, unless \fBpcre2grep\fP can determine that it is reading from a terminal, which is currently possible only in Unix-like environments or Windows. Output to terminal is normally automatically flushed by the operating system. This option can be useful when the input or output is attached to a pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data. However, its use will affect performance, and the \fB-M\fP (multiline) option ceases to work. When input is from a compressed .gz or .bz2 file, \fB--line-buffered\fP is ignored. .TP \fB--line-offsets\fP Instead of showing lines or parts of lines that match, show each match as a line number, the offset from the start of the line, and a length. The line number is terminated by a colon (as usual; see the \fB-n\fP option), and the offset and length are separated by a comma. In this mode, \fB--colour\fP has no effect, and no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one match in a line, each of them is shown separately. This option is mutually exclusive with \fB--output\fP, \fB--file-offsets\fP, and \fB--only-matching\fP. .TP \fB--locale\fP=\fIlocale-name\fP This option specifies a locale to be used for pattern matching. It overrides the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no locale is specified, the PCRE2 library's default (usually the "C" locale) is used. There is no short form for this option. .TP \fB-M\fP, \fB--multiline\fP Allow patterns to match more than one line. When this option is set, the PCRE2 library is called in "multiline" mode, and a match is allowed to continue past the end of the initial line and onto one or more subsequent lines. .sp Patterns used with \fB-M\fP may usefully contain literal newline characters and internal occurrences of ^ and $ characters, because in multiline mode these can match at internal newlines. Because \fBpcre2grep\fP is scanning multiple lines, the \eZ and \ez assertions match only at the end of the last line in the file. The \eA assertion matches at the start of the first line of a match. This can be any line in the file; it is not anchored to the first line. .sp The output for a successful match may consist of more than one line. The first line is the line in which the match started, and the last line is the line in which the match ended. If the matched string ends with a newline sequence, the output ends at the end of that line. If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a match has been handled, scanning restarts at the beginning of the line after the one in which the match ended. .sp The newline sequence that separates multiple lines must be matched as part of the pattern. For example, to find the phrase "regular expression" in a file where "regular" might be at the end of a line and "expression" at the start of the next line, you could use this command: .sp pcre2grep -M 'regular\es+expression' .sp The \es escape sequence matches any white space character, including newlines, and is followed by + so as to match trailing white space on the first line as well as possibly handling a two-character newline sequence. .sp There is a limit to the number of lines that can be matched, imposed by the way that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently large processing buffer, this should not be a problem. .sp The \fB-M\fP option does not work when input is read line by line (see \fB--line-buffered\fP.) .TP \fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP Stop processing after finding \fInumber\fP matching lines, or non-matching lines if \fB-v\fP is also set. Any trailing context lines are output after the final match. In multiline mode, each multiline match counts as just one line for this purpose. If this limit is reached when reading the standard input from a regular file, the file is left positioned just after the last matching line. If \fB-c\fP is also set, the count that is output is never greater than \fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or \fB-q\fP, or when just checking for a match in a binary file. .TP \fB--match-limit\fP=\fInumber\fP Processing some regular expression patterns may take a very long time to search for all possible matching strings. Others may require a very large amount of memory. There are three options that set resource limits for matching. .sp The \fB--match-limit\fP option provides a means of limiting computing resource usage when processing patterns that are not going to match, but which have a very large number of possibilities in their search trees. The classic example is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a counter that is incremented each time around its main processing loop. If the value set by \fB--match-limit\fP is reached, an error occurs. .sp The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of 1024 bytes), the maximum amount of heap memory that may be used for matching. .sp The \fB--depth-limit\fP option limits the depth of nested backtracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtracking point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than \fB--match-limit\fP. .sp There are no short forms for these options. The default limits can be set when the PCRE2 library is compiled; if they are not specified, the defaults are very large and so effectively unlimited. .TP \fB--max-buffer-size\fP=\fInumber\fP This limits the expansion of the processing buffer, whose initial size can be set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no smaller than the starting buffer size. .TP \fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP Six different conventions for indicating the ends of lines in scanned files are supported. For example: .sp pcre2grep -N CRLF 'some pattern' .sp The newline type may be specified in upper, lower, or mixed case. If the newline type is NUL, lines are separated by binary zero characters. The other types are the single-character sequences CR (carriage return) and LF (linefeed), the two-character sequence CRLF, an "anycrlf" type, which recognizes any of the preceding three types, and an "any" type, for which any Unicode line ending sequence is assumed to end a line. The Unicode sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). .sp When the PCRE2 library is built, a default line-ending sequence is specified. This is normally the standard sequence for the operating system. Unless otherwise specified by this option, \fBpcre2grep\fP uses the library's default. .sp This option makes it possible to use \fBpcre2grep\fP to scan files that have come from other environments without having to modify their line endings. If the data that is being scanned does not agree with the convention set by this option, \fBpcre2grep\fP may behave in strange ways. Note that this option does not apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or \fB--include-from\fP options, which are expected to use the operating system's standard newline sequence. .TP \fB-n\fP, \fB--line-number\fP Precede each output line by its line number in the file, followed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the line number. When the \fB-M\fP option causes a pattern to match more than one line, only the first is preceded by its line number. This option is forced if \fB--line-offsets\fP is used. .TP \fB--no-group-separator\fP Do not output a separator between groups of lines when \fB-A\fP, \fB-B\fP, or \fB-C\fP is in use. The default is to output a line containing two hyphens. See also \fB--group-separator\fP. .TP \fB--no-jit\fP If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), \fBpcre2grep\fP automatically makes use of this, unless it was explicitly disabled at build time. This option can be used to disable the use of JIT at run time. It is provided for testing and working around problems. It should never be needed in normal use. .TP \fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP When there is a match, instead of outputting the line that matched, output just the text specified in this option, followed by an operating-system standard newline. In this mode, \fB--colour\fP has no effect, and no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on this option, which is mutually exclusive with \fB--only-matching\fP, \fB--file-offsets\fP, and \fB--line-offsets\fP. However, like \fB--only-matching\fP, if there is more than one match in a line, each of them causes a line of output. .sp Escape sequences starting with a dollar character may be used to insert the contents of the matched part of the line and/or captured substrings into the text. .sp $ or ${} is replaced by the captured substring of the given decimal number; $& (or the legacy $0) substitutes the whole match. If the number is greater than the number of capturing substrings, or if the capture is unset, the replacement is empty. .sp $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; $v by vertical tab. .sp $o or $o{} is replaced by the character whose code point is the given octal number. In the first form, up to three octal digits are processed. When more digits are needed in Unicode mode to specify a wide character, the second form must be used. .sp $x or $x{} is replaced by the character represented by the given hexadecimal number. In the first form, up to two hexadecimal digits are processed. When more digits are needed in Unicode mode to specify a wide character, the second form must be used. .sp Any other character is substituted by itself. In particular, $$ is replaced by a single dollar. .TP \fB-o\fP, \fB--only-matching\fP Show only the part of the line that matched a pattern instead of the whole line. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one match in a line, each of them is shown separately, on a separate line of output. If \fB-o\fP is combined with \fB-v\fP (invert the sense of the match to find non-matching lines), no output is generated, but the return code is set appropriately. If the matched portion of the line is empty, nothing is output unless the file name or line number are being printed, in which case they are shown on an otherwise empty line. This option is mutually exclusive with \fB--output\fP, \fB--file-offsets\fP and \fB--line-offsets\fP. .TP \fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP Show only the part of the line that matched the capturing parentheses of the given number. Up to 50 capturing parentheses are supported by default. This limit can be changed via the \fB--om-capture\fP option. A pattern may contain any number of capturing parentheses, but only those whose number is within the limit can be accessed by \fB-o\fP. An error occurs if the number specified by \fB-o\fP is greater than the limit. .sp -o0 is the same as \fB-o\fP without a number. Because these options can be given without an argument (see above), if an argument is present, it must be given in the same shell item, for example, -o3 or --only-matching=2. The comments given for the non-argument case above also apply to this option. If the specified capturing parentheses do not exist in the pattern, or were not set in the match, nothing is output unless the file name or line number are being output. .sp If this option is given multiple times, multiple substrings are output for each match, in the order the options are given, and all on one line. For example, -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and then 3 again to be output. By default, there is no separator (but see the next but one option). .TP \fB--om-capture\fP=\fInumber\fP Set the number of capturing parentheses that can be accessed by \fB-o\fP. The default is 50. .TP \fB--om-separator\fP=\fItext\fP Specify a separating string for multiple occurrences of \fB-o\fP. The default is an empty string. Separating strings are never coloured. .TP \fB-P\fP, \fB--no-ucp\fP Starting from release 10.43, when UTF/Unicode mode is specified with \fB-u\fP or \fB-U\fP, the PCRE2_UCP option is used by default. This means that the POSIX classes in patterns match more than just ASCII characters. For example, [:digit:] matches any Unicode decimal digit. The \fB--no-ucp\fP option suppresses PCRE2_UCP, thus restricting the POSIX classes to ASCII characters, as was the case in earlier releases. Note that there are now more fine-grained option settings within patterns that affect individual classes. For example, when in UCP mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \ew to match Unicode letters and digits. .TP \fB--posix-pattern-file\fP When patterns are provided with the \fB-f\fP option, do not trim trailing spaces or ignore empty lines in a similar way than other grep tools. To keep the behaviour consistent with older versions, if the pattern read was terminated with CRLF (as character literals) then both characters won't be included as part of it, so if you really need to have pattern ending in '\er', use a escape sequence or provide it by a different method. .TP \fB-q\fP, \fB--quiet\fP Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found. .TP \fB-r\fP, \fB--recursive\fP If any given path is a directory, recursively scan the files it contains, taking note of any \fB--include\fP and \fB--exclude\fP settings. By default, a directory is read as a normal file; in some operating systems this gives an immediate end-of-file. This option is a shorthand for setting the \fB-d\fP option to "recurse". .TP \fB--recursion-limit\fP=\fInumber\fP This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP above for details. .TP \fB-s\fP, \fB--no-messages\fP Suppress error messages about non-existent or unreadable files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. .TP \fB-t\fP, \fB--total-count\fP This option is useful when scanning more than one file. If used on its own, \fB-t\fP suppresses all output except for a grand total number of matching lines (or non-matching lines if \fB-v\fP is used) in all the files. If \fB-t\fP is used with \fB-c\fP, a grand total is output except when the previous output is just one line. In other words, it is not output when just one file's count is listed. If file names are being output, the grand total is preceded by "TOTAL:". Otherwise, it appears as just another number. The \fB-t\fP option is ignored when used with \fB-L\fP (list files without matches), because the grand total would always be zero. .TP \fB-u\fP, \fB--utf\fP Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including those for any \fB--exclude\fP and \fB--include\fP options) and all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an error occurs. .TP \fB-U\fP, \fB--utf-allow-invalid\fP As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code unit sequences. These can never form part of any pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or other binary files. For more details about matching in non-valid UTF-8 strings, see the .\" HREF \fBpcre2unicode\fP(3) .\" documentation. .TP \fB-V\fP, \fB--version\fP Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the standard output and then exit. Anything else on the command line is ignored. .TP \fB-v\fP, \fB--invert-match\fP Invert the sense of the match, so that lines which do \fInot\fP match any of the patterns are the ones that are found. When this option is set, options such as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match that are to be output, are ignored. .TP \fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP Force the patterns only to match "words". That is, there must be a word boundary at the start and end of each matched string. This is equivalent to having "\eb(?:" at the start of each pattern, and ")\eb" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the \fB--include\fP or \fB--exclude\fP options. .TP \fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP Force the patterns to start matching only at the beginnings of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pattern and ")$" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the \fB--include\fP or \fB--exclude\fP options. .TP \fB-Z\fP, \fB--null\fP Terminate files names in the regular output with a zero byte (the NUL character) instead of what would normally appear. This is useful when file names contain unusual characters such as colons, hyphens, or even newlines. The option does not apply to file names in error messages. . . .SH "ENVIRONMENT VARIABLES" .rs .sp The environment variables \fBLC_ALL\fP and \fBLC_CTYPE\fP are examined, in that order, for a locale. The first one that is set is used. This can be overridden by the \fB--locale\fP option. If no locale is set, the PCRE2 library's default (usually the "C" locale) is used. . . .SH "NEWLINES" .rs .sp The \fB-N\fP (\fB--newline\fP) option allows \fBpcre2grep\fP to scan files with newline conventions that differ from the default. This option affects only the way scanned files are processed. It does not affect the interpretation of files specified by the \fB-f\fP, \fB--file-list\fP, \fB--exclude-from\fP, or \fB--include-from\fP options. .P Any parts of the scanned input files that are written to the standard output are copied with whatever newline sequences they have in the input. However, if the final line of a file is output, and it does not end with a newline sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. .P The newline setting does not affect the way in which \fBpcre2grep\fP writes newlines in informational messages to the standard output and error streams. Under Windows, the standard output is set to be binary, so that "\er\en" at the ends of output lines that are copied from the input is not converted to "\er\er\en" by the C I/O library. This means that any messages written to the standard output must end with "\er\en". For all other operating systems, and for all messages to the standard error stream, "\en" is used. . . .SH "OPTIONS COMPATIBILITY WITH GNU GREP" .rs .sp Many of the short and long forms of \fBpcre2grep\fP's options are the same as in the GNU \fBgrep\fP program. Any long option of the form \fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP (PCRE2 terminology). However, the \fB--case-restrict\fP, \fB--depth-limit\fP, \fB-E\fP, \fB--file-list\fP, \fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP, \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--no-ucp\fP, \fB--om-separator\fP, \fB--output\fP, \fB-P\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP options are specific to \fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a capturing parentheses number. .P Although most of the common options work the same way, a few are different in \fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob for GNU \fBgrep\fP, but in \fBpcre2grep\fP it is a regular expression to which the \fB-i\fP option applies. If both the \fB-c\fP and \fB-l\fP options are given, GNU grep lists only file names, without counts, but \fBpcre2grep\fP gives the counts as well. . . .SH "OPTIONS WITH DATA" .rs .sp There are four different ways in which an option with data can be specified. If a short form option is used, the data may follow immediately, or (with one exception) in the next command line item. For example: .sp -f/some/file -f /some/file .sp The exception is the \fB-o\fP option, which may appear with or without data. Because of this, if data is present, it must follow immediately in the same item, for example -o3. .P If a long form option is used, the data may appear in the same command line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: .sp --file=/some/file --file /some/file .sp Note, however, that if you want to supply a file name beginning with ~ as data in a shell command, and have the shell expand ~ to a home directory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. .P The exceptions to the above are the \fB--colour\fP (or \fB--color\fP) and \fB--only-matching\fP options, for which the data is optional. If one of these options does have data, it must be given in the first form, using an equals character. Otherwise \fBpcre2grep\fP will assume that it has no data. . . .SH "USING PCRE2'S CALLOUT FACILITY" .rs .sp \fBpcre2grep\fP has, by default, support for calling external programs or scripts or echoing specific strings during matching by making use of PCRE2's callout facility. However, this support can be completely or partially disabled when \fBpcre2grep\fP is built. You can find out whether your binary has support for callouts by running it with the \fB--help\fP option. If callout support is completely disabled, callouts in patterns are forbidden by \fBpcre2grep\fP. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored. .P A callout in a PCRE2 pattern is of the form (?C) where the argument is either a number or a quoted string (see the .\" HREF \fBpcre2callout\fP .\" documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP; only callouts with string arguments are useful. . . .SS "Echoing a specific string" .rs .sp Starting the callout string with a pipe character invokes an echoing facility that avoids calling an external program or script. This facility is always available, provided that callouts were not completely disabled when \fBpcre2grep\fP was built. The rest of the callout string is processed as a zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the \fB--output\fP (\fB-O\fP) option (see above). However, $0 or $& cannot be used to insert a matched substring because the match is still in progress. Instead, the single character '0' is inserted. Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example: .sp pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' .sp Matching continues normally after the string is output. If you want to see only the callout output but not any output from an actual match, you should end the pattern with (*FAIL). . . .SS "Calling external programs or scripts" .rs .sp This facility can be independently disabled when \fBpcre2grep\fP is built. It is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS, where \fBlib$spawn()\fP is used, and for any Unix-like environment where \fBfork()\fP and \fBexecv()\fP are available. .P If the callout string does not start with a pipe (vertical bar) character, it is parsed into a list of substrings separated by pipe characters. The first substring must be an executable name, with the following substrings specifying arguments: .sp executable_name|arg1|arg2|... .sp Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the \fB--output\fP (\fB-O\fP) option documented above, except that $0 or $& cannot insert the matched string because the match is still in progress. Instead, the character \&'0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example: .sp echo -e "abcde\en12345" | pcre2grep \e '(?x)(.)(..(.)) (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' - .sp Output: .sp Arg1: [a] [bcd] [d] Arg2: |a| () abcde Arg1: [1] [234] [4] Arg2: |1| () 12345 .sp The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero characters in the callout argument will cause premature termination of their substrings, and therefore should not be present. Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any reason (including the non-existence of the executable), a local matching failure occurs and the matcher backtracks in the normal way. . . .SH "MATCHING ERRORS" .rs .sp It is possible to supply a regular expression that takes a very long time to fail to match certain lines. Such patterns normally involve nested indefinite repeats, for example: (a+)*\ed when matched against a line of a's with no final digit. The PCRE2 matching function has a resource limit that causes it to abort in these circumstances. If this happens, \fBpcre2grep\fP outputs an error message and the line that caused the problem to the standard error stream. If there are more than 20 such errors, \fBpcre2grep\fP gives up. .P The \fB--match-limit\fP option of \fBpcre2grep\fP can be used to set the overall resource limit. There are also other limits that affect the amount of memory used during matching; see the discussion of \fB--heap-limit\fP and \fB--depth-limit\fP above. . . .SH DIAGNOSTICS .rs .sp Exit status is 0 if any matches were found, 1 if no matches were found, and 2 for syntax errors, overlong lines, non-existent or inaccessible files (even if matches were found in other files) or too many matching errors. Using the \fB-s\fP option to suppress error messages about inaccessible files does not affect the return code. .P When run under VMS, the return code is placed in the symbol PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). . . .SH "SEE ALSO" .rs .sp \fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3), \fBpcre2unicode\fP(3). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 24 January 2025 Copyright (c) 1997-2023 University of Cambridge. .fi ================================================ FILE: doc/pcre2grep.txt ================================================ PCRE2GREP(1) General Commands Manual PCRE2GREP(1) NAME pcre2grep - a grep with Perl-compatible regular expressions. SYNOPSIS pcre2grep [options] [long options] [pattern] [path1 path2 ...] DESCRIPTION pcre2grep searches files for character patterns, in the same way as other grep commands do, but it uses the PCRE2 regular expression li- brary to support patterns that are compatible with the regular expres- sions of Perl 5. See pcre2syntax(3) for a quick-reference summary of pattern syntax, or pcre2pattern(3) for a full description of the syntax and semantics of the regular expressions that PCRE2 supports. Patterns, whether supplied on the command line or in a separate file, are given without delimiters. For example: pcre2grep Thursday /etc/motd If you attempt to use delimiters (for example, by surrounding a pattern with slashes, as is common in Perl scripts), they are interpreted as part of the pattern. Quotes can of course be used to delimit patterns on the command line because they are interpreted by the shell, and in- deed quotes are required if a pattern contains white space or shell metacharacters. The first argument that follows any option settings is treated as the single pattern to be matched when neither -e nor -f is present. Con- versely, when one or both of these options are used to specify pat- terns, all arguments are treated as path names. At least one of -e, -f, or an argument pattern must be provided. If no files are specified, pcre2grep reads the standard input. The standard input can also be referenced by a name consisting of a single hyphen. For example: pcre2grep some-pattern file1 - file3 By default, input files are searched line by line, so pattern asser- tions about the beginning and end of a subject string (^, $, \A, \Z, and \z) match at the beginning and end of each line. When a line matches a pattern, it is copied to the standard output, and if there is more than one file, the file name is output at the start of each line, followed by a colon. However, there are options that can change how pcre2grep behaves. For example, the -M option makes it possible to search for strings that span line boundaries. What defines a line boundary is controlled by the -N (--newline) option. The -h and -H op- tions control whether or not file names are shown, and the -Z option changes the file name terminator to a zero byte. The amount of memory used for buffering files that are being scanned is controlled by parameters that can be set by the --buffer-size and --max-buffer-size options. The first of these sets the size of buffer that is obtained at the start of processing. If an input file contains very long lines, a larger buffer may be needed; this is handled by au- tomatically extending the buffer, up to the limit specified by --max- buffer-size. The default values for these parameters can be set when pcre2grep is built; if nothing is specified, the defaults are set to 20KiB and 1MiB respectively. An error occurs if a line is too long and the buffer can no longer be expanded. The block of memory that is actually used is three times the "buffer size", to allow for buffering "before" and "after" lines. If the buffer size is too small, fewer than requested "before" and "after" lines may be output. When matching with a multiline pattern, the size of the buffer must be at least half of the maximum match expected or the pattern might fail to match. Patterns can be no longer than 8KiB or BUFSIZ bytes, whichever is the greater. BUFSIZ is defined in . When there is more than one pattern (specified by the use of -e and/or -f), each pattern is applied to each line in the order in which they are defined, except that all the -e patterns are tried before the -f patterns. By default, as soon as one pattern matches a line, no further patterns are considered. However, if --colour (or --color) is used to colour the matching substrings, or if --only-matching, --file-offsets, --line-off- sets, or --output is used to output only the part of the line that matched (either shown literally, or as an offset), the behaviour is different. In this situation, all the patterns are applied to the line. If there is more than one match, the one that begins nearest to the start of the subject is processed; if there is more than one match at that position, the one with the longest matching substring is processed; if the matching substrings are equal, the first match found is processed. Scanning with all the patterns resumes immediately following the match, so that later matches on the same line can be found. Note, however, that an overlapping match that starts in the middle of another match will not be processed. The above behaviour was changed at release 10.41 to be more compatible with GNU grep. In earlier releases, pcre2grep did not recognize matches from later patterns that were earlier in the subject. Patterns that can match an empty string are accepted, but empty string matches are never recognized. An example is the pattern "(su- per)?(man)?", in which all components are optional. This pattern finds all occurrences of both "super" and "man"; the output differs from matching with "super|man" when only the matching substrings are being shown. If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses the value to set a locale when calling the PCRE2 library. The --locale option can be used to override this. SUPPORT FOR COMPRESSED FILES Compile-time options for pcre2grep can set it up to use libz or libbz2 for reading compressed files whose names end in .gz or .bz2, respec- tively. You can find out whether your pcre2grep binary has support for one or both of these file types by running it with the --help option. If the appropriate support is not present, all files are treated as plain text. The standard input is always so treated. If a file with a .gz or .bz2 extension is not in fact compressed, it is read as a plain text file. When input is from a compressed .gz or .bz2 file, the --line-buffered option is ignored. BINARY FILES By default, a file that contains a binary zero byte within the first 1024 bytes is identified as a binary file, and is processed specially. However, if the newline type is specified as NUL, that is, the line terminator is a binary zero, the test for a binary file is not applied. See the --binary-files option for a means of changing the way binary files are handled. BINARY ZEROS IN PATTERNS Patterns passed from the command line are strings that are terminated by a binary zero, so cannot contain internal zeros. However, patterns that are read from a file via the -f option may contain binary zeros. OPTIONS The order in which some of the options appear can affect the output. For example, both the -H and -l options affect the printing of file names. Whichever comes later in the command line will be the one that takes effect. Similarly, except where noted below, if an option is given twice, the later setting is used. Numerical values for options may be followed by K or M, to signify multiplication by 1024 or 1024*1024 respectively. -- This terminates the list of options. It is useful if the next item on the command line starts with a hyphen but is not an option. This allows for the processing of patterns and file names that start with hyphens. -A number, --after-context=number Output up to number lines of context after each matching line. Fewer lines are output if the next match or the end of the file is reached, or if the processing buffer size has been set too small. If file names and/or line numbers are be- ing output, a hyphen separator is used instead of a colon for the context lines (the -Z option can be used to change the file name terminator to a zero byte). A line containing "--" is output between each group of lines, unless they are in fact contiguous in the input file. The value of number is ex- pected to be relatively small. When -c is used, -A is ig- nored. -a, --text Treat binary files as text. This is equivalent to --binary- files=text. --allow-lookaround-bsk PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl. This option causes pcre2grep to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which enables this somewhat dangerous usage. -B number, --before-context=number Output up to number lines of context before each matching line. Fewer lines are output if the previous match or the start of the file is within number lines, or if the process- ing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used in- stead of a colon for the context lines (the -Z option can be used to change the file name terminator to a zero byte). A line containing "--" is output between each group of lines, unless they are in fact contiguous in the input file. The value of number is expected to be relatively small. When -c is used, -B is ignored. --binary-files=word Specify how binary files are to be processed. If the word is "binary" (the default), pattern matching is performed on bi- nary files, but the only output is "Binary file matches" when a match succeeds. If the word is "text", which is equivalent to the -a or --text option, binary files are processed in the same way as any other file. In this case, when a match succeeds, the output may be binary garbage, which can have nasty effects if sent to a terminal. If the word is "without-match", which is equivalent to the -I op- tion, binary files are not processed at all; they are assumed not to be of interest and are skipped without causing any output or affecting the return code. --buffer-size=number Set the parameter that controls how much memory is obtained at the start of processing for buffering files that are being scanned. See also --max-buffer-size below. -C number, --context=number Output number lines of context both before and after each matching line. This is equivalent to setting both -A and -B to the same value. -c, --count Do not output lines from the files that are being scanned; instead output the number of lines that would have been shown, either because they matched, or, if -v is set, because they failed to match. By default, this count is exactly the same as the number of lines that would have been output, but if the -M (multiline) option is used (without -v), there may be more suppressed lines than the count (that is, the number of matches). If no lines are selected, the number zero is output. If sev- eral files are being scanned, a count is output for each of them and the -t option can be used to cause a total to be output at the end. However, if the --files-with-matches op- tion is also used, only those files whose counts are greater than zero are listed. When -c is used, the -A, -B, and -C op- tions are ignored. --colour, --color If this option is given without any data, it is equivalent to "--colour=auto". If data is required, it must be given in the same shell item, separated by an equals sign. --colour=value, --color=value This option specifies under what circumstances the parts of a line that matched a pattern should be coloured in the output. It is ignored if --file-offsets, --line-offsets, or --output is set. By default, output is not coloured. The value for the --colour option (which is optional, see above) may be "never", "always", or "auto". In the latter case, colouring happens only if the standard output is connected to a termi- nal. More resources are used when colouring is enabled, be- cause pcre2grep has to search for all possible matches in a line, not just one, in order to colour them all. The colour that is used can be specified by setting one of the environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or PCREGREP_COLOR, which are checked in that order. If none of these are set, pcre2grep looks for GREP_COLORS or GREP_COLOR (in that order). The value of the variable should be a string of two numbers, separated by a semicolon, except in the case of GREP_COLORS, which must start with "ms=" or "mt=" followed by two semicolon-separated colours, terminated by the end of the string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is ig- nored, and GREP_COLOR is checked. If the string obtained from one of the above variables con- tains any characters other than semicolon or digits, the set- ting is ignored and the default colour is used. The string is copied directly into the control string for setting colour on a terminal, so it is your responsibility to ensure that the values make sense. If no relevant environment variable is set, the default is "1;31", which gives red. -D action, --devices=action If an input path is not a regular file or a directory, "ac- tion" specifies how it is to be processed. Valid values are "read" (the default) or "skip" (silently skip the path). -d action, --directories=action If an input path is a directory, "action" specifies how it is to be processed. Valid values are "read" (the default in non-Windows environments, for compatibility with GNU grep), "recurse" (equivalent to the -r option), or "skip" (silently skip the path, the default in Windows environments). In the "read" case, directories are read as if they were ordinary files. In some operating systems the effect of reading a di- rectory like this is an immediate end-of-file; in others it may provoke an error. --depth-limit=number See --match-limit below. -E, --case-restrict When case distinctions are being ignored in Unicode mode, two ASCII letters (K and S) will by default match Unicode charac- ters U+212A (Kelvin sign) and U+017F (long S) respectively, as well as their lower case ASCII counterparts. When this op- tion is set, case equivalences are restricted such that no ASCII character matches a non-ASCII character, and vice versa. -e pattern, --regex=pattern, --regexp=pattern Specify a pattern to be matched. This option can be used mul- tiple times in order to specify several patterns. It can also be used as a way of specifying a single pattern that starts with a hyphen. When -e is used, no argument pattern is taken from the command line; all arguments are treated as file names. There is no limit to the number of patterns. They are applied to each line in the order in which they are defined. If -f is used with -e, the command line patterns are matched first, followed by the patterns from the file(s), independent of the order in which these options are specified. --exclude=pattern Files (but not directories) whose names match the pattern are skipped without being processed. This applies to all files, whether listed on the command line, obtained from --file- list, or by scanning a directory. The pattern is a PCRE2 reg- ular expression, and is matched against the final component of the file name, not the entire path. The -F, -w, and -x op- tions do not apply to this pattern. The option may be given any number of times in order to specify multiple patterns. If a file name matches both an --include and an --exclude pat- tern, it is excluded. There is no short form for this option. --exclude-from=filename Treat each non-empty line of the file as the data for an --exclude option. What constitutes a newline when reading the file is the operating system's default. The --newline option has no effect on this option. This option may be given more than once in order to specify a number of files to read. --exclude-dir=pattern Directories whose names match the pattern are skipped without being processed, whatever the setting of the --recursive op- tion. This applies to all directories, whether listed on the command line, obtained from --file-list, or by scanning a parent directory. The pattern is a PCRE2 regular expression, and is matched against the final component of the directory name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times in order to specify more than one pattern. If a direc- tory matches both --include-dir and --exclude-dir, it is ex- cluded. There is no short form for this option. -F, --fixed-strings Interpret each data-matching pattern as a list of fixed strings, separated by newlines, instead of as a regular ex- pression. What constitutes a newline for this purpose is con- trolled by the --newline option. The -w (match as a word) and -x (match whole line) options can be used with -F. They ap- ply to each of the fixed strings. A line is selected if any of the fixed strings are found in it (subject to -w or -x, if present). This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude op- tions. -f filename, --file=filename Read patterns from the file, one per line. As is the case with patterns on the command line, no delimiters should be used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --new- line option has no effect on this option. Trailing white space is removed from each line, and blank lines are ignored unless the --posix-pattern-file option is also provided. An empty file contains no patterns and therefore matches noth- ing. Patterns read from a file in this way may contain binary zeros, which are treated as ordinary character literals. If this option is given more than once, all the specified files are read. A data line is output if any of the patterns match it. A file name can be given as "-" to refer to the standard input. When -f is used, patterns specified on the command line using -e may also be present; they are matched before the file's patterns. However, no pattern is taken from the command line; all arguments are treated as the names of paths to be searched. --file-list=filename Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a newline when reading the file is the operating system's de- fault. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any that are listed on the command line. The file name can be given as "-" to refer to the standard input. If --file and --file-list are both specified as "-", patterns are read first. This is useful only when the standard input is a ter- minal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read. --file-offsets Instead of showing lines or parts of lines that match, show each match as an offset from the start of the file and a length, separated by a comma. In this mode, --colour has no effect, and no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown separately. This option is mutu- ally exclusive with --output, --line-offsets, and --only- matching. --group-separator=text Output this text string instead of two hyphens between groups of lines when -A, -B, or -C is in use. See also --no-group- separator. -H, --with-filename Force the inclusion of the file name at the start of output lines when searching a single file. The file name is not nor- mally shown in this case. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. When the -M option causes a pattern to match more than one line, only the first is pre- ceded by the file name. This option overrides any previous -h, -l, or -L options. -h, --no-filename Suppress the output file names when searching multiple files. File names are normally shown when multiple files are searched. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. This option overrides any previous -H, -L, or -l options. --heap-limit=number See --match-limit below. --help Output a help message, giving brief details of the command options and file type support, and then exit. Anything else on the command line is ignored. -I Ignore binary files. This is equivalent to --binary- files=without-match. -i, --ignore-case Ignore upper/lower case distinctions when pattern matching. This applies when matching path names for inclusion or exclu- sion as well as when matching lines in files. --include=pattern If any --include patterns are specified, the only files that are processed are those whose names match one of the patterns and do not match an --exclude pattern. This option does not affect directories, but it applies to all files, whether listed on the command line, obtained from --file-list, or by scanning a directory. The pattern is a PCRE2 regular expres- sion, and is matched against the final component of the file name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. If a file name matches both an --include and an --ex- clude pattern, it is excluded. There is no short form for this option. --include-from=filename Treat each non-empty line of the file as the data for an --include option. What constitutes a newline for this purpose is the operating system's default. The --newline option has no effect on this option. This option may be given any number of times; all the files are read. --include-dir=pattern If any --include-dir patterns are specified, the only direc- tories that are processed are those whose names match one of the patterns and do not match an --exclude-dir pattern. This applies to all directories, whether listed on the command line, obtained from --file-list, or by scanning a parent di- rectory. The pattern is a PCRE2 regular expression, and is matched against the final component of the directory name, not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. If a directory matches both --include-dir and --exclude-dir, it is excluded. There is no short form for this option. -L, --files-without-match Instead of outputting lines from the files, just output the names of the files that do not contain any lines that would have been output. Each file name is output once, on a sepa- rate line by default, but if the -Z option is set, they are separated by zero bytes instead of newlines. This option overrides any previous -H, -h, or -l options. -l, --files-with-matches Instead of outputting lines from the files, just output the names of the files containing lines that would have been out- put. Each file name is output once, on a separate line, but if the -Z option is set, they are separated by zero bytes in- stead of newlines. Searching normally stops as soon as a matching line is found in a file. However, if the -c (count) option is also used, matching continues in order to obtain the correct count, and those files that have at least one match are listed along with their counts. Using this option with -c is a way of suppressing the listing of files with no matches that occurs with -c on its own. This option overrides any previous -H, -h, or -L options. --label=name This option supplies a name to be used for the standard input when file names are being output. If not supplied, "(standard input)" is used. There is no short form for this option. --line-buffered When this option is given, non-compressed input is read and processed line by line, and the output is flushed after each write. By default, input is read in large chunks, unless pcre2grep can determine that it is reading from a terminal, which is currently possible only in Unix-like environments or Windows. Output to terminal is normally automatically flushed by the operating system. This option can be useful when the input or output is attached to a pipe and you do not want pcre2grep to buffer up large amounts of data. However, its use will affect performance, and the -M (multiline) option ceases to work. When input is from a compressed .gz or .bz2 file, --line-buffered is ignored. --line-offsets Instead of showing lines or parts of lines that match, show each match as a line number, the offset from the start of the line, and a length. The line number is terminated by a colon (as usual; see the -n option), and the offset and length are separated by a comma. In this mode, --colour has no effect, and no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown separately. This option is mutually exclu- sive with --output, --file-offsets, and --only-matching. --locale=locale-name This option specifies a locale to be used for pattern match- ing. It overrides the value in the LC_ALL or LC_CTYPE envi- ronment variables. If no locale is specified, the PCRE2 li- brary's default (usually the "C" locale) is used. There is no short form for this option. -M, --multiline Allow patterns to match more than one line. When this option is set, the PCRE2 library is called in "multiline" mode, and a match is allowed to continue past the end of the initial line and onto one or more subsequent lines. Patterns used with -M may usefully contain literal newline characters and internal occurrences of ^ and $ characters, because in multiline mode these can match at internal new- lines. Because pcre2grep is scanning multiple lines, the \Z and \z assertions match only at the end of the last line in the file. The \A assertion matches at the start of the first line of a match. This can be any line in the file; it is not anchored to the first line. The output for a successful match may consist of more than one line. The first line is the line in which the match started, and the last line is the line in which the match ended. If the matched string ends with a newline sequence, the output ends at the end of that line. If -v is set, none of the lines in a multi-line match are output. Once a match has been handled, scanning restarts at the beginning of the line after the one in which the match ended. The newline sequence that separates multiple lines must be matched as part of the pattern. For example, to find the phrase "regular expression" in a file where "regular" might be at the end of a line and "expression" at the start of the next line, you could use this command: pcre2grep -M 'regular\s+expression' The \s escape sequence matches any white space character, in- cluding newlines, and is followed by + so as to match trail- ing white space on the first line as well as possibly han- dling a two-character newline sequence. There is a limit to the number of lines that can be matched, imposed by the way that pcre2grep buffers the input file as it scans it. With a sufficiently large processing buffer, this should not be a problem. The -M option does not work when input is read line by line (see --line-buffered.) -m number, --max-count=number Stop processing after finding number matching lines, or non- matching lines if -v is also set. Any trailing context lines are output after the final match. In multiline mode, each multiline match counts as just one line for this purpose. If this limit is reached when reading the standard input from a regular file, the file is left positioned just after the last matching line. If -c is also set, the count that is output is never greater than number. This option has no effect if used with -L, -l, or -q, or when just checking for a match in a binary file. --match-limit=number Processing some regular expression patterns may take a very long time to search for all possible matching strings. Others may require a very large amount of memory. There are three options that set resource limits for matching. The --match-limit option provides a means of limiting comput- ing resource usage when processing patterns that are not go- ing to match, but which have a very large number of possibil- ities in their search trees. The classic example is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a counter that is incremented each time around its main pro- cessing loop. If the value set by --match-limit is reached, an error occurs. The --heap-limit option specifies, as a number of kibibytes (units of 1024 bytes), the maximum amount of heap memory that may be used for matching. The --depth-limit option limits the depth of nested back- tracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtrack- ing point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than --match-limit. There are no short forms for these options. The default lim- its can be set when the PCRE2 library is compiled; if they are not specified, the defaults are very large and so effec- tively unlimited. --max-buffer-size=number This limits the expansion of the processing buffer, whose initial size can be set by --buffer-size. The maximum buffer size is silently forced to be no smaller than the starting buffer size. -N newline-type, --newline=newline-type Six different conventions for indicating the ends of lines in scanned files are supported. For example: pcre2grep -N CRLF 'some pattern' The newline type may be specified in upper, lower, or mixed case. If the newline type is NUL, lines are separated by bi- nary zero characters. The other types are the single-charac- ter sequences CR (carriage return) and LF (linefeed), the two-character sequence CRLF, an "anycrlf" type, which recog- nizes any of the preceding three types, and an "any" type, for which any Unicode line ending sequence is assumed to end a line. The Unicode sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). When the PCRE2 library is built, a default line-ending se- quence is specified. This is normally the standard sequence for the operating system. Unless otherwise specified by this option, pcre2grep uses the library's default. This option makes it possible to use pcre2grep to scan files that have come from other environments without having to mod- ify their line endings. If the data that is being scanned does not agree with the convention set by this option, pcre2grep may behave in strange ways. Note that this option does not apply to files specified by the -f, --exclude-from, or --include-from options, which are expected to use the op- erating system's standard newline sequence. -n, --line-number Precede each output line by its line number in the file, fol- lowed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the line number. When the -M option causes a pattern to match more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used. --no-group-separator Do not output a separator between groups of lines when -A, -B, or -C is in use. The default is to output a line contain- ing two hyphens. See also --group-separator. --no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build time. This option can be used to disable the use of JIT at run time. It is provided for testing and working around prob- lems. It should never be needed in normal use. -O text, --output=text When there is a match, instead of outputting the line that matched, output just the text specified in this option, fol- lowed by an operating-system standard newline. In this mode, --colour has no effect, and no context is shown. That is, the -A, -B, and -C options are ignored. The --newline option has no effect on this option, which is mutually exclusive with --only-matching, --file-offsets, and --line-offsets. However, like --only-matching, if there is more than one match in a line, each of them causes a line of output. Escape sequences starting with a dollar character may be used to insert the contents of the matched part of the line and/or captured substrings into the text. $ or ${} is replaced by the captured sub- string of the given decimal number; $& (or the legacy $0) substitutes the whole match. If the number is greater than the number of capturing substrings, or if the capture is un- set, the replacement is empty. $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; $v by vertical tab. $o or $o{} is replaced by the character whose code point is the given octal number. In the first form, up to three octal digits are processed. When more digits are needed in Unicode mode to specify a wide character, the sec- ond form must be used. $x or $x{} is replaced by the character rep- resented by the given hexadecimal number. In the first form, up to two hexadecimal digits are processed. When more digits are needed in Unicode mode to specify a wide character, the second form must be used. Any other character is substituted by itself. In particular, $$ is replaced by a single dollar. -o, --only-matching Show only the part of the line that matched a pattern instead of the whole line. In this mode, no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown separately, on a separate line of output. If -o is combined with -v (in- vert the sense of the match to find non-matching lines), no output is generated, but the return code is set appropri- ately. If the matched portion of the line is empty, nothing is output unless the file name or line number are being printed, in which case they are shown on an otherwise empty line. This option is mutually exclusive with --output, --file-offsets and --line-offsets. -onumber, --only-matching=number Show only the part of the line that matched the capturing parentheses of the given number. Up to 50 capturing parenthe- ses are supported by default. This limit can be changed via the --om-capture option. A pattern may contain any number of capturing parentheses, but only those whose number is within the limit can be accessed by -o. An error occurs if the num- ber specified by -o is greater than the limit. -o0 is the same as -o without a number. Because these options can be given without an argument (see above), if an argument is present, it must be given in the same shell item, for ex- ample, -o3 or --only-matching=2. The comments given for the non-argument case above also apply to this option. If the specified capturing parentheses do not exist in the pattern, or were not set in the match, nothing is output unless the file name or line number are being output. If this option is given multiple times, multiple substrings are output for each match, in the order the options are given, and all on one line. For example, -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and then 3 again to be output. By default, there is no separator (but see the next but one option). --om-capture=number Set the number of capturing parentheses that can be accessed by -o. The default is 50. --om-separator=text Specify a separating string for multiple occurrences of -o. The default is an empty string. Separating strings are never coloured. -P, --no-ucp Starting from release 10.43, when UTF/Unicode mode is speci- fied with -u or -U, the PCRE2_UCP option is used by default. This means that the POSIX classes in patterns match more than just ASCII characters. For example, [:digit:] matches any Unicode decimal digit. The --no-ucp option suppresses PCRE2_UCP, thus restricting the POSIX classes to ASCII char- acters, as was the case in earlier releases. Note that there are now more fine-grained option settings within patterns that affect individual classes. For example, when in UCP mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \w to match Unicode letters and digits. --posix-pattern-file When patterns are provided with the -f option, do not trim trailing spaces or ignore empty lines in a similar way than other grep tools. To keep the behaviour consistent with older versions, if the pattern read was terminated with CRLF (as character literals) then both characters won't be included as part of it, so if you really need to have pattern ending in '\r', use a escape sequence or provide it by a different method. -q, --quiet Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found. -r, --recursive If any given path is a directory, recursively scan the files it contains, taking note of any --include and --exclude set- tings. By default, a directory is read as a normal file; in some operating systems this gives an immediate end-of-file. This option is a shorthand for setting the -d option to "re- curse". --recursion-limit=number This is an obsolete synonym for --depth-limit. See --match- limit above for details. -s, --no-messages Suppress error messages about non-existent or unreadable files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. -t, --total-count This option is useful when scanning more than one file. If used on its own, -t suppresses all output except for a grand total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand to- tal is output except when the previous output is just one line. In other words, it is not output when just one file's count is listed. If file names are being output, the grand total is preceded by "TOTAL:". Otherwise, it appears as just another number. The -t option is ignored when used with -L (list files without matches), because the grand total would always be zero. -u, --utf Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (in- cluding those for any --exclude and --include options) and all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an er- ror occurs. -U, --utf-allow-invalid As --utf, but in addition subject lines may contain invalid UTF-8 code unit sequences. These can never form part of any pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or other binary files. For more details about matching in non- valid UTF-8 strings, see the pcre2unicode(3) documentation. -V, --version Write the version numbers of pcre2grep and the PCRE2 library to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match Invert the sense of the match, so that lines which do not match any of the patterns are the ones that are found. When this option is set, options such as --only-matching and --output, which specify parts of a match that are to be out- put, are ignored. -w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must be a word boundary at the start and end of each matched string. This is equivalent to having "\b(?:" at the start of each pattern, and ")\b" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude options. -x, --line-regex, --line-regexp Force the patterns to start matching only at the beginnings of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pat- tern and ")$" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude options. -Z, --null Terminate files names in the regular output with a zero byte (the NUL character) instead of what would normally appear. This is useful when file names contain unusual characters such as colons, hyphens, or even newlines. The option does not apply to file names in error messages. ENVIRONMENT VARIABLES The environment variables LC_ALL and LC_CTYPE are examined, in that or- der, for a locale. The first one that is set is used. This can be over- ridden by the --locale option. If no locale is set, the PCRE2 library's default (usually the "C" locale) is used. NEWLINES The -N (--newline) option allows pcre2grep to scan files with newline conventions that differ from the default. This option affects only the way scanned files are processed. It does not affect the interpretation of files specified by the -f, --file-list, --exclude-from, or --in- clude-from options. Any parts of the scanned input files that are written to the standard output are copied with whatever newline sequences they have in the in- put. However, if the final line of a file is output, and it does not end with a newline sequence, a newline sequence is added. If the new- line setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. The newline setting does not affect the way in which pcre2grep writes newlines in informational messages to the standard output and error streams. Under Windows, the standard output is set to be binary, so that "\r\n" at the ends of output lines that are copied from the input is not converted to "\r\r\n" by the C I/O library. This means that any messages written to the standard output must end with "\r\n". For all other operating systems, and for all messages to the standard error stream, "\n" is used. OPTIONS COMPATIBILITY WITH GNU GREP Many of the short and long forms of pcre2grep's options are the same as in the GNU grep program. Any long option of the form --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE2 terminology). However, the --case-restrict, --depth-limit, -E, --file-list, --file- offsets, --heap-limit, --include-dir, --line-offsets, --locale, --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number. Although most of the common options work the same way, a few are dif- ferent in pcre2grep. For example, the --include option's argument is a glob for GNU grep, but in pcre2grep it is a regular expression to which the -i option applies. If both the -c and -l options are given, GNU grep lists only file names, without counts, but pcre2grep gives the counts as well. OPTIONS WITH DATA There are four different ways in which an option with data can be spec- ified. If a short form option is used, the data may follow immedi- ately, or (with one exception) in the next command line item. For exam- ple: -f/some/file -f /some/file The exception is the -o option, which may appear with or without data. Because of this, if data is present, it must follow immediately in the same item, for example -o3. If a long form option is used, the data may appear in the same command line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: --file=/some/file --file /some/file Note, however, that if you want to supply a file name beginning with ~ as data in a shell command, and have the shell expand ~ to a home di- rectory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. The exceptions to the above are the --colour (or --color) and --only- matching options, for which the data is optional. If one of these op- tions does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data. USING PCRE2'S CALLOUT FACILITY pcre2grep has, by default, support for calling external programs or scripts or echoing specific strings during matching by making use of PCRE2's callout facility. However, this support can be completely or partially disabled when pcre2grep is built. You can find out whether your binary has support for callouts by running it with the --help op- tion. If callout support is completely disabled, callouts in patterns are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored. A callout in a PCRE2 pattern is of the form (?C) where the argu- ment is either a number or a quoted string (see the pcre2callout docu- mentation for details). Numbered callouts are ignored by pcre2grep; only callouts with string arguments are useful. Echoing a specific string Starting the callout string with a pipe character invokes an echoing facility that avoids calling an external program or script. This facil- ity is always available, provided that callouts were not completely disabled when pcre2grep was built. The rest of the callout string is processed as a zero-terminated string, which means it should not con- tain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the --output (-O) option (see above). However, $0 or $& cannot be used to insert a matched substring because the match is still in progress. In- stead, the single character '0' is inserted. Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the es- cape $n. For example: pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' Matching continues normally after the string is output. If you want to see only the callout output but not any output from an actual match, you should end the pattern with (*FAIL). Calling external programs or scripts This facility can be independently disabled when pcre2grep is built. It is supported for Windows, where a call to _spawnvp() is used, for VMS, where lib$spawn() is used, and for any Unix-like environment where fork() and execv() are available. If the callout string does not start with a pipe (vertical bar) charac- ter, it is parsed into a list of substrings separated by pipe charac- ters. The first substring must be an executable name, with the follow- ing substrings specifying arguments: executable_name|arg1|arg2|... Any substring (including the executable name) may contain escape se- quences started by a dollar character. These are the same as for the --output (-O) option documented above, except that $0 or $& cannot in- sert the matched string because the match is still in progress. In- stead, the character '0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example: echo -e "abcde\n12345" | pcre2grep \ '(?x)(.)(..(.)) (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' - Output: Arg1: [a] [bcd] [d] Arg2: |a| () abcde Arg1: [1] [234] [4] Arg2: |1| () 12345 The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero charac- ters in the callout argument will cause premature termination of their substrings, and therefore should not be present. Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any reason (including the non-existence of the executable), a local match- ing failure occurs and the matcher backtracks in the normal way. MATCHING ERRORS It is possible to supply a regular expression that takes a very long time to fail to match certain lines. Such patterns normally involve nested indefinite repeats, for example: (a+)*\d when matched against a line of a's with no final digit. The PCRE2 matching function has a re- source limit that causes it to abort in these circumstances. If this happens, pcre2grep outputs an error message and the line that caused the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. The --match-limit option of pcre2grep can be used to set the overall resource limit. There are also other limits that affect the amount of memory used during matching; see the discussion of --heap-limit and --depth-limit above. DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, and 2 for syntax errors, overlong lines, non-existent or inaccessible files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. When run under VMS, the return code is placed in the symbol PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). SEE ALSO pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 24 January 2025 Copyright (c) 1997-2023 University of Cambridge. PCRE2 10.48-DEV 24 January 2025 PCRE2GREP(1) ================================================ FILE: doc/pcre2jit.3 ================================================ .TH PCRE2JIT 3 "22 August 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT" .rs .sp Just-in-time compiling is a heavyweight optimization that can greatly speed up pattern matching. However, it comes at the cost of extra processing before the match is performed, so it is of most benefit when the same pattern is going to be matched many times. This does not necessarily mean many calls of a matching function; if the pattern is not anchored, matching attempts may take place many times at various positions in the subject, even for a single call. Therefore, if the subject string is very long, it may still pay to use JIT even for one-off matches. JIT support is available for all of the 8-bit, 16-bit and 32-bit PCRE2 libraries. .P JIT support applies only to the traditional Perl-compatible matching function. It does not apply when the DFA matching function is being used. The code for JIT support was written by Zoltan Herczeg. . . .SH "AVAILABILITY OF JIT SUPPORT" .rs .sp JIT support is an optional feature of PCRE2. The "configure" option --enable-jit (or equivalent CMake option) must be set when PCRE2 is built if you want to use JIT. The support is limited to the following hardware platforms: .sp ARM 32-bit (v7, and Thumb2) ARM 64-bit IBM s390x 64 bit Intel x86 32-bit and 64-bit LoongArch 64 bit MIPS 32-bit and 64-bit Power PC 32-bit and 64-bit RISC-V 32-bit and 64-bit .sp If --enable-jit is set on an unsupported platform, compilation fails. .P A client program can tell if JIT support has been compiled by calling \fBpcre2_config()\fP with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular match. One reason for this is that there are a number of options and pattern items that are .\" HTML .\" not supported by JIT .\" (see below). Another reason is that in some environments JIT is unable to get executable memory in which to build its compiled code. The only guarantee from \fBpcre2_config()\fP is that if it returns zero, JIT will definitely \fInot\fP be used. .P As of release 10.45 there is a more informative way to test for JIT support. If \fBpcre2_compile_jit()\fP is called with the single option PCRE2_JIT_TEST_ALLOC it returns zero if JIT is available and has a working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. The code argument is ignored, so it can be a NULL value. .P A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs that need the best possible performance, there is a .\" HTML .\" "fast path" .\" API that is JIT-specific. . . .SH "SIMPLE USE OF JIT" .rs .sp To make use of the JIT support in the simplest way, all you have to do is to call \fBpcre2_jit_compile()\fP after successfully compiling a pattern with \fBpcre2_compile()\fP. This function has two arguments: the first is the compiled pattern pointer that was returned by \fBpcre2_compile()\fP, and the second is zero or more of the following option bits: PCRE2_JIT_COMPLETE, PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT. .P If JIT support is not available, a call to \fBpcre2_jit_compile()\fP does nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern is passed to the JIT compiler, which turns it into machine code that executes much faster than the normal interpretive code, but yields exactly the same results. The returned value from \fBpcre2_jit_compile()\fP is zero on success, or a negative error code. .P There is a limit to the size of pattern that JIT supports, imposed by the size of machine stack that it uses. The exact rules are not documented because they may change at any time, in particular, when new optimizations are introduced. If a pattern is too big, a call to \fBpcre2_jit_compile()\fP returns PCRE2_ERROR_NOMEMORY. .P PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options of \fBpcre2_match()\fP, you should set one or both of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT compiler generates different optimized code for each of the three modes (normal, soft partial, hard partial). When \fBpcre2_match()\fP is called, the appropriate code is run if it is available. Otherwise, the pattern is matched using interpretive code. .P You can call \fBpcre2_jit_compile()\fP multiple times for the same compiled pattern. It does nothing if it has previously compiled code for any of the option bits. For example, you can call it once with PCRE2_JIT_COMPLETE and (perhaps later, when you find you need partial matching) again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial matching. If \fBpcre2_jit_compile()\fP is called with no option bits set, it immediately returns zero. This is an alternative way of testing whether JIT support has been compiled. .P At present, it is not possible to free JIT compiled code except when the entire compiled pattern is freed by calling \fBpcre2_code_free()\fP. .P In some circumstances you may need to call additional functions. These are described in the section entitled .\" HTML .\" "Controlling the JIT stack" .\" below. .P There are some \fBpcre2_match()\fP options that are not supported by JIT, and there are also some pattern items that JIT cannot handle. Details are given .\" HTML .\" below. .\" In both cases, matching automatically falls back to the interpretive code. If you want to know whether JIT was actually used for a particular match, you should arrange for a JIT callback function to be set up as described in the section entitled .\" HTML .\" "Controlling the JIT stack" .\" below, even if you do not need to supply a non-default JIT stack. Such a callback function is called whenever JIT code is about to be obeyed. If the match-time options are not right for JIT execution, the callback function is not obeyed. .P If the JIT compiler finds an unsupported item, no JIT data is generated. You can find out if JIT compilation was successful for a compiled pattern by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_JITSIZE option. A non-zero result means that JIT compilation was successful. A result of 0 means that JIT support is not available, or the pattern was not processed by \fBpcre2_jit_compile()\fP, or the JIT compiler was not able to handle the pattern. Successful JIT compilation does not, however, guarantee the use of JIT at match time because there are some match time options that are not supported by JIT. . . .SH "MATCHING SUBJECTS CONTAINING INVALID UTF" .rs .sp When a pattern is compiled with the PCRE2_UTF option, subject strings are normally expected to be a valid sequence of UTF code units. By default, this is checked at the start of matching and an error is generated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be passed to \fBpcre2_match()\fP to skip the check (for improved performance) if you are sure that a subject string is valid. If this option is used with an invalid string, the result is undefined. The calling program may crash or loop or otherwise misbehave. .P However, a way of running matches on strings that may contain invalid UTF sequences is available. Calling \fBpcre2_compile()\fP with the PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in \fBpcre2_match()\fP to support invalid UTF, and, if \fBpcre2_jit_compile()\fP is subsequently called, the compiled JIT code also supports invalid UTF. Details of how this support works, in both the JIT and the interpretive cases, is given in the .\" HREF \fBpcre2unicode\fP .\" documentation. .P There is also an obsolete option for \fBpcre2_jit_compile()\fP called PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility. It is superseded by the \fBpcre2_compile()\fP option PCRE2_MATCH_INVALID_UTF and should no longer be used. It may be removed in future. . . .\" HTML .SH "UNSUPPORTED OPTIONS AND PATTERN ITEMS" .rs .sp The \fBpcre2_match()\fP options that are supported for JIT matching are PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not supported at match time. .P If the PCRE2_NO_JIT option is passed to \fBpcre2_match()\fP it disables the use of JIT, forcing matching by the interpreter code. .P The only unsupported pattern items are \eC (match a single data unit) when running in a UTF mode, and a callout immediately before an assertion condition in a conditional group. . . .SH "RETURN VALUES FROM JIT MATCHING" .rs .sp When a pattern is matched using JIT, the return values are the same as those given by the interpretive \fBpcre2_match()\fP code, with the addition of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the memory used for the JIT stack was insufficient. See .\" HTML .\" "Controlling the JIT stack" .\" below for a discussion of JIT stack usage. .P The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if searching a very large pattern tree goes on for too long, as it is in the same circumstance when JIT is not used, but the details of exactly what is counted are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned when JIT matching is used. . . .\" HTML .SH "CONTROLLING THE JIT STACK" .rs .sp When the compiled JIT code runs, it needs a block of memory to use as a stack. By default, it uses 32KiB on the machine stack. However, some large or complicated patterns need more than this. The error PCRE2_ERROR_JIT_STACKLIMIT is given when there is not enough stack. Three functions are provided for managing blocks of memory for use as JIT stacks. There is further discussion about the use of JIT stacks in the section entitled .\" HTML .\" "JIT stack FAQ" .\" below. .P The \fBpcre2_jit_stack_create()\fP function creates a JIT stack. Its arguments are a starting size, a maximum size, and a general context (for memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type \fBpcre2_jit_stack\fP, or NULL if there is an error. The \fBpcre2_jit_stack_free()\fP function is used to free a stack that is no longer needed. If its argument is NULL, this function returns immediately, without doing anything. (For the technically minded: the address space is allocated by mmap or VirtualAlloc.) A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. .P The \fBpcre2_jit_stack_assign()\fP function specifies which stack JIT code should use. Its arguments are as follows: .sp pcre2_match_context *mcontext pcre2_jit_callback callback void *data .sp The first argument is a pointer to a match context. When this is subsequently passed to a matching function, its information determines which JIT stack is used. If this argument is NULL, the function returns immediately, without doing anything. There are three cases for the values of the other two options: .sp (1) If \fIcallback\fP is NULL and \fIdata\fP is NULL, an internal 32KiB block on the machine stack is used. This is the default when a match context is created. .sp (2) If \fIcallback\fP is NULL and \fIdata\fP is not NULL, \fIdata\fP must be a pointer to a valid JIT stack, the result of calling \fBpcre2_jit_stack_create()\fP. .sp (3) If \fIcallback\fP is not NULL, it must point to a function that is called with \fIdata\fP as an argument at the start of matching, in order to set up a JIT stack. If the return from the callback function is NULL, the internal 32KiB stack is used; otherwise the return value must be a valid JIT stack, the result of calling \fBpcre2_jit_stack_create()\fP. .sp A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when \fBpcre2_match()\fP is called with options that are incompatible for JIT matching. A callback function can therefore be used to determine whether a match operation was executed by JIT or by the interpreter. .P You may safely use the same JIT stack for more than one pattern (either by assigning directly or by callback), as long as the patterns are matched sequentially in the same thread. Currently, the only way to set up non-sequential matches in one thread is to use callouts: if a callout function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es). .P In a multithread application, if you do not specify a JIT stack, or if you assign or pass back NULL from a callback, that is thread-safe, because each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for each thread so that the application is thread-safe. .P Strictly speaking, even more is allowed. You can assign the same non-NULL stack to a match context that is used by any number of patterns, as long as they are not used for matching by multiple threads at the same time. For example, you could use the same stack in all compiled patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended. .P This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate: .sp During thread initialization thread_local_var = pcre2_jit_stack_create(...) .sp During thread exit pcre2_jit_stack_free(thread_local_var) .sp Use a one-line callback function return thread_local_var .sp All the functions described in this section do nothing if JIT is not available. . . .\" HTML .SH "JIT STACK FAQ" .rs .sp (1) Why do we need JIT stacks? .sp PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is difficult. For example, the stack chain needs to be updated every time if we extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory. .P (2) Why don't we simply allocate blocks of memory with \fBmalloc()\fP? .sp Modern operating systems have a nice feature: they can reserve an address space instead of allocating memory. We can safely allocate memory pages inside this address space, so the stack could grow without moving memory data (this is important because of pointers). Thus we can allocate 1MiB address space, and use only a single memory page (usually 4KiB) if that is enough. However, we can still grow up to 1MiB anytime if needed. .P (3) Who "owns" a JIT stack? .sp The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by \fBpcre2_match()\fP, (that is, it is assigned to a match context that is passed to the pattern currently running), that stack must not be used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function. .P (4) When should a JIT stack be freed? .sp You can free a JIT stack at any time, as long as it will not be used by \fBpcre2_match()\fP again. When you assign the stack to a match context, only a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo not\fP call \fBpcre2_match()\fP with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently used by \fBpcre2_match()\fP in another thread). You can also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement. .P (5) Should I allocate/free a stack every time before/after calling \fBpcre2_match()\fP? .sp No, because this is too costly in terms of resources. However, you could implement some clever idea which release the stack if it is not used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns. .P (6) OK, the stack is for long term memory allocation. But what happens if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the stack is freed? .sp Especially on embedded systems, it might be a good idea to release memory sometimes without freeing the stack. There is no API for this at the moment. Probably a function call which returns with the currently allocated memory for any stack and another which allows releasing memory (shrinking the stack) would be a good idea if someone needs this. .P (7) This is too much of a headache. Isn't there any better solution for JIT stack handling? .sp No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API. . . .SH "FREEING JIT SPECULATIVE MEMORY" .rs .sp .nf .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP); .fi .P The JIT executable allocator does not free all memory when it is possible. It expects new allocations, and keeps some free memory around to improve allocation speed. However, in low memory conditions, it might be better to free all possible memory. You can cause this to happen by calling pcre2_jit_free_unused_memory(). Its argument is a general context, for custom memory management, or NULL for standard memory management. . . .SH "EXAMPLE CODE" .rs .sp This is a single-threaded example that specifies a JIT stack without using a callback. A real program should include error checking after all the function calls. .sp int rc; pcre2_code *re; pcre2_match_data *match_data; pcre2_match_context *mcontext; pcre2_jit_stack *jit_stack; .sp re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0, &errornumber, &erroffset, NULL); rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); mcontext = pcre2_match_context_create(NULL); jit_stack = pcre2_jit_stack_create(32*1024, 512*1024, NULL); pcre2_jit_stack_assign(mcontext, NULL, jit_stack); match_data = pcre2_match_data_create(re, 10); rc = pcre2_match(re, subject, length, 0, 0, match_data, mcontext); /* Process result */ .sp pcre2_code_free(re); pcre2_match_data_free(match_data); pcre2_match_context_free(mcontext); pcre2_jit_stack_free(jit_stack); .sp . . .\" HTML .SH "JIT FAST PATH API" .rs .sp Because the API described above falls back to interpreted matching when JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via \fBpcre2_match()\fP does have a performance impact. Programs that are written for use where JIT is known to be available, and which need the best possible performance, can instead use a "fast path" API to call JIT matching directly instead of calling \fBpcre2_match()\fP (obviously only for patterns that have been successfully processed by \fBpcre2_jit_compile()\fP). .P The fast path function is called \fBpcre2_jit_match()\fP, and it takes exactly the same arguments as \fBpcre2_match()\fP. However, the subject string must be specified with a length; PCRE2_ZERO_TERMINATED is not supported. Unsupported option bits (for example, PCRE2_ANCHORED and PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The return values are also the same as for \fBpcre2_match()\fP, plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. .P When you call \fBpcre2_match()\fP, as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For example, if the subject pointer is NULL but the length is non-zero, an immediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these checks do not happen on the JIT fast path. If invalid UTF data is passed when PCRE2_MATCH_INVALID_UTF was not set for \fBpcre2_compile()\fP, the result is undefined. The program may crash or loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you should call \fBpcre2_jit_match()\fP in UTF mode only if you are sure the subject is valid. .P Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give speedups of more than 10%. . . .SH "SEE ALSO" .rs .sp \fBpcre2api\fP(3), \fBpcre2unicode\fP(3) . . .SH AUTHOR .rs .sp .nf Philip Hazel (FAQ by Zoltan Herczeg) Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 22 August 2024 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2limits.3 ================================================ .TH PCRE2LIMITS 3 "03 September 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SIZE AND OTHER LIMITATIONS" .rs .sp There are some size limitations in PCRE2 but it is hoped that they will never in practice be relevant. .P The maximum size of a compiled pattern is approximately 64 thousand code units for the 8-bit and 16-bit libraries if PCRE2 is compiled with the default internal linkage size, which is 2 bytes for these libraries. If you want to process regular expressions that are truly enormous, you can compile PCRE2 with an internal linkage size of 3 or 4 (when building the 16-bit library, 3 is rounded up to 4). See the \fBREADME\fP file in the source distribution and the .\" HREF \fBpcre2build\fP .\" documentation for details. In these cases the limit is substantially larger. However, the speed of execution is slower. In the 32-bit library, the internal linkage size is always 4. .P The maximum length of a source pattern string is essentially unlimited; it is the largest number a PCRE2_SIZE variable can hold. However, the program that calls \fBpcre2_compile()\fP can specify a smaller limit. .P The maximum length (in code units) of a subject string is one less than the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as size_t. Its maximum value (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings and unset offsets. .P All values in repeating quantifiers must be less than 65536. .P There are two different limits that apply to branches of lookbehind assertions. If every branch in such an assertion matches a fixed number of characters, the maximum length of any branch is 65535 characters. If any branch matches a variable number of characters, then the maximum matching length for every branch is limited. The default limit is set at compile time, defaulting to 255, but can be changed by the calling program. .P There is no limit to the number of parenthesized groups, but there can be no more than 65535 capture groups, and there is a limit to the depth of nesting of parenthesized subpatterns of all kinds. This is imposed in order to limit the amount of system stack used at compile time. The default limit can be specified when PCRE2 is built; if not, the default is set to 250. An application can change this limit by calling pcre2_set_parens_nest_limit() to set the limit in a compile context. .P The maximum length of the name for a named capture group as well as the number of such groups is configurable at build time. The maximum length for the name defaults to .\" DEFINE MAX_NAME_SIZE 128 code units, and the maximum number of such groups to .\" DEFINE MAX_NAME_COUNT 10000. .P The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb is 255 code units for the 8-bit library and 65535 code units for the 16-bit and 32-bit libraries. .P The maximum length of a string argument to a callout is the largest number a 32-bit unsigned integer can hold. .P The maximum amount of heap memory used for matching is controlled by the heap limit, which can be set in a pattern or in a match context. The default is a very large number, effectively unlimited. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 03 September 2025 Copyright (c) 1997-2023 University of Cambridge. .fi ================================================ FILE: doc/pcre2matching.3 ================================================ .TH PCRE2MATCHING 3 "22 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 MATCHING ALGORITHMS" .rs .sp This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the \fBpcre2_match()\fP function. This works in the same way as Perl's matching function, and provides a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the .\" HREF \fBpcre2jit\fP .\" documentation is compatible with this function. .P An alternative algorithm is provided by the \fBpcre2_dfa_match()\fP function; it operates in a different way, and is not Perl-compatible. This alternative has advantages and disadvantages compared with the standard algorithm, and these are described below. .P When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, when there are multiple possibilities. For example, if the anchored pattern .sp ^<.*> .sp is matched against the string .sp .sp there are three possible answers. The standard algorithm finds only one of them, whereas the alternative algorithm finds all three. . . .SH "REGULAR EXPRESSIONS AS TREES" .rs .sp The set of strings that are matched by a regular expression can be represented as a tree structure. An unlimited repetition in the pattern makes the tree of infinite size, but it is still a tree. Matching the pattern to a given subject string (from a given starting point) can be thought of as a search of the tree. There are two ways to search a tree: depth-first and breadth-first, and these correspond to the two matching algorithms provided by PCRE2. . . .SH "THE STANDARD MATCHING ALGORITHM" .rs .sp In the terminology of Jeffrey Friedl's book "Mastering Regular Expressions", the standard algorithm is an "NFA algorithm". It conducts a depth-first search of the pattern tree. That is, it proceeds along a single path through the tree, checking that the subject matches what is required. When there is a mismatch, the algorithm tries any alternatives at the current point, and if they all fail, it backs up to the previous branch point in the tree, and tries the next alternative branch at that level. This often involves backing up (moving to the left) in the subject string as well. The order in which repetition branches are tried is controlled by the greedy or ungreedy nature of the quantifier. .P If a leaf node is reached, a matching string has been found, and at that point the algorithm stops. Thus, if there is more than one possible match, this algorithm returns the first one that it finds. Whether this is the shortest, the longest, or some intermediate length depends on the way the alternations and the greedy or ungreedy repetition quantifiers are specified in the pattern. .P Because it ends up with a single path through the tree, it is relatively straightforward for this algorithm to keep track of the substrings that are matched by portions of the pattern in parentheses. This provides support for capturing parentheses and backreferences. . . .SH "THE ALTERNATIVE MATCHING ALGORITHM" .rs .sp This algorithm conducts a breadth-first search of the tree. Starting from the first matching point in the subject, it scans the subject string from left to right, once, character by character, and as it does this, it remembers all the paths through the tree that represent valid matches. In Friedl's terminology, this is a kind of "DFA algorithm", though it is not implemented as a traditional finite state machine (it keeps multiple states active simultaneously). .P Although the general principle of this matching algorithm is that it scans the subject string only once, without backtracking, there is one exception: when a lookaround assertion is encountered, the characters following or preceding the current point have to be independently inspected. .P The scan continues until either the end of the subject is reached, or there are no more unterminated paths. At this point, terminated paths represent the different matching possibilities (if there are none, the match has failed). Thus, if there is more than one possible match, this algorithm finds all of them, and in particular, it finds the longest. The matches are returned in the output vector in decreasing order of length. There is an option to stop the algorithm after the first match (which is necessarily the shortest) is found. .P Note that the size of vector needed to contain all the results depends on the number of simultaneous matches, not on the number of capturing parentheses in the pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match data block is therefore not advisable when doing DFA matching. .P Note also that all the matches that are found start at the same point in the subject. If the pattern .sp cat(er(pillar)?)? .sp is matched against the string "the caterpillar catchment", the result is the three strings "caterpillar", "cater", and "cat" that start at the fifth character of the subject. The algorithm does not automatically move on to find matches that start at later positions. .P PCRE2's "auto-possessification" optimization usually applies to character repeats at the end of a pattern (as well as internally). For example, the pattern "a\ed+" is compiled as if it were "a\ed++" because there is no point even considering the possibility of backtracking into the repeated digits. For DFA matching, this means that only one possible match is found. If you really do want multiple matches in such cases, either use an ungreedy repeat ("a\ed+?") or set the PCRE2_NO_AUTO_POSSESS option when compiling. .P There are a number of features of PCRE2 regular expressions that are not supported or behave differently in the alternative matching function. Those that are not supported cause an error if encountered. .P 1. Because the algorithm finds all possible matches, the greedy or ungreedy nature of repetition quantifiers is not relevant (though it may affect auto-possessification, as just described). During matching, greedy and ungreedy quantifiers are treated in exactly the same way. However, possessive quantifiers can make a difference when what follows could also match what is quantified, for example in a pattern like this: .sp ^a++\ew! .sp This pattern matches "aaab!" but not "aaa!", which would be matched by a non-possessive quantifier. Similarly, if an atomic group is present, it is matched as if it were a standalone pattern at the current point, and the longest match is then "locked in" for the rest of the overall pattern. .P 2. When dealing with multiple paths through the tree simultaneously, it is not straightforward to keep track of captured substrings for the different matching possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured substrings are available. .P 3. Because no substrings are captured, a number of related features are not available: .sp (a) Backreferences; .sp (b) Conditional expressions that use a backreference as the condition or test for a specific group recursion; .sp (c) Script runs; .sp (d) Scan substring assertions. .P 4. Because many paths through the tree may be active, the \eK escape sequence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported. .P 5. Callouts are supported, but the value of the \fIcapture_top\fP field is always 1, and the value of the \fIcapture_last\fP field is always 0. .P 6. The \eC escape sequence, which (in the standard algorithm) always matches a single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree. .P 7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion. .P 8. The PCRE2_MATCH_INVALID_UTF option for \fBpcre2_compile()\fP is not supported by \fBpcre2_dfa_match()\fP. . . .SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM" .rs .sp The main advantage of the alternative algorithm is that all possible matches (at a single point in the subject) are automatically found, and in particular, the longest match is found. To find more than one match at the same point using the standard algorithm, you have to do kludgy things with callouts. .P Partial matching is possible with this algorithm, though it has some limitations. The .\" HREF \fBpcre2partial\fP .\" documentation gives details of partial matching and discusses multi-segment matching. . . .SH "DISADVANTAGES OF THE ALTERNATIVE ALGORITHM" .rs .sp The alternative algorithm suffers from a number of disadvantages: .P 1. It is substantially slower than the standard algorithm. This is partly because it has to search for all possible matches, but is also because it is less susceptible to optimization. .P 2. Capturing parentheses and other features such as backreferences that rely on them are not supported. .P 3. Matching within invalid UTF strings is not supported. .P 4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm. .P 5. JIT optimization is not supported. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 22 February 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2partial.3 ================================================ .TH PCRE2PARTIAL 3 "27 November 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PARTIAL MATCHING IN PCRE2" .rs .sp In normal use of PCRE2, if there is a match up to the end of a subject string, but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH is returned, just like any other failing match. There are circumstances where it might be helpful to distinguish this "partial match" case. .P One example is an application where the subject string is very long, and not all available at once. The requirement here is to be able to do the matching segment by segment, but special action is needed when a matched substring spans the boundary between two segments. .P Another example is checking a user input string as it is typed, to ensure that it conforms to a required format. Invalid characters can be immediately diagnosed and rejected, giving instant feedback. .P Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options when calling a matching function. The difference between the two options is whether or not a partial match is preferred to an alternative complete match, though the details differ between the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD takes precedence. .P If you want to use partial matching with just-in-time optimized code, as well as setting a partial match option for the matching function, you must also call \fBpcre2_jit_compile()\fP with one or both of these options: .sp PCRE2_JIT_PARTIAL_HARD PCRE2_JIT_PARTIAL_SOFT .sp PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial matches on the same pattern. Separate code is compiled for each mode. If the appropriate JIT mode has not been compiled, interpretive matching code is used. .P Setting a partial matching option disables two of PCRE2's standard optimization hints. PCRE2 remembers the last literal code unit in a pattern, and abandons matching immediately if it is not present in the subject string. This optimization cannot be used for a subject string that might match only partially. PCRE2 also remembers a minimum length of a matching string, and does not bother to run the matching function on shorter strings. This optimization is also disabled for partial matching. . . .SH "REQUIREMENTS FOR A PARTIAL MATCH" .rs .sp A possible partial match occurs during matching when the end of the subject string is reached successfully, but either more characters are needed to complete the match, or the addition of more characters might change what is matched. .P Example 1: if the pattern is /abc/ and the subject is "ab", more characters are definitely needed to complete a match. In this case both hard and soft matching options yield a partial match. .P Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match can be found, but the addition of more characters might change what is matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match; PCRE2_PARTIAL_SOFT returns the complete match. .P On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next pattern item is \ez, \eZ, \eb, \eB, or $ there is always a partial match. Otherwise, for both options, the next pattern item must be one that inspects a character, and at least one of the following must be true: .P (1) At least one character has already been inspected. An inspected character need not form part of the final matched string; lookbehind assertions and the \eK escape sequence provide ways of inspecting characters before the start of a matched string. .P (2) The pattern contains one or more lookbehind assertions. This condition exists in case there is a lookbehind that inspects characters before the start of the match. .P (3) There is a special case when the whole pattern can match an empty string. When the starting point is at the end of the subject, the empty string match is a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above conditions is true, it is returned. However, because adding more characters might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match, which in this case means "there is going to be a match at this point, but until some more characters are added, we do not know if it will be an empty string or something longer". . . . .SH "PARTIAL MATCHING USING pcre2_match()" .rs .sp When a partial matching option is set, the result of calling \fBpcre2_match()\fP can be one of the following: .TP 2 \fBA successful match\fP A complete match has been found, starting and ending within this subject. .TP \fBPCRE2_ERROR_NOMATCH\fP No match can start anywhere in this subject. .TP \fBPCRE2_ERROR_PARTIAL\fP Adding more characters may result in a complete match that uses one or more characters from the end of this subject. .P When a partial match is returned, the first two elements in the ovector point to the portion of the subject that was matched, but the values in the rest of the ovector are undefined. The appearance of \eK in the pattern has no effect for a partial match. Consider this pattern: .sp /abc\eK123/ .sp If it is matched against "456abc123xyz" the result is a complete match, and the ovector defines the matched string as "123", because \eK resets the "start of match" point. However, if a partial match is requested and the subject string is "456abc12", a partial match is found for the string "abc12", because all these characters are needed for a subsequent re-match with additional characters. .P If there is more than one partial match, the first one that was found provides the data that is returned. Consider this pattern: .sp /123\ew+X|dogY/ .sp If this is matched against the subject string "abc123dog", both alternatives fail to match, but the end of the subject is reached during matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying "123dog" as the first partial match. (In this example, there are two partial matches, because "dog" on its own partially matches the second alternative.) . . .SS "How a partial match is processed by pcre2_match()" .rs .sp What happens when a partial match is identified depends on which of the two partial matching options is set. .P If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a partial match is found, without continuing to search for possible complete matches. This option is "hard" because it prefers an earlier partial match over a later complete match. For this reason, the assumption is made that the end of the supplied subject string is not the true end of the available data, which is why \ez, \eZ, \eb, \eB, and $ always give a partial match. .P If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching continues as normal, and other alternatives in the pattern are tried. If no complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match over a partial match. All the various matching items in a pattern behave as if the subject string is potentially complete; \ez, \eZ, and $ match at the end of the subject, as normal, and for \eb and \eB the end of the subject is treated as a non-alphanumeric. .P The difference between the two partial matching options can be illustrated by a pattern such as: .sp /dog(sbody)?/ .sp This matches either "dog" or "dogsbody", greedily (that is, it prefers the longer string if possible). If it is matched against the string "dog" with PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other hand, if the pattern is made ungreedy the result is different: .sp /dog(sbody)??/ .sp In this case the result is always a complete match because that is found first, and matching never continues after finding a complete match. It might be easier to follow this explanation by thinking of the two patterns like this: .sp /dog(sbody)?/ is the same as /dogsbody|dog/ /dog(sbody)??/ is the same as /dog|dogsbody/ .sp The second pattern will never match "dogsbody", because it will always find the shorter match first. . . .SS "Example of partial matching using pcre2test" .rs .sp The \fBpcre2test\fP data modifiers \fBpartial_hard\fP (or \fBph\fP) and \fBpartial_soft\fP (or \fBps\fP) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT, respectively, when calling \fBpcre2_match()\fP. Here is a run of \fBpcre2test\fP using a pattern that matches the whole subject in the form of a date: .sp re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/ data> 25dec3\e=ph Partial match: 23dec3 data> 3ju\e=ph Partial match: 3ju data> 3juj\e=ph No match .sp This example gives the same results for both hard and soft partial matching options. Here is an example where there is a difference: .sp re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/ data> 25jun04\e=ps 0: 25jun04 1: jun data> 25jun04\e=ph Partial match: 25jun04 .sp With PCRE2_PARTIAL_SOFT, the subject is matched completely. For PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so there is only a partial match. . . . .SH "MULTI-SEGMENT MATCHING WITH pcre2_match()" .rs .sp PCRE was not originally designed with multi-segment matching in mind. However, over time, features (including partial matching) that make multi-segment matching possible have been added. A very long string can be searched segment by segment by calling \fBpcre2_match()\fP repeatedly, with the aim of achieving the same results that would happen if the entire string was available for searching all the time. Normally, the strings that are being sought are much shorter than each individual segment, and are in the middle of very long strings, so the pattern is normally not anchored. .P Special logic must be implemented to handle a matched substring that spans a segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a partial match at the end of a segment whenever there is the possibility of changing the match by adding more characters. The PCRE2_NOTBOL option should also be set for all but the first segment. .P When a partial match occurs, the next segment must be added to the current subject and the match re-run, using the \fIstartoffset\fP argument of \fBpcre2_match()\fP to begin at the point where the partial match started. For example: .sp re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/ data> ...the date is 23ja\e=ph Partial match: 23ja data> ...the date is 23jan19 and on that day...\e=offset=15 0: 23jan19 1: jan .sp Note the use of the \fBoffset\fP modifier to start the new match where the partial match was found. In this example, the next segment was added to the one in which the partial match was found. This is the most straightforward approach, typically using a memory buffer that is twice the size of each segment. After a partial match, the first half of the buffer is discarded, the second half is moved to the start of the buffer, and a new segment is added before repeating the match as in the example above. After a no match, the entire buffer can be discarded. .P If there are memory constraints, you may want to discard text that precedes a partial match before adding the next segment. Unfortunately, this is not at present straightforward. In cases such as the above, where the pattern does not contain any lookbehinds, it is sufficient to retain only the partially matched substring. However, if the pattern contains a lookbehind assertion, characters that precede the start of the partial match may have been inspected during the matching process. When \fBpcre2test\fP displays a partial match, it indicates these characters with '<' if the \fBallusedtext\fP modifier is set: .sp re> "(?<=123)abc" data> xx123ab\e=ph,allusedtext Partial match: 123ab <<< .sp However, the \fBallusedtext\fP modifier is not available for JIT matching, because JIT matching does not record the first (or last) consulted characters. For this reason, this information is not available via the API. It is therefore not possible in general to obtain the exact number of characters that must be retained in order to get the right match result. If you cannot retain the entire segment, you must find some heuristic way of choosing. .P If you know the approximate length of the matching substrings, you can use that to decide how much text to retain. The only lookbehind information that is currently available via the API is the length of the longest individual lookbehind in a pattern, but this can be misleading if there are nested lookbehinds. The value returned by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code units) that any individual lookbehind moves back when it is processed. A pattern such as "(?<=(? /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/ data> 23ja\e=dfa,ps Partial match: 23ja data> n05\e=dfa,dfa_restart 0: n05 .sp The first call has "23ja" as the subject, and requests partial matching; the second call has "n05" as the subject for the continued (restarted) match. Notice that when the match is complete, only the last part is shown; PCRE2 does not retain the previously partially-matched string. It is up to the calling program to do that if it needs to. This means that, for an unanchored pattern, if a continued match fails, it is not possible to try again at a new starting point. All this facility is capable of doing is continuing with the previous match attempt. For example, consider this pattern: .sp 1234|3789 .sp If the first part of the subject is "ABC123", a partial match of the first alternative is found at offset 3. There is no partial match for the second alternative, because such a match does not start at the same point in the subject string. Attempting to continue with the string "7890" does not yield a match because only those alternatives that match at one point in the subject are remembered. Depending on the application, this may or may not be what you want. .P If you do want to allow for starting again at the next character, one way of doing it is to retain some or all of the segment and try a new complete match, as described for \fBpcre2_match()\fP above. Another possibility is to work with two buffers. If a partial match at offset \fIn\fP in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you can then try a new match starting at offset \fIn+1\fP in the first buffer. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 27 November 2024 Copyright (c) 1997-2019 University of Cambridge. .fi ================================================ FILE: doc/pcre2pattern.3 ================================================ .TH PCRE2PATTERN 3 "03 September 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" .rs .sp The syntax and semantics of the regular expressions that are supported by PCRE2 are described in detail below. There is a quick-reference syntax summary in the .\" HREF \fBpcre2syntax\fP .\" page. PCRE2 tries to match Perl syntax and semantics as closely as it can. PCRE2 also supports some alternative regular expression syntax that does not conflict with the Perl syntax in order to provide some compatibility with regular expressions in Python, .NET, and Oniguruma. There are in addition some options that enable alternative syntax and semantics that are not the same as in Perl. .P Perl's regular expressions are described in its own documentation, and regular expressions in general are covered in a number of books, some of which have copious examples. Jeffrey Friedl's "Mastering Regular Expressions", published by O'Reilly, covers regular expressions in great detail. This description of PCRE2's regular expressions is intended as reference material. .P This document discusses the regular expression patterns that are supported by PCRE2 when its main matching function, \fBpcre2_match()\fP, is used. PCRE2 also has an alternative matching function, \fBpcre2_dfa_match()\fP, which matches using a different algorithm that is not Perl-compatible. Some of the features discussed below are not available when DFA matching is used. The advantages and disadvantages of the alternative function, and how it differs from the normal function, are discussed in the .\" HREF \fBpcre2matching\fP .\" page. . . .SH "EBCDIC CHARACTER CODES" .rs .sp Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes this by default. However, it can be compiled to run in an environment that uses the EBCDIC code, which is the case for some IBM mainframe operating systems. In the sections below, character code values are ASCII or Unicode; in an EBCDIC environment these characters may have different code values, and there are no code points greater than 255. Differences in behaviour when PCRE2 is running in an EBCDIC environment are described in the section .\" HTML .\" "EBCDIC environments" .\" below, which you can ignore unless you really are in an EBCDIC environment. . . .SH "SPECIAL START-OF-PATTERN ITEMS" .rs .sp A number of options that can be passed to \fBpcre2_compile()\fP can also be set by special items at the start of a pattern. These are not Perl-compatible, but are provided to make these options accessible to pattern writers who are not able to change the program that processes the pattern. Any number of these items may appear, but they must all be together right at the start of the pattern string, and the letters must be in upper case. . . .SS "UTF support" .rs .sp In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be specified for the 32-bit library, in which case it constrains the character values to valid Unicode code points. To process UTF strings, PCRE2 must be built to include Unicode support (which is the default). When using UTF strings you must either call the compiling function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How setting a UTF mode affects pattern matching is mentioned in several places below. There is also a summary of features in the .\" HREF \fBpcre2unicode\fP .\" page. .P Some applications that allow their users to supply patterns may wish to restrict them to non-UTF data for security reasons. If the PCRE2_NEVER_UTF option is passed to \fBpcre2_compile()\fP, (*UTF) is not allowed, and its appearance in a pattern causes an error. . . .SS "Unicode property support" .rs .sp Another special sequence that may appear at the start of a pattern is (*UCP). This has the same effect as setting the PCRE2_UCP option: it causes sequences such as \ed and \ew to use Unicode properties to determine character types, instead of recognizing only characters with codes less than 256 via a lookup table. If also causes upper/lower casing operations to use Unicode properties for characters with code points greater than 127, even when UTF is not set. These behaviours can be changed within the pattern; see the section entitled .\" HTML .\" "Internal Option Setting" .\" below. .P Some applications that allow their users to supply patterns may wish to restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to \fBpcre2_compile()\fP, (*UCP) is not allowed, and its appearance in a pattern causes an error. . . .SS "Locking out empty string matching" .rs .sp Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever matching function is subsequently called to match the pattern. These options lock out the matching of empty strings, either entirely, or only at the start of the subject. . . .SS "Disabling auto-possessification" .rs .sp If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting the PCRE2_NO_AUTO_POSSESS option, or calling \fBpcre2_set_optimize()\fP with a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers possessive when what follows cannot match the repeated item. For example, by default a+b is treated as a++b. For more details, see the .\" HREF \fBpcre2api\fP .\" documentation. . . .SS "Disabling start-up optimizations" .rs .sp If a pattern starts with (*NO_START_OPT), it has the same effect as setting the PCRE2_NO_START_OPTIMIZE option, or calling \fBpcre2_set_optimize()\fP with a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for quickly reaching "no match" results. For more details, see the .\" HREF \fBpcre2api\fP .\" documentation. . . .SS "Disabling automatic anchoring" .rs .sp If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling \fBpcre2_set_optimize()\fP with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables optimizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the .\" HREF \fBpcre2api\fP .\" documentation. . . .SS "Disabling JIT compilation" .rs .sp If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by the application to apply the JIT optimization by calling \fBpcre2_jit_compile()\fP is ignored. . . .SS "Setting match resource limits" .rs .sp The \fBpcre2_match()\fP function contains a counter that is incremented every time it goes round its main loop. The caller of \fBpcre2_match()\fP can set a limit on this counter, which therefore limits the amount of computing resource used for a match. The maximum depth of nested backtracking can also be limited; this indirectly restricts the amount of heap memory that is used, but there is also an explicit memory limit that can be set. .P These facilities are provided to catch runaway matches that are provoked by patterns with huge matching trees. A common example is a pattern with nested unlimited repeats applied to a long string that does not match. When one of these limits is reached, \fBpcre2_match()\fP gives an error return. The limits can also be set by items at the start of the pattern of the form .sp (*LIMIT_HEAP=d) (*LIMIT_MATCH=d) (*LIMIT_DEPTH=d) .sp where d is any number of decimal digits. However, the value of the setting must be less than the value set (or defaulted) by the caller of \fBpcre2_match()\fP for it to have any effect. In other words, the pattern writer can lower the limits set by the programmer, but not raise them. If there is more than one setting of one of these limits, the lower value is used. The heap limit is specified in kibibytes (units of 1024 bytes). .P Prior to release 10.30, LIMIT_DEPTH was called LIMIT_RECURSION. This name is still recognized for backwards compatibility. .P The heap limit applies only when the \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP interpreters are used for matching. It does not apply to JIT. The match limit is used (but in a different way) when JIT is being used, or when \fBpcre2_dfa_match()\fP is called, to limit computing resource usage by those matching functions. The depth limit is ignored by JIT but is relevant for DFA matching, which uses function recursion for recursions within the pattern and for lookaround assertions and atomic groups. In this case, the depth limit controls the depth of such recursion. . . .\" HTML .SS "Newline conventions" .rs .sp PCRE2 supports six different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) character, the two-character sequence CRLF, any of the three preceding, any Unicode newline sequence, or the NUL character (binary zero). The .\" HREF \fBpcre2api\fP .\" page has .\" HTML .\" further discussion .\" about newlines, and shows how to set the newline convention when calling \fBpcre2_compile()\fP. .P It is also possible to specify a newline convention by starting a pattern string with one of the following sequences: .sp (*CR) carriage return (*LF) linefeed (*CRLF) carriage return, followed by linefeed (*ANYCRLF) any of the three above (*ANY) all Unicode newline sequences (*NUL) the NUL character (binary zero) .sp These override the default and the options given to the compiling function. For example, on a Unix system where LF is the default newline sequence, the pattern .sp (*CR)a.b .sp changes the convention to CR. That pattern matches "a\enb" because LF is no longer a newline. If more than one of these settings is present, the last one is used. .P The newline convention affects where the circumflex and dollar assertions are true. It also affects the interpretation of the dot metacharacter when PCRE2_DOTALL is not set, and the behaviour of \eN when not followed by an opening brace. However, it does not affect what the \eR escape sequence matches. By default, this is any Unicode newline sequence, for Perl compatibility. However, this can be changed; see the next section and the description of \eR in the section entitled .\" HTML .\" "Newline sequences" .\" below. A change of \eR setting can be combined with a change of newline convention. . . .SS "Specifying what \eR matches" .rs .sp It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF at compile time. This effect can also be achieved by starting a pattern with (*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, corresponding to PCRE2_BSR_UNICODE. . . .SH "CHARACTERS AND METACHARACTERS" .rs .sp A regular expression is a pattern that is matched against a subject string from left to right. Most characters stand for themselves in a pattern, and match the corresponding characters in the subject. As a trivial example, the pattern .sp The quick brown fox .sp matches a portion of a subject string that is identical to itself. When caseless matching is specified (the PCRE2_CASELESS option or (?i) within the pattern), letters are matched independently of case. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to \fBpcre2_compile()\fP or set by (*CASELESS_RESTRICT) or (?r) within the pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed to \fBpcre2_compile()\fP or set by (*TURKISH_CASING) within the pattern), then the 'i' letters are matched according to Turkish and Azeri languages. .P The power of regular expressions comes from the ability to include wild cards, character classes, alternatives, and repetitions in the pattern. These are encoded in the pattern by the use of \fImetacharacters\fP, which do not stand for themselves but instead are interpreted in some special way. .P There are two different sets of metacharacters: those that are recognized anywhere in the pattern except within square brackets, and those that are recognized within square brackets. Outside square brackets, the metacharacters are as follows: .sp \e general escape character with several uses ^ assert start of string (or line, in multiline mode) $ assert end of string (or line, in multiline mode) . match any character except newline (by default) [ start character class definition | start of alternative branch ( start group or control verb ) end group or control verb * 0 or more quantifier + 1 or more quantifier; also "possessive quantifier" ? 0 or 1 quantifier; also quantifier minimizer { potential start of min/max quantifier .sp Brace characters { and } are also used to enclose data for constructions such as \eg{2} or \ek{name}. In almost all uses of braces, space and/or horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or after the comma. The exception to this is \eu{...} which is an ECMAScript compatibility feature that is recognized only when the PCRE2_EXTRA_ALT_BSUX option is set. ECMAScript does not ignore such white space; it causes the item to be interpreted as literal. .P Part of a pattern that is in square brackets is called a "character class". In a character class the only metacharacters are: .sp \e general escape character ^ negate the class, but only if the first character - indicates character range [ POSIX character class (if followed by POSIX syntax) ] terminates the character class .sp If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \eQ...\eE sequence, or between a # outside a character class and the next newline, inclusive, is ignored. An escaping backslash can be used to include a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are ignored inside a character class. Note: only these two characters are ignored, not the full set of pattern white space characters that are ignored outside a character class. Option settings can be changed within a pattern; see the section entitled .\" HTML .\" "Internal Option Setting" .\" below. .P The following sections describe the use of each of the metacharacters. . . .SH BACKSLASH .rs .sp The backslash character has several uses. Firstly, if it is followed by a character that is not a digit or a letter, it takes away any special meaning that character may have. This use of backslash as an escape character applies both inside and outside character classes. .P For example, if you want to match a * character, you must write \e* in the pattern. This escaping action applies whether or not the following character would otherwise be interpreted as a metacharacter, so it is always safe to precede a non-alphanumeric with backslash to specify that it stands for itself. In particular, if you want to match a backslash, you write \e\e. .P Only ASCII digits and letters have any special meaning after a backslash. All other characters (in particular, those whose code points are greater than 127) are treated as literals. .P If you want to treat all characters in a sequence as literals, you can do so by putting them between \eQ and \eE. Note that this includes white space even when the PCRE2_EXTENDED option is set so that most other white space is ignored. The behaviour is different from Perl in that $ and @ are handled as literals in \eQ...\eE sequences in PCRE2, whereas in Perl, $ and @ cause variable interpolation. Also, Perl does "double-quotish backslash interpolation" on any backslashes between \eQ and \eE which, its documentation says, "may lead to confusing results". PCRE2 treats a backslash between \eQ and \eE just like any other character. Note the following examples: .sp Pattern PCRE2 matches Perl matches .sp .\" JOIN \eQabc$xyz\eE abc$xyz abc followed by the contents of $xyz \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz \eQA\eB\eE A\eB A\eB \eQ\e\eE \e \e\eE .sp The \eQ...\eE sequence is recognized both inside and outside character classes. An isolated \eE that is not preceded by \eQ is ignored. If \eQ is not followed by \eE later in the pattern, the literal interpretation continues to the end of the pattern (that is, \eE is assumed at the end). If the isolated \eQ is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket. .P Another difference from Perl is that any appearance of \eQ or \eE inside what might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers is inside \eQ...\eE, but not if the separating comma is. When not recognized as a quantifier a sequence such as {\eQ1\eE,2} is treated as the literal string "{1,2}". . . .\" HTML .SS "Non-printing characters" .rs .sp A second use of backslash provides a way of encoding non-printing characters in patterns in a visible manner. There is no restriction on the appearance of non-printing characters in a pattern, but when a pattern is being prepared by text editing, it is often easier to use one of the following escape sequences instead of the binary character it represents. In an ASCII or Unicode environment, these escapes are as follows: .sp \ea alarm, that is, the BEL character (hex 07) \ecx "control-x", where x is a non-control ASCII character \ee escape (hex 1B) \ef form feed (hex 0C) \en linefeed (hex 0A) \er carriage return (hex 0D) (but see below) \et tab (hex 09) \e0dd character with octal code 0dd \eddd character with octal code ddd, or back reference \eo{ddd..} character with octal code ddd.. \exhh character with hex code hh \ex{hhh..} character with hex code hhh.. \eN{U+hhh..} character with Unicode hex code point hhh.. .sp A description of how back references work is given .\" HTML .\" later, .\" following the discussion of .\" HTML .\" parenthesized groups. .\" .P By default, after \ex that is not followed by {, one or two hexadecimal digits are read (letters can be in upper or lower case). If the character that follows \ex is neither { nor a hexadecimal digit, an error occurs. This is different from Perl's default behaviour, which generates a NUL character, but is in line with the behaviour of Perl's 'strict' mode in re. .P Any number of hexadecimal digits may appear between \ex{ and }. If a character other than a hexadecimal digit appears between \ex{ and }, or if there is no terminating }, an error occurs. .P Characters whose code points are less than 256 can be defined by either of the two syntaxes for \ex or by an octal sequence. There is no difference in the way they are handled. For example, \exdc is exactly the same as \ex{dc} or \e334. However, using the braced versions does make such sequences easier to read. .P Support is available for some ECMAScript (aka JavaScript) escape sequences via two compile-time options. If PCRE2_ALT_BSUX is set, the sequence \ex followed by { is not recognized. Only if \ex is followed by two hexadecimal digits is it recognized as a character escape. Otherwise it is interpreted as a literal "x" character. In this mode, support for code points greater than 256 is provided by \eu, which must be followed by four hexadecimal digits; otherwise it is interpreted as a literal "u" character. .P PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition, \eu{hhh..} is recognized as the character specified by hexadecimal code point. There may be any number of hexadecimal digits, but unlike other places that also use curly brackets, spaces are not allowed and would result in the string being interpreted as a literal. This syntax is from ECMAScript 6. .P The \eN{U+hhh..} escape sequence is recognized only when PCRE2 is operating in UTF mode. Perl also uses \eN{name} to specify characters by Unicode name; PCRE2 does not support this. Note that when \eN is not followed by an opening brace (curly bracket) it has an entirely different meaning, matching any character that is not a newline. .P There are some legacy applications where the escape sequence \er is expected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \er in a pattern is converted to \en so that it matches a LF (linefeed) instead of a CR (carriage return) character. .P An error occurs if \ec is not followed by a character whose ASCII code point is in the range 32 to 126. The precise effect of \ecx is as follows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \ecA to \ecZ become hex 01 to hex 1A (A is 41, Z is 5A), but \ec{ becomes hex 3B ({ is 7B), and \ec; becomes hex 7B (; is 3B). If the code unit following \ec has a code point less than 32 or greater than 126, a compile-time error occurs. .P For differences in the way some escapes behave in EBCDIC environments, see section .\" HTML .\" "EBCDIC environments" .\" below. . . .SS "Octal escapes and back references" .rs .sp The escape \eo must be followed by a sequence of octal digits, enclosed in braces. An error occurs if this is not the case. This escape provides a way of specifying character code points as octal numbers greater than 0777, and it also allows octal numbers and backreferences to be unambiguously distinguished. .P If braces are not used, after \e0 up to two further octal digits are read. However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit must follow \e0 (use \e00 to generate a NUL character). Make sure you supply two digits after the initial zero if the pattern character that follows is itself an octal digit. .P Inside a character class, when a backslash is followed by any octal digit, up to three octal digits are read to generate a code point. Any subsequent digits stand for themselves. The sequences \e8 and \e9 are treated as the literal characters "8" and "9". .P Outside a character class, Perl's handling of a backslash followed by a digit other than 0 is complicated by ambiguity, and Perl has changed over time, causing PCRE2 also to change. From PCRE2 release 10.45 there is an option called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous rules. The next two subsections describe the two sets of rules. .P For greater clarity and unambiguity, it is best to avoid following \e by a digit greater than zero. Instead, use \eo{...} or \ex{...} to specify numerical character code points, and \eg{...} to specify backreferences. . . .SS "Perl rules for non-class backslash 1-9" .rs .sp All the digits that follow the backslash are read as a decimal number. If the number is less than 10, begins with the digit 8 or 9, or if there are at least that many previous capture groups in the expression, the entire sequence is taken as a back reference. Otherwise, up to three octal digits are read to form a character code. For example: .sp \e040 is another way of writing an ASCII space .\" JOIN \e40 is the same, provided there are fewer than 40 previous capture groups \e7 is always a backreference .\" JOIN \e11 might be a backreference, or another way of writing a tab \e011 is always a tab \e0113 is a tab followed by the character "3" .\" JOIN \e113 might be a backreference, otherwise the character with octal code 113 .\" JOIN \e377 might be a backreference, otherwise the value 255 (decimal) \e81 is always a backreference .sp Note that octal values of 100 or greater that are specified using this syntax must not be introduced by a leading zero, because no more than three octal digits are ever read. . . .SS "Python rules for non_class backslash 1-9" .rs .sp If there are at least three octal digits after the backslash, exactly three are read as an octal code point number, but the value must be no greater than \e377, even in modes where higher code point values are supported. Any subsequent digits stand for themselves. If there are fewer than three octal digits, the sequence is taken as a decimal back reference. Thus, for example, \e12 is always a back reference, independent of how many captures there are in the pattern. An error is generated for a reference to a non-existent capturing group. . . .SS "Constraints on character values" .rs .sp Characters that are specified using octal or hexadecimal numbers are limited to certain values, as follows: .sp 8-bit non-UTF mode no greater than 0xff 16-bit non-UTF mode no greater than 0xffff 32-bit non-UTF mode no greater than 0xffffffff All UTF modes no greater than 0x10ffff and a valid code point .sp Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the so-called "surrogate" code points). The check for these can be disabled by the caller of \fBpcre2_compile()\fP by setting the option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16. . . .SS "Escape sequences in character classes" .rs .sp All the sequences that define a single character value can be used both inside and outside character classes. In addition, inside a character class, \eb is interpreted as the backspace character (hex 08). .P When not followed by an opening brace, \eN is not allowed in a character class. \eB, \eR, and \eX are not special inside a character class. Like other unrecognized alphabetic escape sequences, they cause an error. Outside a character class, these sequences have different meanings. . . .SS "Unsupported escape sequences" .rs .sp In Perl, the sequences \eF, \el, \eL, \eu, and \eU are recognized by its string handler and used to modify the case of following characters. By default, PCRE2 does not support these escape sequences in patterns. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \eU matches a "U" character, and \eu can be used to define a character by code point, as described above. . . .SS "Absolute and relative backreferences" .rs .sp The sequence \eg followed by a signed or unsigned number, optionally enclosed in braces, is an absolute or relative backreference. A named backreference can be coded as \eg{name}. Backreferences are discussed .\" HTML .\" later, .\" following the discussion of .\" HTML .\" parenthesized groups. .\" . . .SS "Absolute and relative subroutine calls" .rs .sp For compatibility with Oniguruma, the non-Perl syntax \eg followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative syntax for referencing a capture group as a subroutine. Details are discussed .\" HTML .\" later. .\" Note that \eg{...} (Perl syntax) and \eg<...> (Oniguruma syntax) are \fInot\fP synonymous. The former is a backreference; the latter is a .\" HTML .\" subroutine .\" call. . . .\" HTML .SS "Generic character types" .rs .sp Another use of backslash is for specifying generic character types: .sp \ed any decimal digit \eD any character that is not a decimal digit \eh any horizontal white space character \eH any character that is not a horizontal white space character \eN any character that is not a newline \es any white space character \eS any character that is not a white space character \ev any vertical white space character \eV any character that is not a vertical white space character \ew any "word" character \eW any "non-word" character .sp The \eN escape sequence has the same meaning as .\" HTML .\" the "." metacharacter .\" when PCRE2_DOTALL is not set, but setting PCRE2_DOTALL does not change the meaning of \eN. Note that when \eN is followed by an opening brace it has a different meaning. See the section entitled .\" HTML .\" "Non-printing characters" .\" above for details. Perl also uses \eN{name} to specify characters by Unicode name; PCRE2 does not support this. .P Each pair of lower and upper case escape sequences partitions the complete set of characters into two disjoint sets. Any given character matches one, and only one, of each pair. The sequences can appear both inside and outside character classes. They each match one character of the appropriate type. If the current matching point is at the end of the subject string, all of them fail, because there is no character to match. .P The default \es characters are HT (9), LF (10), VT (11), FF (12), CR (13), and space (32), which are defined as white space in the "C" locale. This list may vary if locale-specific matching is taking place. For example, in some locales the "non-breaking space" character (\exA0) is recognized as white space, and in others the VT character is not. .P A "word" character is an underscore or any character that is a letter or digit. By default, the definition of letters and digits is controlled by PCRE2's low-valued character tables, and may vary if locale-specific matching is taking place (see .\" HTML .\" "Locale support" .\" in the .\" HREF \fBpcre2api\fP .\" page). For example, in a French locale such as "fr_FR" in Unix-like systems, or "french" in Windows, some character codes greater than 127 are used for accented letters, and these are then matched by \ew. The use of locales with Unicode is discouraged. .P By default, characters whose code points are greater than 127 never match \ed, \es, or \ew, and always match \eD, \eS, and \eW, although this may be different for characters in the range 128-255 when locale-specific matching is happening. These escape sequences retain their original meanings from before Unicode support was available, mainly for efficiency reasons. If the PCRE2_UCP option is set, the behaviour is changed so that Unicode properties are used to determine character types, as follows: .sp \ed any character that matches \ep{Nd} (decimal digit) \es any character that matches \ep{Z} or \eh or \ev \ew any character that matches \ep{L}, \ep{N}, \ep{Mn}, or \ep{Pc} .sp The addition of \ep{Mn} (non-spacing mark) and the replacement of an explicit test for underscore with a test for \ep{Pc} (connector punctuation) happened in PCRE2 release 10.43. This brings PCRE2 into line with Perl. .P The upper case escapes match the inverse sets of characters. Note that \ed matches only decimal digits, whereas \ew matches any Unicode digit, as well as other character categories. Note also that PCRE2_UCP affects \eb, and \eB because they are defined in terms of \ew and \eW. Matching these sequences is noticeably slower when PCRE2_UCP is set. .P The effect of PCRE2_UCP on any one of these escape sequences can be negated by the options PCRE2_EXTRA_ASCII_BSD, PCRE2_EXTRA_ASCII_BSS, and PCRE2_EXTRA_ASCII_BSW, respectively. These options can be set and reset within a pattern by means of an internal option setting .\" HTML .\" (see below). .\" .P The sequences \eh, \eH, \ev, and \eV, in contrast to the other sequences, which match only ASCII characters by default, always match a specific list of code points, whether or not PCRE2_UCP is set. The horizontal space characters are: .sp U+0009 Horizontal tab (HT) U+0020 Space U+00A0 Non-break space U+1680 Ogham space mark U+180E Mongolian vowel separator U+2000 En quad U+2001 Em quad U+2002 En space U+2003 Em space U+2004 Three-per-em space U+2005 Four-per-em space U+2006 Six-per-em space U+2007 Figure space U+2008 Punctuation space U+2009 Thin space U+200A Hair space U+202F Narrow no-break space U+205F Medium mathematical space U+3000 Ideographic space .sp The vertical space characters are: .sp U+000A Linefeed (LF) U+000B Vertical tab (VT) U+000C Form feed (FF) U+000D Carriage return (CR) U+0085 Next line (NEL) U+2028 Line separator U+2029 Paragraph separator .sp In 8-bit, non-UTF-8 mode, only the characters with code points less than 256 are relevant. . . .\" HTML .SS "Newline sequences" .rs .sp Outside a character class, by default, the escape sequence \eR matches any Unicode newline sequence. In 8-bit non-UTF-8 mode \eR is equivalent to the following: .sp (?>\er\en|\en|\ex0b|\ef|\er|\ex85) .sp This is an example of an "atomic group", details of which are given .\" HTML .\" below. .\" This particular group matches either the two-character sequence CR followed by LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next line, U+0085). Because this is an atomic group, the two-character sequence is treated as a single unit that cannot be split. .P In other modes, two additional characters whose code points are greater than 255 are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029). Unicode support is not needed for these characters to be recognized. .P It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF at compile time. (BSR is an abbreviation for "backslash R".) This can be made the default when PCRE2 is built; if this is the case, the other behaviour can be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify these settings by starting a pattern string with one of the following sequences: .sp (*BSR_ANYCRLF) CR, LF, or CRLF only (*BSR_UNICODE) any Unicode newline sequence .sp These override the default and the options given to the compiling function. Note that these special settings, which are not Perl-compatible, are recognized only at the very start of a pattern, and that they must be in upper case. If more than one of them is present, the last one is used. They can be combined with a change of newline convention; for example, a pattern can start with: .sp (*ANY)(*BSR_ANYCRLF) .sp They can also be combined with the (*UTF) or (*UCP) special sequences. Inside a character class, \eR is treated as an unrecognized escape sequence, and causes an error. . . .\" HTML .SS Unicode character properties .rs .sp When PCRE2 is built with Unicode support (the default), three additional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Unknown script and with an unassigned type. .P Matching characters by Unicode property is not fast, because PCRE2 has to do a multistage table lookup in order to find a character's property. That is why the traditional escape sequences such as \ed and \ew do not use Unicode properties in PCRE2 by default, though you can make them do so by setting the PCRE2_UCP option or by starting the pattern with (*UCP). .P The extra escape sequences that provide property support are: .sp \ep{\fIxx\fP} a character with the \fIxx\fP property \eP{\fIxx\fP} a character without the \fIxx\fP property \eX a Unicode extended grapheme cluster .sp For compatibility with Perl, negation can be specified by including a circumflex between the opening brace and the property. For example, \ep{^Lu} is the same as \eP{Lu}. .P In accordance with Unicode's "loose matching" rules, ASCII white space characters, hyphens, and underscores are ignored in the properties represented by \fIxx\fP above. As well as the space character, ASCII white space can be tab, linefeed, vertical tab, formfeed, or carriage return. .P Some properties are specified as a name only; others as a name and a value, separated by a colon or an equals sign. The names and values consist of ASCII letters and digits (with one Perl-specific exception, see below). They are not case sensitive. Note, however, that the escapes themselves, \ep and \eP, \fIare\fP case sensitive. There are abbreviations for many names. The following examples are all equivalent: .sp \ep{bidiclass=al} \ep{BC=al} \ep{ Bidi_Class : AL } \ep{ Bi-di class = Al } \eP{ ^ Bi-di class = Al } .sp There is support for Unicode script names, Unicode general category properties, "Any", which matches any character (including newline), Bidi_Class, a number of binary (yes/no) properties, and some special PCRE2 properties (described .\" HTML .\" below). .\" Certain other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. Note that \eP{Any} does not match any characters, so always causes a match failure. . . . .SS "Script properties for \ep and \eP" .rs .sp There are three different syntax forms for matching a script. Each Unicode character has a basic script and, optionally, a list of other scripts ("Script Extensions") with which it is commonly used. Using the Adlam script as an example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas \ep{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and "script extensions" for the property types are recognized and, as for all property specifications, an equals sign is an alternative to the colon. If a script name is given without a property type, for example, \ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this interpretation at release 5.26 and PCRE2 changed at release 10.40. .P Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not part of an identified script are lumped together as "Common". The current list of recognized script names and their 4-character abbreviations can be obtained by running this command: .sp pcre2test -LS .sp . . . .SS "The general category property for \ep and \eP" .rs .sp Each character has exactly one Unicode general category property, specified by a two-letter abbreviation. If only one letter is specified with \ep or \eP, it includes all the general category properties that start with that letter. In this case, in the absence of negation, the curly brackets in the escape sequence are optional; these two examples have the same effect: .sp \ep{L} \epL .sp The following general category property codes are supported: .sp C Other Cc Control Cf Format Cn Unassigned Co Private use Cs Surrogate .sp L Letter Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter .sp M Mark Mc Spacing mark Me Enclosing mark Mn Non-spacing mark .sp N Number Nd Decimal number Nl Letter number No Other number .sp P Punctuation Pc Connector punctuation Pd Dash punctuation Pe Close punctuation Pf Final punctuation Pi Initial punctuation Po Other punctuation Ps Open punctuation .sp S Symbol Sc Currency symbol Sk Modifier symbol Sm Mathematical symbol So Other symbol .sp Z Separator Zl Line separator Zp Paragraph separator Zs Space separator .sp Perl originally used the name L& for the Lc property. This is still supported by Perl, but discouraged. PCRE2 also still supports it. This property matches any character that has the Lu, Ll, or Lt property, in other words, any letter that is not classified as a modifier or "other". From release 10.45 of PCRE2 the properties Lu, Ll, and Lt are all treated as Lc when case-independent matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The other properties are not affected by caseless matching. .P The Cs (Surrogate) property applies only to characters whose code points are in the range U+D800 to U+DFFF. These characters are no different to any other character when PCRE2 is not in UTF mode (using the 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF validity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the .\" HREF \fBpcre2api\fP .\" page). .P The long synonyms for property names that Perl supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". .P No character that is in the Unicode table has the Cn (unassigned) property. Instead, this property is assumed for any code point that is not in the Unicode table. . . .SS "Binary (yes/no) properties for \ep and \eP" .rs .sp Unicode defines a number of binary properties, that is, properties whose only values are true or false. You can obtain a list of those that are recognized by \ep and \eP, along with their abbreviations, by running this command: .sp pcre2test -LP .sp . . .SS "The Bidi_Class property for \ep and \eP" .rs .sp \ep{Bidi_Class:} matches a character with the given class \ep{BC:} matches a character with the given class .sp The recognized classes are: .sp AL Arabic letter AN Arabic number B paragraph separator BN boundary neutral CS common separator EN European number ES European separator ET European terminator FSI first strong isolate L left-to-right LRE left-to-right embedding LRI left-to-right isolate LRO left-to-right override NSM non-spacing mark ON other neutral PDF pop directional format PDI pop directional isolate R right-to-left RLE right-to-left embedding RLI right-to-left isolate RLO right-to-left override S segment separator WS white space .sp As in all property specifications, an equals sign may be used instead of a colon and the class names are case-insensitive. Only the short names listed above are recognized; PCRE2 does not at present support any long alternatives. . . .SS Extended grapheme clusters .rs .sp The \eX escape matches any number of Unicode characters that form an "extended grapheme cluster", and treats the sequence as an atomic group .\" HTML .\" (see below). .\" Unicode supports various kinds of composite character by giving each character a grapheme breaking property, and having rules that use these properties to define the boundaries of extended grapheme clusters. The rules are defined in Unicode Standard Annex 29, "Unicode Text Segmentation". Unicode 11.0.0 abandoned the use of some previous properties that had been used for emojis. Instead it introduced various emoji-specific properties. PCRE2 uses only the Extended Pictographic property. .P \eX always matches at least one character. Then it decides whether to add additional characters according to the following rules for ending a cluster: .P 1. End at the end of the subject string. .P 2. Do not end between CR and LF; otherwise end after any control character. .P 3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters are of five types: L, V, T, LV, and LVT. An L character may be followed by an L, V, LV, or LVT character; an LV or V character may be followed by a V or T character; an LVT or T character may be followed only by a T character. .P 4. Do not end before extending characters or spacing marks or the zero-width joiner (ZWJ) character. Characters with the "mark" property always have the "extend" grapheme breaking property. .P 5. Do not end after prepend characters. .P 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width joiner) sequences. An emoji ZWJ sequence consists of a character with the Extended_Pictographic property, optionally followed by one or more characters with the Extend property, followed by the ZWJ character, followed by another Extended_Pictographic character. .P 7. Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) characters if there are an odd number of RI characters before the break point. .P 8. Otherwise, end the cluster. . . .\" HTML .SS PCRE2's additional properties .rs .sp As well as the standard Unicode properties described above, PCRE2 supports four more that make it possible to convert traditional escape sequences such as \ew and \es to use Unicode properties. PCRE2 uses these non-standard, non-Perl properties internally when PCRE2_UCP is set. However, they may also be used explicitly. These properties are: .sp Xan Any alphanumeric character Xps Any POSIX space character Xsp Any Perl space character Xwd Any Perl "word" character .sp Xan matches characters that have either the L (letter) or the N (number) property. Xps matches the characters tab, linefeed, vertical tab, form feed, or carriage return, and any other character that has the Z (separator) property (this includes the space character). Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl compatibility, but Perl changed. Xwd matches the same characters as Xan, plus those that match Mn (non-spacing mark) or Pc (connector punctuation, which includes underscore). .P There is another non-standard property, Xuc, which matches any character that can be represented by a Universal Character Name in C++ and other programming languages. These are the characters $, @, ` (grave accent), and all characters with Unicode code points greater than or equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that most base (ASCII) characters are excluded. (Universal Character Names are of the form \euHHHH or \eUHHHHHHHH where H is a hexadecimal digit. Note that the Xuc property does not match these sequences but the characters that they represent.) . . .\" HTML .SS "Resetting the match start" .rs .sp In normal use, the escape sequence \eK causes any previously matched characters not to be included in the final matched sequence that is returned. For example, the pattern: .sp foo\eKbar .sp matches "foobar", but reports that it has matched "bar". \eK does not interact with anchoring in any way. The pattern: .sp ^foo\eKbar .sp matches only when the subject begins with "foobar" (in single line mode), though it again reports the matched string as "bar". This feature is similar to a lookbehind assertion .\" HTML .\" (described below), .\" but the part of the pattern that precedes \eK is not constrained to match a limited number of characters, as is required for a lookbehind assertion. The use of \eK does not interfere with the setting of .\" HTML .\" captured substrings. .\" For example, when the pattern .sp (foo)\eKbar .sp matches "foobar", the first substring is still set to "foo". .P From version 5.32.0 Perl forbids the use of \eK in lookaround assertions. From release 10.38 PCRE2 also forbids this by default. However, the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling \fBpcre2_compile()\fP to re-enable the previous behaviour. When this option is set, \eK is acted upon when it occurs inside positive assertions, but is ignored in negative assertions. Note that when a pattern such as (?=ab\eK) matches, the reported start of the match can be greater than the end of the match. Using \eK in a lookbehind assertion at the start of a pattern can also lead to odd effects. For example, consider this pattern: .sp (?<=\eKfoo)bar .sp If the subject is "foobar", a call to \fBpcre2_match()\fP with a starting offset of 3 succeeds and reports the matching string as "foobar", that is, the start of the reported match is earlier than where the match started. . . .\" HTML .SS "Simple assertions" .rs .sp The final use of backslash is for certain simple assertions. An assertion specifies a condition that has to be met at a particular point in a match, without consuming any characters from the subject string. The use of groups for more complicated assertions is described .\" HTML .\" below. .\" The backslashed assertions are: .sp \eb matches at a word boundary \eB matches when not at a word boundary \eA matches at the start of the subject \eZ matches at the end of the subject also matches before a newline at the end of the subject \ez matches only at the end of the subject \eG matches at the first matching position in the subject .sp Inside a character class, \eb has a different meaning; it matches the backspace character. If any other of these assertions appears in a character class, an "invalid escape sequence" error is generated. .P A word boundary is a position in the subject string where the current character and the previous character do not both match \ew or \eW (i.e. one matches \ew and the other matches \eW), or the start or end of the string if the first or last character matches \ew, respectively. When PCRE2 is built with Unicode support, the meanings of \ew and \eW can be changed by setting the PCRE2_UCP option. When this is done, it also affects \eb and \eB. Neither PCRE2 nor Perl has a separate "start of word" or "end of word" metasequence. However, whatever follows \eb normally determines which it is. For example, the fragment \eba matches "a" at the start of a word. .P The \eA, \eZ, and \ez assertions differ from the traditional circumflex and dollar (described in the next section) in that they only ever match at the very start and end of the subject string, whatever options are set. Thus, they are independent of multiline mode. These three assertions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, which affect only the behaviour of the circumflex and dollar metacharacters. However, if the \fIstartoffset\fP argument of \fBpcre2_match()\fP is non-zero, indicating that matching is to start at a point other than the beginning of the subject, \eA can never match. The difference between \eZ and \ez is that \eZ matches before a newline at the end of the string as well as at the very end, whereas \ez matches only at the end. .P The \eG assertion is true only when the current matching position is at the start point of the matching process, as specified by the \fIstartoffset\fP argument of \fBpcre2_match()\fP. It differs from \eA when the value of \fIstartoffset\fP is non-zero. By calling \fBpcre2_match()\fP multiple times with appropriate arguments, you can mimic Perl's /g option, and it is in this kind of implementation where \eG can be useful. .P Note, however, that PCRE2's implementation of \eG, being true at the starting character of the matching process, is subtly different from Perl's, which defines it as true at the end of the previous match. In Perl, these can be different when the previously matched string was empty. Because PCRE2 does just one match at a time, it cannot reproduce this behaviour. .P If all the alternatives of a pattern begin with \eG, the expression is anchored to the starting match position, and the "anchored" flag is set in the compiled regular expression. . . .SH "CIRCUMFLEX AND DOLLAR" .rs .sp The circumflex and dollar metacharacters are zero-width assertions. That is, they test for a particular condition being true without consuming any characters from the subject string. These two metacharacters are concerned with matching the starts and ends of lines. If the newline convention is set so that only the two-character sequence CRLF is recognized as a newline, isolated CR and LF characters are treated as ordinary data characters, and are not recognized as newlines. .P Outside a character class, in the default matching mode, the circumflex character is an assertion that is true only if the current matching point is at the start of the subject string. If the \fIstartoffset\fP argument of \fBpcre2_match()\fP is non-zero, or if PCRE2_NOTBOL is set, circumflex can never match if the PCRE2_MULTILINE option is unset. Inside a character class, circumflex has an entirely different meaning .\" HTML .\" (see below). .\" .P Circumflex need not be the first character of the pattern if a number of alternatives are involved, but it should be the first thing in each alternative in which it appears if the pattern is ever to match that branch. If all possible alternatives start with a circumflex, that is, if the pattern is constrained to match only at the start of the subject, it is said to be an "anchored" pattern. (There are also other constructs that can cause a pattern to be anchored.) .P The dollar character is an assertion that is true only if the current matching point is at the end of the subject string, or immediately before a newline at the end of the string (by default), unless PCRE2_NOTEOL is set. Note, however, that it does not actually match the newline. Dollar need not be the last character of the pattern if a number of alternatives are involved, but it should be the last item in any branch in which it appears. Dollar has no special meaning in a character class. .P The meaning of dollar can be changed so that it matches only at the very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \eZ assertion. .P The meanings of the circumflex and dollar metacharacters are changed if the PCRE2_MULTILINE option is set. When this is the case, a dollar character matches before any newlines in the string, as well as at the very end, and a circumflex matches immediately after internal newlines as well as at the start of the subject string. It does not match after a newline that ends the string, for compatibility with Perl. However, this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option. .P For example, the pattern /^abc$/ matches the subject string "def\enabc" (where \en represents a newline) in multiline mode, but not otherwise. Consequently, patterns that are anchored in single line mode because all branches start with ^ are not anchored in multiline mode, and a match for circumflex is possible when the \fIstartoffset\fP argument of \fBpcre2_match()\fP is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. .P When the newline convention (see .\" HTML .\" "Newline conventions" .\" below) recognizes the two-character sequence CRLF as a newline, this is preferred, even if the single characters CR and LF are also recognized as newlines. For example, if the newline convention is "any", a multiline mode circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after CR, even though CR on its own is a valid newline. (It also matches at the very start of the string, of course.) .P Note that the sequences \eA, \eZ, and \ez can be used to match the start and end of the subject in both modes, and if all branches of a pattern start with \eA it is always anchored, whether or not PCRE2_MULTILINE is set. . . .\" HTML .SH "FULL STOP (PERIOD, DOT) AND \eN" .rs .sp Outside a character class, a dot in the pattern matches any one character in the subject string except (by default) a character that signifies the end of a line. One or more characters may be specified as line terminators (see .\" HTML .\" "Newline conventions" .\" above). .P Dot never matches a single line-ending character. When the two-character sequence CRLF is the only line ending, dot does not match CR if it is immediately followed by LF, but otherwise it matches all characters (including isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurrences of CR of LF match dot. When all Unicode line endings are being recognized, dot does not match CR or LF or any of the other line ending characters. .P The behaviour of dot with regard to newlines can be changed. If the PCRE2_DOTALL option is set, a dot matches any one character, without exception. If the two-character sequence CRLF is present in the subject string, it takes two dots to match it. .P The handling of dot is entirely independent of the handling of circumflex and dollar, the only relationship being that they both involve newlines. Dot has no special meaning in a character class. .P The escape sequence \eN when not followed by an opening brace behaves like a dot, except that it is not affected by the PCRE2_DOTALL option. In other words, it matches any character except one that signifies the end of a line. .P When \eN is followed by an opening brace it has a different meaning. See the section entitled .\" HTML .\" "Non-printing characters" .\" above for details. Perl also uses \eN{name} to specify characters by Unicode name; PCRE2 does not support this. . . .SH "MATCHING A SINGLE CODE UNIT" .rs .sp Outside a character class, the escape sequence \eC matches any one code unit, whether or not a UTF mode is set. In the 8-bit library, one code unit is one byte; in the 16-bit library it is a 16-bit unit; in the 32-bit library it is a 32-bit unit. Unlike a dot, \eC always matches line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, but it is unclear how it can usefully be used. .P Because \eC breaks up characters into individual code units, matching one unit with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined results, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's validity at the start of processing unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used). .P An application can lock out the use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to build PCRE2 with the use of \eC permanently disabled. .P PCRE2 does not allow \eC to appear in lookbehind assertions .\" HTML .\" (described below) .\" in UTF-8 or UTF-16 modes, because this would make it impossible to calculate the length of the lookbehind. Neither the alternative matching function \fBpcre2_dfa_match()\fP nor the JIT optimizer support \eC in these UTF modes. The former gives a match-time error; the latter fails to optimize and so the match is always run using the interpreter. .P In the 32-bit library, however, \eC is always supported (when not explicitly locked out) because it always matches a single code unit, whether or not UTF-32 is specified. .P In general, the \eC escape sequence is best avoided. However, one way of using it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a lookahead to check the length of the next character, as in this pattern, which could be used with a UTF-8 string (ignore white space and line breaks): .sp (?| (?=[\ex00-\ex7f])(\eC) | (?=[\ex80-\ex{7ff}])(\eC)(\eC) | (?=[\ex{800}-\ex{ffff}])(\eC)(\eC)(\eC) | (?=[\ex{10000}-\ex{1fffff}])(\eC)(\eC)(\eC)(\eC)) .sp In this example, a group that starts with (?| resets the capturing parentheses numbers in each alternative (see .\" HTML .\" "Duplicate Group Numbers" .\" below). The assertions at the start of each branch check the next UTF-8 character for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The character's individual bytes are then captured by the appropriate number of \eC groups. . . .\" HTML .SH "SQUARE BRACKETS AND CHARACTER CLASSES" .rs .sp An opening square bracket introduces a character class, terminated by a closing square bracket. A closing square bracket on its own is not special by default. If a closing square bracket is required as a member of the class, it should be the first data character in the class (after an initial circumflex, if present) or escaped with a backslash. This means that, by default, an empty class cannot be defined. However, if the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at the start does end the (empty) class. .P A character class matches a single character in the subject. A matched character must be in the set of characters defined by the class, unless the first character in the class definition is a circumflex, in which case the subject character must not be in the set defined by the class. If a circumflex is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash. .P For example, the character class [aeiou] matches any lower case English vowel, whereas [^aeiou] matches all other characters. Note that a circumflex is just a convenient notation for specifying the characters that are in the class by enumerating those that are not. A class that starts with a circumflex is not an assertion; it still consumes a character from the subject string, and therefore it fails to match if the current pointer is at the end of the string. .P Characters in a class may be specified by their code points using \eo, \ex, or \eN{U+hh..} in the usual way. When caseless matching is set, any letters in a class represent both their upper case and lower case versions, so for example, a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set. If you do not want these ASCII/non-ASCII case equivalences, you can suppress them by setting PCRE2_EXTRA_CASELESS_RESTRICT, either as an option in a compile context, or by including (*CASELESS_RESTRICT) or (?r) within a pattern. .P Characters that might indicate line breaks are never treated in any special way when matching character classes, whatever line-ending sequence is in use, and whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A class such as [^a] always matches one of these characters. .P The generic character type escape sequences \ed, \eD, \eh, \eH, \ep, \eP, \es, \eS, \ev, \eV, \ew, and \eW may appear in a character class, and add the characters that they match to the class. For example, [\edABCDEF] matches any hexadecimal digit. In UTF modes, the PCRE2_UCP option affects the meanings of \ed, \es, \ew and their upper case partners, just as it does when they appear outside a character class, as described in the section entitled .\" HTML .\" "Generic character types" .\" above. The escape sequence \eb has a different meaning inside a character class; it matches the backspace character. The sequences \eB, \eR, and \eX are not special inside a character class. Like any other unrecognized escape sequences, they cause an error. The same is true for \eN when not followed by an opening brace. .P The minus (hyphen) character can be used to specify a range of characters in a character class. For example, [d-m] matches any letter between d and m, inclusive. If a minus character is required in a class, it must be escaped with a backslash or appear in a position where it cannot be interpreted as indicating a range, typically as the first or last character in the class, or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z. .P There is some special treatment for alphabetic ranges in EBCDIC environments; see the section .\" HTML .\" "EBCDIC environments" .\" below. .P Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \ed or \eH. However, unless the hyphen is the last character in the class, Perl outputs a warning in its warning mode, as this is most likely a user error. As PCRE2 has no facility for warning, an error is given in these cases. .P It is not possible to have the literal character "]" as the end character of a range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as the end of a range, so [W-\e]46] is interpreted as a class containing a range and two other characters. The octal or hexadecimal representation of "]" can also be used to end a range. .P Ranges normally include all code points between the start and end characters, inclusive. They can also be used for code points specified numerically, for example [\e000-\e037]. Ranges can include any characters that are valid for the current mode. In any UTF mode, the so-called "surrogate" characters (those whose code points lie between 0xd800 and 0xdfff inclusive) may not be specified explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). However, ranges such as [\ex{d7ff}-\ex{e000}], which include the surrogates, are always permitted. .P If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\e\e^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character tables for a French locale are in use, [\exc8-\excb] matches accented E characters in both cases. .P A circumflex can conveniently be used with the upper case character types to specify a more restricted set of characters than the matching lower case type. For example, the class [^\eW_] matches any letter or digit, but not underscore, whereas [\ew] includes underscore. A positive character class should be read as "something OR something OR ..." and a negative class as "NOT something AND NOT something AND NOT ...". .P The metacharacters that are recognized in character classes are backslash, hyphen (when it can be interpreted as specifying a range), circumflex (only at the start), and the terminating closing square bracket. An opening square bracket is also special when it can be interpreted as introducing a POSIX class (see .\" HTML .\" "Posix character classes" .\" below), or a special compatibility feature (see .\" HTML .\" "Compatibility feature for word boundaries" .\" below. Escaping any non-alphanumeric character in a class turns it into a literal, whether or not it would otherwise be a metacharacter. . . .SH "PERL EXTENDED CHARACTER CLASSES" .rs .sp From release 10.45 PCRE2 supports Perl's (?[...]) extended character class syntax. This can be used to perform set operations such as intersection on character classes. .P The syntax permitted within (?[...]) is quite different to ordinary character classes. Inside the extended class, there is an expression syntax consisting of "atoms", operators, and ordinary parentheses "()" used for grouping. Such classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) turned on within them. This means that literal space and tab characters are ignored everywhere in the class. .P The allowed atoms are individual characters specified by escape sequences such as \en or \ex{123}, character types such as \ed, POSIX classes such as [:alpha:], and nested ordinary (non-extended) character classes. For example, in (?[\ed & [...]]) the nested class [...] follows the usual rules for ordinary character classes, in which parentheses are not metacharacters, and character literals and ranges are permitted. .P Character literals and ranges may not appear outside a nested ordinary character class because they are not atoms in the extended syntax. The extended syntax does not introduce any additional escape sequences, so (?[\ey]) is an unknown escape, as it would be in [\ey]. .P In the extended syntax, ^ does not negate a class (except within an ordinary class nested inside an extended class); it is instead a binary operator. .P The binary operators are "&" (intersection), "|" or "+" (union), "-" (subtraction) and "^" (symmetric difference). These are left-associative and "&" has higher (tighter) precedence, while the others have equal lower precedence. The one prefix unary operator is "!" (complement), with highest precedence. . . .SH "UTS#18 EXTENDED CHARACTER CLASSES" .rs .sp The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) syntax, allowing instead extended class behaviour inside ordinary [...] character classes. This altered syntax for [...] classes is loosely described by the Unicode standard UTS#18. The PCRE2_ALT_EXTENDED_CLASS option does not prevent use of (?[...]) classes; it just changes the meaning of all [...] classes that are not nested inside a Perl (?[...]) class. .P Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a character class with two literal characters "a" and "[", but in UTS#18 extended classes the "[" character becomes an additional metacharacter within classes, denoting the start of a nested class, so a literal "[" must be escaped as "\e[". .P Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", "--" and "~~" which denote character class union, intersection, subtraction, and symmetric difference respectively. In standard Perl syntax, these would simply be needlessly-repeated literals (except for "--" which could be the start or end of a range). In UTS#18 extended classes these operators can be used in constructs such as [\ep{L}--[QW]] for "Unicode letters, other than Q and W". A literal "-" at the start or end of a range must be escaped, so while "[--1]" in Perl syntax is the range from hyphen to "1", it must be escaped as "[\e--1]" in UTS#18 extended classes. .P Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to ignore space and tab characters is not automatically enabled for UTS#18 extended classes, but it is honoured if set. .P Extended UTS#18 classes can be nested, and nested classes are themselves extended classes (unlike Perl, where nested classes must be simple classes). For example, [\ep{L}&&[\ep{Thai}||\ep{Greek}]] matches any letter that is in the Thai or Greek scripts. Note that this means that no special grouping characters (such as the parentheses used in Perl's (?[...]) class syntax) are needed. .P Individual class items (literal characters, literal ranges, properties such as \ed or \ep{...}, and nested classes) can be combined by juxtaposition or by an operator. Juxtaposition is the implicit union operator, and binds more tightly than any explicit operator. Thus a sequence of literals and/or ranges behaves as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character except "E" or "8". .P Precedence between the explicit operators is not defined, so mixing operators is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is valid. .P This is an emerging syntax which is being adopted gradually across the regex ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024; Python's "re" module reserves the syntax for future use with a FutureWarning for unescaped use of "[" as a literal within character classes. Due to UTS#18 providing insufficient guidance, engines interpret the syntax differently. Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 extended classes, but with slight incompatibilities ([A||B&&C] is parsed as [A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex"). .P PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so that all the UTS#18 extended classes accepted as valid by PCRE2 have the property that they are interpreted either with the same behaviour, or as invalid, by all other major engines. Please file an issue if you are aware of cross-engine differences in behaviour between PCRE2 and another major engine. . . .\" HTML .SH "POSIX CHARACTER CLASSES" .rs .sp Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports this notation, in both ordinary and extended classes. For example, .sp [01[:alpha:]%] .sp matches "0", "1", any alphabetic character, or "%". The supported class names are: .sp alnum letters and digits alpha letters ascii character codes 0 - 127 blank space or tab only cntrl control characters digit decimal digits (same as \ed) graph printing characters, excluding space lower lower case letters print printing characters, including space punct printing characters, excluding letters and digits and space space white space (the same as \es from PCRE2 8.34) upper upper case letters word "word" characters (same as \ew) xdigit hexadecimal digits .sp The default "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and space (32). If locale-specific matching is taking place, the list of space characters may be different; there may be fewer or more of them. "Space" and \es match the same set of characters, as do "word" and \ew. .P The name "word" is a Perl extension, and "blank" is a GNU extension from Perl 5.8. Another Perl extension is negation, which is indicated by a ^ character after the colon. For example, .sp [12[:^digit:]] .sp matches "1", "2", or any non-digit. PCRE2 (and Perl) also recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not supported, and an error is given if they are encountered. .P By default, characters with values greater than 127 do not match any of the POSIX character classes, although this may be different for characters in the range 128-255 when locale-specific matching is happening. However, in UCP mode, unless certain options are set (see below), some of the classes are changed so that Unicode character properties are used. This is achieved by replacing POSIX classes with other sequences, as follows: .sp [:alnum:] becomes \ep{Xan} [:alpha:] becomes \ep{L} [:blank:] becomes \eh [:cntrl:] becomes \ep{Cc} [:digit:] becomes \ep{Nd} [:lower:] becomes \ep{Ll} [:space:] becomes \ep{Xps} [:upper:] becomes \ep{Lu} [:word:] becomes \ep{Xwd} .sp Negated versions, such as [:^alpha:] use \eP instead of \ep. Four other POSIX classes are handled specially in UCP mode: .TP 10 [:graph:] This matches characters that have glyphs that mark the page when printed. In Unicode property terms, it matches all characters with the L, M, N, P, S, or Cf properties, except for: .sp U+061C Arabic Letter Mark U+180E Mongolian Vowel Separator U+2066 - U+2069 Various "isolate"s .sp .TP 10 [:print:] This matches the same characters as [:graph:] plus space characters that are not controls, that is, characters with the Zs property. .TP 10 [:punct:] This matches all characters that have the Unicode P (punctuation) property, plus those characters with code points less than 256 that have the S (Symbol) property. .TP 10 [:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This is a change that was made in PCRE2 release 10.43 for Perl compatibility. .P The other POSIX classes are unchanged by PCRE2_UCP, and match only characters with code points less than 256. .P There are two options that can be used to restrict the POSIX classes to ASCII characters when PCRE2_UCP is set. The option PCRE2_EXTRA_ASCII_DIGIT affects just [:digit:] and [:xdigit:]. Within a pattern, this can be set and unset by (?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency. . . .\" HTML .SH "COMPATIBILITY FEATURE FOR WORD BOUNDARIES" .rs .sp In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of word". PCRE2 treats these items as follows: .sp [[:<:]] is converted to \eb(?=\ew) [[:>:]] is converted to \eb(?<=\ew) .sp Only these exact character sequences are recognized. A sequence such as [a[:<:]b] provokes error for an unrecognized POSIX class name. This support is not compatible with Perl. It is provided to help migrations from other environments, and is best not used in any new patterns. Note that \eb matches at the start and the end of a word (see .\" HTML .\" "Simple assertions" .\" above), and in a Perl-style pattern the preceding or following character normally shows which is wanted, without the need for the assertions that are used above in order to give exactly the POSIX behaviour. Note also that the PCRE2_UCP option changes the meaning of \ew (and therefore \eb) by default, so it also affects these POSIX sequences. . . .SH "VERTICAL BAR" .rs .sp Vertical bar characters are used to separate alternative patterns. For example, the pattern .sp gilbert|sullivan .sp matches either "gilbert" or "sullivan". Any number of alternatives may appear, and an empty alternative is permitted (matching the empty string). The matching process tries each alternative in turn, from left to right, and the first one that succeeds is used. If the alternatives are within a group .\" HTML .\" (defined below), .\" "succeeds" means matching the rest of the main pattern as well as the alternative in the group. . . .\" HTML .SH "INTERNAL OPTION SETTING" .rs .sp The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, and are described in detail in the .\" HREF \fBpcre2api\fP .\" documentation. The option letters are: .sp i for PCRE2_CASELESS m for PCRE2_MULTILINE n for PCRE2_NO_AUTO_CAPTURE s for PCRE2_DOTALL x for PCRE2_EXTENDED xx for PCRE2_EXTENDED_MORE .sp For example, (?im) sets caseless, multiline matching. It is also possible to unset these options by preceding the relevant letters with a hyphen, for example (?-im). The two "extended" options are not independent; unsetting either one cancels the effects of both of them. .P A combined setting and unsetting such as (?im-sx), which sets PCRE2_CASELESS and PCRE2_MULTILINE while unsetting PCRE2_DOTALL and PCRE2_EXTENDED, is also permitted. Only one hyphen may appear in the options string. If a letter appears both before and after the hyphen, the option is unset. An empty options setting "(?)" is allowed. Needless to say, it has no effect. .P If the first character following (? is a circumflex, it causes all of the above options to be unset. Letters may follow the circumflex to cause some options to be re-instated, but a hyphen may not appear. .P Some PCRE2-specific options can be changed by the same mechanism using these pairs or individual letters: .sp aD for PCRE2_EXTRA_ASCII_BSD aS for PCRE2_EXTRA_ASCII_BSS aW for PCRE2_EXTRA_ASCII_BSW aP for PCRE2_EXTRA_ASCII_POSIX and PCRE2_EXTRA_ASCII_DIGIT aT for PCRE2_EXTRA_ASCII_DIGIT r for PCRE2_EXTRA_CASELESS_RESTRICT J for PCRE2_DUPNAMES U for PCRE2_UNGREEDY .sp However, except for 'r', these are not unset by (?^), which is equivalent to (?-imnrsx). If 'a' is not followed by any of the upper case letters shown above, it sets (or unsets) all the ASCII options. .P PCRE2_EXTRA_ASCII_DIGIT has no additional effect when PCRE2_EXTRA_ASCII_POSIX is set, but including it in (?aP) means that (?-aP) suppresses all ASCII restrictions for POSIX classes. .P When one of these option changes occurs at top level (that is, not inside group parentheses), the change applies until a subsequent change, or the end of the pattern. An option change within a group (see below for a description of groups) affects only that part of the group that follows it. At the end of the group these options are reset to the state they were before the group. For example, .sp (a(?i)b)c .sp matches abc and aBc and no other strings (assuming PCRE2_CASELESS is not set externally). Any changes made in one alternative do carry on into subsequent branches within the same group. For example, .sp (a(?i)b|c) .sp matches "ab", "aB", "c", and "C", even though when matching "C" the first branch is abandoned before the option setting. This is because the effects of option settings happen at compile time. There would be some very weird behaviour otherwise. .P As a convenient shorthand, if any option settings are required at the start of a non-capturing group (see the next section), the option letters may appear between the "?" and the ":". Thus the two patterns .sp (?i:saturday|sunday) (?:(?i)saturday|sunday) .sp match exactly the same set of strings. .P \fBNote:\fP There are other PCRE2-specific options, applying to the whole pattern, which can be set by the application when the compiling function is called. In addition, the pattern can contain special leading sequences such as (*CRLF) to override what the application has set or what has been defaulted. Details are given in the section entitled .\" HTML .\" "Newline sequences" .\" above. There are also the (*UTF) and (*UCP) leading sequences that can be used to set UTF and Unicode property modes; they are equivalent to setting the PCRE2_UTF and PCRE2_UCP options, respectively. However, the application can set the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences. . . .\" HTML .SH GROUPS .rs .sp Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things: .sp 1. It localizes a set of alternatives. For example, the pattern .sp cat(aract|erpillar|) .sp matches "cataract", "caterpillar", or "cat". Without the parentheses, it would match "cataract", "erpillar" or an empty string. .sp 2. It creates a "capture group". This means that, when the whole pattern matches, the portion of the subject string that matched the group is passed back to the caller, separately from the portion that matched the whole pattern. (This applies only to the traditional matching function; the DFA matching function does not support capturing.) .P Opening parentheses are counted from left to right (starting from 1) to obtain numbers for capture groups. For example, if the string "the red king" is matched against the pattern .sp the ((red|white) (king|queen)) .sp the captured substrings are "red king", "red", and "king", and are numbered 1, 2, and 3, respectively. .P The fact that plain parentheses fulfil two functions is not always helpful. There are often times when grouping is required without capturing. If an opening parenthesis is followed by a question mark and a colon, the group does not do any capturing, and is not counted when computing the number of any subsequent capture groups. For example, if the string "the white queen" is matched against the pattern .sp the ((?:red|white) (king|queen)) .sp the captured substrings are "white queen" and "queen", and are numbered 1 and 2. The maximum number of capture groups is 65535. .P As a convenient shorthand, if any option settings are required at the start of a non-capturing group, the option letters may appear between the "?" and the ":". Thus the two patterns .sp (?i:saturday|sunday) (?:(?i)saturday|sunday) .sp match exactly the same set of strings. Because alternative branches are tried from left to right, and options are not reset until the end of the group is reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday". . . .\" HTML .SH "DUPLICATE GROUP NUMBERS" .rs .sp Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is itself a non-capturing group. For example, consider this pattern: .sp (?|(Sat)ur|(Sun))day .sp Because the two alternatives are inside a (?| group, both sets of capturing parentheses are numbered one. Thus, when the pattern matches, you can look at captured substring number one, whichever alternative matched. This construct is useful when you want to capture part, but not all, of one of a number of alternatives. Inside a (?| group, parentheses are numbered as usual, but the number is reset at the start of each branch. The numbers of any capturing parentheses that follow the whole group start after the highest number used in any branch. The following example is taken from the Perl documentation. The numbers underneath show in which buffer the captured content will be stored. .sp # before ---------------branch-reset----------- after / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x # 1 2 2 3 2 3 4 .sp A backreference to a capture group uses the most recent value that is set for the group. The following pattern matches "abcabc" or "defdef": .sp /(?|(abc)|(def))\e1/ .sp In contrast, a subroutine call to a capture group always refers to the first one in the pattern with the given number. The following pattern matches "abcabc" or "defabc": .sp /(?|(abc)|(def))(?1)/ .sp A relative reference such as (?-1) is no different: it is just a convenient way of computing an absolute group number. .P If a .\" HTML .\" condition test .\" for a group's having matched refers to a non-unique number, the test is true if any group with that number has matched. .P An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section. . . .SH "NAMED CAPTURE GROUPS" .rs .sp Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is modified, the numbers may change. To help with this difficulty, PCRE2 supports the naming of capture groups. This feature was not added to Perl until release 5.10. Python had the feature earlier, and PCRE1 introduced it at release 4.0, using the Python syntax. PCRE2 supports both the Perl and the Python syntax. .P In PCRE2, a capture group can be named in one of three ways: (?...) or (?'name'...) as in Perl, or (?P...) as in Python. Names may be up to .\" DEFINE MAX_NAME_SIZE 128 code units long. When PCRE2_UTF is not set, they may contain only ASCII alphanumeric characters and underscores, but must start with a non-digit. When PCRE2_UTF is set, the syntax of group names is extended to allow any Unicode letter or Unicode decimal digit. In other words, group names must match one of these patterns: .sp ^[_A-Za-z][_A-Za-z0-9]*\ez when PCRE2_UTF is not set ^[_\ep{L}][_\ep{L}\ep{Nd}]*\ez when PCRE2_UTF is set .sp References to capture groups from other parts of the pattern, such as .\" HTML .\" backreferences, .\" .\" HTML .\" recursion, .\" and .\" HTML .\" conditions, .\" can all be made by name as well as by number. .P Named capture groups are allocated numbers as well as names, exactly as if the names were not present. In both PCRE2 and Perl, capture groups are primarily identified by numbers; any names are just aliases for these numbers. The PCRE2 API provides function calls for extracting the complete name-to-number translation table from a compiled pattern, as well as convenience functions for extracting captured substrings by name. .P \fBWarning:\fP When more than one capture group has the same number, as described in the previous section, a name given to one of them applies to all of them. Perl allows identically numbered groups to have different names. Consider this pattern, where there are two capture groups, both numbered 1: .sp (?|(?aa)|(?bb)) .sp Perl allows this, with both names AA and BB as aliases of group 1. Thus, after a successful match, both names yield the same value (either "aa" or "bb"). .P In an attempt to reduce confusion, PCRE2 does not allow the same group number to be associated with more than one name. The example above provokes a compile-time error. However, there is still scope for confusion. Consider this pattern: .sp (?|(?aa)|(bb)) .sp Although the second group number 1 is not explicitly named, the name AA is still an alias for any group 1. Whether the pattern matches "aa" or "bb", a reference by name to group AA yields the matched string. .P By default, a name must be unique within a pattern, except that duplicate names are permitted for groups with the same number, for example: .sp (?|(?aa)|(?bb)) .sp The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES option at compile time, or by the use of (?J) within the pattern, as described in the section entitled .\" HTML .\" "Internal Option Setting" .\" above. .P Duplicate names can be useful for patterns where only one instance of the named capture group can match. Suppose you want to match the name of a weekday, either as a 3-letter abbreviation or as the full name, and in both cases you want to extract the abbreviation. This pattern (ignoring the line breaks) does the job: .sp (?J) (?Mon|Fri|Sun)(?:day)?| (?Tue)(?:sday)?| (?Wed)(?:nesday)?| (?Thu)(?:rsday)?| (?Sat)(?:urday)? .sp There are five capture groups, but only one is ever set after a match. The convenience functions for extracting the data by name returns the substring for the first (and in this example, the only) group of that name that matched. This saves searching to find which numbered group it was. (An alternative way of solving this problem is to use a "branch reset" group, as described in the previous section.) .P If you make a backreference to a non-unique named group from elsewhere in the pattern, the groups to which the name refers are checked in the order in which they appear in the overall pattern. The first one that is set is used for the reference. For example, this pattern matches both "foofoo" and "barbar" but not "foobar" or "barfoo": .sp (?J)(?:(?foo)|(?bar))\ek .sp .P If you make a subroutine call to a non-unique named group, the one that corresponds to the first occurrence of the name is used. In the absence of duplicate numbers this is the one with the lowest number. .P If you use a named reference in a condition test (see the .\" .\" HTML .\" section about conditions .\" below), either to check whether a capture group has matched, or to check for recursion, all groups with the same name are tested. If the condition is true for any one of them, the overall condition is true. This is the same behaviour as testing by number. For further details of the interfaces for handling named capture groups, see the .\" HREF \fBpcre2api\fP .\" documentation. . . .SH REPETITION .rs .sp Repetition is specified by quantifiers, which may follow any one of these items: .sp a literal data character the dot metacharacter the \eC escape sequence the \eR escape sequence the \eX escape sequence any escape sequence that matches a single character a character class a backreference a parenthesized group (including lookaround assertions) a subroutine call (recursive or otherwise) .sp If a quantifier does not follow a repeatable item, an error occurs. The general repetition quantifier specifies a minimum and maximum number of permitted matches by giving two numbers in curly brackets (braces), separated by a comma. The numbers must be less than 65536, and the first must be less than or equal to the second. For example, .sp z{2,4} .sp matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special character. If the second number is omitted, but the comma is present, there is no upper limit; if the second number and the comma are both omitted, the quantifier specifies an exact number of required matches. Thus .sp [aeiou]{3,} .sp matches at least 3 successive vowels, but may match many more, whereas .sp \ed{8} .sp matches exactly 8 digits. If the first number is omitted, the lower limit is taken as zero; in this case the upper limit must be present. .sp X{,4} is interpreted as X{0,4} .sp This is a change in behaviour that happened in Perl 5.34.0 and PCRE2 10.43. In earlier versions such a sequence was not interpreted as a quantifier. Other regular expression engines may behave either way. .P If the characters that follow an opening brace do not match the syntax of a quantifier, the brace is taken as a literal character. In particular, this means that {,} is a literal string of three characters. .P Note that not every opening brace is potentially the start of a quantifier because braces are used in other items such as \eN{U+345} or \ek{name}. .P In UTF modes, quantifiers apply to characters rather than to individual code units. Thus, for example, \ex{100}{2} matches two characters, each of which is represented by a two-byte sequence in a UTF-8 string. Similarly, \eX{3} matches three Unicode extended grapheme clusters, each of which may be several code units long (and they may be of different lengths). .P The quantifier {0} is permitted, causing the expression to behave as if the previous item and the quantifier were not present. This may be useful for capture groups that are referenced as .\" HTML .\" subroutines .\" from elsewhere in the pattern (but see also the section entitled .\" HTML .\" "Defining capture groups for use by reference only" .\" below). Except for parenthesized groups, items that have a {0} quantifier are omitted from the compiled pattern. .P For convenience, the three most common quantifiers have single-character abbreviations: .sp * is equivalent to {0,} + is equivalent to {1,} ? is equivalent to {0,1} .sp It is possible to construct infinite loops by following a group that can match no characters with a quantifier that has no upper limit, for example: .sp (a?)* .sp Earlier versions of Perl and PCRE1 used to give an error at compile time for such patterns. However, because there are cases where this can be useful, such patterns are now accepted, but whenever an iteration of such a group matches no characters, matching moves on to the next item in the pattern instead of repeatedly matching an empty string. This does not prevent backtracking into any of the iterations if a subsequent item fails to match. .P By default, quantifiers are "greedy", that is, they match as much as possible (up to the maximum number of permitted repetitions), without causing the rest of the pattern to fail. The classic example of where this gives problems is in trying to match comments in C programs. These appear between /* and */ and within the comment, individual * and / characters may appear. An attempt to match C comments by applying the pattern .sp /\e*.*\e*/ .sp to the string .sp /* first comment */ not comment /* second comment */ .sp fails, because it matches the entire string owing to the greediness of the .* item. However, if a quantifier is followed by a question mark, it ceases to be greedy, and instead matches the minimum number of times possible, so the pattern .sp /\e*.*?\e*/ .sp does the right thing with C comments. The meaning of the various quantifiers is not otherwise changed, just the preferred number of matches. Do not confuse this use of question mark with its use as a quantifier in its own right. Because it has two uses, it can sometimes appear doubled, as in .sp \ed??\ed .sp which matches one digit by preference, but can match two if that is the only way the rest of the pattern matches. .P If the PCRE2_UNGREEDY option is set (an option that is not available in Perl), the quantifiers are not greedy by default, but individual ones can be made greedy by following them with a question mark. In other words, it inverts the default behaviour. .P When a parenthesized group is quantified with a minimum repeat count that is greater than 1 or with a limited maximum, more memory is required for the compiled pattern, in proportion to the size of the minimum or maximum. .P If a pattern starts with .* or .{0,} and the PCRE2_DOTALL option (equivalent to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is implicitly anchored, because whatever follows will be tried against every character position in the subject string, so there is no point in retrying the overall match at any position after the first. PCRE2 normally treats such a pattern as though it were preceded by \eA. .P In cases where it is known that the subject string contains no newlines, it is worth setting PCRE2_DOTALL in order to obtain this optimization, or alternatively, using ^ to indicate anchoring explicitly. .P However, there are some cases where the optimization cannot be used. When .* is inside capturing parentheses that are the subject of a backreference elsewhere in the pattern, a match at the start may fail where a later one succeeds. Consider, for example: .sp (.*)abc\e1 .sp If the subject is "xyz123abc123" the match point is the fourth character. For this reason, such a pattern is not implicitly anchored. .P Another case where implicit anchoring is not applied is when the leading .* is inside an atomic group. Once again, a match at the start may fail where a later one succeeds. Consider this pattern: .sp (?>.*?a)b .sp It matches "ab" in the subject "aab". The use of the backtracking control verbs (*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call \fBpcre2_set_optimize()\fP with a PCRE2_DOTSTAR_ANCHOR_OFF directive. .P When a capture group is repeated, the value captured is the substring that matched the final iteration. For example, after .sp (tweedle[dume]{3}\es*)+ .sp has matched "tweedledum tweedledee" the value of the captured substring is "tweedledee". However, if there are nested capture groups, the corresponding captured values may have been set in previous iterations. For example, after .sp (a|(b))+ .sp matches "aba" the value of the second captured substring is "b". . . .\" HTML .SH "ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS" .rs .sp With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be re-evaluated to see if a different number of repeats allows the rest of the pattern to match. Sometimes it is useful to prevent this, either to change the nature of the match, or to cause it fail earlier than it otherwise might, when the author of the pattern knows there is no point in carrying on. .P Consider, for example, the pattern \ed+foo when applied to the subject line .sp 123456bar .sp After matching all 6 digits and then failing to match "foo", the normal action of the matcher is to try again with only 5 digits matching the \ed+ item, and then with 4, and so on, before ultimately failing. "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides the means for specifying that once a group has matched, it is not to be re-evaluated in this way. .P If we use atomic grouping for the previous example, the matcher gives up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example: .sp (?>\ed+)foo .sp Perl 5.28 introduced an experimental alphabetic form starting with (* which may be easier to remember: .sp (*atomic:\ed+)foo .sp This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is prevented from backtracking into it. Backtracking past it to previous items, however, works as normal. .P An alternative description is that a group of this type matches exactly the string of characters that an identical standalone pattern would match, if anchored at the current point in the subject string. .P Atomic groups are not capture groups. Simple cases such as the above example can be thought of as a maximizing repeat that must swallow everything it can. So, while both \ed+ and \ed+? are prepared to adjust the number of digits they match in order to make the rest of the pattern match, (?>\ed+) can only match an entire sequence of digits. .P Atomic groups in general can of course contain arbitrarily complicated expressions, and can be nested. However, when the contents of an atomic group is just a single repeated item, as in the example above, a simpler notation, called a "possessive quantifier" can be used. This consists of an additional + character following a quantifier. Using this notation, the previous example can be rewritten as .sp \ed++foo .sp Note that a possessive quantifier can be used with an entire group, for example: .sp (abc|xyz){2,3}+ .sp Possessive quantifiers are always greedy; the setting of the PCRE2_UNGREEDY option is ignored. They are a convenient notation for the simpler forms of atomic group. However, there is no difference in the meaning of a possessive quantifier and the equivalent atomic group, though there may be a performance difference; possessive quantifiers should be slightly faster. .P The possessive quantifier syntax is an extension to the Perl 5.8 syntax. Jeffrey Friedl originated the idea (and the name) in the first edition of his book. Mike McCloskey liked it, so implemented it when he built Sun's Java package, and PCRE1 copied it from there. It found its way into Perl at release 5.10. .P PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling \fBpcre2_set_optimize()\fP with a PCRE2_AUTO_POSSESS_OFF directive, or by starting the pattern with (*NO_AUTO_POSSESS). .P When a pattern contains an unlimited repeat inside a group that can itself be repeated an unlimited number of times, the use of an atomic group is the only way to avoid some failing matches taking a very long time indeed. The pattern .sp (\eD+|<\ed+>)*[!?] .sp matches an unlimited number of substrings that either consist of non-digits, or digits enclosed in <>, followed by either ! or ?. When it matches, it runs quickly. However, if it is applied to .sp aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa .sp it takes a long time before reporting failure. This is because the string can be divided between the internal \eD+ repeat and the external * repeat in a large number of ways, and all have to be tried. (The example uses [!?] rather than a single character at the end, because both PCRE2 and Perl have an optimization that allows for fast failure when a single character is used. They remember the last single character that is required for a match, and fail early if it is not present in the string.) If the pattern is changed so that it uses an atomic group, like this: .sp ((?>\eD+)|<\ed+>)*[!?] .sp sequences of non-digits cannot be broken, and failure happens quickly. . . .\" HTML .SH "BACKREFERENCES" .rs .sp Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that is, to its left) in the pattern, provided there have been that many previous capture groups. .P However, if the decimal number following the backslash is less than 8, it is always taken as a backreference, and causes an error only if there are not that many capture groups in the entire pattern. In other words, the group that is referenced need not be to the left of the reference for numbers less than 8. A "forward backreference" of this type can make sense when a repetition is involved and the group to the right has participated in an earlier iteration. .P It is not possible to have a numerical "forward backreference" to a group whose number is 8 or more using this syntax because a sequence such as \e50 is interpreted as a character defined in octal. See the subsection entitled "Non-printing characters" .\" HTML .\" above .\" for further details of the handling of digits following a backslash. Other forms of backreferencing do not suffer from this restriction. In particular, there is no problem when named capture groups are used (see below). .P Another way of avoiding the ambiguity inherent in the use of digits following a backslash is to use the \eg escape sequence. This escape must be followed by a signed or unsigned number, optionally enclosed in braces. These examples are all identical: .sp (ring), \e1 (ring), \eg1 (ring), \eg{1} .sp An unsigned number specifies an absolute reference without the ambiguity that is present in the older syntax. It is also useful when literal digits follow the reference. A signed number is a relative reference. Consider this example: .sp (abc(def)ghi)\eg{-1} .sp The sequence \eg{-1} is a reference to the capture group whose number is one less than the number of the next group to be started, so in this example (where the next group would be numbered 3) is it equivalent to \e2, and \eg{-2} would be equivalent to \e1. Note that if this construct is inside a capture group, that group is included in the count, so in this example \eg{-2} also refers to group 1: .sp (A)(\eg{-2}B) .sp The use of relative references can be helpful in long patterns, and also in patterns that are created by joining together fragments that contain references within themselves. .P The sequence \eg{+1} is a reference to the next capture group that is started after this item, and \eg{+2} refers to the one after that, and so on. This kind of forward reference can be useful in patterns that repeat. Perl does not support the use of + in this way. .P A backreference matches whatever actually most recently matched the capture group in the current subject string, rather than anything at all that matches the group (see .\" HTML .\" "Groups as subroutines" .\" below for a way of doing that). So the pattern .sp (sens|respons)e and \e1ibility .sp matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If caseful matching is in force at the time of the backreference, the case of letters is relevant. For example, .sp ((?i)rah)\es+\e1 .sp matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original capture group is matched caselessly. .P There are several different ways of writing backreferences to named capture groups. The .NET syntax is \ek{name}, the Python syntax is (?=name), and the original Perl syntax is \ek or \ek'name'. All of these are now supported by both Perl and PCRE2. Perl 5.10's unified backreference syntax, in which \eg can be used for both numeric and named references, is also supported by PCRE2. We could rewrite the above example in any of the following ways: .sp (?(?i)rah)\es+\ek (?'p1'(?i)rah)\es+\ek{p1} (?P(?i)rah)\es+(?P=p1) (?(?i)rah)\es+\eg{p1} .sp A capture group that is referenced by name may appear in the pattern before or after the reference. .P There may be more than one backreference to the same group. If a group has not actually been used in a particular match, backreferences to it always fail by default. For example, the pattern .sp (a|(bc))\e2 .sp always fails if it starts to match "a" rather than "bc". However, if the PCRE2_MATCH_UNSET_BACKREF option is set at compile time, a backreference to an unset value matches an empty string. .P Because there may be many capture groups in a pattern, all digits following a backslash are taken as part of a potential backreference number. If the pattern continues with a digit character, some delimiter must be used to terminate the backreference. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, this can be white space. Otherwise, the \eg{} syntax or an empty comment (see .\" HTML .\" "Comments" .\" below) can be used. . . .SS "Recursive backreferences" .rs .sp A backreference that occurs inside the group to which it refers fails when the group is first used, so, for example, (a\e1) never matches. However, such references can be useful inside repeated groups. For example, the pattern .sp (a|b\e1)+ .sp matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of the group, the backreference matches the character string corresponding to the previous iteration. In order for this to work, the pattern must be such that the first iteration does not need to match the backreference. This can be done using alternation, as in the example above, or by a quantifier with a minimum of zero. .P For versions of PCRE2 less than 10.25, backreferences of this type used to cause the group that they reference to be treated as an .\" HTML .\" atomic group. .\" This restriction no longer applies, and backtracking into such groups can occur as normal. . . .\" HTML .SH ASSERTIONS .rs .sp An assertion is a test that does not consume any characters. The test must succeed for the match to continue. The simple assertions coded as \eb, \eB, \eA, \eG, \eZ, \ez, ^ and $ are described .\" HTML .\" above. .\" .P More complicated assertions are coded as parenthesized groups. If matching such a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed. .P A special kind of assertion, called a "scan substring" assertion, matches a subpattern against a previously captured substring. This is described in the section entitled .\" HTML .\" "Scan substring assertions" .\" below. It is a PCRE2 extension, not compatible with Perl. .P The other goup-based assertions are of two kinds: those that look ahead of the current position in the subject string, and those that look behind it, and in each case an assertion may be positive (must match for the assertion to be true) or negative (must not match for the assertion to be true). .P The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic assertions can be useful. PCRE2 has some support for these, described in the section entitled .\" HTML .\" "Non-atomic assertions" .\" below, but they are not Perl-compatible. .P A lookaround assertion may appear as the condition in a .\" HTML .\" conditional group .\" (see below). In this case, the result of matching the assertion determines which branch of the condition is followed. .P Assertion groups are not capture groups. If an assertion contains capture groups within it, these are counted for the purposes of numbering the capture groups in the whole pattern. Within each branch of an assertion, locally captured substrings may be referenced in the usual way. For example, a sequence such as (.)\eg{-1} can be used to check that two adjacent characters are the same. .P When a branch within an assertion fails to match, any substrings that were captured are discarded (as happens with any pattern branch that fails to match). A negative assertion is true only when all its branches fail to match; this means that no captured substrings are ever retained after a successful negative assertion. When an assertion contains a matching branch, what happens depends on the type of assertion. .P For a positive assertion, internally captured substrings in the successful branch are retained, and matching continues with the next pattern item after the assertion. For a negative assertion, a matching branch means that the assertion is not true. If such an assertion is being used as a condition in a .\" HTML .\" conditional group .\" (see below), captured substrings are retained, because matching continues with the "no" branch of the condition. For other failing negative assertions, control passes to the previous backtracking point, thus discarding any captured strings within the assertion. .P Most assertion groups may be repeated; though it makes no sense to assert the same thing several times, the side effect of capturing in positive assertions may occasionally be useful. However, an assertion that forms the condition for a conditional group may not be quantified. PCRE2 used to restrict the repetition of assertions, but from release 10.35 the only restriction is that an unlimited maximum repetition is changed to be one more than the minimum. For example, {3,} is treated as {3,4}. . . .SS "Alphabetic assertion names" .rs .sp Traditionally, symbolic sequences such as (?= and (?<= have been used to specify lookaround assertions. Perl 5.28 introduced some experimental alphabetic alternatives which might be easier to remember. They all start with (* instead of (? and must be written using lower case letters. PCRE2 supports the following synonyms: .sp (*positive_lookahead: or (*pla: is the same as (?= (*negative_lookahead: or (*nla: is the same as (?! (*positive_lookbehind: or (*plb: is the same as (?<= (*negative_lookbehind: or (*nlb: is the same as (? .SS "Lookbehind assertions" .rs .sp Lookbehind assertions start with (?<= for positive assertions and (? .\" (see above) .\" can be used instead of a lookbehind assertion at the start of a pattern to get round the length limit restriction. .P In UTF-8 and UTF-16 modes, PCRE2 does not allow the \eC escape (which matches a single code unit even in a UTF mode) to appear in lookbehind assertions, because it makes it impossible to calculate the length of the lookbehind. The \eX and \eR escapes, which can match different numbers of code units, are never permitted in lookbehinds. .P .\" HTML .\" "Subroutine" .\" calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long as the called capture group matches a limited-length string. However, .\" HTML .\" recursion, .\" that is, a "subroutine" call into a group that is already active, is not supported. .P PCRE2 supports backreferences in lookbehinds, but only if certain conditions are met. The PCRE2_MATCH_UNSET_BACKREF option must not be set, there must be no use of (?| in the pattern (it creates duplicate group numbers), and if the backreference is by name, the name must be unique. Of course, the referenced group must itself match a limited length substring. The following pattern matches words containing at least two characters that begin and end with the same character: .sp \eb(\ew)\ew++(?<=\e1) .P Possessive quantifiers can be used in conjunction with lookbehind assertions to specify efficient matching at the end of subject strings. Consider a simple pattern such as .sp abcd$ .sp when applied to a long string that does not match. Because matching proceeds from left to right, PCRE2 will look for each "a" in the subject and then see if what follows matches the rest of the pattern. If the pattern is specified as .sp ^.*abcd$ .sp the initial .* matches the entire string at first, but when this fails (because there is no following "a"), it backtracks to match all but the last character, then all but the last two characters, and so on. Once again the search for "a" covers the entire string, from right to left, so we are no better off. However, if the pattern is written as .sp ^.*+(?<=abcd) .sp there can be no backtracking for the .*+ item because of the possessive quantifier; it can match only the entire string. The subsequent lookbehind assertion does a single test on the last four characters. If it fails, the match fails immediately. For long strings, this approach makes a significant difference to the processing time. . . .SS "Using multiple assertions" .rs .sp Several assertions (of any sort) may occur in succession. For example, .sp (?<=\ed{3})(? .SH "NON-ATOMIC ASSERTIONS" .rs .sp Traditional lookaround assertions are atomic. That is, if an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic positive assertions can be useful. PCRE2 provides these using the following syntax: .sp (*non_atomic_positive_lookahead: or (*napla: or (?* (*non_atomic_positive_lookbehind: or (*naplb: or (?<* .sp Consider the problem of finding the right-most word in a string that also appears earlier in the string, that is, it must appear at least twice in total. This pattern returns the required result as captured substring 1: .sp ^(?x)(*napla: .* \eb(\ew++)) (?> .*? \eb\e1\eb ){2} .sp For a subject such as "word1 word2 word3 word2 word3 word4" the result is "word3". How does it work? At the start, ^(?x) anchors the pattern and sets the "x" option, which causes white space (introduced for readability) to be ignored. Inside the assertion, the greedy .* at first consumes the entire string, but then has to backtrack until the rest of the assertion can match a word, which is captured by group 1. In other words, when the assertion first succeeds, it captures the right-most word in the string. .P The current matching point is then reset to the start of the subject, and the rest of the pattern match checks for two occurrences of the captured word, using an ungreedy .*? to scan from the left. If this succeeds, we are done, but if the last word in the string does not occur twice, this part of the pattern fails. If a traditional atomic lookahead (?= or (*pla: had been used, the assertion could not be re-entered, and the whole match would fail. The pattern would succeed only if the very last word in the subject was found twice. .P Using a non-atomic lookahead, however, means that when the last word does not occur twice in the string, the lookahead can backtrack and find the second-last word, and so on, until either the match succeeds, or all words have been tested. .P Two conditions must be met for a non-atomic assertion to be useful: the contents of one or more capturing groups must change after a backtrack into the assertion, and there must be a backreference to a changed group later in the pattern. If this is not the case, the rest of the pattern match fails exactly as before because nothing has changed, so using a non-atomic assertion just wastes resources. .P There is one exception to backtracking into a non-atomic assertion. If an (*ACCEPT) control verb is triggered, the assertion succeeds atomically. That is, a subsequent match failure cannot backtrack into the assertion. .P Non-atomic assertions are not supported by the alternative matching function \fBpcre2_dfa_match()\fP. They are supported by JIT, but only if they do not contain any control verbs such as (*ACCEPT). (This may change in future). Note that assertions that appear as conditions for .\" HTML .\" conditional groups .\" (see below) must be atomic. . . .\" HTML .SH "SCAN SUBSTRING ASSERTIONS" .rs .sp A special kind of assertion, not compatible with Perl, makes it possible to check the contents of a captured substring by matching it with a subpattern. Because this involves capturing, this feature is not supported by \fBpcre2_dfa_match()\fP. .P A scan substring assertion starts with the sequence (*scan_substring: or (*scs: which is followed by a list of substring numbers (absolute or relative) and/or substring names enclosed in single quotes or angle brackets, all within parentheses. The rest of the item is the subpattern that is applied to the substring, as shown in these examples: .sp (*scan_substring:(1)...) (*scs:(-2)...) (*scs:('AB')...) (*scs:(1,'AB',-2)...) .sp The list of groups is checked in the order they are given, and it is the contents of the first one that is found to be set that are scanned. When PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the same name are checked in numerical order. A scan substring assertion fails if none of the groups it references have been set. .P The pattern match on the substring is always anchored, that is, it must match from the start of the substring. There is no "bumpalong" if it does not match at the start. The end of the subject is temporarily reset to be the end of the substring, so \eZ, \ez, and $ will match there. However, the start of the subject is \fInot\fP reset. This means that ^ matches only if the substring is actually at the start of the main subject, but it also means that lookbehind assertions into what precedes the substring are possible. .P Here is a very simple example: find a word that contains the rare (in English) sequence of letters "rh" not at the start: .sp \eb(\ew++)(*scs:(1).+rh) .sp The first group captures a word which is then scanned by the second group. This example does not actually need this heavyweight feature; the same match can be achieved with: .sp \eb\ew+?rh\ew*\eb .sp When things are more complicated, however, scanning a captured substring can be a useful way to describe the required match. For exmple, there is a rather complicated pattern in the PCRE2 test data that checks an entire subject string for a palindrome, that is, the sequence of letters is the same in both directions. Suppose you want to search for individual words of two or more characters such as "level" that are palindromes: .sp (\eb\ew{2,}+\eb)(*scs:(1)...palindrome-matching-pattern...) .sp Within a substring scanning subpattern, references to other groups work as normal. Capturing groups may appear, and will retain their values during ongoing matching if the assertion succeeds. . . .SH "SCRIPT RUNS" .rs .sp In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are commonly used together, and because some diacritical and other marks are used with multiple scripts, it is not that simple. There is a full description of the rules that PCRE2 uses in the section entitled .\" HTML .\" "Script Runs" .\" in the .\" HREF \fBpcre2unicode\fP .\" documentation. .P If part of a pattern is enclosed between (*script_run: or (*sr: and a closing parenthesis, it fails if the sequence of characters that it matches are not a script run. After a failure, normal backtracking occurs. Script runs can be used to detect spoofing attacks using characters that look the same, but are from different scripts. The string "paypal.com" is an infamous example, where the letters could be a mixture of Latin and Cyrillic. This pattern ensures that the matched characters in a sequence of non-spaces that follow white space are a script run: .sp \es+(*sr:\eS+) .sp To be sure that they are all from the Latin script (for example), a lookahead can be used: .sp \es+(?=\ep{Latin})(*sr:\eS+) .sp This works as long as the first character is expected to be a character in that script, and not (for example) punctuation, which is allowed with any script. If this is not the case, a more creative lookahead is needed. For example, if digits, underscore, and dots are permitted at the start: .sp \es+(?=[0-9_.]*\ep{Latin})(*sr:\eS+) .sp .P In many cases, backtracking into a script run pattern fragment is not desirable. The script run can employ an atomic group to prevent this. Because this is a common requirement, a shorthand notation is provided by (*atomic_script_run: or (*asr: .sp (*asr:...) is the same as (*sr:(?>...)) .sp Note that the atomic group is inside the script run. Putting it outside would not prevent backtracking into the script run pattern. .P Support for script runs is not available if PCRE2 is compiled without Unicode support. A compile-time error is given if any of the above constructs is encountered. Script runs are not supported by the alternate matching function, \fBpcre2_dfa_match()\fP because they use the same mechanism as capturing parentheses. .P \fBWarning:\fP The (*ACCEPT) control verb .\" HTML .\" (see below) .\" should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking. . . .\" HTML .SH "CONDITIONAL GROUPS" .rs .sp It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on the result of an assertion, or whether a specific capture group has already been matched. The two possible forms of conditional group are: .sp (?(condition)yes-pattern) (?(condition)yes-pattern|no-pattern) .sp If the condition is satisfied, the yes-pattern is used; otherwise the no-pattern (if present) is used. An absent no-pattern is equivalent to an empty string (it always matches). If there are more than two alternatives in the group, a compile-time error occurs. Each of the two alternatives may itself contain nested groups of any form, including conditional groups; the restriction to two alternatives applies only at the level of the condition itself. This pattern fragment is an example where the alternatives are complex: .sp (?(1) (A|B|C) | (D | (?(2)E|F) | E) ) .sp .P There are five kinds of condition: references to capture groups, references to recursion, two pseudo-conditions called DEFINE and VERSION, and assertions. . . .SS "Checking for a used capture group by number" .rs .sp If the text between the parentheses consists of a sequence of digits, the condition is true if a capture group of that number has previously matched. If there is more than one capture group with the same number (see the earlier .\" .\" HTML .\" section about duplicate group numbers), .\" the condition is true if any of them have matched. An alternative notation, which is a PCRE2 extension, not supported by Perl, is to precede the digits with a plus or minus sign. In this case, the group number is relative rather than absolute. The most recently opened capture group (which could be enclosing this condition) can be referenced by (?(-1), the next most recent by (?(-2), and so on. Inside loops it can also make sense to refer to subsequent groups. The next capture group to be opened can be referenced as (?(+1), and so on. The value zero in any of these forms is not used; it provokes a compile-time error. .P Consider the following pattern, which contains non-significant white space to make it more readable (assume the PCRE2_EXTENDED option) and to divide it into three parts for ease of discussion: .sp ( \e( )? [^()]+ (?(1) \e) ) .sp The first part matches an optional opening parenthesis, and if that character is present, sets it as the first captured substring. The second part matches one or more characters that are not parentheses. The third part is a conditional group that tests whether or not the first capture group matched. If it did, that is, if subject started with an opening parenthesis, the condition is true, and so the yes-pattern is executed and a closing parenthesis is required. Otherwise, since no-pattern is not present, the conditional group matches nothing. In other words, this pattern matches a sequence of non-parentheses, optionally enclosed in parentheses. .P If you were embedding this pattern in a larger one, you could use a relative reference: .sp ...other stuff... ( \e( )? [^()]+ (?(-1) \e) ) ... .sp This makes the fragment independent of the parentheses in the larger pattern. . . .SS "Checking for a used capture group by name" .rs .sp Perl uses the syntax (?()...) or (?('name')...) to test for a used capture group by name. For compatibility with earlier versions of PCRE1, which had this facility before Perl, the syntax (?(name)...) is also recognized. Note, however, that undelimited names consisting of the letter R followed by digits are ambiguous (see the following section). Rewriting the above example to use a named group gives this: .sp (? \e( )? [^()]+ (?() \e) ) .sp If the name used in a condition of this kind is a duplicate, the test is applied to all groups of the same name, and is true if any one of them has matched. . . .SS "Checking for pattern recursion" .rs .sp "Recursion" in this sense refers to any subroutine-like call from one part of the pattern to another, whether or not it is actually recursive. See the sections entitled .\" HTML .\" "Recursive patterns" .\" and .\" HTML .\" "Groups as subroutines" .\" below for details of recursion and subroutine calls. .P If a condition is the string (R), and there is no capture group with the name R, the condition is true if matching is currently in a recursion or subroutine call to the whole pattern or any capture group. If digits follow the letter R, and there is no group with that name, the condition is true if the most recent call is into a group with the given number, which must exist somewhere in the overall pattern. This is a contrived example that is equivalent to a+b: .sp ((?(R1)a+|(?1)b)) .sp However, in both cases, if there is a capture group with a matching name, the condition tests for its being set, as described in the section above, instead of testing for recursion. For example, creating a group with the name R1 by adding (?) to the above pattern completely changes its meaning. .P If a name preceded by ampersand follows the letter R, for example: .sp (?(R&name)...) .sp the condition is true if the most recent recursion is into a group of that name (which must exist within the pattern). .P This condition does not check the entire recursion stack. It tests only the current level. If the name used in a condition of this kind is a duplicate, the test is applied to all groups of the same name, and is true if any one of them is the most recent recursion. .P At "top level", all these recursion test conditions are false. . . .\" HTML .SS "Defining capture groups for use by reference only" .rs .sp If the condition is the string (DEFINE), the condition is always false, even if there is a group with the name DEFINE. In this case, there may be only one alternative in the rest of the conditional group. It is always skipped if control reaches this point in the pattern; the idea of DEFINE is that it can be used to define subroutines that can be referenced from elsewhere. (The use of .\" HTML .\" subroutines .\" is described below.) For example, a pattern to match an IPv4 address such as "192.168.23.245" could be written like this (ignore white space and line breaks): .sp (?(DEFINE) (? 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) ) \eb (?&byte) (\e.(?&byte)){3} \eb .sp The first part of the pattern is a DEFINE group inside which another group named "byte" is defined. This matches an individual component of an IPv4 address (a number less than 256). When matching takes place, this part of the pattern is skipped because DEFINE acts like a false condition. The rest of the pattern uses references to the named group to match the four dot-separated components of an IPv4 address, insisting on a word boundary at each end. . . .SS "Checking the PCRE2 version" .rs .sp Programs that link with a PCRE2 library can check the version by calling \fBpcre2_config()\fP with appropriate arguments. Users of applications that do not have access to the underlying code cannot do this. A special "condition" called VERSION exists to allow such users to discover which version of PCRE2 they are dealing with by using this condition to match a string such as "yesno". VERSION must be followed either by "=" or ">=" and a version number. For example: .sp (?(VERSION>=10.4)yes|no) .sp This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or "no" otherwise. The fractional part of the version number could be ommited. . . .SS "Assertion conditions" .rs .sp If the condition is not in any of the above formats, it must be a parenthesized assertion. This may be a positive or negative lookahead or lookbehind assertion. However, it must be a traditional atomic assertion, not one of the .\" HTML .\" non-atomic assertions. .\" .P Consider this pattern, again containing non-significant white space, and with the two alternatives on the second line: .sp (?(?=[^a-z]*[a-z]) \ed{2}-[a-z]{3}-\ed{2} | \ed{2}-\ed{2}-\ed{2} ) .sp The condition is a positive lookahead assertion that matches an optional sequence of non-letters followed by a letter. In other words, it tests for the presence of at least one letter in the subject. If a letter is found, the subject is matched against the first alternative; otherwise it is matched against the second. This pattern matches strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits. .P When an assertion that is a condition contains capture groups, any capturing that occurs in a matching branch is retained afterwards, for both positive and negative assertions, because matching always continues after the assertion, whether it succeeds or fails. (Compare non-conditional assertions, for which captures are retained only for positive assertions that succeed.) . . .\" HTML .SH COMMENTS .rs .sp There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as (?: or a group name or number or a Unicode property name. The characters that make up a comment play no part in the pattern matching. .P The sequence (?# marks the start of a comment that continues up to the next closing parenthesis. Nested parentheses are not permitted. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, an unescaped # character also introduces a comment, which in this case continues to immediately after the next newline character or character sequence in the pattern. Which characters are interpreted as newlines is controlled by an option passed to the compiling function or by a special sequence at the start of the pattern, as described in the section entitled .\" HTML .\" "Newline conventions" .\" above. Note that the end of this type of comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. For example, consider this pattern when PCRE2_EXTENDED is set, and the default newline convention (a single linefeed character) is in force: .sp abc #comment \en still comment .sp On encountering the # character, \fBpcre2_compile()\fP skips along, looking for a newline in the pattern. The sequence \en is still literal at this stage, so it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so. . . .\" HTML .SH "RECURSIVE PATTERNS" .rs .sp Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can be done is to use a pattern that matches up to some fixed depth of nesting. It is not possible to handle an arbitrary nesting depth. .P For some time, Perl has provided a facility that allows regular expressions to recurse (amongst other things). It does this by interpolating Perl code in the expression at run time, and the code can refer to the expression itself. A Perl pattern using code interpolation to solve the parentheses problem can be created like this: .sp $re = qr{\e( (?: (?>[^()]+) | (?p{$re}) )* \e)}x; .sp The (?p{...}) item interpolates Perl code at run time, and in this case refers recursively to the pattern in which it appears. .P Obviously, PCRE2 cannot support the interpolation of Perl code. Instead, it supports special syntax for recursion of the entire pattern, and also for individual capture group recursion. After its introduction in PCRE1 and Python, this kind of recursion was subsequently introduced into Perl at release 5.10. .P A special item that consists of (? followed by a number greater than zero and a closing parenthesis is a recursive subroutine call of the capture group of the given number, provided that it occurs inside that group. (If not, it is a .\" HTML .\" non-recursive subroutine .\" call, which is described in the next section.) The special item (?R) or (?0) is a recursive call of the entire regular expression. .P This PCRE2 pattern solves the nested parentheses problem (assume the PCRE2_EXTENDED option is set so that white space is ignored): .sp \e( ( [^()]++ | (?R) )* \e) .sp First it matches an opening parenthesis. Then it matches any number of substrings which can either be a sequence of non-parentheses, or a recursive match of the pattern itself (that is, a correctly parenthesized substring). Finally there is a closing parenthesis. Note the use of a possessive quantifier to avoid backtracking into sequences of non-parentheses. .P If this were part of a larger pattern, you would not want to recurse the entire pattern, so instead you could use this: .sp ( \e( ( [^()]++ | (?1) )* \e) ) .sp We have put the pattern into parentheses, and caused the recursion to refer to them instead of the whole pattern. .P In a larger pattern, keeping track of parenthesis numbers can be tricky. This is made easier by the use of relative references. Instead of (?1) in the pattern above you can write (?-2) to refer to the second most recently opened parentheses preceding the recursion. In other words, a negative number counts capturing parentheses leftwards from the point at which it is encountered. .P Be aware however, that if .\" HTML .\" duplicate capture group numbers .\" are in use, relative references refer to the earliest group with the appropriate number. Consider, for example: .sp (?|(a)|(b)) (c) (?-2) .sp The first two capture groups (a) and (b) are both numbered 1, and group (c) is number 2. When the reference (?-2) is encountered, the second most recently opened parentheses has the number 1, but it is the first such group (the (a) group) to which the recursion refers. This would be the same if an absolute reference (?1) was used. In other words, relative references are just a shorthand for computing a group number. .P It is also possible to refer to subsequent capture groups, by writing references such as (?+2). However, these cannot be recursive because the reference is not inside the parentheses that are referenced. They are always .\" HTML .\" non-recursive subroutine .\" calls, as described in the next section. .P An alternative approach is to use named parentheses. The Perl syntax for this is (?&name); PCRE1's earlier syntax (?P>name) is also supported. We could rewrite the above example as follows: .sp (? \e( ( [^()]++ | (?&pn) )* \e) ) .sp If there is more than one group with the same name, the earliest one is used. .P The example pattern that we have been looking at contains nested unlimited repeats, and so the use of a possessive quantifier for matching strings of non-parentheses is important when applying the pattern to strings that do not match. For example, when this pattern is applied to .sp (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() .sp it yields "no match" quickly. However, if a possessive quantifier is not used, the match runs for a very long time indeed because there are so many different ways the + and * repeats can carve up the subject, and all have to be tested before failure can be reported. .P At the end of a match, the values of capturing parentheses are those from the outermost level. If you want to obtain intermediate values, a callout function can be used (see below and the .\" HREF \fBpcre2callout\fP .\" documentation). If the pattern above is matched against .sp (ab(cd)ef) .sp the value for the inner capturing parentheses (numbered 2) is "ef", which is the last value taken on at the top level. If a capture group is not matched at the top level, its final captured value is unset, even if it was (temporarily) set at a deeper level during the matching process. .P Do not confuse the (?R) item with the condition (R), which tests for recursion. Consider this pattern, which matches text in angle brackets, allowing for arbitrary nesting. Only digits are allowed in nested brackets (that is, when recursing), whereas any characters are permitted at the outer level. .sp < (?: (?(R) \ed++ | [^<>]*+) | (?R)) * > .sp In this pattern, (?(R) is the start of a conditional group, with two different alternatives for the recursive and non-recursive cases. The (?R) item is the actual recursive call. . . .\" HTML .SS "Differences in recursion processing between PCRE2 and Perl" .rs .sp Some former differences between PCRE2 and Perl no longer exist. .P Before release 10.30, recursion processing in PCRE2 differed from Perl in that a recursive subroutine call was always treated as an atomic group. That is, once it had matched some of the subject string, it was never re-entered, even if it contained untried alternatives and there was a subsequent matching failure. (Historical note: PCRE implemented recursion before Perl did.) .P Starting with release 10.30, recursive subroutine calls are no longer treated as atomic. That is, they can be re-entered to try unused alternatives if there is a matching failure later in the pattern. This is now compatible with the way Perl works. If you want a subroutine call to be atomic, you must explicitly enclose it in an atomic group. .P Supporting backtracking into recursions simplifies certain types of recursive pattern. For example, this pattern matches palindromic strings: .sp ^((.)(?1)\e2|.?)$ .sp The second branch in the group matches a single central character in the palindrome when there are an odd number of characters, or nothing when there are an even number of characters, but in order to work it has to be able to try the second case when the rest of the pattern match fails. If you want to match typical palindromic phrases, the pattern has to ignore all non-word characters, which can be done like this: .sp ^\eW*+((.)\eW*+(?1)\eW*+\e2|\eW*+.?)\eW*+$ .sp If run with the PCRE2_CASELESS option, this pattern matches phrases such as "A man, a plan, a canal: Panama!". Note the use of the possessive quantifier *+ to avoid backtracking into sequences of non-word characters. Without this, PCRE2 takes a great deal longer (ten times or more) to match typical phrases, and Perl takes so long that you think it has gone into a loop. .P Another way in which PCRE2 and Perl used to differ in their recursion processing is in the handling of captured values. Formerly in Perl, when a group was called recursively or as a subroutine (see the next section), it had no access to any values that were captured outside the recursion, whereas in PCRE2 these values can be referenced. Consider this pattern: .sp ^(.)(\e1|a(?2)) .sp This pattern matches "bab". The first capturing parentheses match "b", then in the second group, when the backreference \e1 fails to match "b", the second alternative matches "a" and then recurses. In the recursion, \e1 does now match "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works. . . .\" HTML .SS "Groups as subroutines" .rs .sp If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine in a programming language. More accurately, PCRE2 treats the referenced group as an independent subpattern which it tries to match at the current matching position. The called group may be defined before or after the reference. A numbered reference can be absolute or relative, as in these examples: .sp (...(absolute)...)...(?2)... (...(relative)...)...(?-1)... (...(?+1)...(relative)... .sp An earlier example pointed out that the pattern .sp (sens|respons)e and \e1ibility .sp matches "sense and sensibility" and "response and responsibility", but not "sense and responsibility". If instead the pattern .sp (sens|respons)e and (?1)ibility .sp is used, it does match "sense and responsibility" as well as the other two strings. Another example is given in the discussion of DEFINE above. .P Like recursions, subroutine calls used to be treated as atomic, but this changed at PCRE2 release 10.30, so backtracking into subroutine calls can now occur. However, any capturing parentheses that are set during the subroutine call revert to their previous values afterwards. .P Processing options such as case-independence are fixed when a group is defined, so if it is used as a subroutine, such options cannot be changed for different calls. For example, consider this pattern: .sp (abc)(?i:(?-1)) .sp It matches "abcabc". It does not match "abcABC" because the change of processing option does not affect the called group. .P The behaviour of .\" HTML .\" backtracking control verbs .\" in groups when called as subroutines is described in the section entitled .\" HTML .\" "Backtracking verbs in subroutines" .\" below. . . .SS "Recursion and subroutines with returned capture groups" .rs .sp Since PCRE2 10.47, recursion and subroutine calls may also specify a list of capture groups to return. This is a PCRE2 syntax extension not supported by Perl. The pattern matching recurses into the referenced expression as described above, however, when the recursion returns to the calling expression the subgroups captured during the recursion can be retained when the calling expression's context is restored. .P When used as a subroutine, this allows the subroutine's capture groups to be used as return values. .P Only the specific capture groups listed by the caller will be retained, using the following syntax: .sp (?R(grouplist)) recurse whole pattern, returning capture groups (?n(grouplist)) ) (?+n(grouplist)) ) (?-n(grouplist)) ) call subroutine, returning capture groups (?&name(grouplist)) ) (?P>name(grouplist)) ) .P The list of capture groups "grouplist" is a comma-separated list of (absolute or relative) group numbers, and group names enclosed in single quotes or angle brackets. .P Here is an example which first uses the DEFINE condition to create a re-usable routine for matching a weekday, then calls that subroutine and retains the groups it captures for use later: .sp (?x: # ignore whitespace for clarity # Define the routine "weekendday" which matches Saturday or # Sunday, and returns the Sat/Sun prefix as \ek. (?(DEFINE) (? (?|(?Sat)urday|(?Sun)day) ) ) # Call the routine. Matches "Saturday,Sat" or "Sunday,Sun". (?&weekendday()),\ek ) .P This feature is not available using the Oniguruma syntax \eg<...> or \eg'...' below. . . .\" HTML .SS "Oniguruma subroutine syntax" .rs .sp For compatibility with Oniguruma, the non-Perl syntax \eg followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative syntax for calling a group as a subroutine, possibly recursively. Here are two of the examples used above, rewritten using this syntax: .sp (? \e( ( (?>[^()]+) | \eg )* \e) ) (sens|respons)e and \eg'1'ibility .sp PCRE2 supports an extension to Oniguruma: if a number is preceded by a plus or a minus sign it is taken as a relative reference. For example: .sp (abc)(?i:\eg<-1>) .sp Note that \eg{...} (Perl syntax) and \eg<...> (Oniguruma syntax) are \fInot\fP synonymous. The former is a backreference; the latter is a subroutine call. . . .SH CALLOUTS .rs .sp Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it possible, amongst other things, to extract different substrings that match the same pair of parentheses when there is a repetition. .P PCRE2 provides a similar feature, but of course it cannot obey arbitrary Perl code. The feature is called "callout". The caller of PCRE2 provides an external function by putting its entry point in a match context using the function \fBpcre2_set_callout()\fP, and then passing that context to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. If no match context is passed, or if the callout entry point is set to NULL, callout points will be passed over silently during matching. To disallow callouts in the pattern syntax, you may use the PCRE2_EXTRA_NEVER_CALLOUT option. .P Within a regular expression, (?C) indicates a point at which the external function is to be called. There are two kinds of callout: those with a numerical argument and those with a string argument. (?C) on its own with no argument is treated as (?C0). A numerical argument allows the application to distinguish between different callouts. String arguments were added for release 10.20 to make it possible for script languages that use PCRE2 to embed short scripts within patterns in a similar way to Perl. .P During matching, when PCRE2 reaches a callout point, the external function is called. It is provided with the number or string argument of the callout, the position in the pattern, and one item of data that is also set in the match block. The callout function may cause matching to proceed, to backtrack, or to fail. .P By default, PCRE2 implements a number of optimizations at matching time, and one side-effect is that sometimes callouts are skipped. If you need all possible callouts to happen, you need to set options that disable the relevant optimizations. More details, including a complete description of the programming interface to the callout function, are given in the .\" HREF \fBpcre2callout\fP .\" documentation. . . .SS "Callouts with numerical arguments" .rs .sp If you just want to have a means of identifying different callout points, put a number less than 256 after the letter C. For example, this pattern has two callout points: .sp (?C1)abc(?C2)def .sp If the PCRE2_AUTO_CALLOUT flag is passed to \fBpcre2_compile()\fP, numerical callouts are automatically installed before each item in the pattern. They are all numbered 255. If there is a conditional group in the pattern whose condition is an assertion, an additional callout is inserted just before the condition. An explicit callout may also be set at this position, as in this example: .sp (?(?C9)(?=a)abc|def) .sp Note that this applies only to assertion conditions, not to other types of condition. . . .SS "Callouts with string arguments" .rs .sp A delimited string may be used instead of a number as a callout argument. The starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the start, except for {, where the ending delimiter is }. If the ending delimiter is needed within the string, it must be doubled. For example: .sp (?C'ab ''c'' d')xyz(?C{any text})pqr .sp The doubling is removed before the string is passed to the callout function. . . .\" HTML .SH "BACKTRACKING CONTROL" .rs .sp There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form, and may behave differently depending on whether or not a name argument is present. The names are not required to be unique within the pattern. .P By default, for compatibility with Perl, a name is any sequence of characters that does not include a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result is no longer Perl-compatible. .P When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names and only an unescaped closing parenthesis terminates the name. However, the only backslash items that are permitted are \eQ, \eE, and sequences such as \ex{100} that define character code points. Character type escapes such as \ed are faulted. .P A closing parenthesis can be included in a name either as \e) or between \eQ and \eE. In addition to backslash processing, if the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is also set, unescaped white space in verb names is skipped, and #-comments are recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED and PCRE2_EXTENDED_MORE do not affect verb names unless PCRE2_ALT_VERBNAMES is also set. .P The maximum length of a name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit libraries. If the name is empty, that is, if the closing parenthesis immediately follows the colon, the effect is as if the colon were not there. Any number of these verbs may occur in a pattern. Except for (*ACCEPT), they may not be quantified. .P Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching function or JIT, because they use backtracking algorithms. With the exception of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function. .P The behaviour of these verbs in .\" HTML .\" repeated groups, .\" .\" HTML .\" assertions, .\" and in .\" HTML .\" capture groups called as subroutines .\" (whether or not recursively) is documented below. . . .\" HTML .SS "Optimizations that affect backtracking verbs" .rs .sp PCRE2 contains some optimizations that are used to speed up matching by running some checks at the start of each match attempt. For example, it may know the minimum length of matching subject, or that a particular character must be present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option when calling \fBpcre2_compile()\fP, by calling \fBpcre2_set_optimize()\fP with a PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled .\" HTML .\" "Compiling a pattern" .\" in the .\" HREF \fBpcre2api\fP .\" documentation. .P Experiments with Perl suggest that it too has similar optimizations, and like PCRE2, turning them off can change the result of a match. . . .\" HTML .SS "Verbs that act immediately" .rs .sp The following verbs act as soon as they are encountered. .sp (*ACCEPT) or (*ACCEPT:NAME) .sp This verb causes the match to end successfully, skipping the remainder of the pattern. However, when it is inside a capture group that is called as a subroutine, only that group is ended successfully. Matching then continues at the outer level. If (*ACCEPT) in triggered in a positive assertion, the assertion succeeds; in a negative assertion, the assertion fails. .P If (*ACCEPT) is inside capturing parentheses, the data so far is captured. For example: .sp A((?:A|B(*ACCEPT)|C)D) .sp This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by the outer parentheses. .P (*ACCEPT) is the only backtracking verb that is allowed to be quantified because an ungreedy quantification with a minimum of zero acts only when a backtrack happens. Consider, for example, .sp (A(*ACCEPT)??B)C .sp where A, B, and C may be complex expressions. After matching "A", the matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT) (see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means "succeed on backtrack". .P \fBWarning:\fP (*ACCEPT) should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking. .sp (*FAIL) or (*FAIL:NAME) .sp This verb causes a matching failure, forcing backtracking to occur. It may be abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl documentation notes that it is probably useful only when combined with (?{}) or (??{}). Those are, of course, Perl features that are not present in PCRE2. The nearest equivalent is the callout feature, as for example in this pattern: .sp a+(?C)(*FAIL) .sp A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times). .P (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before the verb acts. . . .SS "Recording which path was taken" .rs .sp There is one verb whose main purpose is to track how a match was arrived at, though it also has a secondary use in conjunction with advancing the match starting point (see (*SKIP) below). .sp (*MARK:NAME) or (*:NAME) .sp A name is always required with this verb. For all the other backtracking control verbs, a NAME argument is optional. .P When a match succeeds, the name of the last-encountered mark name on the matching path is passed back to the caller as described in the section entitled .\" HTML .\" "Other information about the match" .\" in the .\" HREF \fBpcre2api\fP .\" documentation. This applies to all instances of (*MARK) and other verbs, including those inside assertions and atomic groups. However, there are differences in those cases when (*MARK) is used in conjunction with (*SKIP) as described below. .P The mark name that was last encountered on the matching path is passed back. A verb without a NAME argument is ignored for this purpose. Here is an example of \fBpcre2test\fP output, where the "mark" modifier requests the retrieval and outputting of (*MARK) data: .sp re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XY 0: XY MK: A XZ 0: XZ MK: B .sp The (*MARK) name is tagged with "MK:" in this output, and in this example it indicates which of the two alternatives matched. This is a more efficient way of obtaining this information than putting each alternative in its own capturing parentheses. .P If a verb with a name is encountered in a positive assertion that is true, the name is recorded and passed back if it is the last-encountered. This does not happen for negative assertions or failing positive assertions. .P After a partial match or a failed match, the last encountered name in the entire match process is returned. For example: .sp re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XP No match, mark = B .sp Note that in this unanchored example the mark is retained from the match attempt that started at the letter "X" in the subject. Subsequent match attempts starting at "P" and then with an empty string do not get as far as the (*MARK) item, but nevertheless do not reset it. .P If you are interested in (*MARK) values after failed matches, you should probably either set the PCRE2_NO_START_OPTIMIZE option or call \fBpcre2_set_optimize()\fP with a PCRE2_START_OPTIMIZE_OFF directive .\" HTML .\" (see above) .\" to ensure that the match is always attempted. . . .SS "Verbs that act after backtracking" .rs .sp The following verbs do nothing when they are encountered. Matching continues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, backtracking cannot pass to the left of the verb. However, when one of these verbs appears inside an atomic group or in an atomic lookaround assertion that is true, its effect is confined to that group, because once the group has been matched, there is never any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point. .P These verbs differ in exactly what kind of failure occurs when backtracking reaches them. The behaviour described below is what happens when the verb is not in a subroutine or an assertion. Subsequent sections cover these special cases. .sp (*COMMIT) or (*COMMIT:NAME) .sp This verb causes the whole match to fail outright if there is a later matching failure that causes backtracking to reach it. Even if the pattern is unanchored, no further attempts to find a match by advancing the starting point take place. If (*COMMIT) is the only backtracking verb that is encountered, once it has been passed \fBpcre2_match()\fP is committed to finding a match at the current starting point, or not at all. For example: .sp a+(*COMMIT)b .sp This matches "xxaab" but not "aacaab". It can be thought of as a kind of dynamic anchor, or "I've started, so I must finish." .P The behaviour of (*COMMIT:NAME) is not the same as (*MARK:NAME)(*COMMIT). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names that are set with (*MARK), ignoring those set by any of the other backtracking verbs. .P If there is more than one backtracking verb in a pattern, a different one that follows (*COMMIT) may be triggered first, so merely passing (*COMMIT) during a match does not always guarantee that a match must be at this starting point. .P Note that (*COMMIT) at the start of a pattern is not the same as an anchor, unless PCRE2's start-of-match optimizations are turned off, as shown in this output from \fBpcre2test\fP: .sp re> /(*COMMIT)abc/ data> xyzabc 0: abc data> re> /(*COMMIT)abc/no_start_optimize data> xyzabc No match .sp For the first pattern, PCRE2 knows that any match must start with "a", so the optimization skips along the subject to "a" before applying the pattern to the first set of data. The match attempt then succeeds. The second pattern disables the optimization that skips along to the first character. The pattern is now applied starting at "x", and so the (*COMMIT) causes the match to fail without trying any other starting points. .sp (*PRUNE) or (*PRUNE:NAME) .sp This verb causes the match to fail at the current starting position in the subject if there is a later matching failure that causes backtracking to reach it. If the pattern is unanchored, the normal "bumpalong" advance to the next starting character then happens. Backtracking can occur as usual to the left of (*PRUNE), before it is reached, or when matching to the right of (*PRUNE), but if there is no match to the right, backtracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE) is just an alternative to an atomic group or possessive quantifier, but there are some uses of (*PRUNE) that cannot be expressed in any other way. In an anchored pattern (*PRUNE) has the same effect as (*COMMIT). .P The behaviour of (*PRUNE:NAME) is not the same as (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names set with (*MARK), ignoring those set by other backtracking verbs. .sp (*SKIP) .sp This verb, when given without a name, is like (*PRUNE), except that if the pattern is unanchored, the "bumpalong" advance is not to the next character, but to the position in the subject where (*SKIP) was encountered. (*SKIP) signifies that whatever text was matched leading up to it cannot be part of a successful match if there is a later mismatch. Consider: .sp a+(*SKIP)b .sp If the subject is "aaaac...", after the first match attempt fails (starting at the first character in the string), the starting point skips on to start the next attempt at "c". Note that a possessive quantifier does not have the same effect as this example; although it would suppress backtracking during the first match attempt, the second attempt would start at the second character instead of skipping on to "c". .P If (*SKIP) is used to specify a new starting position that is the same as the starting position of the current match, or (by being inside a lookbehind) earlier, the position specified by (*SKIP) is ignored, and instead the normal "bumpalong" occurs. .sp (*SKIP:NAME) .sp When (*SKIP) has an associated name, its behaviour is modified. When such a (*SKIP) is triggered, the previous path through the pattern is searched for the most recent (*MARK) that has the same name. If one is found, the "bumpalong" advance is to the subject position that corresponds to that (*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with a matching name is found, the (*SKIP) is ignored. .P The search for a (*MARK) name uses the normal backtracking mechanism, which means that it does not see (*MARK) settings that are inside atomic groups or assertions, because they are never re-entered by backtracking. Compare the following \fBpcre2test\fP examples: .sp re> /a(?>(*MARK:X))(*SKIP:X)(*F)|(.)/ data: abc 0: a 1: a data: re> /a(?:(*MARK:X))(*SKIP:X)(*F)|(.)/ data: abc 0: b 1: b .sp In the first example, the (*MARK) setting is in an atomic group, so it is not seen when (*SKIP:X) triggers, causing the (*SKIP) to be ignored. This allows the second branch of the pattern to be tried at the first character position. In the second example, the (*MARK) setting is not in an atomic group. This allows (*SKIP:X) to find the (*MARK) when it backtracks, and this causes a new matching attempt to start at the second character. This time, the (*MARK) is never seen because "a" does not match "b", so the matcher immediately jumps to the second branch of the pattern. .P Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It ignores names that are set by other backtracking verbs. .sp (*THEN) or (*THEN:NAME) .sp This verb causes a skip to the next innermost alternative when backtracking reaches it. That is, it cancels any further backtracking within the current alternative. Its name comes from the observation that it can be used for a pattern-based if-then-else block: .sp ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ... .sp If the COND1 pattern matches, FOO is tried (and possibly further items after the end of the group if FOO succeeds); on failure, the matcher skips to the second alternative and tries COND2, without backtracking into COND1. If that succeeds and BAR fails, COND3 is tried. If subsequently BAZ fails, there are no more alternatives, so there is a backtrack to whatever came before the entire group. If (*THEN) is not inside an alternation, it acts like (*PRUNE). .P The behaviour of (*THEN:NAME) is not the same as (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is remembered for passing back to the caller. However, (*SKIP:NAME) searches only for names set with (*MARK), ignoring those set by other backtracking verbs. .P A group that does not contain a | character is just a part of the enclosing alternative; it is not a nested alternation with only one alternative. The effect of (*THEN) extends beyond such a group to the enclosing alternative. Consider this pattern, where A, B, etc. are complex pattern fragments that do not contain any | characters at this level: .sp A (B(*THEN)C) | D .sp If A and B are matched, but there is a failure in C, matching does not backtrack into A; instead it moves to the next alternative, that is, D. However, if the group containing (*THEN) is given an alternative, it behaves differently: .sp A (B(*THEN)C | (*FAIL)) | D .sp The effect of (*THEN) is now confined to the inner group. After a failure in C, matching moves to (*FAIL), which causes the whole group to fail because there are no more alternatives to try. In this case, matching does backtrack into A. .P Note that a conditional group is not considered as having two alternatives, because only one is ever used. In other words, the | character in a conditional group has a different meaning. Ignoring white space, consider: .sp ^.*? (?(?=a) a | b(*THEN)c ) .sp If the subject is "ba", this pattern does not match. Because .*? is ungreedy, it initially matches zero characters. The condition (?=a) then fails, the character "b" is matched, but "c" is not. At this point, matching does not backtrack to .*? as might perhaps be expected from the presence of the | character. The conditional group is part of the single alternative that comprises the whole pattern, and so the match fails. (If there was a backtrack into .*?, allowing it to match "b", the match would succeed.) .P The verbs just described provide four different "strengths" of control when subsequent matching fails. (*THEN) is the weakest, carrying on the match at the next alternative. (*PRUNE) comes next, failing the match at the current starting position, but allowing an advance to the next character (for an unanchored pattern). (*SKIP) is similar, except that the advance may be more than one character. (*COMMIT) is the strongest, causing the entire match to fail. . . .SS "More than one backtracking verb" .rs .sp If more than one backtracking verb is present in a pattern, the one that is backtracked onto first acts. For example, consider this pattern, where A, B, etc. are complex pattern fragments: .sp (A(*COMMIT)B(*THEN)C|ABD) .sp If A matches but B fails, the backtrack to (*COMMIT) causes the entire match to fail. However, if A and B match, but C fails, the backtrack to (*THEN) causes the next alternative (ABD) to be tried. This behaviour is consistent, but is not always the same as Perl's. It means that if two or more backtracking verbs appear in succession, all but the last of them has no effect. Consider this example: .sp ...(*COMMIT)(*PRUNE)... .sp If there is a matching failure to the right, backtracking onto (*PRUNE) causes it to be triggered, and its action is taken. There can never be a backtrack onto (*COMMIT). . . .\" HTML .SS "Backtracking verbs in repeated groups" .rs .sp PCRE2 sometimes differs from Perl in its handling of backtracking verbs in repeated groups. For example, consider: .sp /(a(*COMMIT)b)+ac/ .sp If the subject is "abac", Perl matches unless its optimizations are disabled, but PCRE2 always fails because the (*COMMIT) in the second repeat of the group acts. . . .\" HTML .SS "Backtracking verbs in assertions" .rs .sp (*FAIL) in any assertion has its normal effect: it forces an immediate backtrack. The behaviour of the other backtracking verbs depends on whether or not the assertion is standalone or acting as the condition in a conditional group. .P (*ACCEPT) in a standalone positive assertion causes the assertion to succeed without any further processing; captured strings and a mark name (if set) are retained. In a standalone negative assertion, (*ACCEPT) causes the assertion to fail without any further processing; captured substrings and any mark name are discarded. .P If the assertion is a condition, (*ACCEPT) causes the condition to be true for a positive assertion and false for a negative one; captured substrings are retained in both cases. .P The remaining verbs act only when a later failure causes a backtrack to reach them. This means that, for the Perl-compatible assertions, their effect is confined to the assertion, because Perl lookaround assertions are atomic. A backtrack that occurs after such an assertion is complete does not jump back into the assertion. Note in particular that a (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern. .P PCRE2 now supports non-atomic positive assertions and also "scan substring" assertions, as described in the sections entitled .\" HTML .\" "Non-atomic assertions" .\" and .\" HTML .\" "Scan substring assertions" .\" above. These assertions must be standalone (not used as conditions). They are not Perl-compatible. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by backtracks from later in the pattern. .P The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, and a negative assertion to be true. This behaviour differs from Perl when the assertion has only one branch. .P The other backtracking verbs are not treated specially if they appear in a standalone positive assertion. In a conditional positive assertion, backtracking (from within the assertion) into (*COMMIT), (*SKIP), or (*PRUNE) causes the condition to be false. However, for both standalone and conditional negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes the assertion to be true, without considering any further alternative branches. . . .\" HTML .SS "Backtracking verbs in subroutines" .rs .sp These behaviours occur whether or not the group is called recursively. .P (*ACCEPT) in a group called as a subroutine causes the subroutine match to succeed without any further processing. Matching then continues after the subroutine call. Perl documents this behaviour. Perl's treatment of the other verbs in subroutines is different in some cases. .P (*FAIL) in a group called as a subroutine has its normal effect: it forces an immediate backtrack. .P (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail when triggered by being backtracked to in a group called as a subroutine. There is then a backtrack at the outer level. .P (*THEN), when triggered, skips to the next alternative in the innermost enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. . . .\" HTML .SH "EBCDIC ENVIRONMENTS" .rs .sp Differences in the way PCRE behaves when it is running in an EBCDIC environment are covered in this section. . . .SS "Escape sequences" .rs .sp When PCRE2 is compiled in EBCDIC mode, \eN{U+hhh..} is not supported. \ea, \ee, \ef, \en, \er, and \et generate the appropriate EBCDIC code values. The \ec escape is processed as specified for Perl in the \fBperlebcdic\fP document. The only characters that are allowed after \ec are A-Z, a-z, or one of @, [, \e, ], ^, _, or ?. Any other character provokes a compile-time error. The sequence \ec@ encodes character code 0; after \ec the letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, \e, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \ec? becomes either 255 (hex FF) or 95 (hex 5F). .P Thus, apart from \ec?, these escapes generate the same character code values as they do in an ASCII or Unicode environment, though the meanings of the values mostly differ. For example, \ecG always generates code value 7, which is BEL in ASCII but DEL in EBCDIC. .P The sequence \ec? generates DEL (127, hex 7F) in an ASCII environment, but because 127 is not a control character in EBCDIC, Perl makes it generate the APC character. Unfortunately, there are several variants of EBCDIC. In most of them the APC character has the value 255 (hex FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC values, PCRE2 makes \ec? generate 95; otherwise it generates 255. . . .SS "Character classes" .rs .sp In character classes there is a special case in EBCDIC environments for ranges whose end points are both specified as literal letters in the same case. For compatibility with Perl, EBCDIC code points within the range that are not letters are omitted. For example, [h-k] matches only four characters, even though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code points. However, if the range is specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points are included. . . .SH "SEE ALSO" .rs .sp \fBpcre2api\fP(3), \fBpcre2callout\fP(3), \fBpcre2matching\fP(3), \fBpcre2syntax\fP(3), \fBpcre2\fP(3). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 03 September 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2perform.3 ================================================ .TH PCRE2PERFORM 3 "06 December 2022" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 PERFORMANCE" .rs .sp Two aspects of performance are discussed below: memory usage and processing time. The way you express your pattern as a regular expression can affect both of them. . .SH "COMPILED PATTERN MEMORY USAGE" .rs .sp Patterns are compiled by PCRE2 into a reasonably efficient interpretive code, so that most simple patterns do not use much memory for storing the compiled version. However, there is one case where the memory usage of a compiled pattern can be unexpectedly large. If a parenthesized group has a quantifier with a minimum greater than 1 and/or a limited maximum, the whole group is repeated in the compiled code. For example, the pattern .sp (abc|def){2,4} .sp is compiled as if it were .sp (abc|def)(abc|def)((abc|def)(abc|def)?)? .sp (Technical aside: It is done this way so that backtrack points within each of the repetitions can be independently maintained.) .P For regular expressions whose quantifiers use only small numbers, this is not usually a problem. However, if the numbers are large, and particularly if such repetitions are nested, the memory usage can become an embarrassment. For example, the very simple pattern .sp ((ab){1,1000}c){1,3} .sp uses over 50KiB when compiled using the 8-bit library. When PCRE2 is compiled with its default internal pointer size of two bytes, the size limit on a compiled pattern is 65535 code units in the 8-bit and 16-bit libraries, and this is reached with the above pattern if the outer repetition is increased from 3 to 4. PCRE2 can be compiled to use larger internal pointers and thus handle larger compiled patterns, but it is better to try to rewrite your pattern to use less memory if you can. .P One way of reducing the memory usage for such patterns is to make use of PCRE2's .\" HTML .\" "subroutine" .\" facility. Re-writing the above pattern as .sp ((ab)(?2){0,999}c)(?1){0,2} .sp reduces the memory requirements to around 16KiB, and indeed it remains under 20KiB even with the outer repetition increased to 100. However, this kind of pattern is not always exactly equivalent, because any captures within subroutine calls are lost when the subroutine completes. If this is not a problem, this kind of rewriting will allow you to process patterns that PCRE2 cannot otherwise handle. The matching performance of the two different versions of the pattern are roughly the same. (This applies from release 10.30 - things were different in earlier releases.) . . .SH "STACK AND HEAP USAGE AT RUN TIME" .rs .sp From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP uses very little system stack at run time. In earlier releases recursive function calls could use a great deal of stack, and this could cause problems, but this usage has been eliminated. Backtracking positions are now explicitly remembered in memory frames controlled by the code. .P The size of each frame depends on the size of pointer variables and the number of capturing parenthesized groups in the pattern being matched. On a 64-bit system the frame size for a pattern with no captures is 128 bytes. For each capturing group the size increases by 16 bytes. .P Until release 10.41, an initial 20KiB frames vector was allocated on the system stack, but this still caused some issues for multi-thread applications where each thread has a very small stack. From release 10.41 backtracking memory frames are always held in heap memory. An initial heap allocation is obtained the first time any match data block is passed to \fBpcre2_match()\fP. This is remembered with the match data block and re-used if that block is used for another match. It is freed when the match data block itself is freed. .P The size of the initial block is the larger of 20KiB or ten times the pattern's frame size, unless the heap limit is less than this, in which case the heap limit is used. If the initial block proves to be too small during matching, it is replaced by a larger block, subject to the heap limit. The heap limit is checked only when a new block is to be allocated. Reducing the heap limit between calls to \fBpcre2_match()\fP with the same match data block does not affect the saved block. .P In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive function calls, but only for processing atomic groups, lookaround assertions, and recursion within the pattern. The original version of the code used to allocate quite large internal workspace vectors on the stack, which caused some problems for some patterns in environments with small stacks. From release 10.32 the code for \fBpcre2_dfa_match()\fP has been re-factored to use heap memory when necessary for internal workspace when recursing, though recursive function calls are still used. .P The "match depth" parameter can be used to limit the depth of function recursion, and the "match heap" parameter to limit heap memory in \fBpcre2_dfa_match()\fP. . . .SH "PROCESSING TIME" .rs .sp Certain items in regular expression patterns are processed more efficiently than others. It is more efficient to use a character class like [aeiou] than a set of single-character alternatives such as (a|e|i|o|u). In general, the simplest construction that provides the required behaviour is usually the most efficient. Jeffrey Friedl's book contains a lot of useful general discussion about optimizing regular expressions for efficient performance. This document contains a few observations about PCRE2. .P Using Unicode character properties (the \ep, \eP, and \eX escapes) is slow, because PCRE2 has to use a multi-stage table lookup whenever it needs a character's property. If you can find an alternative pattern that does not use character properties, it will probably be faster. .P By default, the escape sequences \eb, \ed, \es, and \ew, and the POSIX character classes such as [:alpha:] do not use Unicode properties, partly for backwards compatibility, and partly for performance reasons. However, you can set the PCRE2_UCP option or start the pattern with (*UCP) if you want Unicode character properties to be used. This can double the matching time for items such as \ed, when matched with \fBpcre2_match()\fP; the performance loss is less with a DFA matching function, and in both cases there is not much difference for \eb. .P When a pattern begins with .* not in atomic parentheses, nor in parentheses that are the subject of a backreference, and the PCRE2_DOTALL option is set, the pattern is implicitly anchored by PCRE2, since it can match only at the start of a subject string. If the pattern has multiple top-level branches, they must all be anchorable. The optimization can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is automatically disabled if the pattern contains (*PRUNE) or (*SKIP). .P If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, because the dot metacharacter does not then match a newline, and if the subject string contains newlines, the pattern may match from the character immediately following one of them instead of from the very start. For example, the pattern .sp .*second .sp matches the subject "first\enand second" (where \en stands for a newline character), with the match starting at the seventh character. In order to do this, PCRE2 has to retry the match starting after every newline in the subject. .P If you are using such a pattern with subject strings that do not contain newlines, the best performance is obtained by setting PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE2 from having to scan along the subject looking for a newline to restart at. .P Beware of patterns that contain nested indefinite repeats. These can take a long time to run when applied to a string that does not match. Consider the pattern fragment .sp ^(a+)* .sp This can match "aaaa" in 16 different ways, and this number increases very rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4 times, and for each of those cases other than 0 or 4, the + repeats can match different numbers of times.) When the remainder of the pattern is such that the entire match is going to fail, PCRE2 has in principle to try every possible variation, and this can take an extremely long time, even for relatively short strings. .P An optimization catches some of the more simple cases such as .sp (a+)*b .sp where a literal character follows. Before embarking on the standard matching procedure, PCRE2 checks that there is a "b" later in the subject string, and if there is not, it fails the match immediately. However, when there is no following literal this optimization cannot be used. You can see the difference by comparing the behaviour of .sp (a+)*\ed .sp with the pattern above. The former gives a failure almost instantly when applied to a whole line of "a" characters, whereas the latter takes an appreciable time with strings longer than about 20 characters. .P In many cases, the solution to this kind of performance issue is to use an atomic group or a possessive quantifier. This can often reduce memory requirements as well. As another example, consider this pattern: .sp ([^<]|<(?!inet))+ .sp It matches from wherever it starts until it encounters " .\" "The match context" .\" in the .\" HREF \fBpcre2api\fP .\" documentation. .P The \fBpcre2test\fP test program has a modifier called "find_limits" which, if applied to a subject line, causes it to find the smallest limits that allow a pattern to match. This is done by repeatedly matching with different limits. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 06 December 2022 Copyright (c) 1997-2022 University of Cambridge. .fi ================================================ FILE: doc/pcre2posix.3 ================================================ .TH PCRE2POSIX 3 "27 November 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SYNOPSIS" .rs .sp .B #include .PP .nf .B int pcre2_regcomp(regex_t *\fIpreg\fP, const char *\fIpattern\fP, .B " int \fIcflags\fP);" .sp .B int pcre2_regexec(const regex_t *\fIpreg\fP, const char *\fIstring\fP, .B " size_t \fInmatch\fP, regmatch_t \fIpmatch\fP[], int \fIeflags\fP);" .sp .B "size_t pcre2_regerror(int \fIerrcode\fP, const regex_t *\fIpreg\fP," .B " char *\fIerrbuf\fP, size_t \fIerrbuf_size\fP);" .sp .B void pcre2_regfree(regex_t *\fIpreg\fP); .fi . .SH DESCRIPTION .rs .sp This set of functions provides a POSIX-style API for the PCRE2 regular expression 8-bit library. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit libraries. See the .\" HREF \fBpcre2api\fP .\" documentation for a description of PCRE2's native API, which contains much additional functionality. .P \fBIMPORTANT NOTE\fP: The functions described here are NOT thread-safe, and should not be used in multi-threaded applications. They are also limited to processing subjects that are not bigger than 2GB. Use the native API instead. .P These functions are wrapper functions that ultimately call the PCRE2 native API. Their prototypes are defined in the \fBpcre2posix.h\fP header file, and they all have unique names starting with \fBpcre2_\fP. However, the \fBpcre2posix.h\fP header also contains macro definitions that convert the standard POSIX names such \fBregcomp()\fP into \fBpcre2_regcomp()\fP etc. This means that a program can use the usual POSIX names without running the risk of accidentally linking with POSIX functions from a different library. .P On Unix-like systems the PCRE2 POSIX library is called \fBlibpcre2-posix\fP, so can be accessed by adding \fB-lpcre2-posix\fP to the command for linking an application. Because the POSIX functions call the native ones, it is also necessary to add \fB-lpcre2-8\fP. .P On Windows systems, if you are linking to a DLL version of the library, it is recommended that \fBPCRE2POSIX_SHARED\fP is defined before including the \fBpcre2posix.h\fP header, as it will allow for a more efficient way to invoke the functions by adding the \fB__declspec(dllimport)\fP decorator. .P Although they were not defined as prototypes in \fBpcre2posix.h\fP, releases 10.33 to 10.36 of the library contained functions with the POSIX names \fBregcomp()\fP etc. These simply passed their arguments to the PCRE2 functions. These functions were provided for backwards compatibility with earlier versions of PCRE2, which had only POSIX names. However, this has proved troublesome in situations where a program links with several libraries, some of which use PCRE2's POSIX interface while others use the real POSIX functions. For this reason, the POSIX names have been removed since release 10.37. .P Calling the header file \fBpcre2posix.h\fP avoids any conflict with other POSIX libraries. It can, of course, be renamed or aliased as \fBregex.h\fP, which is the "correct" name, if there is no clash. It provides two structure types, \fIregex_t\fP for compiled internal forms, and \fIregmatch_t\fP for returning captured substrings. It also defines some constants whose names start with "REG_"; these are used for setting options and identifying error codes. . . .SH "USING THE POSIX FUNCTIONS" .rs .sp Note that these functions are just POSIX-style wrappers for PCRE2's native API. They do not give POSIX regular expression behaviour, and they are not thread-safe or even POSIX compatible. .P Those POSIX option bits that can reasonably be mapped to PCRE2 native options have been implemented. In addition, the option REG_EXTENDED is defined with the value zero. This has no effect, but since programs that are written to the POSIX interface often use it, this makes it easier to slot in PCRE2 as a replacement library. Other POSIX options are not even defined. .P There are also some options that are not defined by POSIX. These have been added at the request of users who want to make use of certain PCRE2-specific features via the POSIX calling interface or to add BSD or GNU functionality. .P When PCRE2 is called via these functions, it is only the API that is POSIX-like in style. The syntax and semantics of the regular expressions themselves are still those of Perl, subject to the setting of various PCRE2 options, as described below. "POSIX-like in style" means that the API approximates to the POSIX definition; it is not fully POSIX-compatible, and in multi-unit encoding domains it is probably even less compatible. .P The descriptions below use the actual names of the functions, but, as described above, the standard POSIX names (without the \fBpcre2_\fP prefix) may also be used. . . .SH "COMPILING A PATTERN" .rs .sp The function \fBpcre2_regcomp()\fP is called to compile a pattern into an internal form. By default, the pattern is a C string terminated by a binary zero (but see REG_PEND below). The \fIpreg\fP argument is a pointer to a \fBregex_t\fP structure that is used as a base for storing information about the compiled regular expression. It is also used for input when REG_PEND is set. The \fBregex_t\fP structure used by \fBpcre2_regcomp()\fP is defined in \fBpcre2posix.h\fP and is not the same as the structure used by other libraries that provide POSIX-style matching. .P The argument \fIcflags\fP is either zero, or contains one or more of the bits defined by the following macros: .sp REG_DOTALL .sp The PCRE2_DOTALL option is set when the regular expression is passed for compilation to the native function. Note that REG_DOTALL is not part of the POSIX standard. .sp REG_ICASE .sp The PCRE2_CASELESS option is set when the regular expression is passed for compilation to the native function. .sp REG_NEWLINE .sp The PCRE2_MULTILINE option is set when the regular expression is passed for compilation to the native function. Note that this does \fInot\fP mimic the defined POSIX behaviour for REG_NEWLINE (see the following section). .sp REG_NOSPEC .sp The PCRE2_LITERAL option is set when the regular expression is passed for compilation to the native function. This disables all meta characters in the pattern, causing it to be treated as a literal string. The only other options that are allowed with REG_NOSPEC are REG_ICASE, REG_NOSUB, REG_PEND, and REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard. .sp REG_NOSUB .sp When a pattern that is compiled with this flag is passed to \fBpcre2_regexec()\fP for matching, the \fInmatch\fP and \fIpmatch\fP arguments are ignored, and no captured strings are returned. Versions of the PCRE2 library prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens because it disables the use of backreferences. .sp REG_PEND .sp If this option is set, the \fBreg_endp\fP field in the \fIpreg\fP structure (which has the type const char *) must be set to point to the character beyond the end of the pattern before calling \fBpcre2_regcomp()\fP. The pattern itself may now contain binary zeros, which are treated as data characters. Without REG_PEND, a binary zero terminates the pattern and the \fBre_endp\fP field is ignored. This is a GNU extension to the POSIX standard and should be used with caution in software intended to be portable to other systems. .sp REG_UCP .sp The PCRE2_UCP option is set when the regular expression is passed for compilation to the native function. This causes PCRE2 to use Unicode properties when matching \ed, \ew, etc., instead of just recognizing ASCII values. Note that REG_UCP is not part of the POSIX standard. .sp REG_UNGREEDY .sp The PCRE2_UNGREEDY option is set when the regular expression is passed for compilation to the native function. Note that REG_UNGREEDY is not part of the POSIX standard. .sp REG_UTF .sp The PCRE2_UTF option is set when the regular expression is passed for compilation to the native function. This causes the pattern itself and all data strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF is not part of the POSIX standard. .P In the absence of these flags, no options are passed to the native function. This means that the regex is compiled with PCRE2 default semantics. In particular, the way it handles newline characters in the subject string is the Perl way, not the POSIX way. Note that setting PCRE2_MULTILINE has only \fIsome\fP of the effects specified for REG_NEWLINE. It does not affect the way newlines are matched by the dot metacharacter (they are not) or by a negative class such as [^a] (they are). .P The yield of \fBpcre2_regcomp()\fP is zero on success, and non-zero otherwise. The \fIpreg\fP structure is filled in on success, and one other member of the structure (as well as \fIre_endp\fP) is public: \fIre_nsub\fP contains the number of capturing subpatterns in the regular expression. Various error codes are defined in the header file. .P NOTE: If the yield of \fBpcre2_regcomp()\fP is non-zero, you must not attempt to use the contents of the \fIpreg\fP structure. If, for example, you pass it to \fBpcre2_regexec()\fP, the result is undefined and your program is likely to crash. . . .SH "MATCHING NEWLINE CHARACTERS" .rs .sp This area is not simple, because POSIX and Perl take different views of things. It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was never intended to be a POSIX engine. The following table lists the different possibilities for matching newline characters in Perl and PCRE2: .sp Default Change with .sp . matches newline no PCRE2_DOTALL newline matches [^a] yes not changeable $ matches \en at end yes PCRE2_DOLLAR_ENDONLY $ matches \en in middle no PCRE2_MULTILINE ^ matches \en in middle no PCRE2_MULTILINE .sp This is the equivalent table for a POSIX-compatible pattern matcher: .sp Default Change with .sp . matches newline yes REG_NEWLINE newline matches [^a] yes REG_NEWLINE $ matches \en at end no REG_NEWLINE $ matches \en in middle no REG_NEWLINE ^ matches \en in middle no REG_NEWLINE .sp This behaviour is not what happens when PCRE2 is called via its POSIX API. By default, PCRE2's behaviour is the same as Perl's, except that there is no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there is no way to stop newline from matching [^a]. .P Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and PCRE2_DOLLAR_ENDONLY when calling \fBpcre2_compile()\fP directly, but there is no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using the POSIX API, passing REG_NEWLINE to PCRE2's \fBpcre2_regcomp()\fP function causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY. . . .SH "MATCHING A PATTERN" .rs .sp The function \fBpcre2_regexec()\fP is called to match a compiled pattern \fIpreg\fP against a given \fIstring\fP, which is by default terminated by a zero byte (but see REG_STARTEND below), subject to the options in \fIeflags\fP. These can be: .sp REG_NOTBOL .sp The PCRE2_NOTBOL option is set when calling the underlying PCRE2 matching function. .sp REG_NOTEMPTY .sp The PCRE2_NOTEMPTY option is set when calling the underlying PCRE2 matching function. Note that REG_NOTEMPTY is not part of the POSIX standard. However, setting this option can give more POSIX-like behaviour in some situations. .sp REG_NOTEOL .sp The PCRE2_NOTEOL option is set when calling the underlying PCRE2 matching function. .sp REG_STARTEND .sp When this option is set, the subject string starts at \fIstring\fP + \fIpmatch[0].rm_so\fP and ends at \fIstring\fP + \fIpmatch[0].rm_eo\fP, which should point to the first character beyond the string. There may be binary zeros within the subject string, and indeed, using REG_STARTEND is the only way to pass a subject string that contains a binary zero. .P Whatever the value of \fIpmatch[0].rm_so\fP, the offsets of the matched string and any captured substrings are still given relative to the start of \fIstring\fP itself. (Before PCRE2 release 10.30 these were given relative to \fIstring\fP + \fIpmatch[0].rm_so\fP, but this differs from other implementations.) .P This is a BSD extension, compatible with but not specified by IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does not imply REG_NOTBOL; REG_STARTEND affects only the location and length of the string, not how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are mutually exclusive; the error REG_INVARG is returned. .P If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of \fBpcre2_regexec()\fP are ignored (except possibly as input for REG_STARTEND). .P The value of \fInmatch\fP may be zero, and the value \fIpmatch\fP may be NULL (unless REG_STARTEND is set); in both these cases no data about any matched strings is returned. .P Otherwise, the portion of the string that was matched, and also any captured substrings, are returned via the \fIpmatch\fP argument, which points to an array of \fInmatch\fP structures of type \fIregmatch_t\fP, containing the members \fIrm_so\fP and \fIrm_eo\fP. These contain the byte offset to the first character of each substring and the offset to the first character after the end of each substring, respectively. The 0th element of the vector relates to the entire portion of \fIstring\fP that was matched; subsequent elements relate to the capturing subpatterns of the regular expression. Unused entries in the array have both structure members set to -1. .P \fIregmatch_t\fP as well as the \fIregoff_t\fP typedef it uses are defined in \fBpcre2posix.h\fP and are not warranted to have the same size or layout as other similarly named types from other libraries that provide POSIX-style matching. .P A successful match yields a zero return; various error codes are defined in the header file, of which REG_NOMATCH is the "expected" failure code. . . .SH "ERROR MESSAGES" .rs .sp The \fBpcre2_regerror()\fP function maps a non-zero errorcode from either \fBpcre2_regcomp()\fP or \fBpcre2_regexec()\fP to a printable message. If \fIpreg\fP is not NULL, the error should have arisen from the use of that structure. A message terminated by a binary zero is placed in \fIerrbuf\fP. If the buffer is too short, only the first \fIerrbuf_size\fP - 1 characters of the error message are used. The yield of the function is the size of buffer needed to hold the whole message, including the terminating zero. This value is greater than \fIerrbuf_size\fP if the message was truncated. . . .SH MEMORY USAGE .rs .sp Compiling a regular expression causes memory to be allocated and associated with the \fIpreg\fP structure. The function \fBpcre2_regfree()\fP frees all such memory, after which \fIpreg\fP may no longer be used as a compiled expression. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2sample.3 ================================================ .TH PCRE2SAMPLE 3 "28 February 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 SAMPLE PROGRAM" .rs .sp A simple, complete demonstration program to get you started with using PCRE2 is supplied in the file \fIpcre2demo.c\fP in the \fBsrc\fP directory in the PCRE2 distribution. A listing of this program is given in the .\" HREF \fBpcre2demo\fP .\" documentation. If you do not have a copy of the PCRE2 distribution, you can save this listing to recreate the contents of \fIpcre2demo.c\fP. .P The demonstration program compiles the regular expression that is its first argument, and matches it against the subject string in its second argument. No PCRE2 options are set, and default character tables are used. If matching succeeds, the program outputs the portion of the subject that matched, together with the contents of any captured substrings. .P If the -g option is given on the command line, the program then goes on to check for further matches of the same regular expression in the same subject string. The logic is a little bit tricky because of the possibility of matching an empty string. Comments in the code explain what is going on. .P The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit library. It handles strings and characters that are stored in 8-bit code units. By default, one character corresponds to one code unit, but if the pattern starts with "(*UTF)", both it and the subject are treated as UTF-8 strings, where characters may occupy multiple code units. .P If PCRE2 is installed in the standard include and library directories for your operating system, you should be able to compile the demonstration program using a command like this: .sp cc -o pcre2demo pcre2demo.c -lpcre2-8 .sp If PCRE2 is installed elsewhere, you may need to add additional options to the command line. For example, on a Unix-like system that has PCRE2 installed in \fI/usr/local\fP, you can compile the demonstration program using a command like this: .sp .\" JOINSH cc -o pcre2demo -I/usr/local/include pcre2demo.c \e -L/usr/local/lib -lpcre2-8 .sp Once you have built the demonstration program, you can run simple tests like this: .sp ./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -i 'cat' 'the dog sat on the CAT' .sp Note that there is a much more comprehensive test program, called .\" HREF \fBpcre2test\fP, .\" which supports many more facilities for testing regular expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be installed). The .\" HREF \fBpcre2demo\fP .\" program is provided as a relatively simple coding example. .P If you try to run .\" HREF \fBpcre2demo\fP .\" when PCRE2 is not installed in the standard library directory, you may get an error like this on some operating systems (e.g. Solaris): .sp ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory .sp This is caused by the way shared library support works on those systems. You need to add .sp -R/usr/local/lib .sp (for example) to the compile command to get round this problem. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 28 February 2025 Copyright (c) 1997-2016 University of Cambridge. .fi ================================================ FILE: doc/pcre2serialize.3 ================================================ .TH PCRE2SERIALIZE 3 "19 January 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS" .rs .sp .nf .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP, .B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP," .B " pcre2_general_context *\fIgcontext\fP);" .sp .B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP, .B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP," .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_serialize_free(uint8_t *\fIbytes\fP); .sp .B int32_t pcre2_serialize_get_number_of_codes(const uint8_t *\fIbytes\fP); .fi .sp If you are running an application that uses a large number of regular expression patterns, it may be useful to store them in a precompiled form instead of having to compile them every time the application is run. However, if you are using the just-in-time optimization feature, it is not possible to save and reload the JIT data, because it is position-dependent. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width and PCRE2_SIZE type. For example, patterns compiled on a 32-bit system using PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor can they be reloaded using the 8-bit library. .P Note that "serialization" in PCRE2 does not convert compiled patterns to an abstract format like Java or .NET serialization. The serialized output is really just a bytecode dump, which is why it can only be reloaded in the same environment as the one that created it. Hence the restrictions mentioned above. Applications that are not statically linked with a fixed version of PCRE2 must be prepared to recompile patterns from their sources, in order to be immune to PCRE2 upgrades. . . .SH "SECURITY CONCERNS" .rs .sp The facility for saving and restoring compiled patterns is intended for use within individual applications. As such, the data supplied to \fBpcre2_serialize_decode()\fP is expected to be trusted data, not data from arbitrary external sources. There is only some simple consistency checking, not complete validation of what is being re-loaded. Corrupted data may cause undefined results. For example, if the length field of a pattern in the serialized data is corrupted, the deserializing code may read beyond the end of the byte stream that is passed to it. . . .SH "SAVING COMPILED PATTERNS" .rs .sp Before compiled patterns can be saved they must be serialized, which in PCRE2 means converting the pattern to a stream of bytes. A single byte stream may contain any number of compiled patterns, but they must all use the same character tables. A single copy of the tables is included in the byte stream (its size is 1088 bytes). For more details of character tables, see the .\" HTML .\" section on locale support .\" in the .\" HREF \fBpcre2api\fP .\" documentation. .P The function \fBpcre2_serialize_encode()\fP creates a serialized byte stream from a list of compiled patterns. Its first two arguments specify the list, being a pointer to a vector of pointers to compiled patterns, and the length of the vector. The third and fourth arguments point to variables which are set to point to the created byte stream and its length, respectively. The final argument is a pointer to a general context, which can be used to specify custom memory management functions. If this argument is NULL, \fBmalloc()\fP is used to obtain memory for the byte stream. The yield of the function is the number of serialized patterns, or one of the following negative error codes: .sp PCRE2_ERROR_BADDATA the number of patterns is zero or less PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns PCRE2_ERROR_NOMEMORY memory allocation failed PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL .sp PCRE2_ERROR_BADMAGIC means either that a pattern's code has been corrupted, or that a slot in the vector does not point to a compiled pattern. .P Once a set of patterns has been serialized you can save the data in any appropriate manner. Here is sample code that compiles two patterns and writes them to a file. It assumes that the variable \fIfd\fP refers to a file that is open for output. The error checking that should be present in a real application has been omitted for simplicity. .sp int errorcode; uint8_t *bytes; PCRE2_SIZE erroroffset; PCRE2_SIZE bytescount; pcre2_code *list_of_codes[2]; list_of_codes[0] = pcre2_compile("first pattern", PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL); list_of_codes[1] = pcre2_compile("second pattern", PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL); errorcode = pcre2_serialize_encode(list_of_codes, 2, &bytes, &bytescount, NULL); errorcode = fwrite(bytes, 1, bytescount, fd); .sp Note that the serialized data is binary data that may contain any of the 256 possible byte values. On systems that make a distinction between binary and non-binary data, be sure that the file is opened for binary output. .P Serializing a set of patterns leaves the original data untouched, so they can still be used for matching. Their memory must eventually be freed in the usual way by calling \fBpcre2_code_free()\fP. When you have finished with the byte stream, it too must be freed by calling \fBpcre2_serialize_free()\fP. If this function is called with a NULL argument, it returns immediately without doing anything. . . .SH "RE-USING PRECOMPILED PATTERNS" .rs .sp In order to re-use a set of saved patterns you must first make the serialized byte stream available in main memory (for example, by reading from a file). The management of this memory block is up to the application. You can use the \fBpcre2_serialize_get_number_of_codes()\fP function to find out how many compiled patterns are in the serialized data without actually decoding the patterns: .sp uint8_t *bytes = ; int32_t number_of_codes = pcre2_serialize_get_number_of_codes(bytes); .sp The \fBpcre2_serialize_decode()\fP function reads a byte stream and recreates the compiled patterns in new memory blocks, setting pointers to them in a vector. The first two arguments are a pointer to a suitable vector and its length, and the third argument points to a byte stream. The final argument is a pointer to a general context, which can be used to specify custom memory management functions for the decoded patterns. If this argument is NULL, \fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte stream is no longer needed and can be discarded. .sp pcre2_code *list_of_codes[2]; uint8_t *bytes = ; int32_t number_of_codes = pcre2_serialize_decode(list_of_codes, 2, bytes, NULL); .sp If the vector is not large enough for all the patterns in the byte stream, it is filled with those that fit, and the remainder are ignored. The yield of the function is the number of decoded patterns, or one of the following negative error codes: .sp PCRE2_ERROR_BADDATA second argument is zero or less PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data PCRE2_ERROR_BADMODE mismatch of code unit size or PCRE2 version PCRE2_ERROR_BADSERIALIZEDDATA other sanity check failure PCRE2_ERROR_MEMORY memory allocation failed PCRE2_ERROR_NULL first or third argument is NULL .sp PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness. .P Decoded patterns can be used for matching in the usual way, and must be freed by calling \fBpcre2_code_free()\fP. However, be aware that there is a potential race issue if you are using multiple patterns that were decoded from a single byte stream in a multithreaded application. A single copy of the character tables is used by all the decoded patterns and a reference count is used to arrange for its memory to be automatically freed when the last pattern is freed, but there is no locking on this reference count. Therefore, if you want to call \fBpcre2_code_free()\fP for these patterns in different threads, you must arrange your own locking, and ensure that \fBpcre2_code_free()\fP cannot be called by two threads at the same time. .P If a pattern was processed by \fBpcre2_jit_compile()\fP before being serialized, the JIT data is discarded and so is no longer available after a save/restore cycle. You can, however, process a restored pattern with \fBpcre2_jit_compile()\fP if you wish. . . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 19 January 2024 Copyright (c) 1997-2018 University of Cambridge. .fi ================================================ FILE: doc/pcre2syntax.3 ================================================ .TH PCRE2SYNTAX 3 "14 October 2025" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" .rs .sp The full syntax and semantics of the regular expression patterns that are supported by PCRE2 are described in the .\" HREF \fBpcre2pattern\fP .\" documentation. This document contains a quick-reference summary of the pattern syntax followed by the syntax of replacement strings in substitution function. The full description of the latter is in the .\" HREF \fBpcre2api\fP .\" documentation. . .SH "QUOTING" .rs .sp \ex where x is non-alphanumeric is a literal x \eQ...\eE treat enclosed characters as literal .sp Note that white space inside \eQ...\eE is always treated as literal, even if PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also that PCRE2's handling of \eQ...\eE has some differences from Perl's. See the .\" HREF \fBpcre2pattern\fP .\" documentation for details. . . .SH "BRACED ITEMS" .rs .sp With one exception, wherever brace characters { and } are required to enclose data for constructions such as \eg{2} or \ek{name}, space and/or horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or after the comma. The exception is \eu{...} which is not Perl-compatible and is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This is an ECMAScript compatibility feature, and follows ECMAScript's behaviour. . . .SH "ESCAPED CHARACTERS" .rs .sp This table applies to ASCII and Unicode environments. An unrecognized escape sequence causes an error. .sp \ea alarm, that is, the BEL character (hex 07) \ecx "control-x", where x is a non-control ASCII character \ee escape (hex 1B) \ef form feed (hex 0C) \en newline (hex 0A) \er carriage return (hex 0D) \et tab (hex 09) \e0dd character with octal code 0dd \eddd character with octal code ddd, or backreference \eo{ddd..} character with octal code ddd.. \eN{U+hh..} character with Unicode code point hh.. (Unicode mode only) \exhh character with hex code hh \ex{hh..} character with hex code hh.. .sp \eN{U+hh..} is synonymous with \ex{hh..} but is not supported in environments that use EBCDIC code (mainly IBM mainframes). Note that \eN not followed by an opening curly bracket has a different meaning (see below). .P If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized: .sp \eU the character "U" \euhhhh character with hex code hhhh \eu{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX .sp When \ex is not followed by {, one or two hexadecimal digits are read, but in ALT_BSUX mode \ex must be followed by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise it matches a literal "x". Likewise, if \eu (in ALT_BSUX mode) is not followed by four hexadecimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it matches a literal "u". .P Note that \e0dd is always an octal code. The treatment of backslash followed by a non-zero digit is complicated; for details see the section .\" HTML .\" "Non-printing characters" .\" in the .\" HREF \fBpcre2pattern\fP .\" documentation, where details of escape processing in EBCDIC environments are also given. . . .SH "CHARACTER TYPES" .rs .sp . any character except newline; in dotall mode, any character whatsoever \eC one code unit, even in UTF mode (best avoided) \ed a decimal digit \eD a character that is not a decimal digit \eh a horizontal white space character \eH a character that is not a horizontal white space character \eN a character that is not a newline \ep{\fIxx\fP} a character with the \fIxx\fP property \eP{\fIxx\fP} a character without the \fIxx\fP property \eR a newline sequence \es a white space character \eS a character that is not a white space character \ev a vertical white space character \eV a character that is not a vertical white space character \ew a "word" character \eW a "non-word" character \eX a Unicode extended grapheme cluster .sp \eC is dangerous because it may leave the current matching point in the middle of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 with the use of \eC permanently disabled. .P By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific matching is happening, \es and \ew may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behaviour of these escape sequences is changed to use Unicode properties and they match many more characters, but there are some option settings that can restrict individual sequences to matching only ASCII characters. .P Property descriptions in \ep and \eP are matched caselessly; hyphens, underscores, and ASCII white space characters are ignored, in accordance with Unicode's "loose matching" rules. For example, \ep{Bidi_Class=al} is the same as \ep{ bidi class = AL }. . . .SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP" .rs .sp C Other Cc Control Cf Format Cn Unassigned Co Private use Cs Surrogate .sp L Letter Lc Cased letter, the union of Ll, Lu, and Lt L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter .sp M Mark Mc Spacing mark Me Enclosing mark Mn Non-spacing mark .sp N Number Nd Decimal number Nl Letter number No Other number .sp P Punctuation Pc Connector punctuation Pd Dash punctuation Pe Close punctuation Pf Final punctuation Pi Initial punctuation Po Other punctuation Ps Open punctuation .sp S Symbol Sc Currency symbol Sk Modifier symbol Sm Mathematical symbol So Other symbol .sp Z Separator Zl Line separator Zp Paragraph separator Zs Space separator .sp From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all equivalent to Lc. . . .SH "PCRE2 SPECIAL CATEGORY PROPERTIES FOR \ep and \eP" .rs .sp Xan Alphanumeric: union of properties L and N Xps POSIX space: property Z or tab, NL, VT, FF, CR Xsp Perl space: property Z or tab, NL, VT, FF, CR Xuc Universally-named character: one that can be represented by a Universal Character Name Xwd Perl word: property Xan or underscore .sp Perl and POSIX space are now the same. Perl added VT to its space character set at release 5.18. . . .SH "BINARY PROPERTIES FOR \ep AND \eP" .rs .sp Unicode defines a number of binary properties, that is, properties whose only values are true or false. You can obtain a list of those that are recognized by \ep and \eP, along with their abbreviations, by running this command: .sp pcre2test -LP . . . .SH "SCRIPT MATCHING WITH \ep AND \eP" .rs .sp Many script names and their 4-letter abbreviations are recognized in \ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of course). You can obtain a list of these scripts by running this command: .sp pcre2test -LS . . . .SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP" .rs .sp \ep{Bidi_Class:} matches a character with the given class \ep{BC:} matches a character with the given class .sp The recognized classes are: .sp AL Arabic letter AN Arabic number B paragraph separator BN boundary neutral CS common separator EN European number ES European separator ET European terminator FSI first strong isolate L left-to-right LRE left-to-right embedding LRI left-to-right isolate LRO left-to-right override NSM non-spacing mark ON other neutral PDF pop directional format PDI pop directional isolate R right-to-left RLE right-to-left embedding RLI right-to-left isolate RLO right-to-left override S segment separator WS white space . . .SH "CHARACTER CLASSES" .rs .sp [...] positive character class [^...] negative character class [x-y] range (can be used for hex characters) [[:xxx:]] positive POSIX named set [[:^xxx:]] negative POSIX named set .sp alnum alphanumeric alpha alphabetic ascii 0-127 blank space or tab cntrl control character digit decimal digit graph printing, excluding space lower lower case letter print printing, including space punct printing, excluding alphanumeric space white space upper upper case letter word same as \ew xdigit hexadecimal digit .sp In PCRE2, POSIX character set names recognize only ASCII characters by default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \eQ...\eE inside a character class. .P When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be used, allowing nested character classes, combined using set operators. .sp [x&&[^y]] UTS#18 extended character class .sp x||y set union (OR) x&&y set intersection (AND) x--y set difference (AND NOT) x~~y set symmetric difference (XOR) .sp . . .SH "PERL EXTENDED CHARACTER CLASSES" .rs .sp (?[...]) Perl extended character class (?[\ep{Thai} & \ep{Nd}]) operators; white space ignored (?[(x - y) & z]) parentheses for grouping .sp (?[ [^3] & \ep{Nd} ]) [...] is a nested ordinary class (?[ [:alpha:] - [z] ]) POSIX set is allowed outside [...] (?[ \ed - [3] ]) backslash-escaped set is allowed outside [...] (?[ !\en & [:ascii:] ]) backslash-escaped character is allowed outside [...] all other characters or ranges must be enclosed in [...] .sp x|y, x+y set union (OR) x&y set intersection (AND) x-y set difference (AND NOT) x^y set symmetric difference (XOR) !x set complement (NOT) .sp Inside a Perl extended character class, [...] switches mode to be interpreted as an ordinary character class. Outside of a nested [...], the only items permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside a nested ordinary class, ^ has its usual meaning (inverts the class when used as the first character); outside of a nested class, ^ is the XOR operator. . . .SH "QUANTIFIERS" .rs .sp ? 0 or 1, greedy ?+ 0 or 1, possessive ?? 0 or 1, lazy * 0 or more, greedy *+ 0 or more, possessive *? 0 or more, lazy + 1 or more, greedy ++ 1 or more, possessive +? 1 or more, lazy {n} exactly n {n,m} at least n, no more than m, greedy {n,m}+ at least n, no more than m, possessive {n,m}? at least n, no more than m, lazy {n,} n or more, greedy {n,}+ n or more, possessive {n,}? n or more, lazy {,m} zero up to m, greedy {,m}+ zero up to m, possessive {,m}? zero up to m, lazy . . .SH "ANCHORS AND SIMPLE ASSERTIONS" .rs .sp \eb word boundary \eB not a word boundary ^ start of subject also after an internal newline in multiline mode (after any newline if PCRE2_ALT_CIRCUMFLEX is set) \eA start of subject $ end of subject also before newline at end of subject also before internal newline in multiline mode \eZ end of subject also before newline at end of subject \ez end of subject \eG first matching position in subject . . .SH "REPORTED MATCH POINT SETTING" .rs .sp \eK set reported start of match .sp From release 10.38 \eK is not permitted by default in lookaround assertions, for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option is set, the previous behaviour is re-enabled. When this option is set, \eK is honoured in positive assertions, but ignored in negative ones. . . .SH "ALTERNATION" .rs .sp expr|expr|expr... . . .SH "CAPTURING" .rs .sp (...) capture group (?...) named capture group (Perl) (?'name'...) named capture group (Perl) (?P...) named capture group (Python) (?:...) non-capture group (?|...) non-capture group; reset group numbers for capture groups in each alternative .sp In non-UTF modes, names may contain underscores and ASCII letters and digits; in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit. . . .SH "ATOMIC GROUPS" .rs .sp (?>...) atomic non-capture group (*atomic:...) atomic non-capture group . . .SH "COMMENT" .rs .sp (?#....) comment (not nestable) . . .SH "OPTION SETTING" .rs Changes of these options within a group are automatically cancelled at the end of the group. .sp (?a) all ASCII options (?aD) restrict \ed to ASCII in UCP mode (?aS) restrict \es to ASCII in UCP mode (?aW) restrict \ew to ASCII in UCP mode (?aP) restrict all POSIX classes to ASCII in UCP mode (?aT) restrict POSIX digit classes to ASCII in UCP mode (?i) caseless (?J) allow duplicate named groups (?m) multiline (?n) no auto capture (?r) restrict caseless to either ASCII or non-ASCII (?s) single line (dotall) (?U) default ungreedy (lazy) (?x) ignore white space except in classes or \eQ...\eE (?xx) as (?x) but also ignore space and tab in classes (?-...) unset the given option(s) (?^) unset imnrsx options .sp (?aP) implies (?aT) as well, though this has no additional effect. However, it means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes. .P Unsetting x or xx unsets both. Several options may be set at once, and a mixture of setting and unsetting such as (?i-x) is allowed, but there may be only one hyphen. Setting (but no unsetting) is allowed after (?^ for example (?^in). An option setting may appear at the start of a non-capture group, for example (?i:...). .P The following are recognized only at the very start of a pattern or after one of the newline or \eR sequences or options with similar syntax. More than one of them may appear. For the first three, d is a decimal number. .sp (*LIMIT_DEPTH=d) set the backtracking limit to d (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes (*LIMIT_MATCH=d) set the match limit to d (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching (*NOTEMPTY) set PCRE2_NOTEMPTY when matching (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) (*NO_JIT) disable JIT optimization (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching (*UTF) set appropriate UTF mode for the library in use (*UCP) set PCRE2_UCP (use Unicode properties for \ed etc) .sp Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP, not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time. . . .SH "NEWLINE CONVENTION" .rs .sp These are recognized only at the very start of the pattern or after option settings with a similar syntax. .sp (*CR) carriage return only (*LF) linefeed only (*CRLF) carriage return followed by linefeed (*ANYCRLF) all three of the above (*ANY) any Unicode newline sequence (*NUL) the NUL character (binary zero) . . .SH "WHAT \eR MATCHES" .rs .sp These are recognized only at the very start of the pattern or after option setting with a similar syntax. .sp (*BSR_ANYCRLF) CR, LF, or CRLF (*BSR_UNICODE) any Unicode newline sequence . . .SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS" .rs .sp (?=...) ) (*pla:...) ) positive lookahead (*positive_lookahead:...) ) .sp (?!...) ) (*nla:...) ) negative lookahead (*negative_lookahead:...) ) .sp (?<=...) ) (*plb:...) ) positive lookbehind (*positive_lookbehind:...) ) .sp (? name 'name' name . . .SH "SCRIPT RUNS" .rs .sp (*script_run:...) ) script run, can be backtracked into (*sr:...) ) .sp (*atomic_script_run:...) ) atomic script run (*asr:...) ) . . .SH "BACKREFERENCES" .rs .sp \en reference by number (can be ambiguous) \egn reference by number \eg{n} reference by number \eg+n relative reference by number (PCRE2 extension) \eg-n relative reference by number \eg{+n} relative reference by number (PCRE2 extension) \eg{-n} relative reference by number \ek reference by name (Perl) \ek'name' reference by name (Perl) \eg{name} reference by name (Perl) \ek{name} reference by name (.NET) (?P=name) reference by name (Python) . . .SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)" .rs .sp (?R) recurse whole pattern (?n) call subroutine by absolute number (?+n) call subroutine by relative number (?-n) call subroutine by relative number (?&name) call subroutine by name (Perl) (?P>name) call subroutine by name (Python) \eg call subroutine by name (Oniguruma) \eg'name' call subroutine by name (Oniguruma) \eg call subroutine by absolute number (Oniguruma) \eg'n' call subroutine by absolute number (Oniguruma) \eg<+n> call subroutine by relative number (PCRE2 extension) \eg'+n' call subroutine by relative number (PCRE2 extension) \eg<-n> call subroutine by relative number (PCRE2 extension) \eg'-n' call subroutine by relative number (PCRE2 extension) .sp The variants using parentheses (?...) may also specify a list of capture groups to return, which shall be retained in the calling subexpression if set during the recursion (this feature is not supported by Perl). .sp (?R(grouplist)) recurse whole pattern, returning capture groups (PCRE2 extension) (?n(grouplist)) ) (?+n(grouplist)) ) call subroutine, returning capture groups (?-n(grouplist)) ) (PCRE2 extension) (?&name(grouplist)) ) (?P>name(grouplist)) ) .sp The comma-separated list "grouplist" uses the same syntax as (*scan_substring:(grouplist)...), and may identify groups in any of the following ways: .sp n absolute reference +n relative reference -n relative reference name 'name' name . . .SH "CONDITIONAL PATTERNS" .rs .sp (?(condition)yes-pattern) (?(condition)yes-pattern|no-pattern) .sp (?(n) absolute reference condition (?(+n) relative reference condition (PCRE2 extension) (?(-n) relative reference condition (PCRE2 extension) (?() named reference condition (Perl) (?('name') named reference condition (Perl) (?(name) named reference condition (PCRE2, deprecated) (?(R) overall recursion condition (?(Rn) specific numbered group recursion condition (?(R&name) specific named group recursion condition (?(DEFINE) define groups for reference (?(VERSION[>]=n[.m]) test PCRE2 version (?(assert) assertion condition .sp Note the ambiguity of (?(R) and (?(Rn) which might be named reference conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. .sp The parts within brackets for the VERSION conditional syntax could be ommited. The fractional part of the version number defaults to 0 in that case. . . .SH "BACKTRACKING CONTROL" .rs .sp All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour if :NAME is present. The others just set a name for passing back to the caller, but this is not a name that (*SKIP) can see. The following act immediately they are reached: .sp (*ACCEPT) force successful match (*FAIL) force backtrack; synonym (*F) (*MARK:NAME) set name to be passed back; synonym (*:NAME) .sp The following act only when a subsequent match failure causes a backtrack to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored. .sp (*COMMIT) overall failure, no advance of starting point (*PRUNE) advance to next starting character (*SKIP) advance to current matching position (*SKIP:NAME) advance to position corresponding to an earlier (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation .sp The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call. . . .SH "CALLOUTS" .rs .sp (?C) callout (assumed number 0) (?Cn) callout with numerical data n (?C"text") callout with string data .sp The allowed string delimiters are ` ' " ^ % # $ (which are the same for the start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it. . . .SH "REPLACEMENT STRINGS" .rs .sp If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for \fBpcre2_substitute()\fP is not interpreted. Otherwise, by default, the only special character is the dollar character in one of the following forms: .sp $$ insert a dollar character $n or ${n} insert the contents of group \fIn\fP $ insert the contents of named group $0 or $& insert the entire matched substring $` insert the substring that precedes the match $' insert the substring that follows the match $_ insert the entire input string $+ insert the highest-numbered capture group which matched $*MARK or ${*MARK} insert a control verb name .sp For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, there is additional interpretation: .P 1. Backslash is an escape character, and the forms described in "ESCAPED CHARACTERS" above are recognized. Also: .sp \eQ...\eE can be used to suppress interpretation \el force the next character to lower case \eu force the next character to upper case \eL force subsequent characters to lower case \eU force subsequent characters to upper case \eu\eL force next character to upper case, then all lower \el\eU force next character to lower case, then all upper \eE end \eL or \eU case forcing \eb backspace character (note: as in character class in pattern) \ev vertical tab character (note: not the same as in a pattern) .sp 2. The Python form \eg, where the angle brackets are part of the syntax and \fIn\fP is either a group name or a number, is recognized as an alternative way of inserting the contents of a group, for example \eg<3>. .P 3. Capture substitution supports the following additional forms: .sp ${n:-string} default for unset group ${n:+string1:string2} values for set/unset group .sp The substitution strings themselves are expanded. Backslash can be used to escape colons and closing curly brackets. . . .SH "SEE ALSO" .rs .sp \fBpcre2pattern\fP(3), \fBpcre2api\fP(3), \fBpcre2callout\fP(3), \fBpcre2matching\fP(3), \fBpcre2\fP(3). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 14 October 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2test.1 ================================================ .TH PCRE2TEST 1 "12 October 2025" "PCRE2 10.48-DEV" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS .rs .sp .B pcre2test "[options] [input file [output file]]" .sp \fBpcre2test\fP is a test program for the PCRE2 regular expression libraries, but it can also be used for experimenting with regular expressions. This document describes the features of the test program; for details of the regular expressions themselves, see the .\" HREF \fBpcre2pattern\fP .\" documentation. For details of the PCRE2 library function calls and their options, see the .\" HREF \fBpcre2api\fP .\" documentation. .P The input for \fBpcre2test\fP is a sequence of regular expression patterns and subject strings to be matched. There are also command lines for setting defaults and controlling some special actions. The output shows the result of each match attempt. Modifiers on external or internal command lines, the patterns, and the subject lines specify PCRE2 function options, control how the subject is processed, and what output is produced. .P There are many obscure modifiers, some of which are specifically designed for use in conjunction with the test script and data files that are distributed as part of PCRE2. All the modifiers are documented here, some without much justification, but many of them are unlikely to be of use except when testing the libraries. . . .SH "PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES" .rs .sp Different versions of the PCRE2 library can be built to support character strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or all three of these libraries may be simultaneously installed. The \fBpcre2test\fP program can be used to test all the libraries. However, its own input and output are always in 8-bit format. When testing the 16-bit or 32-bit libraries, patterns and subject strings are converted to 16-bit or 32-bit format before being passed to the library functions. Results are converted back to 8-bit code units for output. .P In the rest of this document, the names of library functions and structures are given in generic form, for example, \fBpcre2_compile()\fP. The actual names used in the libraries have a suffix _8, _16, or _32, as appropriate. . . .\" HTML .SH "INPUT ENCODING" .rs .sp Input to \fBpcre2test\fP is processed line by line, either by calling the C library's \fBfgets()\fP function, or via the \fBlibreadline\fP or \fBlibedit\fP library. In some Windows environments character 26 (hex 1A) causes an immediate end of file, and no further data is read, so this character should be avoided unless you really want that action. .P The input is processed using C's string functions, so must not contain binary zeros, even though in Unix-like environments, \fBfgets()\fP treats any bytes other than newline as data characters. An error is generated if a binary zero is encountered. By default subject lines are processed for backslash escapes, which makes it possible to include any data value in strings that are passed to the library for matching. For patterns, there is a facility for specifying some or all of the 8-bit input characters as hexadecimal pairs, which makes it possible to include binary zeros. . . .SS "Input for the 16-bit and 32-bit libraries" .rs .sp When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed to the library. For subject lines and some patterns, backslash escapes can be used. In addition, when the \fButf\fP modifier (see .\" HTML .\" "Setting compilation options" .\" below) is set, the pattern and any following subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. .P For non-UTF testing of wide characters, the \fButf8_input\fP modifier can be used. This is mutually exclusive with \fButf\fP, and is allowed only in 16-bit or 32-bit mode. It causes the pattern and following subject lines to be treated as UTF-8 according to the original definition (RFC 2279), which allows for character values up to 0x7fffffff. Each character is placed in one 16-bit or 32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error to occur). .P UTF-8 (in its original definition) is not capable of encoding values greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with \fButf8_input\fP set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) 0x80000000 is added to the character's value. For subject strings, using an escape sequence is preferable. . . .SH "COMMAND LINE OPTIONS" .rs .TP 10 \fB-8\fP If the 8-bit library has been built, this option causes it to be used (this is the default). If the 8-bit library has not been built, this option causes an error. .TP 10 \fB-16\fP If the 16-bit library has been built, this option causes it to be used. If the 8-bit library has not been built, this is the default. If the 16-bit library has not been built, this option causes an error. .TP 10 \fB-32\fP If the 32-bit library has been built, this option causes it to be used. If no other library has been built, this is the default. If the 32-bit library has not been built, this option causes an error. .TP 10 \fB-ac\fP Behave as if each pattern has the \fBauto_callout\fP modifier, that is, insert automatic callouts into every pattern that is compiled. .TP 10 \fB-AC\fP As for \fB-ac\fP, but in addition behave as if each subject line has the \fBcallout_extra\fP modifier, that is, show additional information from callouts. .TP 10 \fB-b\fP Behave as if each pattern has the \fBfullbincode\fP modifier; the full internal binary form of the pattern is output after compilation. .TP 10 \fB-C\fP Output the version number of the PCRE2 library, and all available information about the optional features that are included, and then exit with zero exit code. All other options are ignored. If both -C and -LM are present, whichever is first is recognized. .TP 10 \fB-C\fP \fIoption\fP Output information about a specific build-time option, then exit. This functionality is intended for use in scripts such as \fBRunTest\fP. The following options output the value and set the exit code as indicated: .sp linksize the configured internal link size (2, 3, or 4) exit code is set to the link size newline the default newline setting: CR, LF, CRLF, ANYCRLF, ANY, or NUL exit code is always 0 bsr the default setting for what \eR matches: ANYCRLF or ANY exit code is always 0 .sp The following options output 1 for true or 0 for false, and set the exit code to the same value: .sp backslash-C \eC is supported (not locked out) ebcdic compiled for an EBCDIC environment ebcdic-io if PCRE2 is compiled for EBCDIC, whether pcre2test's input and output is EBCDIC or ASCII ebcdic-nl25 if PCRE2 is compiled for EBCDIC, whether NL (= LF) is 0x25 (otherwise it is 0x15, the default) jit just-in-time support is available pcre2-16 the 16-bit library was built pcre2-32 the 32-bit library was built pcre2-8 the 8-bit library was built unicode Unicode support is available .sp Note that the availability of JIT support in the library does not guarantee that it can actually be used because in some environments it is unable to allocate executable memory. The option "jitusable" gives more detailed information. It returns one of the following values: .sp 0 JIT is available and usable 1 JIT is available but cannot allocate executable memory 2 JIT is not available 3 Unexpected return from test call to \fBpcre2_jit_compile()\fP .sp If an unknown option is given, an error message is output; the exit code is 0. .TP 10 \fB--colo[u]r[=]\fP By default, the output is coloured if the output file is a terminal (\fBauto\fP). Force or suppress output of ANSI colour escapes with \fBalways\fP and \fBnever\fP respectively. .TP 10 \fB-d\fP Behave as if each pattern has the \fBdebug\fP modifier; the internal form and information about the compiled pattern is output after compilation; \fB-d\fP is equivalent to \fB-b -i\fP. .TP 10 \fB-dfa\fP Behave as if each subject line has the \fBdfa\fP modifier; matching is done using the \fBpcre2_dfa_match()\fP function instead of the default \fBpcre2_match()\fP. .TP 10 \fB-E\fP Run in "preprocess only" mode (similar to "gcc -E"). The "#if ... #endif" commands are processed, and all other lines are printed verbatim. .TP 10 \fB-error\fP \fInumber[,number,...]\fP Call \fBpcre2_get_error_message()\fP for each of the error numbers in the comma-separated list, display the resulting messages on the standard output, then exit with zero exit code. The numbers may be positive or negative. This is a convenience facility for PCRE2 maintainers. .TP 10 \fB-help\fP Output a brief summary these options and then exit. .TP 10 \fB-i\fP Behave as if each pattern has the \fBinfo\fP modifier; information about the compiled pattern is given after compilation. .TP 10 \fB-jit\fP Behave as if each pattern line has the \fBjit\fP modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available. .TP 10 \fB-jitfast\fP Behave as if each pattern line has the \fBjitfast\fP modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available, and each subject line is passed directly to the JIT matcher via its "fast path". .TP 10 \fB-jitverify\fP Behave as if each pattern line has the \fBjitverify\fP modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available, and the use of JIT for matching is verified. .TP 10 \fB-LM\fP List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. .TP 10 \fB-LP\fP List properties: write a list of recognized Unicode properties to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. .TP 10 \fB-LS\fP List scripts: write a list of recognized Unicode script names to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. .TP 10 \fB-malloc\fP Exercise malloc() failures, by first counting the number of calls made to malloc during pattern compilation and matching, then re-running the compilation and matching that many times, exercising a failure of each malloc() call. .TP 10 \fB-pattern\fP \fImodifier-list\fP Behave as if each pattern line contains the given modifiers. .TP 10 \fB-q\fP Do not output the version number of \fBpcre2test\fP at the start of execution. .TP 10 \fB-S\fP \fIsize\fP On Unix-like systems, set the size of the run-time stack to \fIsize\fP mebibytes (units of 1024*1024 bytes). .TP 10 \fB-subject\fP \fImodifier-list\fP Behave as if each subject line contains the given modifiers. .TP 10 \fB-t\fP Run each compile and match many times with a timer, and output the resulting times per compile or match. When JIT is used, separate times are given for the initial compile and the JIT compile. You can control the number of iterations that are used for timing by following \fB-t\fP with a number (as a separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times. .TP 10 \fB-tm\fP This is like \fB-t\fP except that it times only the matching phase, not the compile phase. .TP 10 \fB-T\fP \fB-TM\fP These behave like \fB-t\fP and \fB-tm\fP, but in addition, at the end of a run, the total times for all compiles and matches are output. .TP 10 \fB-unittest\fP Run a fixed set of additional tests of the PCRE2 API which are not driven by the test input files, and then exit. .TP 10 \fB-version\fP Output the PCRE2 version number and then exit. . . .SH "DESCRIPTION" .rs .sp If \fBpcre2test\fP is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from the standard input. If \fBpcre2test\fP is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout. .P When \fBpcre2test\fP is built, a configuration option can specify that it should be linked with the \fBlibreadline\fP or \fBlibedit\fP library. When this is done, if the input is from a terminal, it is read using the \fBreadline()\fP function. This provides line-editing and history facilities. The output from the \fB-help\fP option states whether or not \fBreadline()\fP will be used. .P The program handles any number of tests, each of which consists of a set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pattern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed by the \fBperltest.sh\fP script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. For a specification of \fBperltest.sh\fP, see the comments near its beginning. See also the #perltest command below. .P When the input is a terminal, \fBpcre2test\fP prompts for each line of input, using "re>" to prompt for regular expression patterns, and "data>" to prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt. .P Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \en escape sequence (or \er or \er\en, etc., depending on the newline setting) in a single line of input to encode the newline sequences. There is no limit on the length of subject lines; the input buffer is automatically extended if it is too small. There are replication features that makes it possible to generate long repetitive pattern or subject lines without having to supply them explicitly. .P An empty line or the end of the file signals the end of the subject lines for a test, at which point a new pattern or command line is expected if there is still input to be read. . . .SH "COMMAND LINES" .rs .sp In between sets of test data, a line that begins with # is interpreted as a command line. If the first character is followed by white space or an exclamation mark, the line is treated as a comment, and ignored. Otherwise, the following commands are recognized: .sp #forbid_utf .sp Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of patterns. This command also forces an error if a subsequent pattern contains any occurrences of \eP, \ep, or \eX, which are still supported when PCRE2_UTF is not set, but which require Unicode property support to be included in the library. .P This is a trigger guard that is used in test files to ensure that UTF or Unicode property tests are not accidentally added to files that are used when Unicode support is not included in the library. Setting PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained by the use of \fB#pattern\fP; the difference is that \fB#forbid_utf\fP cannot be unset, and the automatic options are not displayed in pattern information, to avoid cluttering up test output. .sp #load .sp This command is used to load a set of precompiled patterns from a file, as described in the section entitled "Saving and restoring compiled patterns" .\" HTML .\" below. .\" .sp #loadtables .sp This command is used to load a set of binary character tables that can be accessed by the tables=3 qualifier. Such tables can be created by the \fBpcre2_dftables\fP program with the -b option. .sp #newline_default [] .sp When PCRE2 is built, a default newline convention can be specified. This determines which characters and/or character pairs are recognized as indicating a newline in a pattern or subject string. The default can be overridden when a pattern is compiled. The standard test files contain tests of various newline conventions, but the majority of the tests expect a single linefeed to be recognized as a newline by default. Without special action the tests would fail when PCRE2 is compiled with either CR or CRLF as the default newline. .P The #newline_default command specifies a list of newline types that are acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case), for example: .sp #newline_default LF Any anyCRLF .sp If the default newline is in the list, this command has no effect. Otherwise, except when testing the POSIX API, a \fBnewline\fP modifier that specifies the first newline convention in the list (LF in the above example) is added to any pattern that does not already have a \fBnewline\fP modifier. If the newline list is empty, the feature is turned off. This command is present in a number of the standard test input files. .P When the POSIX API is being tested there is no way to override the default newline convention, though it is possible to set the newline convention from within the pattern. A warning is given if the \fBposix\fP or \fBposix_nosub\fP modifier is used when \fB#newline_default\fP would set a default for the non-POSIX API. .sp #pattern .sp This command sets a default modifier list that applies to all subsequent patterns. Modifiers on a pattern can change these settings. .sp #perltest .sp This line is used in test files that can also be processed by \fBperltest.sh\fP to confirm that Perl gives the same results as PCRE2. Subsequent tests are checked for the use of \fBpcre2test\fP features that are incompatible with the \fBperltest.sh\fP script. .P Patterns must use '/' as their delimiter, and only certain modifiers are supported. Comment lines, #pattern commands, and #subject commands that set or unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and #newline_default commands, which are needed in the relevant pcre2test files, are silently ignored. All other command lines are ignored, but give a warning message. The \fB#perltest\fP command helps detect tests that are accidentally put in the wrong file or use the wrong delimiter. For more details of the \fBperltest.sh\fP script see the comments it contains. .sp #pop [] #popcopy [] .sp These commands are used to manipulate the stack of compiled patterns, as described in the section entitled "Saving and restoring compiled patterns" .\" HTML .\" below. .\" .sp #save .sp This command is used to save a set of compiled patterns to a file, as described in the section entitled "Saving and restoring compiled patterns" .\" HTML .\" below. .\" .sp #subject .sp This command sets a default modifier list that applies to all subsequent subject lines. Modifiers on a subject line can change these settings. .sp #if CONDITION ... #endif .sp If CONDITION is true, then the command is printed, and its contents are processed as normal, including printing the commandlines to the output. If CONDITION is false, then all lines between the "#if" and "#endif" are skipped and not printed. The CONDITION can be any of the conditions which are tested by the "-C" commandline option and which set pcre2test's exit code to a boolean value. The CONDITION may also be preceded by "!". . . .SH "MODIFIER SYNTAX" .rs .sp Modifier lists are used with both pattern and subject lines. Items in a list are separated by commas followed by optional white space. Trailing white space in a modifier list is ignored. Some modifiers may be given for both patterns and subject lines, whereas others are valid only for one or the other. Each modifier has a long name, for example "anchored", and some of them must be followed by an equals sign and a value, for example, "offset=12". Values cannot contain comma characters, but may contain spaces. Modifiers that do not take values may be preceded by a minus sign to turn off a previous setting. .P A few of the more common modifiers can also be specified as single letters, for example "i" for "caseless". In documentation, following the Perl convention, these are written with a slash ("the /i modifier") for clarity. Abbreviated modifiers must all be concatenated in the first item of a modifier list. If the first item is not recognized as a long modifier name, it is interpreted as a sequence of these abbreviations. For example: .sp /abc/ig,newline=cr,jit=3 .sp This is a pattern line whose modifier list starts with two one-letter modifiers (/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. . . .SH "PATTERN SYNTAX" .rs .sp A pattern line must start with one of the following characters (common symbols, excluding pattern meta-characters): .sp / ! " ' ` - = _ : ; , % & @ ~ .sp This is interpreted as the pattern's delimiter. A regular expression may be continued over several input lines, in which case the newline characters are included within it. It is possible to include the delimiter as a literal within the pattern by escaping it with a backslash, for example .sp /abc\e/def/ .sp If you do this, the escape and the delimiter form part of the pattern, but since the delimiters are all non-alphanumeric, the inclusion of the backslash does not affect the pattern's interpretation. Note, however, that this trick does not work within \eQ...\eE literal bracketing because the backslash will itself be interpreted as a literal. If the terminating delimiter is immediately followed by a backslash, for example, .sp /abc/\e .sp a backslash is added to the end of the pattern. This is done to provide a way of testing the error condition that arises if a pattern finishes with a backslash, because .sp /abc\e/ .sp is interpreted as the first line of a pattern that starts with "abc/", causing pcre2test to read the next line as a continuation of the regular expression. .P A pattern can be followed by a modifier list (details below). . . .SH "SUBJECT LINE SYNTAX" .rs .sp Before each subject line is passed to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP, leading and trailing white space is removed, and the line is scanned for backslash escapes, unless the \fBsubject_literal\fP modifier was set for the pattern. The following provide a means of encoding non-printing characters in a visible way: .sp \ea alarm (BEL, \ex07) \eb backspace (\ex08) \ee escape (\ex27) \ef form feed (\ex0c) \en newline (\ex0a) \eN{U+hh...} unicode character (any number of hex digits) \er carriage return (\ex0d) \et tab (\ex09) \ev vertical tab (\ex0b) \eddd octal number (up to 3 octal digits); represent a single code point unless larger than 255 with the 8-bit library \eo{dd...} octal number (any number of octal digits} representing a character in UTF mode or a code point \exhh hexadecimal byte (up to 2 hex digits) \ex{hh...} hexadecimal number (up to 8 hex digits) representing a character in UTF mode or a code point .sp Invoking \eN{U+hh...} or \ex{hh...} doesn't require the use of the \fButf\fP modifier on the pattern. It is always recognized. There may be any number of hexadecimal digits inside the braces; invalid values provoke error messages but when using \eN{U+hh...} with some invalid unicode characters they will be accepted with a warning instead. .P Note that even in UTF-8 mode, \exhh (and depending of how large, \eddd) describe one byte rather than one character; this makes it possible to construct invalid UTF-8 sequences for testing purposes. On the other hand, \ex{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating more than one byte if the value is greater than 127. To avoid the ambiguity it is preferred to use \eN{U+hh...} when describing characters. When testing the 8-bit library not in UTF-8 mode, \ex{hh} generates one byte for values that could fit on it, and causes an error for greater values. .P When testing the 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh} values are accepted. This makes it possible to construct invalid UTF-16 sequences for testing purposes. .P When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \ex{...} values are accepted. This makes it possible to construct invalid UTF-32 sequences for testing purposes. .P There is a special backslash sequence that specifies replication of one or more characters: .sp \e[]{} .sp This makes it possible to test long strings without having to provide them as part of the file. For example: .sp \e[abc]{4} .sp is converted to "abcabcabcabc". This feature does not support nesting. To include a closing square bracket in the characters, code it as \ex5D. .P A backslash followed by an equals sign marks the end of the subject string and the start of a modifier list. For example: .sp abc\e=notbol,notempty .sp If the subject string is empty and \e= is followed by white space, the line is treated as a comment line, and is not used for matching. For example: .sp \e= This is a comment. abc\e= This is an invalid modifier list. .sp A backslash followed by any other non-alphanumeric character just escapes that character. A backslash followed by anything else causes an error. However, if the very last character in the line is a backslash (and there is no modifier list), it is ignored. This gives a way of passing an empty line as data, since a real empty line terminates the data input. .P If the \fBsubject_literal\fP modifier is set for a pattern, all subject lines that follow are treated as literals, with no special treatment of backslashes. No replication is possible, and any subject modifiers must be set as defaults by a \fB#subject\fP command. . . .SH "PATTERN MODIFIERS" .rs .sp There are several types of modifier that can appear in pattern lines. Except where noted below, they may also be used in \fB#pattern\fP commands. A pattern's modifier list can add to or override default modifiers that were set by a previous \fB#pattern\fP command. . . .\" HTML .SS "Setting compilation options" .rs .sp The following modifiers set options for \fBpcre2_compile()\fP. Most of them set bits in the options argument of that function, but those whose names start with PCRE2_EXTRA are additional options that are set in the compile context. Some of these options have single-letter abbreviations. There is special handling for /x: if a second x is present, PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third appearance adds PCRE2_EXTENDED as well, though this makes no difference to the way \fBpcre2_compile()\fP behaves. See .\" HREF \fBpcre2api\fP .\" for a description of the effects of these options. .sp allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options ascii_bsd set PCRE2_EXTRA_ASCII_BSD ascii_bss set PCRE2_EXTRA_ASCII_BSS ascii_bsw set PCRE2_EXTRA_ASCII_BSW ascii_digit set PCRE2_EXTRA_ASCII_DIGIT ascii_posix set PCRE2_EXTRA_ASCII_POSIX auto_callout set PCRE2_AUTO_CALLOUT bad_escape_is_literal set PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL /i caseless set PCRE2_CASELESS /r caseless_restrict set PCRE2_EXTRA_CASELESS_RESTRICT dollar_endonly set PCRE2_DOLLAR_ENDONLY /s dotall set PCRE2_DOTALL dupnames set PCRE2_DUPNAMES endanchored set PCRE2_ENDANCHORED escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF /x extended set PCRE2_EXTENDED /xx extended_more set PCRE2_EXTENDED_MORE extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX firstline set PCRE2_FIRSTLINE literal set PCRE2_LITERAL match_line set PCRE2_EXTRA_MATCH_LINE match_invalid_utf set PCRE2_MATCH_INVALID_UTF match_unset_backref set PCRE2_MATCH_UNSET_BACKREF match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK python_octal set PCRE2_EXTRA_PYTHON_OCTAL turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT utf set PCRE2_UTF .sp As well as turning on the PCRE2_UTF option, the \fButf\fP modifier causes all non-printing characters in output strings to be printed using the \ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly brackets. Setting \fButf\fP in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. .sp The following modifiers enable or disable performance optimizations by calling \fBpcre2_set_optimize()\fP before invoking the regex compiler. .sp optimization_full enable all optional optimizations optimization_none disable all optional optimizations auto_possess auto-possessify variable quantifiers auto_possess_off don't auto-possessify variable quantifiers dotstar_anchor anchor patterns starting with .* dotstar_anchor_off don't anchor patterns starting with .* start_optimize enable pre-scan of subject string start_optimize_off disable pre-scan of subject string .sp See the .\" HREF \fBpcre2_set_optimize\fP .\" documentation for details on these optimizations. . . .\" HTML .SS "Setting compilation controls" .rs .sp The following modifiers affect the compilation process or request information about the pattern. There are single-letter abbreviations for some that are heavily used in the test files. .sp /B bincode show binary code without lengths bsr=[anycrlf|unicode] specify \eR handling callout_info show callout information convert= request foreign pattern conversion convert_glob_escape=c set glob escape character convert_glob_separator=c set glob separator character convert_length set convert buffer length debug same as info,fullbincode expand expand repetition syntax in pattern framesize show matching frame size fullbincode show binary code with lengths /I info show info about compiled pattern hex unquoted characters are hexadecimal jit[=] use JIT jitfast use JIT fast path jitverify verify JIT use locale= use this locale max_pattern_compiled ) set maximum compiled pattern _length= ) length (bytes) max_pattern_length= set maximum pattern length (code units) max_varlookbehind= set maximum variable lookbehind length memory show memory used newline= set newline type null_context compile with a NULL context null_pattern pass pattern as NULL parens_nest_limit= set maximum parentheses depth posix use the POSIX API posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack pushtablescopy push a copy with tables onto the stack stackguard= test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables use_length do not zero-terminate the pattern utf8_input treat input as UTF-8 .sp The effects of these modifiers are described in the following sections. . . .SS "Newline and \eR handling" .rs .sp The \fBbsr\fP modifier specifies what \eR in a pattern should match. If it is set to "anycrlf", \eR matches CR, LF, or CRLF only. If it is set to "unicode", \eR matches any Unicode newline sequence. The default can be specified when PCRE2 is built; if it is not, the default is set to Unicode. .P The \fBnewline\fP modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case). . . .SS "Information about a pattern" .rs .sp The \fBdebug\fP modifier is a shorthand for \fBinfo,fullbincode\fP, requesting all available information. .P The \fBbincode\fP modifier causes a representation of the compiled code to be output after compilation. This information does not contain length and offset values, which ensures that the same output is generated for different internal link sizes and different code unit widths. By using \fBbincode\fP, the same regression tests can be used in different environments. .P The \fBfullbincode\fP modifier, by contrast, \fIdoes\fP include length and offset values. This is used in a few special tests that run only for specific code unit widths and link sizes, and is also useful for one-off tests. .P The \fBinfo\fP modifier requests information about the compiled pattern (whether it is anchored, has a fixed first character, and so on). The information is obtained from the \fBpcre2_pattern_info()\fP function. Here are some typical examples: .sp re> /(?i)(^a|^b)/m,info Capture group count = 1 Compile options: multiline Overall options: caseless multiline First code unit at start or follows newline Subject length lower bound = 1 .sp re> /(?i)abc/info Capture group count = 0 Compile options: Overall options: caseless First code unit = 'a' (caseless) Last code unit = 'c' (caseless) Subject length lower bound = 3 .sp "Compile options" are those specified by modifiers; "overall options" have added options that are taken or deduced from the pattern. If both sets of options are the same, just a single "options" line is output; if there are no options, the line is omitted. "First code unit" is where any match must start; if there is more than one they are listed as "starting code units". "Last code unit" is the last literal code unit that must be present in any match. This is not necessarily the last character. These lines are omitted if no starting or ending code units are recorded. The subject length line is omitted when \fBno_start_optimize\fP is set because the minimum length is not calculated when it can never be used. .P The \fBframesize\fP modifier shows the size, in bytes, of each storage frame used by \fBpcre2_match()\fP for handling backtracking. The size depends on the number of capturing parentheses in the pattern. A vector of these frames is used at matching time; its overall size is shown when the \fBheaframes_size\fP subject modifier is set. .P The \fBcallout_info\fP modifier requests information about all the callouts in the pattern. A list of them is output at the end of any other information that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. . . .SS "Passing a NULL context" .rs .sp Normally, \fBpcre2test\fP passes a context block to \fBpcre2_compile()\fP. If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses default values). . . .SS "Passing a NULL pattern" .rs .sp The \fBnull_pattern\fP modifier is for testing the behaviour of \fBpcre2_compile()\fP when the pattern argument is NULL. The length value passed is the default PCRE2_ZERO_TERMINATED unless \fBuse_length\fP is set. Any length other than zero causes an error. . . .SS "Specifying pattern characters in hexadecimal" .rs .sp The \fBhex\fP modifier specifies that the characters of the pattern, except for substrings enclosed in single or double quotes, are to be interpreted as pairs of hexadecimal digits. This feature is provided as a way of creating patterns that contain binary zeros and other non-printing characters. White space is permitted between pairs of digits. For example, this pattern contains three characters: .sp /ab 32 59/hex .sp Parts of such a pattern are taken literally if quoted. This pattern contains nine characters, only two of which are specified in hexadecimal: .sp /ab "literal" 32/hex .sp Either single or double quotes may be used. There is no way of including the delimiter within a substring. The \fBhex\fP and \fBexpand\fP modifiers are mutually exclusive. . . .SS "Specifying the pattern's length" .rs .sp By default, patterns are passed to the compiling functions as zero-terminated strings but can be passed by length instead of being zero-terminated. The \fBuse_length\fP modifier causes this to happen. Using a length happens automatically (whether or not \fBuse_length\fP is set) when \fBhex\fP is set, because patterns specified in hexadecimal may contain binary zeros. .P If \fBhex\fP or \fBuse_length\fP is used with the POSIX wrapper API (see .\" HTML .\" "Using the POSIX wrapper API" .\" below), the REG_PEND extension is used to pass the pattern's length. . . .SS "Specifying a maximum for variable lookbehinds" .rs .sp Variable lookbehind assertions are supported only if, for each one, there is a maximum length (in characters) that it can match. There is a limit on this, whose default can be set at build time, with an ultimate default of 255. The \fBmax_varlookbehind\fP modifier uses the \fBpcre2_set_max_varlookbehind()\fP function to change the limit. Lookbehinds whose branches each match a fixed length are limited to 65535 characters per branch. . . .SS "Specifying wide characters in 16-bit and 32-bit modes" .rs .sp In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 and translated to UTF-16 or UTF-32 when the \fButf\fP modifier is set. For testing the 16-bit and 32-bit libraries in non-UTF mode, the \fButf8_input\fP modifier can be used. It is mutually exclusive with \fButf\fP. Input lines are interpreted as UTF-8 as a means of specifying wide characters. More details are given in .\" HTML .\" "Input encoding" .\" above. . . .SS "Generating long repetitive patterns" .rs .sp Some tests use long patterns that are very repetitive. Instead of creating a very long input line for such a pattern, you can use a special repetition feature, similar to the one described for subject lines above. If the \fBexpand\fP modifier is present on a pattern, parts of the pattern that have the form .sp \e[]{} .sp are expanded before the pattern is passed to \fBpcre2_compile()\fP. For example, \e[AB]{6000} is expanded to "ABAB..." 6000 times. This construction cannot be nested. An initial "\e[" sequence is recognized only if "]{" followed by decimal digits and "}" is found later in the pattern. If not, the characters remain in the pattern unaltered. The \fBexpand\fP and \fBhex\fP modifiers are mutually exclusive. .P If part of an expanded pattern looks like an expansion, but is really part of the actual pattern, unwanted expansion can be avoided by giving two values in the quantifier. For example, \e[AB]{6000,6000} is not recognized as an expansion item. .P If the \fBinfo\fP modifier is set on an expanded pattern, the result of the expansion is included in the information that is output. . . .SS "JIT compilation" .rs .sp Just-in-time (JIT) compiling is a heavyweight optimization that can greatly speed up pattern matching. See the .\" HREF \fBpcre2jit\fP .\" documentation for details. JIT compiling happens, optionally, after a pattern has been successfully compiled into an internal form. The JIT compiler converts this to optimized machine code. It needs to know whether the match-time options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, because different code is generated for the different cases. See the \fBpartial\fP modifier in "Subject Modifiers" .\" HTML .\" below .\" for details of how these options are specified for each match attempt. .P JIT compilation is requested by the \fBjit\fP pattern modifier, which may optionally be followed by an equals sign and a number in the range 0 to 7. The three bits that make up the number specify which of the three JIT operating modes are to be compiled: .sp 1 compile JIT code for non-partial matching 2 compile JIT code for soft partial matching 4 compile JIT code for hard partial matching .sp The possible values for the \fBjit\fP modifier are therefore: .sp 0 disable JIT 1 normal matching only 2 soft partial matching only 3 normal and soft partial matching 4 hard partial matching only 6 soft and hard partial matching only 7 all three modes .sp If no number is given, 7 is assumed. The phrase "partial matching" means a call to \fBpcre2_match()\fP with either the PCRE2_PARTIAL_SOFT or the PCRE2_PARTIAL_HARD option set. Note that such a call may return a complete match; the options enable the possibility of a partial match, but do not require it. Note also that if you request JIT compilation only for partial matching (for example, jit=2) but do not set the \fBpartial\fP modifier on a subject line, that match will not use JIT code because none was compiled for non-partial matching. .P If JIT compilation is successful, the compiled JIT code will automatically be used when an appropriate type of match is run, except when incompatible run-time options are specified. For more details, see the .\" HREF \fBpcre2jit\fP .\" documentation. See also the \fBjitstack\fP modifier below for a way of setting the size of the JIT stack. .P If the \fBjitfast\fP modifier is specified, matching is done using the JIT "fast path" interface, \fBpcre2_jit_match()\fP, which skips some of the sanity checks that are done by \fBpcre2_match()\fP, and of course does not work when JIT is not supported. If \fBjitfast\fP is specified without \fBjit\fP, jit=7 is assumed. .P If the \fBjitverify\fP modifier is specified, information about the compiled pattern shows whether JIT compilation was or was not successful. If \fBjitverify\fP is specified without \fBjit\fP, jit=7 is assumed. If JIT compilation is successful when \fBjitverify\fP is set, the text "(JIT)" is added to the first output line after a match or non match when JIT-compiled code was actually used in the match. . . .SS "Setting a locale" .rs .sp The \fBlocale\fP modifier must specify the name of a locale, for example: .sp /pattern/locale=fr_FR .sp The given locale is set, \fBpcre2_maketables()\fP is called to build a set of character tables for the locale, and this is then passed to \fBpcre2_compile()\fP when compiling the regular expression. The same tables are used when matching the following subject lines. The \fBlocale\fP modifier applies only to the pattern on which it appears, but can be given in a \fB#pattern\fP command if a default is needed. Setting a locale and alternate character tables are mutually exclusive. . . .SS "Showing pattern memory" .rs .sp The \fBmemory\fP modifier causes the size in bytes of the memory used to hold the compiled pattern to be output. This does not include the size of the \fBpcre2_code\fP block; it is just the actual compiled data. If the pattern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example: .sp re> /a(b)c/jit,memory Memory allocation (code space): 21 Memory allocation (JIT code): 1910 .sp . . .SS "Limiting nested parentheses" .rs .sp The \fBparens_nest_limit\fP modifier sets a limit on the depth of nested parentheses in a pattern. Breaching the limit causes a compilation error. The default for the library is set when PCRE2 is built, but \fBpcre2test\fP sets its own default of 220, which is required for running the standard test suite. . . .SS "Limiting the pattern length" .rs .sp The \fBmax_pattern_length\fP modifier sets a limit, in code units, to the length of pattern that \fBpcre2_compile()\fP will accept. Breaching the limit causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). . . .SS "Limiting the size of a compiled pattern" .rs .sp The \fBmax_pattern_compiled_length\fP modifier sets a limit, in bytes, to the amount of memory used by a compiled pattern. Breaching the limit causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). . . .\" HTML .SS "Using the POSIX wrapper API" .rs .sp The \fBposix\fP and \fBposix_nosub\fP modifiers cause \fBpcre2test\fP to call PCRE2 via the POSIX wrapper API rather than its native API. When \fBposix_nosub\fP is used, the POSIX option REG_NOSUB is passed to \fBregcomp()\fP. The POSIX wrapper supports only the 8-bit library. Note that it does not imply POSIX matching semantics; for more detail see the .\" HREF \fBpcre2posix\fP .\" documentation. The following pattern modifiers set options for the \fBregcomp()\fP function: .sp caseless REG_ICASE multiline REG_NEWLINE dotall REG_DOTALL ) ungreedy REG_UNGREEDY ) These options are not part of ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) .sp The \fBregerror_buffsize\fP modifier specifies a size for the error buffer that is passed to \fBregerror()\fP in the event of a compilation error. For example: .sp /abc/posix,regerror_buffsize=20 .sp This provides a means of testing the behaviour of \fBregerror()\fP when the buffer is too small for the error message. If this modifier has not been set, a large buffer is used. .P The \fBaftertext\fP and \fBallaftertext\fP subject modifiers work as described below. All other modifiers are either ignored, with a warning message, or cause an error. .P The pattern is passed to \fBregcomp()\fP as a zero-terminated string by default, but if the \fBuse_length\fP or \fBhex\fP modifiers are set, the REG_PEND extension is used to pass it by length. . . .SS "Testing the stack guard feature" .rs .sp The \fBstackguard\fP modifier is used to test the use of \fBpcre2_set_compile_recursion_guard()\fP, a function that is provided to enable stack availability to be checked during compilation (see the .\" HREF \fBpcre2api\fP .\" documentation for details). If the number specified by the modifier is greater than zero, \fBpcre2_set_compile_recursion_guard()\fP is called to set up callback from \fBpcre2_compile()\fP to a local function. The argument it receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted. . . .SS "Using alternative character tables" .rs .sp The value specified for the \fBtables\fP modifier must be one of the digits 0, 1, 2, or 3. It causes a specific set of built-in character tables to be passed to \fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour with different character tables. The digit specifies the tables as follows: .sp 0 do not pass any special character tables 1 the default ASCII tables, as distributed in pcre2_chartables.c.dist 2 a set of tables defining ISO 8859 characters 3 a set of tables loaded by the #loadtables command .sp In tables 2, some characters whose codes are greater than 128 are identified as letters, digits, spaces, etc. Tables 3 can be used only after a \fB#loadtables\fP command has loaded them from a binary file. Setting alternate character tables and a locale are mutually exclusive. . . .SS "Setting certain match controls" .rs .sp The following modifiers are really subject modifiers, and are described under "Subject Modifiers" below. However, they may be included in a pattern's modifier list, in which case they are applied to every subject line that is processed with that pattern. These modifiers do not affect the compilation process. .sp aftertext show text after match allaftertext show text after captures allcaptures show all captures allvector show the entire ovector allusedtext show all consulted text altglobal alternative global matching /g global global matching heapframes_size show match data heapframes size jitstack= set size of JIT stack mark show mark values null_substitute_match_data substitute with NULL match data replace= specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY substitute_skip= skip substitution substitute_stop= skip substitution and following substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY .sp These modifiers may not appear in a \fB#pattern\fP command. If you want them as defaults, set them in a \fB#subject\fP command. . . .SS "Specifying literal subject lines" .rs .sp If the \fBsubject_literal\fP modifier is present on a pattern, all the subject lines that it matches are taken as literal strings, with no interpretation of backslashes. It is not possible to set subject modifiers on such lines, but any that are set as defaults by a \fB#subject\fP command are recognized. . . .SS "Saving a compiled pattern" .rs .sp When a pattern with the \fBpush\fP modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and \fBpcre2test\fP expects the next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as described in the section entitled "Saving and restoring compiled patterns" .\" HTML .\" below. .\" If \fBpushcopy\fP is used instead of \fBpush\fP, a copy of the compiled pattern is stacked, leaving the original as current, ready to match the following input lines. This provides a way of testing the \fBpcre2_code_copy()\fP function. .\" The \fBpush\fP and \fBpushcopy \fP modifiers are incompatible with compilation modifiers such as \fBglobal\fP that act at match time. Any that are specified are ignored (for the stacked copy), with a warning message, except for \fBreplace\fP, which causes an error. Note that \fBjitverify\fP, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern. . . .SS "Testing foreign pattern conversion" .rs .sp The experimental foreign pattern conversion functions in PCRE2 can be tested by setting the \fBconvert\fP modifier. Its argument is a colon-separated list of options, which set the equivalent option for the \fBpcre2_pattern_convert()\fP function: .sp glob PCRE2_CONVERT_GLOB glob_no_starstar PCRE2_CONVERT_GLOB_NO_STARSTAR glob_no_wild_separator PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR posix_basic PCRE2_CONVERT_POSIX_BASIC posix_extended PCRE2_CONVERT_POSIX_EXTENDED unset Unset all options .sp The "unset" value is useful for turning off a default that has been set by a \fB#pattern\fP command. When one of these options is set, the input pattern is passed to \fBpcre2_pattern_convert()\fP. If the conversion is successful, the result is reflected in the output and then passed to \fBpcre2_compile()\fP. The normal \fButf\fP and \fBno_utf_check\fP options, if set, cause the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to \fBpcre2_pattern_convert()\fP. .P By default, the conversion function is allowed to allocate a buffer for its output. However, if the \fBconvert_length\fP modifier is set to a value greater than zero, \fBpcre2test\fP passes a buffer of the given length. This makes it possible to test the length check. .P The \fBconvert_glob_escape\fP and \fBconvert_glob_separator\fP modifiers can be used to specify the escape and separator characters for glob processing, overriding the defaults, which are operating-system dependent. . . .\" HTML .SH "SUBJECT MODIFIERS" .rs .sp The modifiers that can appear in subject lines and the \fB#subject\fP command are of two types. . . .SS "Setting match options" .rs .sp The following modifiers set options for \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. See .\" HREF \fBpcre2api\fP .\" for a description of their effects. .sp anchored set PCRE2_ANCHORED copy_matched_subject set PCRE2_COPY_MATCHED_SUBJECT endanchored set PCRE2_ENDANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST disable_recurseloop_check set PCRE2_DISABLE_RECURSELOOP_CHECK no_jit set PCRE2_NO_JIT no_utf_check set PCRE2_NO_UTF_CHECK notbol set PCRE2_NOTBOL notempty set PCRE2_NOTEMPTY notempty_atstart set PCRE2_NOTEMPTY_ATSTART noteol set PCRE2_NOTEOL partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_soft (or ps) set PCRE2_PARTIAL_SOFT .sp The partial matching modifiers are provided with abbreviations because they appear frequently in tests. .P If the \fBposix\fP or \fBposix_nosub\fP modifier was present on the pattern, causing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP. The other modifiers are ignored, with a warning message. .P There is one additional modifier that can be used with the POSIX wrapper. It is ignored (with a warning) if used for non-POSIX matching. .sp posix_startend=[:] .sp This causes the subject string to be passed to \fBregexec()\fP using the REG_STARTEND option, which uses offsets to specify which part of the string is searched. If only one number is given, the end offset is passed as the end of the subject string. For more detail of REG_STARTEND, see the .\" HREF \fBpcre2posix\fP .\" documentation. If the subject string contains binary zeros (coded as escapes such as \ex{00} because \fBpcre2test\fP does not support actual binary zeros in its input), you must use \fBposix_startend\fP to specify its length. . . .SS "Setting match controls" .rs .sp The following modifiers affect the matching process or request additional information. Some of them may also be specified on a pattern line (see above), in which case they apply to every subject line that is matched against that pattern, but can be overridden by modifiers on the subject. .sp aftertext show text after match allaftertext show text after captures allcaptures show all captures allusedtext show all consulted text (non-JIT only) allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data= set a value to pass via callouts callout_error=[:] control callout error callout_extra show extra callout information callout_fail=[:] control callout failure callout_no_where do not show position of a callout callout_none do not supply a callout function copy= copy captured substring depth_limit= set a depth limit dfa use \fBpcre2_dfa_match()\fP find_limits find heap, match and depth limits find_limits_noheap find match and depth limits get= extract captured substring getall extract all captured substrings /g global global matching heapframes_size show match data heapframes size heap_limit= set a limit on heap memory (Kbytes) jitstack= set size of JIT stack mark show mark values match_limit= set a match limit memory show heap memory usage null_context match with a NULL context null_replacement substitute with NULL replacement null_subject match with NULL subject null_substitute_match_data substitute with NULL match data offset= set starting offset offset_limit= set offset limit ovector= set size of output vector recursion_limit= obsolete synonym for depth_limit replace= specify a replacement string startchar show startchar when relevant startoffset= same as offset= substitute_callout use substitution callouts substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY substitute_skip= skip substitution number n substitute_stop= skip substitution number n and greater substitute_subject= specify a different subject for substitution substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY zero_terminate pass the subject as zero-terminated .sp The effects of these modifiers are described in the following sections. When matching via the POSIX wrapper API, the \fBaftertext\fP, \fBallaftertext\fP, and \fBovector\fP subject modifiers work as described below. All other modifiers are either ignored, with a warning message, or cause an error. . . .SS "Showing more text" .rs .sp The \fBaftertext\fP modifier requests that as well as outputting the part of the subject string that matched the entire pattern, \fBpcre2test\fP should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. The \fBallaftertext\fP modifier requests the same action for captured substrings as well as the main matched substring. In each case the remainder is output on the following line with a plus character following the capture number. .P The \fBallusedtext\fP modifier requests that all the text that was consulted during a successful pattern match by the interpreter should be shown, for both full and partial matches. This feature is not supported for JIT matching, and if requested with JIT it is ignored (with a warning message). Setting this modifier affects the output if there is a lookbehind at the start of a match, or, for a complete match, a lookahead at the end, or if \eK is used in the pattern. Characters that precede or follow the start and end of the actual match are indicated in the output by '<' or '>' characters underneath them. Here is an example: .sp re> /(?<=pqr)abc(?=xyz)/ data> 123pqrabcxyz456\e=allusedtext 0: pqrabcxyz <<< >>> data> 123pqrabcxy\e=ph,allusedtext Partial match: pqrabcxy <<< .sp The first, complete match shows that the matched string is "abc", with the preceding and following strings "pqr" and "xyz" having been consulted during the match (when processing the assertions). The partial match can indicate only the preceding string. .P The \fBstartchar\fP modifier requests that the starting character for the match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \eK has been processed as part of the match. In this situation, the output for the matched string is displayed from the starting character instead of from the match point, with circumflex characters under the earlier characters. For example: .sp re> /abc\eKxyz/ data> abcxyz\e=startchar 0: abcxyz ^^^ .sp Unlike \fBallusedtext\fP, the \fBstartchar\fP modifier can be used with JIT. However, these two modifiers are mutually exclusive. . . .SS "Showing the value of all capture groups" .rs .sp The \fBallcaptures\fP modifier requests that the values of all potential captured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to the return code from \fBpcre2_match()\fP). Groups that did not take part in the match are output as "". This modifier is not relevant for DFA matching (which does no capturing) and does not apply when \fBreplace\fP is specified; it is ignored, with a warning message, if present. . . .SS "Showing the entire ovector, for all outcomes" .rs .sp The \fBallvector\fP modifier requests that the entire ovector be shown, whatever the outcome of the match. Compare \fBallcaptures\fP, which shows only up to the maximum number of capture groups for the pattern, and then only for a successful complete non-DFA match. This modifier, which acts after any match result, and also for DFA matching, provides a means of checking that there are no unexpected modifications to ovector fields. Before each match attempt, the ovector is filled with a special value, and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all groups after the maximum capture group for the pattern. In other cases it applies to the entire ovector. After a partial match, the first two elements are the only ones that should be set. After a DFA match, the amount of ovector that is used depends on the number of matches that were found. . . .SS "Testing pattern callouts" .rs .sp A callout function is supplied when \fBpcre2test\fP calls the library matching functions, unless \fBcallout_none\fP is specified. Its behaviour can be controlled by various modifiers listed above whose names begin with \fBcallout_\fP. Details are given in the section entitled "Callouts" .\" HTML .\" below. .\" Testing callouts from \fBpcre2_substitute()\fP is described separately in "Testing the substitution function" .\" HTML .\" below. .\" . . .SS "Finding all matches in a string" .rs .sp Searching for all possible matches within a subject can be requested by the \fBglobal\fP or \fBaltglobal\fP modifier. After finding a match, the matching function is called again to search the remainder of the subject. The difference between \fBglobal\fP and \fBaltglobal\fP is that the former uses the \fIstart_offset\fP argument to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP to start searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbehind assertion (including \eb or \eB). .P If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART flag set, in order to search for another, non-empty, match at the same point in the subject. This imitates the way Perl handles such cases when using the \fB/g\fP modifier or the \fBsplit()\fP function. . . .SS "Testing substring extraction functions" .rs .sp The \fBcopy\fP and \fBget\fP modifiers can be used to test the \fBpcre2_substring_copy_xxx()\fP and \fBpcre2_substring_get_xxx()\fP functions. They can be given more than once, and each can specify a capture group name or number, for example: .sp abcd\e=copy=1,copy=3,get=G1 .sp If the \fB#subject\fP command is used to set default copy and/or get lists, these can be unset by specifying a negative number to cancel all numbered groups and an empty name to cancel all named groups. .P The \fBgetall\fP modifier tests \fBpcre2_substring_list_get()\fP, which extracts all captured substrings. .P If the subject line is successfully matched, the substrings extracted by the convenience functions are output with C, G, or L after the string number instead of a colon. This is in addition to the normal full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name. . . .\" HTML .SS "Testing the substitution function" .rs .sp If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is called instead of one of the matching functions (or after one call of \fBpcre2_match()\fP in the case of PCRE2_SUBSTITUTE_MATCHED). Note that replacement strings cannot contain commas, because a comma signifies the end of a modifier. This is not thought to be an issue in a test program. .P Specifying a completely empty replacement string disables this modifier. However, it is possible to specify an empty replacement by providing a buffer length, as described below, for an otherwise empty replacement. .P Unlike subject strings, \fBpcre2test\fP does not process replacement strings for escape sequences. In UTF mode, a replacement string is checked to see if it is a valid UTF-8 string. If so, it is correctly converted to a UTF string of the appropriate code unit width. If it is not a valid UTF-8 string, the individual code units are copied directly. This provides a means of passing an invalid UTF-8 string for testing purposes. .P The following modifiers set options (in additional to the normal match options) for \fBpcre2_substitute()\fP: .sp global PCRE2_SUBSTITUTE_GLOBAL substitute_extended PCRE2_SUBSTITUTE_EXTENDED substitute_literal PCRE2_SUBSTITUTE_LITERAL substitute_matched PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY .sp See the .\" HREF \fBpcre2api\fP .\" documentation for details of these options. .P After a successful substitution, the modified string is output, preceded by the number of replacements. This may be zero if there were no matches. Here is a simple example of a substitution test: .sp /abc/replace=xxx =abc=abc= 1: =xxx=abc= =abc=abc=\e=global 2: =xxx=xxx= .sp Subject and replacement strings should be kept relatively short (fewer than 256 characters) for substitution tests, as fixed-size buffers are used. To make it easy to test for buffer overflow, if the replacement string starts with a number in square brackets, that number is passed to \fBpcre2_substitute()\fP as the size of the output buffer, with the replacement string starting at the next character. Here is an example that tests the edge case: .sp /abc/ 123abc123\e=replace=[10]XYZ 1: 123XYZ123 123abc123\e=replace=[9]XYZ Failed: error -48: no more memory .sp The default action of \fBpcre2_substitute()\fP is to return PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the \fBsubstitute_overflow_length\fP modifier), \fBpcre2_substitute()\fP continues to go through the motions of matching and substituting (but not doing any callouts), in order to compute the size of buffer that is required. When this happens, \fBpcre2test\fP shows the required buffer length (which includes space for the trailing zero) as part of the error message. For example: .sp /abc/substitute_overflow_length 123abc123\e=replace=[9]XYZ Failed: error -48: no more memory: 10 code units are needed .sp A replacement string is ignored with POSIX and DFA matching. Specifying partial matching provokes an error return ("bad option value") from \fBpcre2_substitute()\fP. .sp The \fBsubstitute_subject\fP modifier may be used to test the use of the PCRE2 API, in which a client calls \fBpcre2_match()\fP followed by \fBpcre2_substitute()\fP with PCRE2_SUBSTITUTE_MATCHED, but the client performs an unexpected and unsupported modification of the subject buffer in-place, in between the match and substitution. . . .SS "Testing substitute callouts" .rs .sp If the \fBsubstitute_callout\fP modifier is set, a substitution callout function is set up. The \fBnull_context\fP modifier must not be set, because the address of the callout function is passed in a match context. When the callout function is called (after each substitution), details of the input and output strings are output. For example: .sp /abc/g,replace=<$0>,substitute_callout abcdefabcpqr 1(1) Old 0 3 "abc" New 0 5 "" 2(1) Old 6 9 "abc" New 8 13 "" 2: defpqr .sp The first number on each callout line is the count of matches. The parenthesized number is the number of pairs that are set in the ovector (that is, one more than the number of capturing groups that were set). Then are listed the offsets of the old substring, its contents, and the same for the replacement. .P By default, the substitution callout function returns zero, which accepts the replacement and causes matching to continue if /g was used. Two further modifiers can be used to test other return values. If \fBsubstitute_skip\fP is set to a value greater than zero the callout function returns +1 for the match of that number, and similarly \fBsubstitute_stop\fP returns -1. These cause the replacement to be rejected, and -1 causes no further matching to take place. If either of them are set, \fBsubstitute_callout\fP is assumed. For example: .sp /abc/g,replace=<$0>,substitute_skip=1 abcdefabcpqr 1(1) Old 0 3 "abc" New 0 5 " SKIPPED" 2(1) Old 6 9 "abc" New 6 11 "" 2: abcdefpqr abcdefabcpqr\e=substitute_stop=1 1(1) Old 0 3 "abc" New 0 5 " STOPPED" 1: abcdefabcpqr .sp If both are set for the same number, stop takes precedence. Only a single skip or stop is supported, which is sufficient for testing that the feature works. . . .SS "Testing substitute case callouts" .rs .sp If the \fBsubstitute_case_callout\fP modifier is set, a substitution case callout function is set up. The callout function is called for each substituted chunk which is to be case-transformed. .P The callout function passed is a fixed function with implementation for certain behaviours: inputs which shrink when case-transformed; inputs which grow; inputs with distinct upper/lower/titlecase forms. The characters which are not special-cased for testing purposes are left unmodified, as if they are caseless characters. . . .SS "Setting the JIT stack size" .rs .sp The \fBjitstack\fP modifier provides a way of setting the maximum stack size that is used by the just-in-time optimization code. It is ignored if JIT optimization is not being used. The value is a number of kibibytes (units of 1024 bytes). Setting zero reverts to the default of 32KiB. Providing a stack that is larger than the default is necessary only for very complicated patterns. If \fBjitstack\fP is set non-zero on a subject line it overrides any value that was set on the pattern. . . .SS "Setting heap, match, and depth limits" .rs .sp The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set the appropriate limits in the match context. These values are ignored when the \fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified. . . .SS "Finding minimum limits" .rs .sp If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP calls the relevant matching function several times, setting different values in the match context via \fBpcre2_set_heap_limit()\fP, \fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds the smallest value for each parameter that allows the match to complete without a "limit exceeded" error. The match itself may succeed or fail. An alternative modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the standard tests, because the minimum heap limit varies between systems. If JIT is being used, only the match limit is relevant, and the other two are automatically omitted. .P When using this modifier, the pattern should not contain any limit settings such as (*LIMIT_MATCH=...) within it. If such a setting is present and is lower than the minimum matching value, the minimum value cannot be found because \fBpcre2_set_match_limit()\fP etc. are only able to reduce the value of an in-pattern limit; they cannot increase it. .P For non-DFA matching, the minimum \fIdepth_limit\fP number is a measure of how much nested backtracking happens (that is, how deeply the pattern's tree is searched). In the case of DFA matching, \fIdepth_limit\fP controls the depth of recursive calls of the internal function that is used for handling pattern recursion, lookaround assertions, and atomic groups. .P For non-DFA matching, the \fImatch_limit\fP number is a measure of the amount of backtracking that takes place, and learning the minimum value can be instructive. For most simple matches, the number is quite small, but for patterns with very large numbers of matching possibilities, it can become large very quickly with increasing length of subject string. In the case of DFA matching, \fImatch_limit\fP controls the total number of calls, both recursive and non-recursive, to the internal matching function, thus controlling the overall amount of computing resource that is used. .P For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes (units of 1024 bytes), limits the amount of heap memory used for matching. . . .SS "Showing MARK names" .rs .sp .P The \fBmark\fP modifier causes the names from backtracking control verbs that are returned from calls to \fBpcre2_match()\fP to be displayed. If a mark is returned for a match, non-match, or partial match, \fBpcre2test\fP shows it. For a match, it is on a line by itself, tagged with "MK:". Otherwise, it is added to the non-match message. . . .SS "Showing memory usage" .rs .sp The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap memory allocation and freeing calls that occur during a call to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory is used only when a match requires more internal workspace that the default allocation on the stack, so in many cases there will be no output. No heap memory is allocated during matching with JIT. For this modifier to work, the \fBnull_context\fP modifier must not be set on both the pattern and the subject, though it can be set on one or the other. . . .SS "Showing the heap frame overall vector size" .rs .sp The \fBheapframes_size\fP modifier is relevant for matches using \fBpcre2_match()\fP without JIT. After a match has run (whether successful or not) the size, in bytes, of the allocated heap frames vector that is left attached to the match data block is shown. If the matching action involved several calls to \fBpcre2_match()\fP (for example, global matching or for timing) only the final value is shown. .P This modifier is ignored, with a warning, for POSIX or DFA matching. JIT matching does not use the heap frames vector, so the size is always zero, unless there was a previous non-JIT match. Note that specifing a size of zero for the output vector (see below) causes \fBpcre2test\fP to free its match data block (and associated heap frames vector) and allocate a new one. . . .SS "Setting a starting offset" .rs .sp The \fBoffset\fP modifier sets an offset in the subject string at which matching starts. Its value is a number of code units, not characters. . . .SS "Setting an offset limit" .rs .sp The \fBoffset_limit\fP modifier sets a limit for unanchored matches. If a match cannot be found starting at or before this offset in the subject, a "no match" return is given. The data value is a number of code units, not characters. When this modifier is used, the \fBuse_offset_limit\fP modifier must have been set for the pattern; if not, an error is generated. . . .SS "Setting the size of the output vector" .rs .sp The \fBovector\fP modifier applies only to the subject line in which it appears, though of course it can also be used to set a default in a \fB#subject\fP command. It specifies the number of pairs of offsets that are available for storing matching information. The default is 15. .P A value of zero is useful when testing the POSIX API because it causes \fBregexec()\fP to be called with a NULL capture vector. When not testing the POSIX API, a value of zero is used to cause \fBpcre2_match_data_create_from_pattern()\fP to be called, in order to create a new match block of exactly the right size for the pattern. (It is not possible to create a match block with a zero-length ovector; there is always at least one pair of offsets.) The old match data block is freed. . . .SS "Passing the subject as zero-terminated" .rs .sp By default, the subject string is passed to a native API matching function with its correct length. In order to test the facility for passing a zero-terminated string, the \fBzero_terminate\fP modifier is provided. It causes the length to be passed as PCRE2_ZERO_TERMINATED. When matching via the POSIX interface, this modifier is ignored, with a warning. .P When testing \fBpcre2_substitute()\fP, this modifier also has the effect of passing the replacement string as zero-terminated. . . .SS "Passing a NULL context, subject, or replacement" .rs .sp Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, \fBpcre2_jit_match()\fP or \fBpcre2_substitute()\fP. If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used with the \fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP modifiers. .P Similarly, for testing purposes, if the \fBnull_subject\fP or \fBnull_replacement\fP modifier is set, the subject or replacement string pointers are passed as NULL, respectively, to the relevant functions. . . .SH "THE ALTERNATIVE MATCHING FUNCTION" .rs .sp By default, \fBpcre2test\fP uses the standard PCRE2 matching function, \fBpcre2_match()\fP to match each subject line. PCRE2 also supports an alternative matching function, \fBpcre2_dfa_match()\fP, which operates in a different way, and has some restrictions. The differences between the two functions are described in the .\" HREF \fBpcre2matching\fP .\" documentation. .P If the \fBdfa\fP modifier is set, the alternative matching function is used. This function finds all possible matches at a given point in the subject. If, however, the \fBdfa_shortest\fP modifier is set, processing stops after the first match is found. This is always the shortest possible match. . . .SH "DEFAULT OUTPUT FROM pcre2test" .rs .sp This section describes the output when the normal matching function, \fBpcre2_match()\fP, is being used. .P When a match succeeds, \fBpcre2test\fP outputs the list of captured substrings, starting with number 0 for the string that matched the whole pattern. Otherwise, it outputs "No match" when the return is PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is the entire substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \eK, \eb, or \eB was involved.) .P For any other return, \fBpcre2test\fP outputs the PCRE2 negative error number and a short descriptive phrase. If the error is a failed UTF string check, the code unit offset of the start of the failing character is also output. Here is an example of an interactive \fBpcre2test\fP run. .sp $ pcre2test PCRE2 version 10.22 2016-07-29 .sp re> /^abc(\ed+)/ data> abc123 0: abc123 1: 123 data> xyz No match .sp Unset capturing substrings that are not followed by one that is set are not shown by \fBpcre2test\fP unless the \fBallcaptures\fP modifier is specified. In the following example, there are two capturing substrings, but when the first data line is matched, the second, unset substring is not shown. An "internal" unset substring is shown as "", as for the second data line. .sp re> /(a)|(b)/ data> a 0: a 1: a data> b 0: b 1: 2: b .sp If the strings contain any non-printing characters, they are output as \exhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \ex{hh...} escapes. See below for the definition of non-printing characters. If the \fBaftertext\fP modifier is set, the output for substring 0 is followed by the rest of the subject string, identified by "0+" like this: .sp re> /cat/aftertext data> cataract 0: cat 0+ aract .sp If global matching is requested, the results of successive matching attempts are output in sequence, like this: .sp re> /\eBi(\ew\ew)/g data> Mississippi 0: iss 1: ss 0: iss 1: ss 0: ipp 1: pp .sp "No match" is output only if the first match attempt fails. Here is an example of a failure message (the offset 4 that is specified by the \fBoffset\fP modifier is past the end of the subject string): .sp re> /xyz/ data> xyz\e=offset=4 Error -24 (bad offset value) .P Note that whereas patterns can be continued over several lines (a plain ">" prompt is used for continuations), subject lines may not. However newlines can be included in a subject by means of the \en escape (or \er, \er\en, etc., depending on the newline sequence setting). . . . .SH "OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION" .rs .sp When the alternative matching function, \fBpcre2_dfa_match()\fP, is used, the output consists of a list of all the matches that start at the first point in the subject where there is at least one match. For example: .sp re> /(tang|tangerine|tan)/ data> yellow tangerine\e=dfa 0: tangerine 1: tang 2: tan .sp Using the normal matching function on this data finds only "tang". The longest matching string is always given first (and numbered zero). After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the partially matching substring. Note that this is the entire substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \eb, or \eB was involved. (\eK is not supported for DFA matching.) .P If global matching is requested, the search for further matches resumes at the end of the longest match. For example: .sp re> /(tang|tangerine|tan)/g data> yellow tangerine and tangy sultana\e=dfa 0: tangerine 1: tang 2: tan 0: tang 1: tan 0: tan .sp The alternative matching function does not support substring capture, so the modifiers that are concerned with captured substrings are not relevant. . . .SH "RESTARTING AFTER A PARTIAL MATCH" .rs .sp When the alternative matching function has given the PCRE2_ERROR_PARTIAL return, indicating that the subject partially matched the pattern, you can restart the match with additional subject data by means of the \fBdfa_restart\fP modifier. For example: .sp re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/ data> 23ja\e=ps,dfa Partial match: 23ja data> n05\e=dfa,dfa_restart 0: n05 .sp For further information about partial matching, see the .\" HREF \fBpcre2partial\fP .\" documentation. . . .\" HTML .SH CALLOUTS .rs .sp If the pattern contains any callout requests, \fBpcre2test\fP's callout function is called during matching unless \fBcallout_none\fP is specified. This works with both matching functions, and with JIT, though there are some differences in behaviour. The output for callouts with numerical arguments and those with string arguments is slightly different. . . .SS "Callouts with numerical arguments" .rs .sp By default, the callout function displays the callout number, the start and current positions in the subject text at the callout time, and the next pattern item to be tested. For example: .sp --->pqrabcdef 0 ^ ^ \ed .sp This output indicates that callout number 0 occurred for a match attempt starting at the fourth character of the subject string, when the pointer was at the seventh character, and when the next pattern item was \ed. Just one circumflex is output if the start and current positions are the same, or if the current position precedes the start position, which can happen if the callout is in a lookbehind assertion. .P Callouts numbered 255 are assumed to be automatic callouts, inserted as a result of the \fBauto_callout\fP pattern modifier. In this case, instead of showing the callout number, the offset in the pattern, preceded by a plus, is output. For example: .sp re> /\ed?[A-E]\e*/auto_callout data> E* --->E* +0 ^ \ed? +3 ^ [A-E] +8 ^^ \e* +10 ^ ^ 0: E* .sp If a pattern contains (*MARK) items, an additional line is output whenever a change of latest mark is passed to the callout function. For example: .sp re> /a(*MARK:X)bc/auto_callout data> abc --->abc +0 ^ a +1 ^^ (*MARK:X) +10 ^^ b Latest Mark: X +11 ^ ^ c +12 ^ ^ 0: abc .sp The mark changes between matching "a" and "b", but stays the same for the rest of the match, so nothing more is output. If, as a result of backtracking, the mark reverts to being unset, the text "" is output. . . .SS "Callouts with string arguments" .rs .sp The output for a callout with a string argument is similar, except that instead of outputting a callout number before the position indicators, the callout string and its offset in the pattern string are output before the reflection of the subject string, and the subject string is reflected for each callout. For example: .sp re> /^ab(?C'first')cd(?C"second")ef/ data> abcdefg Callout (7): 'first' --->abcdefg ^ ^ c Callout (20): "second" --->abcdefg ^ ^ e 0: abcdef .sp . . .SS "Callout modifiers" .rs .sp The callout function in \fBpcre2test\fP returns zero (carry on matching) by default, but you can use a \fBcallout_fail\fP modifier in a subject line to change this and other parameters of the callout (see below). .P If the \fBcallout_capture\fP modifier is set, the current captured groups are output when a callout occurs. This is useful only for non-DFA matching, as \fBpcre2_dfa_match()\fP does not support capturing, so no captures are ever shown. .P The normal callout output, showing the callout number or pattern offset (as described above) is suppressed if the \fBcallout_no_where\fP modifier is set. .P When using the interpretive matching function \fBpcre2_match()\fP without JIT, setting the \fBcallout_extra\fP modifier causes additional output from \fBpcre2test\fP's callout function to be generated. For the first callout in a match attempt at a new starting position in the subject, "New match attempt" is output. If there has been a backtrack since the last callout (or start of matching if this is the first callout), "Backtrack" is output, followed by "No other matching paths" if the backtrack ended the previous match attempt. For example: .sp re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess data> aac\e=callout_extra New match attempt --->aac +0 ^ ( +1 ^ a+ +3 ^ ^ ) +4 ^ ^ b Backtrack --->aac +3 ^^ ) +4 ^^ b Backtrack No other matching paths New match attempt --->aac +0 ^ ( +1 ^ a+ +3 ^^ ) +4 ^^ b Backtrack No other matching paths New match attempt --->aac +0 ^ ( +1 ^ a+ Backtrack No other matching paths New match attempt --->aac +0 ^ ( +1 ^ a+ No match .sp Notice that various optimizations must be turned off if you want all possible matching paths to be scanned. If \fBno_start_optimize\fP is not used, there is an immediate "no match", without any callouts, because the starting optimization fails to find "b" in the subject, which it knows must be present for any match. If \fBno_auto_possess\fP is not used, the "a+" item is turned into "a++", which reduces the number of backtracks. .P The \fBcallout_extra\fP modifier has no effect if used with the DFA matching function, or with JIT. . . .SS "Return values from callouts" .rs .sp The default return from the callout function is zero, which allows matching to continue. The \fBcallout_fail\fP modifier can be given one or two numbers. If there is only one number, 1 is returned instead of 0 (causing matching to backtrack) when a callout of that number is reached. If two numbers (:) are given, 1 is returned when callout is reached and there have been at least callouts. The \fBcallout_error\fP modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, causing the entire matching process to be aborted. If both these modifiers are set for the same callout number, \fBcallout_error\fP takes precedence. Note that callouts with string arguments are always given the number zero. .P The \fBcallout_data\fP modifier can be given an unsigned or a negative number. This is set as the "user data" that is passed to the matching function, and passed back when the callout function is invoked. Any value other than zero is used as a return from \fBpcre2test\fP's callout function. .P Inserting callouts can be helpful when using \fBpcre2test\fP to check complicated regular expressions. For further information about callouts, see the .\" HREF \fBpcre2callout\fP .\" documentation. . . . .SH "NON-PRINTING CHARACTERS" .rs .sp When \fBpcre2test\fP is outputting text in the compiled version of a pattern, bytes other than 32-126 are always treated as non-printing characters and are therefore shown as hex escapes. .P When \fBpcre2test\fP is outputting text that is a matched part of a subject string, it behaves in the same way, unless a different locale has been set for the pattern (using the \fBlocale\fP modifier). In this case, the \fBisprint()\fP function is used to distinguish printing and non-printing characters. . . . .\" HTML .SH "SAVING AND RESTORING COMPILED PATTERNS" .rs .sp It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. JIT data cannot be saved. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width and PCRE2_SIZE type. Before compiled patterns can be saved they must be serialized, that is, converted to a stream of bytes. A single byte stream may contain any number of compiled patterns, but they must all use the same character tables. A single copy of the tables is included in the byte stream (its size is 1088 bytes). .P The functions whose names begin with \fBpcre2_serialize_\fP are used for serializing and de-serializing. They are described in the .\" HREF \fBpcre2serialize\fP .\" documentation. In this section we describe the features of \fBpcre2test\fP that can be used to test these functions. .P Note that "serialization" in PCRE2 does not convert compiled patterns to an abstract format like Java or .NET. It just makes a reloadable byte code stream. Hence the restrictions on reloading mentioned above. .P In \fBpcre2test\fP, when a pattern with \fBpush\fP modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and \fBpcre2test\fP expects the next line to contain a new pattern (or command) instead of a subject line. By contrast, the \fBpushcopy\fP modifier causes a copy of the compiled pattern to be stacked, leaving the original available for immediate matching. By using \fBpush\fP and/or \fBpushcopy\fP, a number of patterns can be compiled and retained. These modifiers are incompatible with \fBposix\fP, and control modifiers that act at match time are ignored (with a message) for the stacked patterns. The \fBjitverify\fP modifier applies only at compile time. .P The command .sp #save .sp causes all the stacked patterns to be serialized and the result written to the named file. Afterwards, all the stacked patterns are freed. The command .sp #load .sp reads the data in the file, and then arranges for it to be de-serialized, with the resulting compiled patterns added to the pattern stack. The pattern on the top of the stack can be retrieved by the #pop command, which must be followed by lines of subjects that are to be matched with the pattern, terminated as usual by an empty line or end of file. This command may be followed by a modifier list containing only .\" HTML .\" control modifiers .\" that act after a pattern has been compiled. In particular, \fBhex\fP, \fBposix\fP, \fBposix_nosub\fP, \fBpush\fP, and \fBpushcopy\fP are not allowed, nor are any .\" HTML .\" option-setting modifiers. .\" The JIT modifiers are, however permitted. Here is an example that saves and reloads two patterns. .sp /abc/push /xyz/push #save tempfile #load tempfile #pop info xyz .sp #pop jit,bincode abc .sp If \fBjitverify\fP is used with #pop, it does not automatically imply \fBjit\fP, which is different behaviour from when it is used on a pattern. .P The #popcopy command is analogous to the \fBpushcopy\fP modifier in that it makes current a copy of the topmost stack pattern, leaving the original still on the stack. . . . .SH "SEE ALSO" .rs .sp \fBpcre2\fP(3), \fBpcre2api\fP(3), \fBpcre2callout\fP(3), \fBpcre2jit\fP, \fBpcre2matching\fP(3), \fBpcre2partial\fP(d), \fBpcre2pattern\fP(3), \fBpcre2serialize\fP(3). . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 12 October 2025 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: doc/pcre2test.txt ================================================ PCRE2TEST(1) General Commands Manual PCRE2TEST(1) NAME pcre2test - a program for testing Perl-compatible regular expressions. SYNOPSIS pcre2test [options] [input file [output file]] pcre2test is a test program for the PCRE2 regular expression libraries, but it can also be used for experimenting with regular expressions. This document describes the features of the test program; for details of the regular expressions themselves, see the pcre2pattern documenta- tion. For details of the PCRE2 library function calls and their op- tions, see the pcre2api documentation. The input for pcre2test is a sequence of regular expression patterns and subject strings to be matched. There are also command lines for setting defaults and controlling some special actions. The output shows the result of each match attempt. Modifiers on external or internal command lines, the patterns, and the subject lines specify PCRE2 func- tion options, control how the subject is processed, and what output is produced. There are many obscure modifiers, some of which are specifically de- signed for use in conjunction with the test script and data files that are distributed as part of PCRE2. All the modifiers are documented here, some without much justification, but many of them are unlikely to be of use except when testing the libraries. PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES Different versions of the PCRE2 library can be built to support charac- ter strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or all three of these libraries may be simultaneously in- stalled. The pcre2test program can be used to test all the libraries. However, its own input and output are always in 8-bit format. When testing the 16-bit or 32-bit libraries, patterns and subject strings are converted to 16-bit or 32-bit format before being passed to the li- brary functions. Results are converted back to 8-bit code units for output. In the rest of this document, the names of library functions and struc- tures are given in generic form, for example, pcre2_compile(). The ac- tual names used in the libraries have a suffix _8, _16, or _32, as ap- propriate. INPUT ENCODING Input to pcre2test is processed line by line, either by calling the C library's fgets() function, or via the libreadline or libedit library. In some Windows environments character 26 (hex 1A) causes an immediate end of file, and no further data is read, so this character should be avoided unless you really want that action. The input is processed using C's string functions, so must not contain binary zeros, even though in Unix-like environments, fgets() treats any bytes other than newline as data characters. An error is generated if a binary zero is encountered. By default subject lines are processed for backslash escapes, which makes it possible to include any data value in strings that are passed to the library for matching. For patterns, there is a facility for specifying some or all of the 8-bit input char- acters as hexadecimal pairs, which makes it possible to include binary zeros. Input for the 16-bit and 32-bit libraries When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed to the library. For subject lines and some patterns, back- slash escapes can be used. In addition, when the utf modifier (see "Setting compilation options" below) is set, the pattern and any fol- lowing subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. For non-UTF testing of wide characters, the utf8_input modifier can be used. This is mutually exclusive with utf, and is allowed only in 16-bit or 32-bit mode. It causes the pattern and following subject lines to be treated as UTF-8 according to the original definition (RFC 2279), which allows for character values up to 0x7fffffff. Each charac- ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error to occur). UTF-8 (in its original definition) is not capable of encoding values greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) 0x80000000 is added to the character's value. For subject strings, using an escape sequence is preferable. COMMAND LINE OPTIONS -8 If the 8-bit library has been built, this option causes it to be used (this is the default). If the 8-bit library has not been built, this option causes an error. -16 If the 16-bit library has been built, this option causes it to be used. If the 8-bit library has not been built, this is the default. If the 16-bit library has not been built, this option causes an error. -32 If the 32-bit library has been built, this option causes it to be used. If no other library has been built, this is the default. If the 32-bit library has not been built, this op- tion causes an error. -ac Behave as if each pattern has the auto_callout modifier, that is, insert automatic callouts into every pattern that is com- piled. -AC As for -ac, but in addition behave as if each subject line has the callout_extra modifier, that is, show additional in- formation from callouts. -b Behave as if each pattern has the fullbincode modifier; the full internal binary form of the pattern is output after com- pilation. -C Output the version number of the PCRE2 library, and all available information about the optional features that are included, and then exit with zero exit code. All other op- tions are ignored. If both -C and -LM are present, whichever is first is recognized. -C option Output information about a specific build-time option, then exit. This functionality is intended for use in scripts such as RunTest. The following options output the value and set the exit code as indicated: linksize the configured internal link size (2, 3, or 4) exit code is set to the link size newline the default newline setting: CR, LF, CRLF, ANYCRLF, ANY, or NUL exit code is always 0 bsr the default setting for what \R matches: ANYCRLF or ANY exit code is always 0 The following options output 1 for true or 0 for false, and set the exit code to the same value: backslash-C \C is supported (not locked out) ebcdic compiled for an EBCDIC environment ebcdic-io if PCRE2 is compiled for EBCDIC, whether pcre2test's input and output is EBCDIC or ASCII ebcdic-nl25 if PCRE2 is compiled for EBCDIC, whether NL (= LF) is 0x25 (otherwise it is 0x15, the default) jit just-in-time support is available pcre2-16 the 16-bit library was built pcre2-32 the 32-bit library was built pcre2-8 the 8-bit library was built unicode Unicode support is available Note that the availability of JIT support in the library does not guarantee that it can actually be used because in some environments it is unable to allocate executable memory. The option "jitusable" gives more detailed information. It re- turns one of the following values: 0 JIT is available and usable 1 JIT is available but cannot allocate executable memory 2 JIT is not available 3 Unexpected return from test call to pcre2_jit_compile() If an unknown option is given, an error message is output; the exit code is 0. --colo[u]r[=] By default, the output is coloured if the output file is a terminal (auto). Force or suppress output of ANSI colour es- capes with always and never respectively. -d Behave as if each pattern has the debug modifier; the inter- nal form and information about the compiled pattern is output after compilation; -d is equivalent to -b -i. -dfa Behave as if each subject line has the dfa modifier; matching is done using the pcre2_dfa_match() function instead of the default pcre2_match(). -E Run in "preprocess only" mode (similar to "gcc -E"). The "#if ... #endif" commands are processed, and all other lines are printed verbatim. -error number[,number,...] Call pcre2_get_error_message() for each of the error numbers in the comma-separated list, display the resulting messages on the standard output, then exit with zero exit code. The numbers may be positive or negative. This is a convenience facility for PCRE2 maintainers. -help Output a brief summary these options and then exit. -i Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation. -jit Behave as if each pattern line has the jit modifier; after successful compilation, each pattern is passed to the just- in-time compiler, if available. -jitfast Behave as if each pattern line has the jitfast modifier; af- ter successful compilation, each pattern is passed to the just-in-time compiler, if available, and each subject line is passed directly to the JIT matcher via its "fast path". -jitverify Behave as if each pattern line has the jitverify modifier; after successful compilation, each pattern is passed to the just-in-time compiler, if available, and the use of JIT for matching is verified. -LM List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -LP List properties: write a list of recognized Unicode proper- ties to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -LS List scripts: write a list of recognized Unicode script names to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -malloc Exercise malloc() failures, by first counting the number of calls made to malloc during pattern compilation and matching, then re-running the compilation and matching that many times, exercising a failure of each malloc() call. -pattern modifier-list Behave as if each pattern line contains the given modifiers. -q Do not output the version number of pcre2test at the start of execution. -S size On Unix-like systems, set the size of the run-time stack to size mebibytes (units of 1024*1024 bytes). -subject modifier-list Behave as if each subject line contains the given modifiers. -t Run each compile and match many times with a timer, and out- put the resulting times per compile or match. When JIT is used, separate times are given for the initial compile and the JIT compile. You can control the number of iterations that are used for timing by following -t with a number (as a separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times. -tm This is like -t except that it times only the matching phase, not the compile phase. -T -TM These behave like -t and -tm, but in addition, at the end of a run, the total times for all compiles and matches are out- put. -unittest Run a fixed set of additional tests of the PCRE2 API which are not driven by the test input files, and then exit. -version Output the PCRE2 version number and then exit. DESCRIPTION If pcre2test is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from the standard input. If pcre2test is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout. When pcre2test is built, a configuration option can specify that it should be linked with the libreadline or libedit library. When this is done, if the input is from a terminal, it is read using the readline() function. This provides line-editing and history facilities. The output from the -help option states whether or not readline() will be used. The program handles any number of tests, each of which consists of a set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pat- tern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed by the perltest.sh script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. For a speci- fication of perltest.sh, see the comments near its beginning. See also the #perltest command below. When the input is a terminal, pcre2test prompts for each line of input, using "re>" to prompt for regular expression patterns, and "data>" to prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt. Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r or \r\n, etc., depending on the newline setting) in a single line of input to encode the newline sequences. There is no limit on the length of subject lines; the input buffer is automatically extended if it is too small. There are replication features that makes it possible to generate long repetitive pattern or subject lines without having to supply them explicitly. An empty line or the end of the file signals the end of the subject lines for a test, at which point a new pattern or command line is ex- pected if there is still input to be read. COMMAND LINES In between sets of test data, a line that begins with # is interpreted as a command line. If the first character is followed by white space or an exclamation mark, the line is treated as a comment, and ignored. Otherwise, the following commands are recognized: #forbid_utf Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of patterns. This command also forces an error if a subsequent pattern contains any occurrences of \P, \p, or \X, which are still supported when PCRE2_UTF is not set, but which require Unicode property support to be included in the library. This is a trigger guard that is used in test files to ensure that UTF or Unicode property tests are not accidentally added to files that are used when Unicode support is not included in the library. Setting PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained by the use of #pattern; the difference is that #forbid_utf cannot be unset, and the automatic options are not displayed in pattern informa- tion, to avoid cluttering up test output. #load This command is used to load a set of precompiled patterns from a file, as described in the section entitled "Saving and restoring compiled patterns" below. #loadtables This command is used to load a set of binary character tables that can be accessed by the tables=3 qualifier. Such tables can be created by the pcre2_dftables program with the -b option. #newline_default [] When PCRE2 is built, a default newline convention can be specified. This determines which characters and/or character pairs are recognized as indicating a newline in a pattern or subject string. The default can be overridden when a pattern is compiled. The standard test files con- tain tests of various newline conventions, but the majority of the tests expect a single linefeed to be recognized as a newline by de- fault. Without special action the tests would fail when PCRE2 is com- piled with either CR or CRLF as the default newline. The #newline_default command specifies a list of newline types that are acceptable as the default. The types must be one of CR, LF, CRLF, ANY- CRLF, ANY, or NUL (in upper or lower case), for example: #newline_default LF Any anyCRLF If the default newline is in the list, this command has no effect. Oth- erwise, except when testing the POSIX API, a newline modifier that specifies the first newline convention in the list (LF in the above ex- ample) is added to any pattern that does not already have a newline modifier. If the newline list is empty, the feature is turned off. This command is present in a number of the standard test input files. When the POSIX API is being tested there is no way to override the de- fault newline convention, though it is possible to set the newline con- vention from within the pattern. A warning is given if the posix or posix_nosub modifier is used when #newline_default would set a default for the non-POSIX API. #pattern This command sets a default modifier list that applies to all subse- quent patterns. Modifiers on a pattern can change these settings. #perltest This line is used in test files that can also be processed by perl- test.sh to confirm that Perl gives the same results as PCRE2. Subse- quent tests are checked for the use of pcre2test features that are in- compatible with the perltest.sh script. Patterns must use '/' as their delimiter, and only certain modifiers are supported. Comment lines, #pattern commands, and #subject commands that set or unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and #newline_default commands, which are needed in the relevant pcre2test files, are silently ignored. All other command lines are ignored, but give a warning message. The #perltest command helps detect tests that are accidentally put in the wrong file or use the wrong delimiter. For more details of the perltest.sh script see the comments it contains. #pop [] #popcopy [] These commands are used to manipulate the stack of compiled patterns, as described in the section entitled "Saving and restoring compiled patterns" below. #save This command is used to save a set of compiled patterns to a file, as described in the section entitled "Saving and restoring compiled pat- terns" below. #subject This command sets a default modifier list that applies to all subse- quent subject lines. Modifiers on a subject line can change these set- tings. #if CONDITION ... #endif If CONDITION is true, then the command is printed, and its contents are processed as normal, including printing the commandlines to the output. If CONDITION is false, then all lines between the "#if" and "#endif" are skipped and not printed. The CONDITION can be any of the conditions which are tested by the "-C" commandline option and which set pcre2test's exit code to a boolean value. The CONDITION may also be preceded by "!". MODIFIER SYNTAX Modifier lists are used with both pattern and subject lines. Items in a list are separated by commas followed by optional white space. Trailing white space in a modifier list is ignored. Some modifiers may be given for both patterns and subject lines, whereas others are valid only for one or the other. Each modifier has a long name, for example "an- chored", and some of them must be followed by an equals sign and a value, for example, "offset=12". Values cannot contain comma charac- ters, but may contain spaces. Modifiers that do not take values may be preceded by a minus sign to turn off a previous setting. A few of the more common modifiers can also be specified as single let- ters, for example "i" for "caseless". In documentation, following the Perl convention, these are written with a slash ("the /i modifier") for clarity. Abbreviated modifiers must all be concatenated in the first item of a modifier list. If the first item is not recognized as a long modifier name, it is interpreted as a sequence of these abbreviations. For example: /abc/ig,newline=cr,jit=3 This is a pattern line whose modifier list starts with two one-letter modifiers (/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. PATTERN SYNTAX A pattern line must start with one of the following characters (common symbols, excluding pattern meta-characters): / ! " ' ` - = _ : ; , % & @ ~ This is interpreted as the pattern's delimiter. A regular expression may be continued over several input lines, in which case the newline characters are included within it. It is possible to include the delim- iter as a literal within the pattern by escaping it with a backslash, for example /abc\/def/ If you do this, the escape and the delimiter form part of the pattern, but since the delimiters are all non-alphanumeric, the inclusion of the backslash does not affect the pattern's interpretation. Note, however, that this trick does not work within \Q...\E literal bracketing because the backslash will itself be interpreted as a literal. If the terminat- ing delimiter is immediately followed by a backslash, for example, /abc/\ a backslash is added to the end of the pattern. This is done to provide a way of testing the error condition that arises if a pattern finishes with a backslash, because /abc\/ is interpreted as the first line of a pattern that starts with "abc/", causing pcre2test to read the next line as a continuation of the regu- lar expression. A pattern can be followed by a modifier list (details below). SUBJECT LINE SYNTAX Before each subject line is passed to pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match(), leading and trailing white space is removed, and the line is scanned for backslash escapes, unless the subject_literal modifier was set for the pattern. The following provide a means of en- coding non-printing characters in a visible way: \a alarm (BEL, \x07) \b backspace (\x08) \e escape (\x27) \f form feed (\x0c) \n newline (\x0a) \N{U+hh...} unicode character (any number of hex digits) \r carriage return (\x0d) \t tab (\x09) \v vertical tab (\x0b) \ddd octal number (up to 3 octal digits); represent a single code point unless larger than 255 with the 8-bit li- brary \o{dd...} octal number (any number of octal digits} representing a character in UTF mode or a code point \xhh hexadecimal byte (up to 2 hex digits) \x{hh...} hexadecimal number (up to 8 hex digits) representing a character in UTF mode or a code point Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf modifier on the pattern. It is always recognized. There may be any num- ber of hexadecimal digits inside the braces; invalid values provoke er- ror messages but when using \N{U+hh...} with some invalid unicode char- acters they will be accepted with a warning instead. Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) describe one byte rather than one character; this makes it possible to construct invalid UTF-8 sequences for testing purposes. On the other hand, \x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating more than one byte if the value is greater than 127. To avoid the ambiguity it is preferred to use \N{U+hh...} when describing characters. When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values that could fit on it, and causes an error for greater values. When testing the 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it possible to construct in- valid UTF-16 sequences for testing purposes. When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...} values are accepted. This makes it possible to construct in- valid UTF-32 sequences for testing purposes. There is a special backslash sequence that specifies replication of one or more characters: \[]{} This makes it possible to test long strings without having to provide them as part of the file. For example: \[abc]{4} is converted to "abcabcabcabc". This feature does not support nesting. To include a closing square bracket in the characters, code it as \x5D. A backslash followed by an equals sign marks the end of the subject string and the start of a modifier list. For example: abc\=notbol,notempty If the subject string is empty and \= is followed by white space, the line is treated as a comment line, and is not used for matching. For example: \= This is a comment. abc\= This is an invalid modifier list. A backslash followed by any other non-alphanumeric character just es- capes that character. A backslash followed by anything else causes an error. However, if the very last character in the line is a backslash (and there is no modifier list), it is ignored. This gives a way of passing an empty line as data, since a real empty line terminates the data input. If the subject_literal modifier is set for a pattern, all subject lines that follow are treated as literals, with no special treatment of back- slashes. No replication is possible, and any subject modifiers must be set as defaults by a #subject command. PATTERN MODIFIERS There are several types of modifier that can appear in pattern lines. Except where noted below, they may also be used in #pattern commands. A pattern's modifier list can add to or override default modifiers that were set by a previous #pattern command. Setting compilation options The following modifiers set options for pcre2_compile(). Most of them set bits in the options argument of that function, but those whose names start with PCRE2_EXTRA are additional options that are set in the compile context. Some of these options have single-letter abbrevia- tions. There is special handling for /x: if a second x is present, PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third appearance adds PCRE2_EXTENDED as well, though this makes no dif- ference to the way pcre2_compile() behaves. See pcre2api for a descrip- tion of the effects of these options. allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options ascii_bsd set PCRE2_EXTRA_ASCII_BSD ascii_bss set PCRE2_EXTRA_ASCII_BSS ascii_bsw set PCRE2_EXTRA_ASCII_BSW ascii_digit set PCRE2_EXTRA_ASCII_DIGIT ascii_posix set PCRE2_EXTRA_ASCII_POSIX auto_callout set PCRE2_AUTO_CALLOUT bad_escape_is_literal set PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL /i caseless set PCRE2_CASELESS /r caseless_restrict set PCRE2_EXTRA_CASELESS_RESTRICT dollar_endonly set PCRE2_DOLLAR_ENDONLY /s dotall set PCRE2_DOTALL dupnames set PCRE2_DUPNAMES endanchored set PCRE2_ENDANCHORED escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF /x extended set PCRE2_EXTENDED /xx extended_more set PCRE2_EXTENDED_MORE extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX firstline set PCRE2_FIRSTLINE literal set PCRE2_LITERAL match_line set PCRE2_EXTRA_MATCH_LINE match_invalid_utf set PCRE2_MATCH_INVALID_UTF match_unset_backref set PCRE2_MATCH_UNSET_BACKREF match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK python_octal set PCRE2_EXTRA_PYTHON_OCTAL turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT utf set PCRE2_UTF As well as turning on the PCRE2_UTF option, the utf modifier causes all non-printing characters in output strings to be printed using the \x{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. The following modifiers enable or disable performance optimizations by calling pcre2_set_optimize() before invoking the regex compiler. optimization_full enable all optional optimizations optimization_none disable all optional optimizations auto_possess auto-possessify variable quantifiers auto_possess_off don't auto-possessify variable quantifiers dotstar_anchor anchor patterns starting with .* dotstar_anchor_off don't anchor patterns starting with .* start_optimize enable pre-scan of subject string start_optimize_off disable pre-scan of subject string See the pcre2_set_optimize documentation for details on these optimiza- tions. Setting compilation controls The following modifiers affect the compilation process or request in- formation about the pattern. There are single-letter abbreviations for some that are heavily used in the test files. /B bincode show binary code without lengths bsr=[anycrlf|unicode] specify \R handling callout_info show callout information convert= request foreign pattern conversion convert_glob_escape=c set glob escape character convert_glob_separator=c set glob separator character convert_length set convert buffer length debug same as info,fullbincode expand expand repetition syntax in pattern framesize show matching frame size fullbincode show binary code with lengths /I info show info about compiled pattern hex unquoted characters are hexadecimal jit[=] use JIT jitfast use JIT fast path jitverify verify JIT use locale= use this locale max_pattern_compiled ) set maximum compiled pattern _length= ) length (bytes) max_pattern_length= set maximum pattern length (code units) max_varlookbehind= set maximum variable lookbehind length memory show memory used newline= set newline type null_context compile with a NULL context null_pattern pass pattern as NULL parens_nest_limit= set maximum parentheses depth posix use the POSIX API posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack pushtablescopy push a copy with tables onto the stack stackguard= test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables use_length do not zero-terminate the pattern utf8_input treat input as UTF-8 The effects of these modifiers are described in the following sections. Newline and \R handling The bsr modifier specifies what \R in a pattern should match. If it is set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to "unicode", \R matches any Unicode newline sequence. The default can be specified when PCRE2 is built; if it is not, the default is set to Uni- code. The newline modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case). Information about a pattern The debug modifier is a shorthand for info,fullbincode, requesting all available information. The bincode modifier causes a representation of the compiled code to be output after compilation. This information does not contain length and offset values, which ensures that the same output is generated for dif- ferent internal link sizes and different code unit widths. By using bincode, the same regression tests can be used in different environ- ments. The fullbincode modifier, by contrast, does include length and offset values. This is used in a few special tests that run only for specific code unit widths and link sizes, and is also useful for one-off tests. The info modifier requests information about the compiled pattern (whether it is anchored, has a fixed first character, and so on). The information is obtained from the pcre2_pattern_info() function. Here are some typical examples: re> /(?i)(^a|^b)/m,info Capture group count = 1 Compile options: multiline Overall options: caseless multiline First code unit at start or follows newline Subject length lower bound = 1 re> /(?i)abc/info Capture group count = 0 Compile options: Overall options: caseless First code unit = 'a' (caseless) Last code unit = 'c' (caseless) Subject length lower bound = 3 "Compile options" are those specified by modifiers; "overall options" have added options that are taken or deduced from the pattern. If both sets of options are the same, just a single "options" line is output; if there are no options, the line is omitted. "First code unit" is where any match must start; if there is more than one they are listed as "starting code units". "Last code unit" is the last literal code unit that must be present in any match. This is not necessarily the last character. These lines are omitted if no starting or ending code units are recorded. The subject length line is omitted when no_start_optimize is set because the minimum length is not calculated when it can never be used. The framesize modifier shows the size, in bytes, of each storage frame used by pcre2_match() for handling backtracking. The size depends on the number of capturing parentheses in the pattern. A vector of these frames is used at matching time; its overall size is shown when the heaframes_size subject modifier is set. The callout_info modifier requests information about all the callouts in the pattern. A list of them is output at the end of any other infor- mation that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. Passing a NULL context Normally, pcre2test passes a context block to pcre2_compile(). If the null_context modifier is set, however, NULL is passed. This is for testing that pcre2_compile() behaves correctly in this case (it uses default values). Passing a NULL pattern The null_pattern modifier is for testing the behaviour of pcre2_com- pile() when the pattern argument is NULL. The length value passed is the default PCRE2_ZERO_TERMINATED unless use_length is set. Any length other than zero causes an error. Specifying pattern characters in hexadecimal The hex modifier specifies that the characters of the pattern, except for substrings enclosed in single or double quotes, are to be inter- preted as pairs of hexadecimal digits. This feature is provided as a way of creating patterns that contain binary zeros and other non-print- ing characters. White space is permitted between pairs of digits. For example, this pattern contains three characters: /ab 32 59/hex Parts of such a pattern are taken literally if quoted. This pattern contains nine characters, only two of which are specified in hexadeci- mal: /ab "literal" 32/hex Either single or double quotes may be used. There is no way of includ- ing the delimiter within a substring. The hex and expand modifiers are mutually exclusive. Specifying the pattern's length By default, patterns are passed to the compiling functions as zero-ter- minated strings but can be passed by length instead of being zero-ter- minated. The use_length modifier causes this to happen. Using a length happens automatically (whether or not use_length is set) when hex is set, because patterns specified in hexadecimal may contain binary ze- ros. If hex or use_length is used with the POSIX wrapper API (see "Using the POSIX wrapper API" below), the REG_PEND extension is used to pass the pattern's length. Specifying a maximum for variable lookbehinds Variable lookbehind assertions are supported only if, for each one, there is a maximum length (in characters) that it can match. There is a limit on this, whose default can be set at build time, with an ultimate default of 255. The max_varlookbehind modifier uses the pcre2_set_max_varlookbehind() function to change the limit. Lookbehinds whose branches each match a fixed length are limited to 65535 charac- ters per branch. Specifying wide characters in 16-bit and 32-bit modes In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 and translated to UTF-16 or UTF-32 when the utf modifier is set. For testing the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input modifier can be used. It is mutually exclusive with utf. Input lines are interpreted as UTF-8 as a means of specifying wide characters. More details are given in "Input encoding" above. Generating long repetitive patterns Some tests use long patterns that are very repetitive. Instead of cre- ating a very long input line for such a pattern, you can use a special repetition feature, similar to the one described for subject lines above. If the expand modifier is present on a pattern, parts of the pattern that have the form \[]{} are expanded before the pattern is passed to pcre2_compile(). For exam- ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction cannot be nested. An initial "\[" sequence is recognized only if "]{" followed by decimal digits and "}" is found later in the pattern. If not, the characters remain in the pattern unaltered. The expand and hex modifiers are mutually exclusive. If part of an expanded pattern looks like an expansion, but is really part of the actual pattern, unwanted expansion can be avoided by giving two values in the quantifier. For example, \[AB]{6000,6000} is not rec- ognized as an expansion item. If the info modifier is set on an expanded pattern, the result of the expansion is included in the information that is output. JIT compilation Just-in-time (JIT) compiling is a heavyweight optimization that can greatly speed up pattern matching. See the pcre2jit documentation for details. JIT compiling happens, optionally, after a pattern has been successfully compiled into an internal form. The JIT compiler converts this to optimized machine code. It needs to know whether the match-time options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, because different code is generated for the different cases. See the partial modifier in "Subject Modifiers" below for details of how these options are specified for each match attempt. JIT compilation is requested by the jit pattern modifier, which may op- tionally be followed by an equals sign and a number in the range 0 to 7. The three bits that make up the number specify which of the three JIT operating modes are to be compiled: 1 compile JIT code for non-partial matching 2 compile JIT code for soft partial matching 4 compile JIT code for hard partial matching The possible values for the jit modifier are therefore: 0 disable JIT 1 normal matching only 2 soft partial matching only 3 normal and soft partial matching 4 hard partial matching only 6 soft and hard partial matching only 7 all three modes If no number is given, 7 is assumed. The phrase "partial matching" means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- plete match; the options enable the possibility of a partial match, but do not require it. Note also that if you request JIT compilation only for partial matching (for example, jit=2) but do not set the partial modifier on a subject line, that match will not use JIT code because none was compiled for non-partial matching. If JIT compilation is successful, the compiled JIT code will automati- cally be used when an appropriate type of match is run, except when in- compatible run-time options are specified. For more details, see the pcre2jit documentation. See also the jitstack modifier below for a way of setting the size of the JIT stack. If the jitfast modifier is specified, matching is done using the JIT "fast path" interface, pcre2_jit_match(), which skips some of the san- ity checks that are done by pcre2_match(), and of course does not work when JIT is not supported. If jitfast is specified without jit, jit=7 is assumed. If the jitverify modifier is specified, information about the compiled pattern shows whether JIT compilation was or was not successful. If jitverify is specified without jit, jit=7 is assumed. If JIT compila- tion is successful when jitverify is set, the text "(JIT)" is added to the first output line after a match or non match when JIT-compiled code was actually used in the match. Setting a locale The locale modifier must specify the name of a locale, for example: /pattern/locale=fr_FR The given locale is set, pcre2_maketables() is called to build a set of character tables for the locale, and this is then passed to pcre2_com- pile() when compiling the regular expression. The same tables are used when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern command if a default is needed. Setting a locale and alternate charac- ter tables are mutually exclusive. Showing pattern memory The memory modifier causes the size in bytes of the memory used to hold the compiled pattern to be output. This does not include the size of the pcre2_code block; it is just the actual compiled data. If the pat- tern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example: re> /a(b)c/jit,memory Memory allocation (code space): 21 Memory allocation (JIT code): 1910 Limiting nested parentheses The parens_nest_limit modifier sets a limit on the depth of nested parentheses in a pattern. Breaching the limit causes a compilation er- ror. The default for the library is set when PCRE2 is built, but pcre2test sets its own default of 220, which is required for running the standard test suite. Limiting the pattern length The max_pattern_length modifier sets a limit, in code units, to the length of pattern that pcre2_compile() will accept. Breaching the limit causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Limiting the size of a compiled pattern The max_pattern_compiled_length modifier sets a limit, in bytes, to the amount of memory used by a compiled pattern. Breaching the limit causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Using the POSIX wrapper API The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via the POSIX wrapper API rather than its native API. When posix_nosub is used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX wrapper supports only the 8-bit library. Note that it does not imply POSIX matching semantics; for more detail see the pcre2posix documenta- tion. The following pattern modifiers set options for the regcomp() function: caseless REG_ICASE multiline REG_NEWLINE dotall REG_DOTALL ) ungreedy REG_UNGREEDY ) These options are not part of ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) The regerror_buffsize modifier specifies a size for the error buffer that is passed to regerror() in the event of a compilation error. For example: /abc/posix,regerror_buffsize=20 This provides a means of testing the behaviour of regerror() when the buffer is too small for the error message. If this modifier has not been set, a large buffer is used. The aftertext and allaftertext subject modifiers work as described be- low. All other modifiers are either ignored, with a warning message, or cause an error. The pattern is passed to regcomp() as a zero-terminated string by de- fault, but if the use_length or hex modifiers are set, the REG_PEND ex- tension is used to pass it by length. Testing the stack guard feature The stackguard modifier is used to test the use of pcre2_set_com- pile_recursion_guard(), a function that is provided to enable stack availability to be checked during compilation (see the pcre2api docu- mentation for details). If the number specified by the modifier is greater than zero, pcre2_set_compile_recursion_guard() is called to set up callback from pcre2_compile() to a local function. The argument it receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted. Using alternative character tables The value specified for the tables modifier must be one of the digits 0, 1, 2, or 3. It causes a specific set of built-in character tables to be passed to pcre2_compile(). This is used in the PCRE2 tests to check behaviour with different character tables. The digit specifies the ta- bles as follows: 0 do not pass any special character tables 1 the default ASCII tables, as distributed in pcre2_chartables.c.dist 2 a set of tables defining ISO 8859 characters 3 a set of tables loaded by the #loadtables command In tables 2, some characters whose codes are greater than 128 are iden- tified as letters, digits, spaces, etc. Tables 3 can be used only after a #loadtables command has loaded them from a binary file. Setting al- ternate character tables and a locale are mutually exclusive. Setting certain match controls The following modifiers are really subject modifiers, and are described under "Subject Modifiers" below. However, they may be included in a pattern's modifier list, in which case they are applied to every sub- ject line that is processed with that pattern. These modifiers do not affect the compilation process. aftertext show text after match allaftertext show text after captures allcaptures show all captures allvector show the entire ovector allusedtext show all consulted text altglobal alternative global matching /g global global matching heapframes_size show match data heapframes size jitstack= set size of JIT stack mark show mark values null_substitute_match_data substitute with NULL match data replace= specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY substitute_skip= skip substitution substitute_stop= skip substitution and following substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY These modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command. Specifying literal subject lines If the subject_literal modifier is present on a pattern, all the sub- ject lines that it matches are taken as literal strings, with no inter- pretation of backslashes. It is not possible to set subject modifiers on such lines, but any that are set as defaults by a #subject command are recognized. Saving a compiled pattern When a pattern with the push modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and pcre2test expects the next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as described in the section entitled "Saving and restoring compiled pat- terns" below. If pushcopy is used instead of push, a copy of the com- piled pattern is stacked, leaving the original as current, ready to match the following input lines. This provides a way of testing the pcre2_code_copy() function. The push and pushcopy modifiers are in- compatible with compilation modifiers such as global that act at match time. Any that are specified are ignored (for the stacked copy), with a warning message, except for replace, which causes an error. Note that jitverify, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern. Testing foreign pattern conversion The experimental foreign pattern conversion functions in PCRE2 can be tested by setting the convert modifier. Its argument is a colon-sepa- rated list of options, which set the equivalent option for the pcre2_pattern_convert() function: glob PCRE2_CONVERT_GLOB glob_no_starstar PCRE2_CONVERT_GLOB_NO_STARSTAR glob_no_wild_separator PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR posix_basic PCRE2_CONVERT_POSIX_BASIC posix_extended PCRE2_CONVERT_POSIX_EXTENDED unset Unset all options The "unset" value is useful for turning off a default that has been set by a #pattern command. When one of these options is set, the input pat- tern is passed to pcre2_pattern_convert(). If the conversion is suc- cessful, the result is reflected in the output and then passed to pcre2_compile(). The normal utf and no_utf_check options, if set, cause the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to pcre2_pattern_convert(). By default, the conversion function is allowed to allocate a buffer for its output. However, if the convert_length modifier is set to a value greater than zero, pcre2test passes a buffer of the given length. This makes it possible to test the length check. The convert_glob_escape and convert_glob_separator modifiers can be used to specify the escape and separator characters for glob process- ing, overriding the defaults, which are operating-system dependent. SUBJECT MODIFIERS The modifiers that can appear in subject lines and the #subject command are of two types. Setting match options The following modifiers set options for pcre2_match() or pcre2_dfa_match(). See pcre2api for a description of their effects. anchored set PCRE2_ANCHORED copy_matched_subject set PCRE2_COPY_MATCHED_SUBJECT endanchored set PCRE2_ENDANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST disable_recurseloop_check set PCRE2_DISABLE_RECURSELOOP_CHECK no_jit set PCRE2_NO_JIT no_utf_check set PCRE2_NO_UTF_CHECK notbol set PCRE2_NOTBOL notempty set PCRE2_NOTEMPTY notempty_atstart set PCRE2_NOTEMPTY_ATSTART noteol set PCRE2_NOTEOL partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_soft (or ps) set PCRE2_PARTIAL_SOFT The partial matching modifiers are provided with abbreviations because they appear frequently in tests. If the posix or posix_nosub modifier was present on the pattern, caus- ing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOT- BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). The other modifiers are ignored, with a warning message. There is one additional modifier that can be used with the POSIX wrap- per. It is ignored (with a warning) if used for non-POSIX matching. posix_startend=[:] This causes the subject string to be passed to regexec() using the REG_STARTEND option, which uses offsets to specify which part of the string is searched. If only one number is given, the end offset is passed as the end of the subject string. For more detail of REG_STAR- TEND, see the pcre2posix documentation. If the subject string contains binary zeros (coded as escapes such as \x{00} because pcre2test does not support actual binary zeros in its input), you must use posix_star- tend to specify its length. Setting match controls The following modifiers affect the matching process or request addi- tional information. Some of them may also be specified on a pattern line (see above), in which case they apply to every subject line that is matched against that pattern, but can be overridden by modifiers on the subject. aftertext show text after match allaftertext show text after captures allcaptures show all captures allusedtext show all consulted text (non-JIT only) allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data= set a value to pass via callouts callout_error=[:] control callout error callout_extra show extra callout information callout_fail=[:] control callout failure callout_no_where do not show position of a callout callout_none do not supply a callout function copy= copy captured substring depth_limit= set a depth limit dfa use pcre2_dfa_match() find_limits find heap, match and depth limits find_limits_noheap find match and depth limits get= extract captured substring getall extract all captured substrings /g global global matching heapframes_size show match data heapframes size heap_limit= set a limit on heap memory (Kbytes) jitstack= set size of JIT stack mark show mark values match_limit= set a match limit memory show heap memory usage null_context match with a NULL context null_replacement substitute with NULL replacement null_subject match with NULL subject null_substitute_match_data substitute with NULL match data offset= set starting offset offset_limit= set offset limit ovector= set size of output vector recursion_limit= obsolete synonym for depth_limit replace= specify a replacement string startchar show startchar when relevant startoffset= same as offset= substitute_callout use substitution callouts substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY substitute_skip= skip substitution number n substitute_stop= skip substitution number n and greater substitute_subject= specify a different subject for sub- stitution substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY zero_terminate pass the subject as zero-terminated The effects of these modifiers are described in the following sections. When matching via the POSIX wrapper API, the aftertext, allaftertext, and ovector subject modifiers work as described below. All other modi- fiers are either ignored, with a warning message, or cause an error. Showing more text The aftertext modifier requests that as well as outputting the part of the subject string that matched the entire pattern, pcre2test should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. The allaftertext modifier requests the same action for captured sub- strings as well as the main matched substring. In each case the remain- der is output on the following line with a plus character following the capture number. The allusedtext modifier requests that all the text that was consulted during a successful pattern match by the interpreter should be shown, for both full and partial matches. This feature is not supported for JIT matching, and if requested with JIT it is ignored (with a warning message). Setting this modifier affects the output if there is a look- behind at the start of a match, or, for a complete match, a lookahead at the end, or if \K is used in the pattern. Characters that precede or follow the start and end of the actual match are indicated in the out- put by '<' or '>' characters underneath them. Here is an example: re> /(?<=pqr)abc(?=xyz)/ data> 123pqrabcxyz456\=allusedtext 0: pqrabcxyz <<< >>> data> 123pqrabcxy\=ph,allusedtext Partial match: pqrabcxy <<< The first, complete match shows that the matched string is "abc", with the preceding and following strings "pqr" and "xyz" having been con- sulted during the match (when processing the assertions). The partial match can indicate only the preceding string. The startchar modifier requests that the starting character for the match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \K has been processed as part of the match. In this situation, the output for the matched string is displayed from the starting character instead of from the match point, with circumflex characters under the earlier characters. For ex- ample: re> /abc\Kxyz/ data> abcxyz\=startchar 0: abcxyz ^^^ Unlike allusedtext, the startchar modifier can be used with JIT. How- ever, these two modifiers are mutually exclusive. Showing the value of all capture groups The allcaptures modifier requests that the values of all potential cap- tured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to the return code from pcre2_match()). Groups that did not take part in the match are output as "". This modifier is not relevant for DFA matching (which does no capturing) and does not apply when replace is specified; it is ignored, with a warning message, if present. Showing the entire ovector, for all outcomes The allvector modifier requests that the entire ovector be shown, what- ever the outcome of the match. Compare allcaptures, which shows only up to the maximum number of capture groups for the pattern, and then only for a successful complete non-DFA match. This modifier, which acts af- ter any match result, and also for DFA matching, provides a means of checking that there are no unexpected modifications to ovector fields. Before each match attempt, the ovector is filled with a special value, and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all groups after the maximum capture group for the pattern. In other cases it applies to the entire ovector. After a partial match, the first two elements are the only ones that should be set. After a DFA match, the amount of ovector that is used depends on the number of matches that were found. Testing pattern callouts A callout function is supplied when pcre2test calls the library match- ing functions, unless callout_none is specified. Its behaviour can be controlled by various modifiers listed above whose names begin with callout_. Details are given in the section entitled "Callouts" below. Testing callouts from pcre2_substitute() is described separately in "Testing the substitution function" below. Finding all matches in a string Searching for all possible matches within a subject can be requested by the global or altglobal modifier. After finding a match, the matching function is called again to search the remainder of the subject. The difference between global and altglobal is that the former uses the start_offset argument to pcre2_match() or pcre2_dfa_match() to start searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbe- hind assertion (including \b or \B). If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART flag set, in order to search for another, non- empty, match at the same point in the subject. This imitates the way Perl handles such cases when using the /g modifier or the split() func- tion. Testing substring extraction functions The copy and get modifiers can be used to test the pcre2_sub- string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be given more than once, and each can specify a capture group name or num- ber, for example: abcd\=copy=1,copy=3,get=G1 If the #subject command is used to set default copy and/or get lists, these can be unset by specifying a negative number to cancel all num- bered groups and an empty name to cancel all named groups. The getall modifier tests pcre2_substring_list_get(), which extracts all captured substrings. If the subject line is successfully matched, the substrings extracted by the convenience functions are output with C, G, or L after the string number instead of a colon. This is in addition to the normal full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name. Testing the substitution function If the replace modifier is set, the pcre2_substitute() function is called instead of one of the matching functions (or after one call of pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- placement strings cannot contain commas, because a comma signifies the end of a modifier. This is not thought to be an issue in a test pro- gram. Specifying a completely empty replacement string disables this modi- fier. However, it is possible to specify an empty replacement by pro- viding a buffer length, as described below, for an otherwise empty re- placement. Unlike subject strings, pcre2test does not process replacement strings for escape sequences. In UTF mode, a replacement string is checked to see if it is a valid UTF-8 string. If so, it is correctly converted to a UTF string of the appropriate code unit width. If it is not a valid UTF-8 string, the individual code units are copied directly. This pro- vides a means of passing an invalid UTF-8 string for testing purposes. The following modifiers set options (in additional to the normal match options) for pcre2_substitute(): global PCRE2_SUBSTITUTE_GLOBAL substitute_extended PCRE2_SUBSTITUTE_EXTENDED substitute_literal PCRE2_SUBSTITUTE_LITERAL substitute_matched PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY See the pcre2api documentation for details of these options. After a successful substitution, the modified string is output, pre- ceded by the number of replacements. This may be zero if there were no matches. Here is a simple example of a substitution test: /abc/replace=xxx =abc=abc= 1: =xxx=abc= =abc=abc=\=global 2: =xxx=xxx= Subject and replacement strings should be kept relatively short (fewer than 256 characters) for substitution tests, as fixed-size buffers are used. To make it easy to test for buffer overflow, if the replacement string starts with a number in square brackets, that number is passed to pcre2_substitute() as the size of the output buffer, with the re- placement string starting at the next character. Here is an example that tests the edge case: /abc/ 123abc123\=replace=[10]XYZ 1: 123XYZ123 123abc123\=replace=[9]XYZ Failed: error -48: no more memory The default action of pcre2_substitute() is to return PCRE2_ER- ROR_NOMEMORY when the output buffer is too small. However, if the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- tute_overflow_length modifier), pcre2_substitute() continues to go through the motions of matching and substituting (but not doing any callouts), in order to compute the size of buffer that is required. When this happens, pcre2test shows the required buffer length (which includes space for the trailing zero) as part of the error message. For example: /abc/substitute_overflow_length 123abc123\=replace=[9]XYZ Failed: error -48: no more memory: 10 code units are needed A replacement string is ignored with POSIX and DFA matching. Specifying partial matching provokes an error return ("bad option value") from pcre2_substitute(). The substitute_subject modifier may be used to test the use of the PCRE2 API, in which a client calls pcre2_match() followed by pcre2_sub- stitute() with PCRE2_SUBSTITUTE_MATCHED, but the client performs an un- expected and unsupported modification of the subject buffer in-place, in between the match and substitution. Testing substitute callouts If the substitute_callout modifier is set, a substitution callout func- tion is set up. The null_context modifier must not be set, because the address of the callout function is passed in a match context. When the callout function is called (after each substitution), details of the input and output strings are output. For example: /abc/g,replace=<$0>,substitute_callout abcdefabcpqr 1(1) Old 0 3 "abc" New 0 5 "" 2(1) Old 6 9 "abc" New 8 13 "" 2: defpqr The first number on each callout line is the count of matches. The parenthesized number is the number of pairs that are set in the ovector (that is, one more than the number of capturing groups that were set). Then are listed the offsets of the old substring, its contents, and the same for the replacement. By default, the substitution callout function returns zero, which ac- cepts the replacement and causes matching to continue if /g was used. Two further modifiers can be used to test other return values. If sub- stitute_skip is set to a value greater than zero the callout function returns +1 for the match of that number, and similarly substitute_stop returns -1. These cause the replacement to be rejected, and -1 causes no further matching to take place. If either of them are set, substi- tute_callout is assumed. For example: /abc/g,replace=<$0>,substitute_skip=1 abcdefabcpqr 1(1) Old 0 3 "abc" New 0 5 " SKIPPED" 2(1) Old 6 9 "abc" New 6 11 "" 2: abcdefpqr abcdefabcpqr\=substitute_stop=1 1(1) Old 0 3 "abc" New 0 5 " STOPPED" 1: abcdefabcpqr If both are set for the same number, stop takes precedence. Only a sin- gle skip or stop is supported, which is sufficient for testing that the feature works. Testing substitute case callouts If the substitute_case_callout modifier is set, a substitution case callout function is set up. The callout function is called for each substituted chunk which is to be case-transformed. The callout function passed is a fixed function with implementation for certain behaviours: inputs which shrink when case-transformed; inputs which grow; inputs with distinct upper/lower/titlecase forms. The char- acters which are not special-cased for testing purposes are left unmod- ified, as if they are caseless characters. Setting the JIT stack size The jitstack modifier provides a way of setting the maximum stack size that is used by the just-in-time optimization code. It is ignored if JIT optimization is not being used. The value is a number of kibibytes (units of 1024 bytes). Setting zero reverts to the default of 32KiB. Providing a stack that is larger than the default is necessary only for very complicated patterns. If jitstack is set non-zero on a subject line it overrides any value that was set on the pattern. Setting heap, match, and depth limits The heap_limit, match_limit, and depth_limit modifiers set the appro- priate limits in the match context. These values are ignored when the find_limits or find_limits_noheap modifier is specified. Finding minimum limits If the find_limits modifier is present on a subject line, pcre2test calls the relevant matching function several times, setting different values in the match context via pcre2_set_heap_limit(), pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the smallest value for each parameter that allows the match to complete without a "limit exceeded" error. The match itself may succeed or fail. An alternative modifier, find_limits_noheap, omits the heap limit. This is used in the standard tests, because the minimum heap limit varies between systems. If JIT is being used, only the match limit is rele- vant, and the other two are automatically omitted. When using this modifier, the pattern should not contain any limit set- tings such as (*LIMIT_MATCH=...) within it. If such a setting is present and is lower than the minimum matching value, the minimum value cannot be found because pcre2_set_match_limit() etc. are only able to reduce the value of an in-pattern limit; they cannot increase it. For non-DFA matching, the minimum depth_limit number is a measure of how much nested backtracking happens (that is, how deeply the pattern's tree is searched). In the case of DFA matching, depth_limit controls the depth of recursive calls of the internal function that is used for handling pattern recursion, lookaround assertions, and atomic groups. For non-DFA matching, the match_limit number is a measure of the amount of backtracking that takes place, and learning the minimum value can be instructive. For most simple matches, the number is quite small, but for patterns with very large numbers of matching possibilities, it can become large very quickly with increasing length of subject string. In the case of DFA matching, match_limit controls the total number of calls, both recursive and non-recursive, to the internal matching func- tion, thus controlling the overall amount of computing resource that is used. For both kinds of matching, the heap_limit number, which is in kibibytes (units of 1024 bytes), limits the amount of heap memory used for matching. Showing MARK names The mark modifier causes the names from backtracking control verbs that are returned from calls to pcre2_match() to be displayed. If a mark is returned for a match, non-match, or partial match, pcre2test shows it. For a match, it is on a line by itself, tagged with "MK:". Otherwise, it is added to the non-match message. Showing memory usage The memory modifier causes pcre2test to log the sizes of all heap mem- ory allocation and freeing calls that occur during a call to pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is used only when a match requires more internal workspace that the de- fault allocation on the stack, so in many cases there will be no out- put. No heap memory is allocated during matching with JIT. For this modifier to work, the null_context modifier must not be set on both the pattern and the subject, though it can be set on one or the other. Showing the heap frame overall vector size The heapframes_size modifier is relevant for matches using pcre2_match() without JIT. After a match has run (whether successful or not) the size, in bytes, of the allocated heap frames vector that is left attached to the match data block is shown. If the matching action involved several calls to pcre2_match() (for example, global matching or for timing) only the final value is shown. This modifier is ignored, with a warning, for POSIX or DFA matching. JIT matching does not use the heap frames vector, so the size is always zero, unless there was a previous non-JIT match. Note that specifing a size of zero for the output vector (see below) causes pcre2test to free its match data block (and associated heap frames vector) and allocate a new one. Setting a starting offset The offset modifier sets an offset in the subject string at which matching starts. Its value is a number of code units, not characters. Setting an offset limit The offset_limit modifier sets a limit for unanchored matches. If a match cannot be found starting at or before this offset in the subject, a "no match" return is given. The data value is a number of code units, not characters. When this modifier is used, the use_offset_limit modi- fier must have been set for the pattern; if not, an error is generated. Setting the size of the output vector The ovector modifier applies only to the subject line in which it ap- pears, though of course it can also be used to set a default in a #sub- ject command. It specifies the number of pairs of offsets that are available for storing matching information. The default is 15. A value of zero is useful when testing the POSIX API because it causes regexec() to be called with a NULL capture vector. When not testing the POSIX API, a value of zero is used to cause pcre2_match_data_cre- ate_from_pattern() to be called, in order to create a new match block of exactly the right size for the pattern. (It is not possible to cre- ate a match block with a zero-length ovector; there is always at least one pair of offsets.) The old match data block is freed. Passing the subject as zero-terminated By default, the subject string is passed to a native API matching func- tion with its correct length. In order to test the facility for passing a zero-terminated string, the zero_terminate modifier is provided. It causes the length to be passed as PCRE2_ZERO_TERMINATED. When matching via the POSIX interface, this modifier is ignored, with a warning. When testing pcre2_substitute(), this modifier also has the effect of passing the replacement string as zero-terminated. Passing a NULL context, subject, or replacement Normally, pcre2test passes a context block to pcre2_match(), pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the null_context modifier is set, however, NULL is passed. This is for testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used with the find_limits, find_limits_noheap, or substitute_callout modi- fiers. Similarly, for testing purposes, if the null_subject or null_replace- ment modifier is set, the subject or replacement string pointers are passed as NULL, respectively, to the relevant functions. THE ALTERNATIVE MATCHING FUNCTION By default, pcre2test uses the standard PCRE2 matching function, pcre2_match() to match each subject line. PCRE2 also supports an alter- native matching function, pcre2_dfa_match(), which operates in a dif- ferent way, and has some restrictions. The differences between the two functions are described in the pcre2matching documentation. If the dfa modifier is set, the alternative matching function is used. This function finds all possible matches at a given point in the sub- ject. If, however, the dfa_shortest modifier is set, processing stops after the first match is found. This is always the shortest possible match. DEFAULT OUTPUT FROM pcre2test This section describes the output when the normal matching function, pcre2_match(), is being used. When a match succeeds, pcre2test outputs the list of captured sub- strings, starting with number 0 for the string that matched the whole pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER- ROR_NOMATCH, or "Partial match:" followed by the partially matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is the entire substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \K, \b, or \B was involved.) For any other return, pcre2test outputs the PCRE2 negative error number and a short descriptive phrase. If the error is a failed UTF string check, the code unit offset of the start of the failing character is also output. Here is an example of an interactive pcre2test run. $ pcre2test PCRE2 version 10.22 2016-07-29 re> /^abc(\d+)/ data> abc123 0: abc123 1: 123 data> xyz No match Unset capturing substrings that are not followed by one that is set are not shown by pcre2test unless the allcaptures modifier is specified. In the following example, there are two capturing substrings, but when the first data line is matched, the second, unset substring is not shown. An "internal" unset substring is shown as "", as for the second data line. re> /(a)|(b)/ data> a 0: a 1: a data> b 0: b 1: 2: b If the strings contain any non-printing characters, they are output as \xhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \x{hh...} escapes. See below for the defi- nition of non-printing characters. If the aftertext modifier is set, the output for substring 0 is followed by the rest of the subject string, identified by "0+" like this: re> /cat/aftertext data> cataract 0: cat 0+ aract If global matching is requested, the results of successive matching at- tempts are output in sequence, like this: re> /\Bi(\w\w)/g data> Mississippi 0: iss 1: ss 0: iss 1: ss 0: ipp 1: pp "No match" is output only if the first match attempt fails. Here is an example of a failure message (the offset 4 that is specified by the offset modifier is past the end of the subject string): re> /xyz/ data> xyz\=offset=4 Error -24 (bad offset value) Note that whereas patterns can be continued over several lines (a plain ">" prompt is used for continuations), subject lines may not. However newlines can be included in a subject by means of the \n escape (or \r, \r\n, etc., depending on the newline sequence setting). OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION When the alternative matching function, pcre2_dfa_match(), is used, the output consists of a list of all the matches that start at the first point in the subject where there is at least one match. For example: re> /(tang|tangerine|tan)/ data> yellow tangerine\=dfa 0: tangerine 1: tang 2: tan Using the normal matching function on this data finds only "tang". The longest matching string is always given first (and numbered zero). Af- ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol- lowed by the partially matching substring. Note that this is the entire substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \b, or \B was involved. (\K is not supported for DFA matching.) If global matching is requested, the search for further matches resumes at the end of the longest match. For example: re> /(tang|tangerine|tan)/g data> yellow tangerine and tangy sultana\=dfa 0: tangerine 1: tang 2: tan 0: tang 1: tan 0: tan The alternative matching function does not support substring capture, so the modifiers that are concerned with captured substrings are not relevant. RESTARTING AFTER A PARTIAL MATCH When the alternative matching function has given the PCRE2_ERROR_PAR- TIAL return, indicating that the subject partially matched the pattern, you can restart the match with additional subject data by means of the dfa_restart modifier. For example: re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ data> 23ja\=ps,dfa Partial match: 23ja data> n05\=dfa,dfa_restart 0: n05 For further information about partial matching, see the pcre2partial documentation. CALLOUTS If the pattern contains any callout requests, pcre2test's callout func- tion is called during matching unless callout_none is specified. This works with both matching functions, and with JIT, though there are some differences in behaviour. The output for callouts with numerical argu- ments and those with string arguments is slightly different. Callouts with numerical arguments By default, the callout function displays the callout number, the start and current positions in the subject text at the callout time, and the next pattern item to be tested. For example: --->pqrabcdef 0 ^ ^ \d This output indicates that callout number 0 occurred for a match at- tempt starting at the fourth character of the subject string, when the pointer was at the seventh character, and when the next pattern item was \d. Just one circumflex is output if the start and current posi- tions are the same, or if the current position precedes the start posi- tion, which can happen if the callout is in a lookbehind assertion. Callouts numbered 255 are assumed to be automatic callouts, inserted as a result of the auto_callout pattern modifier. In this case, instead of showing the callout number, the offset in the pattern, preceded by a plus, is output. For example: re> /\d?[A-E]\*/auto_callout data> E* --->E* +0 ^ \d? +3 ^ [A-E] +8 ^^ \* +10 ^ ^ 0: E* If a pattern contains (*MARK) items, an additional line is output when- ever a change of latest mark is passed to the callout function. For ex- ample: re> /a(*MARK:X)bc/auto_callout data> abc --->abc +0 ^ a +1 ^^ (*MARK:X) +10 ^^ b Latest Mark: X +11 ^ ^ c +12 ^ ^ 0: abc The mark changes between matching "a" and "b", but stays the same for the rest of the match, so nothing more is output. If, as a result of backtracking, the mark reverts to being unset, the text "" is output. Callouts with string arguments The output for a callout with a string argument is similar, except that instead of outputting a callout number before the position indicators, the callout string and its offset in the pattern string are output be- fore the reflection of the subject string, and the subject string is reflected for each callout. For example: re> /^ab(?C'first')cd(?C"second")ef/ data> abcdefg Callout (7): 'first' --->abcdefg ^ ^ c Callout (20): "second" --->abcdefg ^ ^ e 0: abcdef Callout modifiers The callout function in pcre2test returns zero (carry on matching) by default, but you can use a callout_fail modifier in a subject line to change this and other parameters of the callout (see below). If the callout_capture modifier is set, the current captured groups are output when a callout occurs. This is useful only for non-DFA matching, as pcre2_dfa_match() does not support capturing, so no captures are ever shown. The normal callout output, showing the callout number or pattern offset (as described above) is suppressed if the callout_no_where modifier is set. When using the interpretive matching function pcre2_match() without JIT, setting the callout_extra modifier causes additional output from pcre2test's callout function to be generated. For the first callout in a match attempt at a new starting position in the subject, "New match attempt" is output. If there has been a backtrack since the last call- out (or start of matching if this is the first callout), "Backtrack" is output, followed by "No other matching paths" if the backtrack ended the previous match attempt. For example: re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess data> aac\=callout_extra New match attempt --->aac +0 ^ ( +1 ^ a+ +3 ^ ^ ) +4 ^ ^ b Backtrack --->aac +3 ^^ ) +4 ^^ b Backtrack No other matching paths New match attempt --->aac +0 ^ ( +1 ^ a+ +3 ^^ ) +4 ^^ b Backtrack No other matching paths New match attempt --->aac +0 ^ ( +1 ^ a+ Backtrack No other matching paths New match attempt --->aac +0 ^ ( +1 ^ a+ No match Notice that various optimizations must be turned off if you want all possible matching paths to be scanned. If no_start_optimize is not used, there is an immediate "no match", without any callouts, because the starting optimization fails to find "b" in the subject, which it knows must be present for any match. If no_auto_possess is not used, the "a+" item is turned into "a++", which reduces the number of back- tracks. The callout_extra modifier has no effect if used with the DFA matching function, or with JIT. Return values from callouts The default return from the callout function is zero, which allows matching to continue. The callout_fail modifier can be given one or two numbers. If there is only one number, 1 is returned instead of 0 (caus- ing matching to backtrack) when a callout of that number is reached. If two numbers (:) are given, 1 is returned when callout is reached and there have been at least callouts. The callout_error modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus- ing the entire matching process to be aborted. If both these modifiers are set for the same callout number, callout_error takes precedence. Note that callouts with string arguments are always given the number zero. The callout_data modifier can be given an unsigned or a negative num- ber. This is set as the "user data" that is passed to the matching function, and passed back when the callout function is invoked. Any value other than zero is used as a return from pcre2test's callout function. Inserting callouts can be helpful when using pcre2test to check compli- cated regular expressions. For further information about callouts, see the pcre2callout documentation. NON-PRINTING CHARACTERS When pcre2test is outputting text in the compiled version of a pattern, bytes other than 32-126 are always treated as non-printing characters and are therefore shown as hex escapes. When pcre2test is outputting text that is a matched part of a subject string, it behaves in the same way, unless a different locale has been set for the pattern (using the locale modifier). In this case, the is- print() function is used to distinguish printing and non-printing char- acters. SAVING AND RESTORING COMPILED PATTERNS It is possible to save compiled patterns on disc or elsewhere, and re- load them later, subject to a number of restrictions. JIT data cannot be saved. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also have the same endianness, pointer width and PCRE2_SIZE type. Before compiled patterns can be saved they must be serialized, that is, con- verted to a stream of bytes. A single byte stream may contain any num- ber of compiled patterns, but they must all use the same character ta- bles. A single copy of the tables is included in the byte stream (its size is 1088 bytes). The functions whose names begin with pcre2_serialize_ are used for se- rializing and de-serializing. They are described in the pcre2serialize documentation. In this section we describe the features of pcre2test that can be used to test these functions. Note that "serialization" in PCRE2 does not convert compiled patterns to an abstract format like Java or .NET. It just makes a reloadable byte code stream. Hence the restrictions on reloading mentioned above. In pcre2test, when a pattern with push modifier is successfully com- piled, it is pushed onto a stack of compiled patterns, and pcre2test expects the next line to contain a new pattern (or command) instead of a subject line. By contrast, the pushcopy modifier causes a copy of the compiled pattern to be stacked, leaving the original available for im- mediate matching. By using push and/or pushcopy, a number of patterns can be compiled and retained. These modifiers are incompatible with posix, and control modifiers that act at match time are ignored (with a message) for the stacked patterns. The jitverify modifier applies only at compile time. The command #save causes all the stacked patterns to be serialized and the result written to the named file. Afterwards, all the stacked patterns are freed. The command #load reads the data in the file, and then arranges for it to be de-serial- ized, with the resulting compiled patterns added to the pattern stack. The pattern on the top of the stack can be retrieved by the #pop com- mand, which must be followed by lines of subjects that are to be matched with the pattern, terminated as usual by an empty line or end of file. This command may be followed by a modifier list containing only control modifiers that act after a pattern has been compiled. In particular, hex, posix, posix_nosub, push, and pushcopy are not al- lowed, nor are any option-setting modifiers. The JIT modifiers are, however permitted. Here is an example that saves and reloads two pat- terns. /abc/push /xyz/push #save tempfile #load tempfile #pop info xyz #pop jit,bincode abc If jitverify is used with #pop, it does not automatically imply jit, which is different behaviour from when it is used on a pattern. The #popcopy command is analogous to the pushcopy modifier in that it makes current a copy of the topmost stack pattern, leaving the original still on the stack. SEE ALSO pcre2(3), pcre2api(3), pcre2callout(3), pcre2jit, pcre2matching(3), pcre2partial(d), pcre2pattern(3), pcre2serialize(3). AUTHOR Philip Hazel Retired from University Computing Service Cambridge, England. REVISION Last updated: 12 October 2025 Copyright (c) 1997-2024 University of Cambridge. PCRE2 10.48-DEV 12 October 2025 PCRE2TEST(1) ================================================ FILE: doc/pcre2unicode.3 ================================================ .TH PCRE2UNICODE 3 "27 November 2024" "PCRE2 10.48-DEV" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "UNICODE AND UTF SUPPORT" .rs .sp PCRE2 is normally built with Unicode support, though if you do not need it, you can build it without, in which case the library will be smaller. With Unicode support, PCRE2 has knowledge of Unicode character properties and can process strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit width), but this is not the default. Unless specifically requested, PCRE2 treats each code unit in a string as one character. .P There are two ways of telling PCRE2 to switch to UTF mode, where characters may consist of more than one code unit and the range of values is constrained. The program can call .\" HREF \fBpcre2_compile()\fP .\" with the PCRE2_UTF option, or the pattern may start with the sequence (*UTF). However, the latter facility can be locked out by the PCRE2_NEVER_UTF option. That is, the programmer can prevent the supplier of the pattern from switching to UTF mode. .P Note that the PCRE2_MATCH_INVALID_UTF option (see .\" HTML .\" below) .\" forces PCRE2_UTF to be set. .P In UTF mode, both the pattern and any subject strings that are matched against it are treated as UTF strings instead of strings of individual one-code-unit characters. There are also some other changes to the way characters are handled, as documented below. . . .SH "UNICODE PROPERTY SUPPORT" .rs .sp When PCRE2 is built with Unicode support, the escape sequences \ep{..}, \eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting. The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal number, the derived properties Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties. .P The full lists are given in the .\" HREF \fBpcre2pattern\fP .\" and .\" HREF \fBpcre2syntax\fP .\" documentation. In general, only the short names for properties are supported. For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not supported. Furthermore, in Perl, many properties may optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE2 does not support this. . . .SH "WIDE CHARACTERS AND UTF MODES" .rs .sp Code points less than 256 can be specified in patterns by either braced or unbraced hexadecimal escape sequences (for example, \ex{b3} or \exb3). Larger values have to use braced sequences. Unbraced octal code points up to \e777 are also recognized; larger ones can be coded using \eo{...}. .P The escape sequence \eN{U+} is recognized as another way of specifying a Unicode character by code point in a UTF mode. It is not allowed in non-UTF mode. .P In UTF mode, repeat quantifiers apply to complete UTF characters, not to individual code units. .P In UTF mode, the dot metacharacter matches one UTF character instead of a single code unit. .P In UTF mode, capture group names are not restricted to ASCII, and may contain any Unicode letters and decimal digits, as well as underscore. .P The escape sequence \eC can be used to match a single code unit in UTF mode, but its use can lead to some strange effects because it breaks up multi-unit characters (see the description of \eC in the .\" HREF \fBpcre2pattern\fP .\" documentation). For this reason, there is a build-time option that disables support for \eC completely. There is also a less draconian compile-time option for locking out the use of \eC when a pattern is compiled. .P The use of \eC is not supported by the alternative matching function \fBpcre2_dfa_match()\fP when in UTF-8 or UTF-16 mode, that is, when a character may consist of more than one code unit. The use of \eC in these modes provokes a match-time error. Also, the JIT optimization does not support \eC in these modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that contains \eC, it will not succeed, and so when \fBpcre2_match()\fP is called, the matching will be carried out by the interpretive function. .P The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test characters of any code value, but, by default, the characters that PCRE2 recognizes as digits, spaces, or word characters remain the same set as in non-UTF mode, all with code points less than 256. This remains true even when PCRE2 is built to include Unicode support, because to do otherwise would slow down matching in many common cases. Note that this also applies to \eb and \eB, because they are defined in terms of \ew and \eW. If you want to test for a wider sense of, say, "digit", you can use explicit Unicode property tests such as \ep{Nd}. Alternatively, if you set the PCRE2_UCP option, the way that the character escapes work is changed so that Unicode properties are used to determine which characters match, though there are some options that suppress this for individual escapes. For details see the section on .\" HTML .\" generic character types .\" in the .\" HREF \fBpcre2pattern\fP .\" documentation. .P Like the escapes, characters that match the POSIX named character classes are all low-valued characters unless the PCRE2_UCP option is set, but there is an option to override this. .P In contrast to the character escapes and character classes, the special horizontal and vertical white space escapes (\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters, whether or not PCRE2_UCP is set. . . .SH "UNICODE CASE-EQUIVALENCE" .rs .sp If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use of Unicode properties except for characters whose code points are less than 128 and that have at most two case-equivalent values. For these, a direct table lookup is used for speed. A few Unicode characters such as Greek sigma have more than two code points that are case-equivalent, and these are treated specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case processing for non-UTF character encodings such as UCS-2. .P There are two ASCII characters (S and K) that, in addition to their ASCII lower case equivalents, have a non-ASCII one as well (long S and Kelvin sign). Recognition of these non-ASCII characters as case-equivalent to their ASCII counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. .sp Without PCRE2_EXTRA_CASELESS_RESTRICT: 'k' = 'K' = U+212A (Kelvin sign) 's' = 'S' = U+017F (long S) With PCRE2_EXTRA_CASELESS_RESTRICT: 'k' = 'K' U+212A (Kelvin sign) only case-equivalent to itself 's' = 'S' U+017F (long S) only case-equivalent to itself .P One language family, Turkish and Azeri, has its own case-insensitivity rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 (small dotless i) characters. .sp Without PCRE2_EXTRA_TURKISH_CASING: 'i' = 'I' U+0130 (capital I with dot above) only case-equivalent to itself U+0131 (small dotless i) only case-equivalent to itself With PCRE2_EXTRA_TURKISH_CASING: 'i' = U+0130 (capital I with dot above) U+0131 (small dotless i) = 'I' .P It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING together. .P From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower case), and Lt (title case) are all treated as Lc (cased letter) when caseless matching is set by the PCRE2_CASELESS option or (?i) within the pattern. . . .\" HTML .SH "SCRIPT RUNS" .rs .sp The pattern constructs (*script_run:...) and (*atomic_script_run:...), with synonyms (*sr:...) and (*asr:...), verify that the string matched within the parentheses is a script run. In concept, a script run is a sequence of characters that are all from the same Unicode script. However, because some scripts are commonly used together, and because some diacritical and other marks are used with multiple scripts, it is not that simple. .P Every Unicode character has a Script property, mostly with a value corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There are also three special values: .P "Unknown" is used for code points that have not been assigned, and also for the surrogate code points. In the PCRE2 32-bit library, characters whose code points are greater than the Unicode maximum (U+10FFFF), which are accessible only in non-UTF mode, are assigned the Unknown script. .P "Common" is used for characters that are used with many scripts. These include punctuation, emoji, mathematical, musical, and currency symbols, and the ASCII digits 0 to 9. .P "Inherited" is used for characters such as diacritical marks that modify a previous character. These are considered to take on the script of the character that they modify. .P Some Inherited characters are used with many scripts, but many of them are only normally used with a small number of scripts. For example, U+102E0 (Coptic Epact thousands mark) is used only with Arabic and Coptic. In order to make it possible to check this, a Unicode property called Script Extension exists. Its value is a list of scripts that apply to the character. For the majority of characters, the list contains just one script, the same one as the Script property. However, for characters such as U+102E0 more than one Script is listed. There are also some Common characters that have a single, non-Common script in their Script Extension list. .P The next section describes the basic rules for deciding whether a given string of characters is a script run. Note, however, that there are some special cases involving the Chinese Han script, and an additional constraint for decimal digits. These are covered in subsequent sections. . . .SS "Basic script run rules" .rs .sp A string that is less than two characters long is a script run. This is the only case in which an Unknown character can be part of a script run. Longer strings are checked using only the Script Extensions property, not the basic Script property. .P If a character's Script Extension property is the single value "Inherited", it is always accepted as part of a script run. This is also true for the property "Common", subject to the checking of decimal digits described below. All the remaining characters in a script run must have at least one script in common in their Script Extension lists. In set-theoretic terminology, the intersection of all the sets of scripts must not be empty. .P A simple example is an Internet name such as "google.com". The letters are all in the Latin script, and the dot is Common, so this string is a script run. However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a string that looks the same, but with Cyrillic "o"s is not a script run. .P More interesting examples involve characters with more than one script in their Script Extension. Consider the following characters: .sp U+060C Arabic comma U+06D4 Arabic full stop .sp The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could appear in script runs of either Arabic or Hanifi Rohingya. The first could also appear in Syriac or Thaana script runs, but the second could not. . . .SS "The Chinese Han script" .rs .sp The Chinese Han script is commonly used in conjunction with other scripts for writing certain languages. Japanese uses the Hiragana and Katakana scripts together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo and Han. These three combinations are treated as special cases when checking script runs and are, in effect, "virtual scripts". Thus, a script run may contain a mixture of Hiragana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture of Bopomofo and Han, but not, for example, a mixture of Hangul and Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Standard 39 ("Unicode Security Mechanisms", http://unicode.org/reports/tr39/) in allowing such mixtures. . . .SS "Decimal digits" .rs .sp Unicode contains many sets of 10 decimal digits in different scripts, and some scripts (including the Common script) contain more than one set. Some of these decimal digits them are visually indistinguishable from the common ASCII digits. In addition to the script checking described above, if a script run contains any decimal digits, they must all come from the same set of 10 adjacent characters. . . .SH "VALIDITY OF UTF STRINGS" .rs .sp When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant functions. If an invalid UTF string is passed, a negative error code is returned. The code unit offset to the offending character can be extracted from the match data block by calling \fBpcre2_get_startchar()\fP, which is used for this purpose after a UTF error. .P In some situations, you may already know that your strings are valid, and therefore want to skip these checks in order to improve performance, for example in the case of a long subject string that is being scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at compile time or at match time, PCRE2 assumes that the pattern or subject it is given (respectively) contains only valid UTF code unit sequences. .P If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result is undefined and your program may crash or loop indefinitely or give incorrect results. There is, however, one mode of matching that can handle invalid UTF subject strings. This is enabled by passing PCRE2_MATCH_INVALID_UTF to \fBpcre2_compile()\fP and is discussed below in the next section. The rest of this section covers the case when PCRE2_MATCH_INVALID_UTF is not set. .P Passing PCRE2_NO_UTF_CHECK to \fBpcre2_compile()\fP just disables the UTF check for the pattern; it does not also apply to subject strings. If you want to disable the check for a subject string you must pass this same option to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. .P UTF-16 and UTF-32 strings can indicate their endianness by special code knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. .P Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other processing takes place. In the case of \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting offset, the check is applied only to that part of the subject that could be inspected during matching, and there is a check that the starting offset points to the first code unit of a character or to the end of the subject. If there are no lookbehind assertions in the pattern, the check starts at the starting offset. Otherwise, it starts at the length of the longest lookbehind before the starting offset, or at the start of the subject if there are not that many characters before the starting offset. Note that the sequences \eb and \eB are one-character lookbehinds. .P In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be. .P Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, where they are used in pairs to encode code points with values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs are available independently in the UTF-8 and UTF-32 encodings. (In other words, the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and UTF-32.) .P Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. If you want to allow escape sequences such as \ex{d800} (a surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16. . . .\" HTML .SS "Errors in UTF-8 strings" .rs .sp The following negative error codes are given for invalid UTF-8 strings: .sp PCRE2_ERROR_UTF8_ERR1 PCRE2_ERROR_UTF8_ERR2 PCRE2_ERROR_UTF8_ERR3 PCRE2_ERROR_UTF8_ERR4 PCRE2_ERROR_UTF8_ERR5 .sp The string ends with a truncated UTF-8 character; the code specifies how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279) allows for up to 6 bytes, and this is checked first; hence the possibility of 4 or 5 missing bytes. .sp PCRE2_ERROR_UTF8_ERR6 PCRE2_ERROR_UTF8_ERR7 PCRE2_ERROR_UTF8_ERR8 PCRE2_ERROR_UTF8_ERR9 PCRE2_ERROR_UTF8_ERR10 .sp The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the character do not have the binary value 0b10 (that is, either the most significant bit is 0, or the next bit is 1). .sp PCRE2_ERROR_UTF8_ERR11 PCRE2_ERROR_UTF8_ERR12 .sp A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; these code points are excluded by RFC 3629. .sp PCRE2_ERROR_UTF8_ERR13 .sp A 4-byte character has a value greater than 0x10ffff; these code points are excluded by RFC 3629. .sp PCRE2_ERROR_UTF8_ERR14 .sp A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of code points are reserved by RFC 3629 for use with UTF-16, and so are excluded from UTF-8. .sp PCRE2_ERROR_UTF8_ERR15 PCRE2_ERROR_UTF8_ERR16 PCRE2_ERROR_UTF8_ERR17 PCRE2_ERROR_UTF8_ERR18 PCRE2_ERROR_UTF8_ERR19 .sp A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a value that can be represented by fewer bytes, which is invalid. For example, the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just one byte. .sp PCRE2_ERROR_UTF8_ERR20 .sp The two most significant bits of the first byte of a character have the binary value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a byte can only validly occur as the second or subsequent byte of a multi-byte character. .sp PCRE2_ERROR_UTF8_ERR21 .sp The first byte of a character has the value 0xfe or 0xff. These values can never occur in a valid UTF-8 string. . . .\" HTML .SS "Errors in UTF-16 strings" .rs .sp The following negative error codes are given for invalid UTF-16 strings: .sp PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate follows high surrogate PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate .sp . . .\" HTML .SS "Errors in UTF-32 strings" .rs .sp The following negative error codes are given for invalid UTF-32 strings: .sp PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff .sp . . .\" HTML .SH "MATCHING IN INVALID UTF STRINGS" .rs .sp You can run pattern matches on subject strings that may contain invalid UTF sequences if you call \fBpcre2_compile()\fP with the PCRE2_MATCH_INVALID_UTF option. This is supported by \fBpcre2_match()\fP, including JIT matching, but not by \fBpcre2_dfa_match()\fP. When PCRE2_MATCH_INVALID_UTF is set, it forces PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a valid UTF string. .P If you do not set PCRE2_MATCH_INVALID_UTF when calling \fBpcre2_compile\fP, and you are not certain that your subject strings are valid UTF sequences, you should not make use of the JIT "fast path" function \fBpcre2_jit_match()\fP because it bypasses sanity checks, including the one for UTF validity. An invalid string may cause undefined behaviour, including looping, crashing, or giving the wrong answer. .P Setting PCRE2_MATCH_INVALID_UTF does not affect what \fBpcre2_compile()\fP generates, but if \fBpcre2_jit_compile()\fP is subsequently called, it does generate different code. If JIT is not used, the option affects the behaviour of the interpretive code in \fBpcre2_match()\fP. When PCRE2_MATCH_INVALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time. .P In this mode, an invalid code unit sequence in the subject never matches any pattern item. It does not match dot, it does not match \ep{Any}, it does not even match negative items such as [^X]. A lookbehind assertion fails if it encounters an invalid sequence while moving the current point backwards. In other words, an invalid UTF code unit sequence acts as a barrier which no match can cross. .P You can also think of this as the subject being split up into fragments of valid UTF, delimited internally by invalid code unit sequences. The pattern is matched fragment by fragment. The result of a successful match, however, is given as code unit offsets in the entire subject string in the usual way. There are a few points to consider: .P The internal boundaries are not interpreted as the beginnings or ends of lines and so do not match circumflex or dollar characters in the pattern. .P If \fBpcre2_match()\fP is called with an offset that points to an invalid UTF-sequence, that sequence is skipped, and the match starts at the next valid UTF character, or the end of the subject. .P At internal fragment boundaries, \eb and \eB behave in the same way as at the beginning and end of the subject. For example, a sequence such as \ebWORD\eb would match an instance of WORD that is surrounded by invalid UTF code units. .P Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbitrary data, knowing that any matched strings that are returned are valid UTF. This can be useful when searching for UTF text in executable or other binary files. .P Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. . . .SH AUTHOR .rs .sp .nf Philip Hazel Retired from University Computing Service Cambridge, England. .fi . . .SH REVISION .rs .sp .nf Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. .fi ================================================ FILE: libpcre2-16.pc.in ================================================ # Package Information for pkg-config prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: libpcre2-16 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 16 bit character support Version: @PACKAGE_VERSION@ License: BSD-3-Clause WITH PCRE2-exception Libs: -L${libdir} -lpcre2-16@LIB_POSTFIX@ Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@ Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ ================================================ FILE: libpcre2-32.pc.in ================================================ # Package Information for pkg-config prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: libpcre2-32 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 32 bit character support Version: @PACKAGE_VERSION@ License: BSD-3-Clause WITH PCRE2-exception Libs: -L${libdir} -lpcre2-32@LIB_POSTFIX@ Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@ Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ ================================================ FILE: libpcre2-8.pc.in ================================================ # Package Information for pkg-config prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: libpcre2-8 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 8 bit character support Version: @PACKAGE_VERSION@ License: BSD-3-Clause WITH PCRE2-exception Libs: -L${libdir} -lpcre2-8@LIB_POSTFIX@ Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@ Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@ ================================================ FILE: libpcre2-posix.pc.in ================================================ # Package Information for pkg-config prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: libpcre2-posix Description: Posix compatible interface to libpcre2-8 Version: @PACKAGE_VERSION@ License: BSD-3-Clause WITH PCRE2-exception Libs: -L${libdir} -lpcre2-posix@LIB_POSTFIX@ Cflags: -I${includedir} @PCRE2POSIX_CFLAG@ Requires.private: libpcre2-8 ================================================ FILE: m4/ax_pthread.m4 ================================================ # =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_pthread.html # =========================================================================== # # SYNOPSIS # # AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) # # DESCRIPTION # # This macro figures out how to build C programs using POSIX threads. It # sets the PTHREAD_LIBS output variable to the threads library and linker # flags, and the PTHREAD_CFLAGS output variable to any special C compiler # flags that are needed. (The user can also force certain compiler # flags/libs to be tested by setting these environment variables.) # # Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is # needed for multi-threaded programs (defaults to the value of CC # respectively CXX otherwise). (This is necessary on e.g. AIX to use the # special cc_r/CC_r compiler alias.) # # NOTE: You are assumed to not only compile your program with these flags, # but also to link with them as well. For example, you might link with # $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS # $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS # # If you are only building threaded programs, you may wish to use these # variables in your default LIBS, CFLAGS, and CC: # # LIBS="$PTHREAD_LIBS $LIBS" # CFLAGS="$CFLAGS $PTHREAD_CFLAGS" # CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS" # CC="$PTHREAD_CC" # CXX="$PTHREAD_CXX" # # In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant # has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to # that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). # # Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the # PTHREAD_PRIO_INHERIT symbol is defined when compiling with # PTHREAD_CFLAGS. # # ACTION-IF-FOUND is a list of shell commands to run if a threads library # is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it # is not found. If ACTION-IF-FOUND is not specified, the default action # will define HAVE_PTHREAD. # # Please let the authors know if this macro fails on any platform, or if # you have any other suggestions or comments. This macro was based on work # by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help # from M. Frigo), as well as ac_pthread and hb_pthread macros posted by # Alejandro Forero Cuervo to the autoconf macro repository. We are also # grateful for the helpful feedback of numerous users. # # Updated for Autoconf 2.68 by Daniel Richard G. # # LICENSE # # Copyright (c) 2008 Steven G. Johnson # Copyright (c) 2011 Daniel Richard G. # Copyright (c) 2019 Marc Stevens # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 31 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD]) AC_DEFUN([AX_PTHREAD], [ AC_REQUIRE([AC_CANONICAL_HOST]) AC_REQUIRE([AC_PROG_CC]) AC_REQUIRE([AC_PROG_SED]) AC_LANG_PUSH([C]) ax_pthread_ok=no # We used to check for pthread.h first, but this fails if pthread.h # requires special compiler flags (e.g. on Tru64 or Sequent). # It gets checked for in the link test anyway. # First of all, check if the user has set any of the PTHREAD_LIBS, # etcetera environment variables, and if threads linking works using # them: if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then ax_pthread_save_CC="$CC" ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"]) AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"]) CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS]) AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes]) AC_MSG_RESULT([$ax_pthread_ok]) if test "x$ax_pthread_ok" = "xno"; then PTHREAD_LIBS="" PTHREAD_CFLAGS="" fi CC="$ax_pthread_save_CC" CFLAGS="$ax_pthread_save_CFLAGS" LIBS="$ax_pthread_save_LIBS" fi # We must check for the threads library under a number of different # names; the ordering is very important because some systems # (e.g. DEC) have both -lpthread and -lpthreads, where one of the # libraries is broken (non-POSIX). # Create a list of thread flags to try. Items with a "," contain both # C compiler flags (before ",") and linker flags (after ","). Other items # starting with a "-" are C compiler flags, and remaining items are # library names, except for "none" which indicates that we try without # any flags at all, and "pthread-config" which is a program returning # the flags for the Pth emulation library. ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" # The ordering *is* (sometimes) important. Some notes on the # individual items follow: # pthreads: AIX (must check this before -lpthread) # none: in case threads are in libc; should be tried before -Kthread and # other compiler flags to prevent continual compiler warnings # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) # -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64 # (Note: HP C rejects this with "bad form for `-t' option") # -pthreads: Solaris/gcc (Note: HP C also rejects) # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it # doesn't hurt to check since this sometimes defines pthreads and # -D_REENTRANT too), HP C (must be checked before -lpthread, which # is present but should not be used directly; and before -mthreads, # because the compiler interprets this as "-mt" + "-hreads") # -mthreads: Mingw32/gcc, Lynx/gcc # pthread: Linux, etcetera # --thread-safe: KAI C++ # pthread-config: use pthread-config program (for GNU Pth library) case $host_os in freebsd*) # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) ax_pthread_flags="-kthread lthread $ax_pthread_flags" ;; hpux*) # From the cc(1) man page: "[-mt] Sets various -D flags to enable # multi-threading and also sets -lpthread." ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags" ;; openedition*) # IBM z/OS requires a feature-test macro to be defined in order to # enable POSIX threads at all, so give the user a hint if this is # not set. (We don't define these ourselves, as they can affect # other portions of the system API in unpredictable ways.) AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING], [ # if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS) AX_PTHREAD_ZOS_MISSING # endif ], [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])]) ;; solaris*) # On Solaris (at least, for some versions), libc contains stubbed # (non-functional) versions of the pthreads routines, so link-based # tests will erroneously succeed. (N.B.: The stubs are missing # pthread_cleanup_push, or rather a function called by this macro, # so we could check for that, but who knows whether they'll stub # that too in a future libc.) So we'll check first for the # standard Solaris way of linking pthreads (-mt -lpthread). ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags" ;; esac # Are we compiling with Clang? AC_CACHE_CHECK([whether $CC is Clang], [ax_cv_PTHREAD_CLANG], [ax_cv_PTHREAD_CLANG=no # Note that Autoconf sets GCC=yes for Clang as well as GCC if test "x$GCC" = "xyes"; then AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG], [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */ # if defined(__clang__) && defined(__llvm__) AX_PTHREAD_CC_IS_CLANG # endif ], [ax_cv_PTHREAD_CLANG=yes]) fi ]) ax_pthread_clang="$ax_cv_PTHREAD_CLANG" # GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC) # Note that for GCC and Clang -pthread generally implies -lpthread, # except when -nostdlib is passed. # This is problematic using libtool to build C++ shared libraries with pthread: # [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460 # [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333 # [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555 # To solve this, first try -pthread together with -lpthread for GCC AS_IF([test "x$GCC" = "xyes"], [ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"]) # Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first AS_IF([test "x$ax_pthread_clang" = "xyes"], [ax_pthread_flags="-pthread,-lpthread -pthread"]) # The presence of a feature test macro requesting re-entrant function # definitions is, on some systems, a strong hint that pthreads support is # correctly enabled case $host_os in darwin* | hpux* | linux* | osf* | solaris*) ax_pthread_check_macro="_REENTRANT" ;; aix*) ax_pthread_check_macro="_THREAD_SAFE" ;; *) ax_pthread_check_macro="--" ;; esac AS_IF([test "x$ax_pthread_check_macro" = "x--"], [ax_pthread_check_cond=0], [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"]) if test "x$ax_pthread_ok" = "xno"; then for ax_pthread_try_flag in $ax_pthread_flags; do case $ax_pthread_try_flag in none) AC_MSG_CHECKING([whether pthreads work without any flags]) ;; *,*) PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"` PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"` AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"]) ;; -*) AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag]) PTHREAD_CFLAGS="$ax_pthread_try_flag" ;; pthread-config) AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no]) AS_IF([test "x$ax_pthread_config" = "xno"], [continue]) PTHREAD_CFLAGS="`pthread-config --cflags`" PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" ;; *) AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag]) PTHREAD_LIBS="-l$ax_pthread_try_flag" ;; esac ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" # Check for various functions. We must include pthread.h, # since some functions may be macros. (On the Sequent, we # need a special flag -Kthread to make this header compile.) # We check for pthread_join because it is in -lpthread on IRIX # while pthread_create is in libc. We check for pthread_attr_init # due to DEC craziness with -lpthreads. We check for # pthread_cleanup_push because it is one of the few pthread # functions on Solaris that doesn't have a non-functional libc stub. # We try pthread_create on general principles. AC_LINK_IFELSE([AC_LANG_PROGRAM([#include # if $ax_pthread_check_cond # error "$ax_pthread_check_macro must be defined" # endif static void *some_global = NULL; static void routine(void *a) { /* To avoid any unused-parameter or unused-but-set-parameter warning. */ some_global = a; } static void *start_routine(void *a) { return a; }], [pthread_t th; pthread_attr_t attr; pthread_create(&th, 0, start_routine, 0); pthread_join(th, 0); pthread_attr_init(&attr); pthread_cleanup_push(routine, 0); pthread_cleanup_pop(0) /* ; */])], [ax_pthread_ok=yes], []) CFLAGS="$ax_pthread_save_CFLAGS" LIBS="$ax_pthread_save_LIBS" AC_MSG_RESULT([$ax_pthread_ok]) AS_IF([test "x$ax_pthread_ok" = "xyes"], [break]) PTHREAD_LIBS="" PTHREAD_CFLAGS="" done fi # Clang needs special handling, because older versions handle the -pthread # option in a rather... idiosyncratic way if test "x$ax_pthread_clang" = "xyes"; then # Clang takes -pthread; it has never supported any other flag # (Note 1: This will need to be revisited if a system that Clang # supports has POSIX threads in a separate library. This tends not # to be the way of modern systems, but it's conceivable.) # (Note 2: On some systems, notably Darwin, -pthread is not needed # to get POSIX threads support; the API is always present and # active. We could reasonably leave PTHREAD_CFLAGS empty. But # -pthread does define _REENTRANT, and while the Darwin headers # ignore this macro, third-party headers might not.) # However, older versions of Clang make a point of warning the user # that, in an invocation where only linking and no compilation is # taking place, the -pthread option has no effect ("argument unused # during compilation"). They expect -pthread to be passed in only # when source code is being compiled. # # Problem is, this is at odds with the way Automake and most other # C build frameworks function, which is that the same flags used in # compilation (CFLAGS) are also used in linking. Many systems # supported by AX_PTHREAD require exactly this for POSIX threads # support, and in fact it is often not straightforward to specify a # flag that is used only in the compilation phase and not in # linking. Such a scenario is extremely rare in practice. # # Even though use of the -pthread flag in linking would only print # a warning, this can be a nuisance for well-run software projects # that build with -Werror. So if the active version of Clang has # this misfeature, we search for an option to squash it. AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread], [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG], [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown # Create an alternate version of $ac_link that compiles and # links in two steps (.c -> .o, .o -> exe) instead of one # (.c -> exe), because the warning occurs only in the second # step ax_pthread_save_ac_link="$ac_link" ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g' ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"` ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)" ax_pthread_save_CFLAGS="$CFLAGS" for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do AS_IF([test "x$ax_pthread_try" = "xunknown"], [break]) CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS" ac_link="$ax_pthread_save_ac_link" AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], [ac_link="$ax_pthread_2step_ac_link" AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], [break]) ]) done ac_link="$ax_pthread_save_ac_link" CFLAGS="$ax_pthread_save_CFLAGS" AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no]) ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try" ]) case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in no | unknown) ;; *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;; esac fi # $ax_pthread_clang = yes # Various other checks: if test "x$ax_pthread_ok" = "xyes"; then ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. AC_CACHE_CHECK([for joinable pthread attribute], [ax_cv_PTHREAD_JOINABLE_ATTR], [ax_cv_PTHREAD_JOINABLE_ATTR=unknown for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [int attr = $ax_pthread_attr; return attr /* ; */])], [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break], []) done ]) AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \ test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \ test "x$ax_pthread_joinable_attr_defined" != "xyes"], [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], [$ax_cv_PTHREAD_JOINABLE_ATTR], [Define to necessary symbol if this constant uses a non-standard name on your system.]) ax_pthread_joinable_attr_defined=yes ]) AC_CACHE_CHECK([whether more special flags are required for pthreads], [ax_cv_PTHREAD_SPECIAL_FLAGS], [ax_cv_PTHREAD_SPECIAL_FLAGS=no case $host_os in solaris*) ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS" ;; esac ]) AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \ test "x$ax_pthread_special_flags_added" != "xyes"], [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS" ax_pthread_special_flags_added=yes]) AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT], [ax_cv_PTHREAD_PRIO_INHERIT], [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[int i = PTHREAD_PRIO_INHERIT; return i;]])], [ax_cv_PTHREAD_PRIO_INHERIT=yes], [ax_cv_PTHREAD_PRIO_INHERIT=no]) ]) AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \ test "x$ax_pthread_prio_inherit_defined" != "xyes"], [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.]) ax_pthread_prio_inherit_defined=yes ]) CFLAGS="$ax_pthread_save_CFLAGS" LIBS="$ax_pthread_save_LIBS" # More AIX lossage: compile with *_r variant if test "x$GCC" != "xyes"; then case $host_os in aix*) AS_CASE(["x/$CC"], [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6], [#handle absolute path differently from PATH based program lookup AS_CASE(["x$CC"], [x/*], [ AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"]) AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])]) ], [ AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC]) AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])]) ] ) ]) ;; esac fi fi test -n "$PTHREAD_CC" || PTHREAD_CC="$CC" test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX" AC_SUBST([PTHREAD_LIBS]) AC_SUBST([PTHREAD_CFLAGS]) AC_SUBST([PTHREAD_CC]) AC_SUBST([PTHREAD_CXX]) # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: if test "x$ax_pthread_ok" = "xyes"; then ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1]) : else ax_pthread_ok=no $2 fi AC_LANG_POP ])dnl AX_PTHREAD ================================================ FILE: m4/pcre2_check_vscript.m4 ================================================ # =========================================================================== # https://www.gnu.org/software/autoconf-archive/ax_check_vscript.html # =========================================================================== # # Our pcre2_check_vscript.m4 is derived from the upstream ax_check_vscript.m4, # with several modifications. # # The original upstream file requires the following notice: # # LICENSE # # Copyright (c) 2014 Kevin Cernekee # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 2.99 PCRE2 # _PCRE2_CHECK_VSCRIPT_SHLIB(flag, map-contents, action-if-link-succeeds) # Build a shared library with the given linker flag and map file contents. # This properly tests version script support by building a shared library # rather than an executable, avoiding issues with executable-specific symbols # (e.g. FreeBSD's crt1.o symbols, Solaris linker symbols in values-Xc.o). # Uses libtool for portability across different platforms. AC_DEFUN([_PCRE2_CHECK_VSCRIPT_SHLIB], [ rm -rf conftest.vscript mkdir conftest.vscript cat > conftest.vscript/conftest.c <<_ACEOF int hidethis(void) { return 0; } int exposethis(void) { return hidethis(); } _ACEOF echo "$2" > conftest.vscript/conftest.map _pcre2_abs_top_builddir="$ac_pwd" _pcre2_vscript_libtool="$SHELL $_pcre2_abs_top_builddir/libtool" _pcre2_vscript_cc="$CC" _pcre2_vscript_compile_flags="$CFLAGS $CPPFLAGS" _pcre2_vscript_ld_flags="$CFLAGS $LDFLAGS" _pcre2_vscript_script_flag="$1" export _pcre2_vscript_libtool export _pcre2_vscript_cc export _pcre2_vscript_compile_flags export _pcre2_vscript_ld_flags export _pcre2_vscript_script_flag AS_IF([(cd conftest.vscript && \ $_pcre2_vscript_libtool --tag=CC --mode=compile $_pcre2_vscript_cc $_pcre2_vscript_compile_flags -c -o conftest.lo conftest.c && \ $_pcre2_vscript_libtool --tag=CC --mode=link $_pcre2_vscript_cc $_pcre2_vscript_ld_flags -o libconftest.la conftest.lo -rpath /usr/lib -Wl,$_pcre2_vscript_script_flag,conftest.map) >&AS_MESSAGE_LOG_FD 2>&1], [$3]) unset _pcre2_vscript_libtool unset _pcre2_vscript_cc unset _pcre2_vscript_compile_flags unset _pcre2_vscript_ld_flags unset _pcre2_vscript_script_flag rm -rf conftest.vscript ]) dnl _PCRE2_CHECK_VSCRIPT_SHLIB AC_DEFUN([PCRE2_CHECK_VSCRIPT], [ AC_ARG_ENABLE([symvers], AS_HELP_STRING([--disable-symvers], [disable library symbol versioning [default=auto]]), [want_symvers=$enableval], [want_symvers=yes] ) AS_IF([test x$want_symvers = xyes], [ dnl First, test --version-script and -M with a simple wildcard. AC_CACHE_CHECK([linker version script flag], pcre2_cv_check_vscript_flag, [ pcre2_cv_check_vscript_flag=unsupported _PCRE2_CHECK_VSCRIPT_SHLIB([--version-script], [PCRE2_10.00 { global: exposethis; local: *; };], [pcre2_cv_check_vscript_flag=--version-script]) AS_IF([test x$pcre2_cv_check_vscript_flag = xunsupported], [ _PCRE2_CHECK_VSCRIPT_SHLIB([-M], [PCRE2_10.00 { global: exposethis; local: *; };], [pcre2_cv_check_vscript_flag=-M]) ]) dnl The linker may interpret -M (no argument) as "produce a load map." dnl If "-M conftest.map" doesn't fail when conftest.map contains dnl obvious syntax errors, assume this is the case. AS_IF([test x$pcre2_cv_check_vscript_flag != xunsupported], [ _PCRE2_CHECK_VSCRIPT_SHLIB([$pcre2_cv_check_vscript_flag], [PCRE2_10.00 { global: exposethis; local: *; }; {], [pcre2_cv_check_vscript_flag=unsupported]) ]) ]) AS_IF([test x$pcre2_cv_check_vscript_flag != xunsupported], [ dnl Test without wildcard - for detecting Solaris, which requires the dnl wildcard (or else a much more complex and brittle configuration). AC_CACHE_CHECK([if version scripts work without wildcard], pcre2_cv_check_vscript_no_star, [ pcre2_cv_check_vscript_no_star=no _PCRE2_CHECK_VSCRIPT_SHLIB([$pcre2_cv_check_vscript_flag], [PCRE2_10.00 { global: exposethis; local: hidethis; };], [pcre2_cv_check_vscript_no_star=yes]) ]) pcre2_check_vscript_flag=$pcre2_cv_check_vscript_flag pcre2_check_vscript_no_star=$pcre2_cv_check_vscript_no_star ], [ pcre2_check_vscript_flag= pcre2_check_vscript_no_star=no ]) ], [ AC_MSG_CHECKING([linker version script flag]) AC_MSG_RESULT([disabled]) pcre2_check_vscript_flag= pcre2_check_vscript_no_star=no ]) ]) dnl PCRE2_CHECK_VSCRIPT ================================================ FILE: m4/pcre2_visibility.m4 ================================================ # visibility.m4 serial 4 (gettext-0.18.2) dnl Copyright (C) 2005, 2008, 2010-2011 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. dnl Originally From Bruno Haible. dnl Tests whether the compiler supports the command-line option dnl -fvisibility=hidden and the function attribute dnl __attribute__((__visibility__("default"))). dnl dnl Set the variable VISIBILITY_CFLAGS. dnl Defines and sets the variable HAVE_VISIBILITY. dnl Defines and sets the variable WORKING_WERROR. dnl Modified to fit with PCRE build environment by Cristian Rodríguez. dnl Adjusted for PCRE2 by PH. dnl Refactored to work with non GCC (but compatible) compilers. AC_DEFUN([PCRE2_VISIBILITY], [ AC_REQUIRE([AC_PROG_CC]) VISIBILITY_CFLAGS= HAVE_VISIBILITY=0 dnl First, check whether -Werror can be added to the command line, or dnl whether it leads to an error because of some other option that the dnl user has put into $CC $CFLAGS $CPPFLAGS. AC_MSG_CHECKING([whether the -Werror option is usable]) AC_CACHE_VAL([pcre2_cv_cc_vis_werror], [ pcre2_save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -Werror" pcre2_cv_cc_vis_werror=no AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[]], [[]])], [ AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[]], [[ #warning e ]])], [], [pcre2_cv_cc_vis_werror=yes] ) ], []) CFLAGS="$pcre2_save_CFLAGS"]) AC_MSG_RESULT([$pcre2_cv_cc_vis_werror]) if test -n "$pcre2_cv_cc_vis_werror" && test $pcre2_cv_cc_vis_werror = yes then WORKING_WERROR=1 else WORKING_WERROR=0 fi if test $pcre2_cv_cc_vis_werror = yes; then dnl Now check whether GCC compatible visibility declarations are supported. AC_MSG_CHECKING([for GCC compatible visibility declarations]) AC_CACHE_VAL([pcre2_cv_cc_visibility], [ pcre2_save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -Werror -fvisibility=hidden" dnl We use the option -Werror and a function dummyfunc, because on some dnl platforms (Cygwin 1.7) the use of -fvisibility triggers a warning dnl "visibility attribute not supported in this configuration; ignored" dnl at the first function definition in every compilation unit, and we dnl don't want to use the option in this case. AC_COMPILE_IFELSE( [AC_LANG_PROGRAM( [[extern __attribute__((__visibility__("hidden"))) int hiddenfunc (void); extern __attribute__((__visibility__("default"))) int exportedfunc (void); void dummyfunc (void) {} ]], [[]])], [pcre2_cv_cc_visibility=yes], [pcre2_cv_cc_visibility=no]) CFLAGS="$pcre2_save_CFLAGS"]) AC_MSG_RESULT([$pcre2_cv_cc_visibility]) fi if test -n "$pcre2_cv_cc_visibility" && test $pcre2_cv_cc_visibility = yes then VISIBILITY_CFLAGS="-fvisibility=hidden" HAVE_VISIBILITY=1 AC_DEFINE(PCRE2_EXPORT, [__attribute__ ((visibility ("default")))], [Define to the annotation for making a symbol visible.]) else AC_DEFINE(PCRE2_EXPORT, [], [Define to the annotation for making a symbol visible.]) fi AC_SUBST([VISIBILITY_CFLAGS]) AC_SUBST([HAVE_VISIBILITY]) AC_DEFINE_UNQUOTED([HAVE_VISIBILITY], [$HAVE_VISIBILITY], [Define to 1 if the compiler supports GCC compatible visibility declarations.]) ]) ================================================ FILE: m4/pcre2_zos.m4 ================================================ dnl Tests whether the compiler requires an additional flag in order to fail on dnl undefined headers. dnl The concept of setting this commandline flag was learned from patches and dnl mailing list discussions of the gnulib and gawk projects (credit to dnl Bruno Haible). AC_DEFUN([PCRE2_ZOS_FIXES], [ AC_CACHE_CHECK([for OS/390 (z/OS)], [pcre2_cv_os390], [if test "`uname`" = "OS/390"; then pcre2_cv_os390=yes else pcre2_cv_os390=no fi]) if test "$pcre2_cv_os390" = "yes"; then AC_CACHE_CHECK([whether the compiler supports -qhaltonmsg=CCN3296], [pcre2_cv_xlc_qhaltonmsg_support], [save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -qhaltonmsg=CCN3296" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([])], [pcre2_cv_xlc_qhaltonmsg_support=yes], [pcre2_cv_xlc_qhaltonmsg_support=no]) CFLAGS="$save_CFLAGS" ]) AC_CACHE_CHECK([whether non-existent headers fail the compile], [pcre2_cv_xlc_nonexistent_fatal], [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]])], [pcre2_cv_xlc_nonexistent_fatal=no], [pcre2_cv_xlc_nonexistent_fatal=yes]) ]) if test "$pcre2_cv_xlc_nonexistent_fatal" = "no" && test "$pcre2_cv_xlc_qhaltonmsg_support" = "yes"; then AC_CACHE_CHECK([whether -qhaltonmsg=CCN3296 fixes the non-existent-header issue], [pcre2_cv_xlc_qhaltonmsg_fixes], [save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -qhaltonmsg=CCN3296" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]])], [pcre2_cv_xlc_qhaltonmsg_fixes=no], [pcre2_cv_xlc_qhaltonmsg_fixes=yes]) CFLAGS="$save_CFLAGS" ]) if test "$pcre2_cv_xlc_qhaltonmsg_fixes" = "no"; then AC_MSG_ERROR([-qhaltonmsg=CCN3296 not effective on non-existent headers]) fi CFLAGS="$CFLAGS -qhaltonmsg=CCN3296" fi fi ]) ================================================ FILE: maint/.gitignore ================================================ ucptest utf8 utf8.* pcre2_ucp.h pcre2_ucptables_inc.h pcre2_ucd.c testinput testinput11 testoutput !build-interface ================================================ FILE: maint/132html ================================================ #! /usr/bin/perl -w # Script to turn PCRE2 man pages into HTML # Subroutine to handle font changes and other escapes sub do_line { my($s) = $_[0]; $s =~ s/ $s =~ s/>/>/g; $s =~ s"\\fI(.*?)\\f[RP]"$1"g; $s =~ s"\\fB(.*?)\\f[RP]"$1"g; $s =~ s"\\e"\\"g; $s =~ s/(?<=Copyright )\(c\)/©/g; $s =~ s/\\&//g; # Deal with the \& 0-width space $s; } # Subroutine to ensure not in a paragraph sub end_para { if ($inpara) { print TEMP "
\n" if ($inpre); print TEMP "

\n"; } $inpara = $inpre = 0; $wrotetext = 0; } # Subroutine to start a new paragraph sub new_para { &end_para(); print TEMP "

\n"; $inpara = 1; } # Main program $innf = 0; $inpara = 0; $inpre = 0; $wrotetext = 0; $toc = 0; $header = 1; $ref = 1; while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) { $toc = 1 if $ARGV[0] eq "-toc"; $header = 0 if $ARGV[0] eq "-noheader"; shift; } # Initial output to STDOUT if ($header) { print < $ARGV[0] specification

$ARGV[0] man page

Return to the PCRE2 index page.

This page is part of the PCRE2 HTML documentation. It was generated automatically from the original man page. If there is any nonsense in it, please consult the man page, in case the conversion went wrong.
End } print "

    \n" if ($toc); open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n"; while () { # Handle lines beginning with a dot if (/^\./) { # Some of the PCRE2 man pages used to contain instances of .br. However, # they should have all been removed because they cause trouble in some # (other) automated systems that translate man pages to HTML. Complain if # we find .br or .in (another macro that is deprecated). if (/^\.br/ || /^\.in/) { print STDERR "\n*** Deprecated macro encountered - rewrite needed\n"; print STDERR "*** $_\n"; die "*** Processing abandoned\n"; } # Instead of .br, relevant "literal" sections are enclosed in .nf/.fi. elsif (/^\.nf/) { $innf = 1; } elsif (/^\.fi/) { $innf = 0; } # Handling .sp is subtle. If it is inside a literal section, do nothing if # the next line is a non literal text line; similarly, if not inside a # literal section, do nothing if a literal follows, unless we are inside # a .nf/.fi section or about to enter one. The point being that the
        # and 
    that delimit literal sections will do the spacing. Always skip # if no previous output. elsif (/^\.sp/) { if ($wrotetext) { $_ = ; if ($inpre) { print TEMP "\n" if (/^[\s.]/); } else { print TEMP "
    \n
    \n" if ($innf || /^\.nf/ || !/^[\s.]/); } redo; # Now process the lookahead line we just read } } elsif (/^\.TP/ || /^\.PP/ || /^\.P/) { &new_para(); } elsif (/^\.SH\s*("?)(.*)\1/) { # Ignore the NAME section if ($2 =~ /^NAME\b/) { ; next; } &end_para(); my($title) = &do_line($2); if ($toc) { printf("
  • $title\n", $ref, $ref); printf TEMP ("

    $title

    \n", $ref); $ref++; } else { print TEMP "

    \n$title\n

    \n"; } } elsif (/^\.SS\s*("?)(.*)\1/) { &end_para(); my($title) = &do_line($2); print TEMP "

    \n$title\n

    \n"; } elsif (/^\.B\s*(.*)/) { &new_para() if (!$inpara); $_ = &do_line($1); s/"(.*?)"/$1/g; print TEMP "$_\n"; $wrotetext = 1; } elsif (/^\.I\s*(.*)/) { &new_para() if (!$inpara); $_ = &do_line($1); s/"(.*?)"/$1/g; print TEMP "$_\n"; $wrotetext = 1; } # Remove the "AUTOMATICALLY GENERATED" warning from pcre2demo.3 elsif (/^\.\\"AUTOMATICALLY GENERATED/) { next; } # A comment that starts "HREF" takes the next line as a name that # is turned into a hyperlink, using the text given, which might be # in a special font. If it ends in () or (digits) or punctuation, they # aren't part of the link. elsif (/^\.\\"\s*HREF/) { $_=; chomp; $_ = &do_line($_); $_ =~ s/\s+$//; $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/; print TEMP "$_\n"; } # A comment that starts "HTML" inserts literal HTML elsif (/^\.\\"\s*HTML\s*(.*)/) { print TEMP $1; } # A comment that starts < inserts that HTML at the end of the # *next* input line - so as not to get a newline between them. elsif (/^\.\\"\s*(<.*>)/) { my($markup) = $1; $_=; chomp; $_ = &do_line($_); $_ =~ s/\s+$//; print TEMP "$_$markup\n"; } # A comment that starts JOIN joins the next two lines together, with one # space between them. Then that line is processed. This is used in some # displays where two lines are needed for the "man" version. JOINSH works # the same, except that it assumes this is a shell command, so removes # continuation backslashes. elsif (/^\.\\"\s*JOIN(SH)?/) { my($one,$two); $one = ; $two = ; $one =~ s/\s*\\e\s*$// if (defined($1)); chomp($one); $two =~ s/^\s+//; $_ = "$one $two"; redo; # Process the joined lines } # .EX/.EE are used in the pcre2demo page to bracket the entire program, # which is unmodified except for turning backslash into "\e". elsif (/^\.EX\s*$/) { print TEMP "
    \n";
          while ()
            {
            last if /^\.EE\s*$/;
            s/\\e/\\/g;
            s/&/&/g;
            s//>/g;
            print TEMP;
            }
          }
    
        # Ignore anything not recognized
    
        next;
        }
    
      # Line does not begin with a dot. Replace blank lines with new paragraphs
    
      if (/^\s*$/)
        {
        &end_para() if ($wrotetext);
        next;
        }
    
      # Convert fonts changes and output an ordinary line. Ensure that indented
      # lines are marked as literal.
    
      $_ = &do_line($_);
      &new_para() if (!$inpara);
    
      if (/^\s/)
        {
        if (!$inpre)
          {
          print TEMP "
    \n";
          $inpre = 1;
          }
        }
      elsif ($inpre)
        {
        print TEMP "
    \n"; $inpre = 0; } # Add
    to the end of a non-literal line if we are within .nf/.fi $_ .= "
    \n" if (!$inpre && $innf); print TEMP; $wrotetext = 1; } # The TOC, if present, will have been written - terminate it print "
\n" if ($toc); # Copy the remainder to the standard output close(TEMP); open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n"; print while (); if ($header) { print < Return to the PCRE2 index page.

End } close(TEMP); unlink("/tmp/$$"); # End ================================================ FILE: maint/CheckMan ================================================ #! /usr/bin/perl # A script to scan PCRE2's man pages to check for typos in the control # sequences. I use only a small set of the available repertoire, so it is # straightforward to check that nothing else has slipped in by mistake. This # script should be called in the doc directory. $yield = 0; while (scalar(@ARGV) > 0) { $line = 0; $file = shift @ARGV; open (IN, $file) || die "Failed to open $file\n"; while () { $count = 0; $line++; if (/^\s*$/) { printf "Empty line $line of $file\n"; $yield = 1; } elsif (/^\./) { if (!/^\.\s*$| ^\.B\s+\S| ^\.TH\s\S| ^\.SH\s\S| ^\.SS\s\S| ^\.TP(?:\s?\d+)?\s*$| ^\.br\s*$| ^\.rs\s*$| ^\.sp\s*$| ^\.nf\s*$| ^\.fi\s*$| ^\.P\s*$| ^\.PP\s*$| ^\.\\"(?:\ HREF)?\s*$| ^\.\\"\sDEFINE\s\w+$| ^\.\\"\sHTML\s\s*$| ^\.\\"\sHTML\s<\/a>\s*$| ^\.\\"\s<\/a>\s*$| ^\.\\"\sJOINSH\s*$| ^\.\\"\sJOIN\s*$/x ) { printf "Bad control line $line of $file\n"; $yield = 1; } } elsif (/\\[^ef&]|\\f[^IBP]/) { printf "Bad backslash in line $line of $file\n"; $yield = 1; } while (/\\f[BI]/g) { $count++; } while (/\\fP/g) { $count--; } if ($count != 0) { printf "Mismatching formatting in line $line of $file\n"; $yield = 1; } } close(IN); } exit $yield; # End ================================================ FILE: maint/CheckTxt ================================================ #! /usr/bin/perl # This is a script for checking whether a file contains any carriage return # characters, and whether it is valid UTF-8. use Encode; # This subroutine does the work for one file. $yield = 0; $ascii = 0; # bool $crlf = 0; # bool sub checktxt { my($file) = $_[0]; open(IN, "<:raw", "$file") || die "Can't open $file for input"; $bin = do { local $/ = undef; }; close(IN); my $data; eval { $data = Encode::decode("UTF-8", $bin, Encode::FB_CROAK); 1; # return true } or do { printf "Bad UTF-8 in $file\n"; $yield = 1; return; }; if (!$crlf && index($data, "\r") != -1) { printf "CR in $file\n"; $yield = 1; } if ($ascii && $data =~ /[^\x01-\x7e]/) { printf "Non-ASCII in $file\n"; $yield = 1; } } # This is the main program $, = ""; # Output field separator for ($i = 0; $i < @ARGV; $i++) { if ($ARGV[$i] eq "-ascii") { $ascii = 1; } elsif ($ARGV[$i] eq "-crlf") { $crlf = 1; } else { checktxt($ARGV[$i]); } } exit $yield; # End ================================================ FILE: maint/CleanTxt ================================================ #! /usr/bin/perl -w # Script to take the output of nroff -man and remove all the backspacing and # the page footers and the screen commands etc so that it is more usefully # readable online. In fact, in the latest nroff, intermediate footers don't # seem to be generated any more. $blankcount = 0; $lastwascut = 0; $firstheader = 1; # Input on STDIN; output to STDOUT. while () { s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m" s/.\x8//g; # Remove "char, backspace" # Handle header lines. Retain only the first one we encounter, but remove # the blank line that follows. Any others (e.g. at end of document) and the # following blank line are dropped. if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/) { if ($firstheader) { $firstheader = 0; print; $lastprinted = $_; $lastwascut = 0; } $_=; # Remove a blank that follows next; } # Count runs of empty lines if (/^\s*$/) { $blankcount++; $lastwascut = 0; next; } # If a chunk of lines has been cut out (page footer) and the next line # has a different indentation, put back one blank line. if ($lastwascut && $blankcount < 1 && defined($lastprinted)) { ($a) = $lastprinted =~ /^(\s*)/; ($b) = $_ =~ /^(\s*)/; $blankcount++ if ($a ne $b); } # We get here only when we have a non-blank line in hand. If it was preceded # by 3 or more blank lines, read the next 3 lines and see if they are blank. # If so, remove all 7 lines, and remember that we have just done a cut. if ($blankcount >= 3) { for ($i = 0; $i < 3; $i++) { $next[$i] = ; $next[$i] = "" if !defined $next[$i]; $next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m" $next[$i] =~ s/.\x8//g; # Remove "char, backspace" } # Cut out chunks of the form <3 blanks><3 blanks> if ($next[0] =~ /^\s*$/ && $next[1] =~ /^\s*$/ && $next[2] =~ /^\s*$/) { $blankcount -= 3; $lastwascut = 1; } # Otherwise output the saved blanks, the current, and the next three # lines. Remember the last printed line. else { for ($i = 0; $i < $blankcount; $i++) { print "\n"; } print; for ($i = 0; $i < 3; $i++) { $next[$i] =~ s/.\x8//g; print $next[$i]; $lastprinted = $_; } $lastwascut = 0; $blankcount = 0; } } # This non-blank line is not preceded by 3 or more blank lines. Output # any blanks there are, and the line. Remember it. Force two blank lines # before headings. else { $blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ && defined($lastprinted); for ($i = 0; $i < $blankcount; $i++) { print "\n"; } print; $lastprinted = $_; $lastwascut = 0; $blankcount = 0; } } # End ================================================ FILE: maint/Detrail ================================================ #! /usr/bin/perl # This is a script for removing trailing whitespace from lines in files that # are listed on the command line. # This subroutine does the work for one file. sub detrail { my($file) = $_[0]; my($changed) = 0; open(IN, "<", "$file") || die "Can't open $file for input"; @lines = ; close(IN); foreach (@lines) { if (/\s+\n$/) { s/\s+\n$/\n/; $changed = 1; } } if ($changed) { open(OUT, ">", "$file") || die "Can't open $file for output"; print OUT @lines; close(OUT); } } # This is the main program $, = ""; # Output field separator for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); } # End ================================================ FILE: maint/FetchUcd.sh ================================================ #! /bin/sh # Small helper script to fetch the Unicode Character Database files VER=17.0.0 cd "$(dirname "$0")" pwd rm -rf Unicode.tables/ mkdir Unicode.tables fetch_file() { url="$1" i="$2" echo "=== Downloading $i ===" # Download each file with curl and place into the Unicode.tables folder # Reject the download if there is an HTTP error if ! curl --fail -o Unicode.tables/$i -L "$url"; then echo "Error downloading $i" rm -f Unicode.tables/$i fi } for i in BidiMirroring.txt \ CaseFolding.txt \ DerivedCoreProperties.txt \ PropertyAliases.txt \ PropertyValueAliases.txt \ PropList.txt \ ScriptExtensions.txt \ Scripts.txt \ UnicodeData.txt \ ; do fetch_file "https://www.unicode.org/Public/$VER/ucd/$i" "$i" done for i in DerivedBidiClass.txt \ DerivedGeneralCategory.txt \ ; do fetch_file "https://www.unicode.org/Public/$VER/ucd/extracted/$i" "$i" done for i in GraphemeBreakProperty.txt \ ; do fetch_file "https://www.unicode.org/Public/$VER/ucd/auxiliary/$i" "$i" done for i in emoji-data.txt \ ; do fetch_file "https://www.unicode.org/Public/$VER/ucd/emoji/$i" "$i" done ================================================ FILE: maint/FilterCoverage.py ================================================ #! /usr/bin/env python3 # Script which is a simple LCOV filter: removes DA/BRDA entries for lines marked in-source with # "LCOV_EXCL_LINE" or "LCOV_EXCL_START"/"LCOV_EXCL_STOP". # # Usage: python3 FilterCoverage.py coverage-lcov.info > coverage-lcov.filtered.info import sys import re def scan_exclusions(srcpath): """Return a set of line numbers to exclude for this source file.""" with open(srcpath, "r", encoding="utf-8") as fh: text = fh.readlines() excl = set() in_block = False for i, line in enumerate(text, start=1): if "LCOV_EXCL_LINE" in line: excl.add(i) if "LCOV_EXCL_START" in line: in_block = True excl.add(i) continue if "LCOV_EXCL_STOP" in line: excl.add(i) in_block = False continue if in_block: excl.add(i) # If line matches '^\s*#(if|endif|else|elif)\b', exclude it. # For some reason, Clang likes to output coverage for these lines, # even though they have no executable code. if re.match(r'^\s*#(if|endif|else|elif|ifdef|ifndef)\b', line): excl.add(i) # Similarly, Clang seems to generate DA entries for "}" lines inside # switch statements. Exclude these too. if re.match(r'^\s*}\s*(/\*.*?\*/)?$', line): excl.add(i) return excl DA_RE = re.compile(r'^\s*DA:(\d+),(\d+)(,.*)?\s*$') LF_RE = re.compile(r'^\s*LF:(\d+)\s*$') LH_RE = re.compile(r'^\s*LH:(\d+)\s*$') BRDA_RE = re.compile(r'^\s*BRDA:(\d+),([e\d]+),(.*),([-\d]+)\s*$') BRF_RE = re.compile(r'^\s*BRF:(\d+)\s*$') BRH_RE = re.compile(r'^\s*BRH:(\d+)\s*$') FN_RE = re.compile(r'^\s*FN:(\d+),([^,\s]*)\s*$') FNDA_RE = re.compile(r'^\s*FNDA:(\d+),([^,\s]*)\s*$') FNF_RE = re.compile(r'^\s*FNF:(\d+)\s*$') FNH_RE = re.compile(r'^\s*FNH:(\d+)\s*$') def process_block(block_lines): """Return processed block lines with excluded DA/BRDA removed and LF/LH fixed.""" if not block_lines: return block_lines # get SF path from first 'SF:' line (should be first) first = block_lines[0] assert first.lstrip().startswith('SF:') sf_path = first.split(':', 1)[1].strip() exclusions = scan_exclusions(sf_path) new_lines = [] da_orig_found = 0 da_orig_hit = 0 da_new_found = 0 da_new_hit = 0 brda_orig_found = 0 brda_orig_hit = 0 brda_new_found = 0 brda_new_hit = 0 fnda_orig_found = 0 fnda_orig_hit = 0 fnda_new_found = 0 fnda_new_hit = 0 fn_exclusions = set() # Pass 1: identify FN exclusions for line in block_lines: m_fn = FN_RE.match(line) assert (m_fn is not None) == line.lstrip().startswith('FN:') if m_fn: fn_line = int(m_fn.group(1)) fn_name = m_fn.group(2) if fn_line in exclusions: fn_exclusions.add(fn_name) # Pass 2: filter DA, BRDA, FN/FNDA; copy others verbatim for line in block_lines: m_da = DA_RE.match(line) assert (m_da is not None) == line.lstrip().startswith('DA:') if m_da: line_num = int(m_da.group(1)) execution_count = int(m_da.group(2)) da_orig_found += 1 if execution_count > 0: da_orig_hit += 1 if line_num in exclusions: # drop this DA line continue da_new_found += 1 if execution_count > 0: da_new_hit += 1 new_lines.append(line) continue m_brda = BRDA_RE.match(line) assert (m_brda is not None) == line.lstrip().startswith('BRDA:') if m_brda: brda_orig_found += 1 taken = m_brda.group(4) if taken != '-' and int(taken) > 0: brda_orig_hit += 1 if int(m_brda.group(1)) in exclusions: # drop this BRDA line continue brda_new_found += 1 if taken != '-' and int(taken) > 0: brda_new_hit += 1 new_lines.append(line) continue m_fnda = FNDA_RE.match(line) assert (m_fnda is not None) == line.lstrip().startswith('FNDA:') if m_fnda: fnda_orig_found += 1 fn_name = m_fnda.group(2) count = int(m_fnda.group(1)) if count > 0: fnda_orig_hit += 1 if fn_name in fn_exclusions: # drop this FNDA line continue fnda_new_found += 1 if count > 0: fnda_new_hit += 1 new_lines.append(line) continue m_fn = FN_RE.match(line) assert (m_fn is not None) == line.lstrip().startswith('FN:') if m_fn: fn_line = int(m_fn.group(1)) fn_name = m_fn.group(2) if fn_name in fn_exclusions: # drop this FN line continue new_lines.append(line) continue # other lines: append unchanged new_lines.append(line) # Pass 3: fix LF/LH, BRF/BRH, FNF/FNH # Mutate new_lines. If we find any LF/LH lines, check they have the expected original values. # If so, replace with new values. If not, print a warning. for i, line in enumerate(new_lines): # LF m_lf = LF_RE.match(line) assert (m_lf is not None) == line.lstrip().startswith('LF:') if m_lf: # preserve leading whitespace exactly leading = re.match(r'^(\s*)', line).group(1) # replace with recomputed value (number of DA entries remaining) new_lines[i] = f"{leading}LF:{da_new_found}\n" # warn if original disagreed (useful for debugging) try: lf_orig = int(m_lf.group(1)) if lf_orig != da_orig_found: print(f"warning: original LF ({lf_orig}) != counted DA entries ({da_orig_found}) for {sf_path}", file=sys.stderr) except Exception: pass continue # LH m_lh = LH_RE.match(line) assert (m_lh is not None) == line.lstrip().startswith('LH:') if m_lh: leading = re.match(r'^(\s*)', line).group(1) new_lines[i] = f"{leading}LH:{da_new_hit}\n" try: lh_orig = int(m_lh.group(1)) if lh_orig != da_orig_hit: print(f"warning: original LH ({lh_orig}) != counted DA hits ({da_orig_hit}) for {sf_path}", file=sys.stderr) except Exception: pass continue # BRF m_brf = BRF_RE.match(line) assert (m_brf is not None) == line.lstrip().startswith('BRF:') if m_brf: leading = re.match(r'^(\s*)', line).group(1) # replace with recomputed branch-found (if you computed brda_new_found above) new_lines[i] = f"{leading}BRF:{brda_new_found}\n" try: brf_orig = int(m_brf.group(1)) if brf_orig != brda_orig_found: print(f"warning: original BRF ({brf_orig}) != counted BRDA entries ({brda_orig_found}) for {sf_path}", file=sys.stderr) except Exception: pass continue # BRH m_brh = BRH_RE.match(line) assert (m_brh is not None) == line.lstrip().startswith('BRH:') if m_brh: leading = re.match(r'^(\s*)', line).group(1) new_lines[i] = f"{leading}BRH:{brda_new_hit}\n" try: brh_orig = int(m_brh.group(1)) if brh_orig != brda_orig_hit: print(f"warning: original BRH ({brh_orig}) != counted BRDA hits ({brda_orig_hit}) for {sf_path}", file=sys.stderr) except Exception: pass continue # FNF m_fnf = FNF_RE.match(line) assert (m_fnf is not None) == line.lstrip().startswith('FNF:') if m_fnf: leading = re.match(r'^(\s*)', line).group(1) new_lines[i] = f"{leading}FNF:{fnda_new_found}\n" try: fnf_orig = int(m_fnf.group(1)) if fnf_orig != fnda_orig_found: print(f"warning: original FNF ({fnf_orig}) != counted FNDA entries ({fnda_orig_found}) for {sf_path}", file=sys.stderr) except Exception: pass continue # FNH m_fnh = FNH_RE.match(line) assert (m_fnh is not None) == line.lstrip().startswith('FNH:') if m_fnh: leading = re.match(r'^(\s*)', line).group(1) new_lines[i] = f"{leading}FNH:{fnda_new_hit}\n" try: fnh_orig = int(m_fnh.group(1)) if fnh_orig != fnda_orig_hit: print(f"warning: original FNH ({fnh_orig}) != counted FNDA hits ({fnda_orig_hit}) for {sf_path}", file=sys.stderr) except Exception: pass continue return new_lines def filter_lcov(in_fh, out_fh): lines = in_fh.readlines() i = 0 out_lines = [] while i < len(lines): line = lines[i] if line.lstrip().startswith('SF:'): # buffer block until end_of_record block = [] while i < len(lines): block.append(lines[i]) if lines[i].strip() == 'end_of_record': i += 1 break i += 1 processed = process_block(block) out_lines.extend(processed) else: out_lines.append(line) i += 1 out_fh.writelines(out_lines) if __name__ == "__main__": if len(sys.argv) > 3: print("Usage: python3 FilterCoverage.py [infile [outfile]]", file=sys.stderr) sys.exit(1) if len(sys.argv) > 2: with open(sys.argv[2], "w", encoding="utf-8") as out_fh: with open(sys.argv[1], "r", encoding="utf-8") as in_fh: filter_lcov(in_fh, out_fh) elif len(sys.argv) > 1: with open(sys.argv[1], "r", encoding="utf-8") as fh: filter_lcov(fh, sys.stdout) else: filter_lcov(sys.stdin, sys.stdout) ================================================ FILE: maint/GenerateCommon.py ================================================ # PCRE2 UNICODE PROPERTY SUPPORT # ------------------------------ # This file is a Python module containing common lists and functions for the # GenerateXXX scripts that create various.c and .h files from Unicode data # files. It was created as part of a re-organizaton of these scripts in # December 2021. import re # --------------------------------------------------------------------------- # DATA LISTS # --------------------------------------------------------------------------- # BIDI classes in the DerivedBidiClass.txt file, short and long identifiers. bidi_classes = [ 'AL', 'Arabic_Letter', 'AN', 'Arabic_Number', 'B', 'Paragraph_Separator', 'BN', 'Boundary_Neutral', 'CS', 'Common_Separator', 'EN', 'European_Number', 'ES', 'European_Separator', 'ET', 'European_Terminator', 'FSI', 'First_Strong_Isolate', 'L', 'Left_To_Right', 'LRE', 'Left_To_Right_Embedding', 'LRI', 'Left_To_Right_Isolate', 'LRO', 'Left_To_Right_Override', 'NSM', 'Nonspacing_Mark', 'ON', 'Other_Neutral', 'PDF', 'Pop_Directional_Format', 'PDI', 'Pop_Directional_Isolate', 'R', 'Right_To_Left', 'RLE', 'Right_To_Left_Embedding', 'RLI', 'Right_To_Left_Isolate', 'RLO', 'Right_To_Left_Override', 'S', 'Segment_Separator', 'WS', 'White_Space' ] # Particular category property names, with comments. NOTE: If ever this list # is changed, the table called "catposstab" in the pcre2_auto_possess.c file # must be edited to keep in step. category_names = [ 'Cc', 'Control', 'Cf', 'Format', 'Cn', 'Unassigned', 'Co', 'Private use', 'Cs', 'Surrogate', 'Ll', 'Lower case letter', 'Lm', 'Modifier letter', 'Lo', 'Other letter', 'Lt', 'Title case letter', 'Lu', 'Upper case letter', 'Mc', 'Spacing mark', 'Me', 'Enclosing mark', 'Mn', 'Non-spacing mark', 'Nd', 'Decimal number', 'Nl', 'Letter number', 'No', 'Other number', 'Pc', 'Connector punctuation', 'Pd', 'Dash punctuation', 'Pe', 'Close punctuation', 'Pf', 'Final punctuation', 'Pi', 'Initial punctuation', 'Po', 'Other punctuation', 'Ps', 'Open punctuation', 'Sc', 'Currency symbol', 'Sk', 'Modifier symbol', 'Sm', 'Mathematical symbol', 'So', 'Other symbol', 'Zl', 'Line separator', 'Zp', 'Paragraph separator', 'Zs', 'Space separator' ] # The Extended_Pictographic property is not found in the file where all the # others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt # file, but we list it here so that the name has the correct index value. break_properties = [ 'CR', ' 0', 'LF', ' 1', 'Control', ' 2', 'Extend', ' 3', 'Prepend', ' 4', 'SpacingMark', ' 5', 'L', ' 6 Hangul syllable type L', 'V', ' 7 Hangul syllable type V', 'T', ' 8 Hangul syllable type T', 'LV', ' 9 Hangul syllable type LV', 'LVT', '10 Hangul syllable type LVT', 'Regional_Indicator', '11', 'Other', '12', 'ZWJ', '13', 'Extended_Pictographic', '14' ] # List of files from which the names of Boolean properties are obtained, along # with a list of regex patterns for properties to be ignored, and a list of # extra pattern names to add. bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] bool_propsignore = [r'^Other_', r'^Hyphen$'] bool_propsextras = ['ASCII', 'Bidi_Mirrored'] # --------------------------------------------------------------------------- # GET BOOLEAN PROPERTY NAMES # --------------------------------------------------------------------------- # Get a list of Boolean property names from a number of files. def getbpropslist(): bplist = [] bplast = "" for filename in bool_propsfiles: try: file = open('Unicode.tables/' + filename, 'r') except IOError: print(f"** Couldn't open {'Unicode.tables/' + filename}\n") sys.exit(1) for line in file: line = re.sub(r'#.*', '', line) data = list(map(str.strip, line.split(';'))) if len(data) <= 1 or data[1] == bplast: continue bplast = data[1] for pat in bool_propsignore: if re.match(pat, bplast) != None: break else: if bplast not in bplist: bplist.append(bplast) file.close() bplist.extend(bool_propsextras) bplist.sort() return bplist bool_properties = getbpropslist() bool_props_list_item_size = (len(bool_properties) + 31) // 32 # --------------------------------------------------------------------------- # COLLECTING PROPERTY NAMES AND ALIASES # --------------------------------------------------------------------------- script_names = ['Unknown'] abbreviations = {} def collect_property_names(): global script_names global abbreviations names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') last_script_name = "" with open("Unicode.tables/Scripts.txt") as f: for line in f: match_obj = names_re.match(line) if match_obj == None or match_obj.group(1) == last_script_name: continue last_script_name = match_obj.group(1) script_names.append(last_script_name) # Sometimes there is comment in the line # so splitting around semicolon is not enough value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') with open("Unicode.tables/PropertyValueAliases.txt") as f: for line in f: match_obj = value_alias_re.match(line) if match_obj == None: continue if match_obj.group(1) == "sc": if match_obj.group(2) == match_obj.group(3): abbreviations[match_obj.group(3)] = () elif match_obj.group(4) == None: abbreviations[match_obj.group(3)] = (match_obj.group(2),) else: abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) # We can also collect Boolean property abbreviations into the same dictionary bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') with open("Unicode.tables/PropertyAliases.txt") as f: for line in f: match_obj = bin_alias_re.match(line) if match_obj == None: continue if match_obj.group(2) != match_obj.group(1) and match_obj.group(2) in bool_properties: if match_obj.group(3) == None: abbreviations[match_obj.group(2)] = (match_obj.group(1),) else: abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) collect_property_names() # --------------------------------------------------------------------------- # REORDERING SCRIPT NAMES # --------------------------------------------------------------------------- script_abbrevs = [] def reorder_scripts(): global script_names global script_abbrevs global abbreviations for name in script_names: abbrevs = abbreviations[name] script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) extended_script_abbrevs = set() with open("Unicode.tables/ScriptExtensions.txt") as f: names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+[A-Za-z]) +#') for line in f: match_obj = names_re.match(line) if match_obj == None: continue for name in match_obj.group(1).split(" "): extended_script_abbrevs.add(name) new_script_names = [] new_script_abbrevs = [] for idx, abbrev in enumerate(script_abbrevs): if abbrev in extended_script_abbrevs: new_script_names.append(script_names[idx]) new_script_abbrevs.append(abbrev) for idx, abbrev in enumerate(script_abbrevs): if abbrev not in extended_script_abbrevs: new_script_names.append(script_names[idx]) new_script_abbrevs.append(abbrev) script_names = new_script_names script_abbrevs = new_script_abbrevs reorder_scripts() script_list_item_size = (script_names.index('Unknown') + 31) // 32 # --------------------------------------------------------------------------- # DERIVED LISTS # --------------------------------------------------------------------------- # Create general character property names from the first letters of the # particular categories. gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) general_category_names = list(gcn_set) general_category_names.sort() # --------------------------------------------------------------------------- # FUNCTIONS # --------------------------------------------------------------------------- import sys # Open an output file, using the command's argument or a default. Write common # preliminary header information. def open_output(default): if len(sys.argv) > 2: print('** Too many arguments: just give a file name') sys.exit(1) if len(sys.argv) == 2: output_name = sys.argv[1] else: output_name = default try: file = open(output_name, "w") except IOError: print("** Couldn't open %s" % output_name) sys.exit(1) script_name = sys.argv[0] i = script_name.rfind('/') if i >= 0: script_name = script_name[i+1:] file.write("""\ /************************************************* * Perl-Compatible Regular Expressions * *************************************************/ /* PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge New API code Copyright (c) 2016-2022 University of Cambridge This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! """) file.write("Instead, modify the maint/%s script and run it to generate\n" "a new version of this code.\n\n" % script_name) file.write("""\ ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ \n\n""") return file # End of UcpCommon.py ================================================ FILE: maint/GenerateTest.py ================================================ #! /usr/bin/env python3 # PCRE2 UNICODE PROPERTY SUPPORT # ------------------------------ # # This file auto-generates Unicode property tests and their expected output. # It is recommended to re-run this generator after the Unicode files are # updated. The names of the generated files are `testinput` and `testoutput` # and should be copied over to replace either test26 or test27 files. import re import sys from GenerateCommon import \ script_names, \ script_abbrevs def write_both(text): input_file.write(text) output_file.write(text) def to_string_char(ch_idx): if ch_idx < 128: if ch_idx < 16: return "\\x{0%x}" % ch_idx if ch_idx >= 32: return chr(ch_idx) return "\\x{%x}" % ch_idx try: input_file = open("testinput", "w") output_file = open("testoutput", "w") except IOError: print("** Couldn't create output files") sys.exit(1) write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n"); write_both("# data, do not edit unless that data has changed and they are reflecting\n"); write_both("# a previous version.\n\n"); # --------------------------------------------------------------------------- # UNICODE SCRIPT EXTENSION TESTS # --------------------------------------------------------------------------- def gen_script_tests(): script_data = [None] * len(script_names) char_data = [None] * 0x110000 property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#") prev_name = "" script_idx = -1 with open("Unicode.tables/Scripts.txt") as f: version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$" v = re.match(version_pat, f.readline()) unicode_version = v.group(1) write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n") write_both("#perltest\n\n") for line in f: match_obj = property_re.match(line) if match_obj == None: continue name = match_obj.group(3) if name != prev_name: script_idx = script_names.index(name) prev_name = name low = int(match_obj.group(1), 16) high = low char_data[low] = name if match_obj.group(2) != None: high = int(match_obj.group(2), 16) for idx in range(low + 1, high + 1): char_data[idx] = name if script_data[script_idx] == None: script_data[script_idx] = [low, None, None, None, None] script_data[script_idx][1] = high extended_script_indicies = {} with open("Unicode.tables/ScriptExtensions.txt") as f: for line in f: match_obj = property_re.match(line) if match_obj == None: continue low = int(match_obj.group(1), 16) high = low if match_obj.group(2) != None: high = int(match_obj.group(2), 16) for abbrev in match_obj.group(3).split(" "): if abbrev not in extended_script_indicies: idx = script_abbrevs.index(abbrev) extended_script_indicies[abbrev] = idx rec = script_data[idx] rec[2] = low rec[3] = high else: idx = extended_script_indicies[abbrev] rec = script_data[idx] if rec[2] > low: rec[2] = low if rec[3] < high: rec[3] = high if rec[4] == None: name = script_names[idx] for idx in range(low, high + 1): if char_data[idx] != name: rec[4] = idx break long_property_name = False for idx, rec in enumerate(script_data): script_name = script_names[idx] if script_name == "Unknown": continue script_abbrev = script_abbrevs[idx] write_both("# Base script check\n") write_both("/^\\p{sc=%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[0])) output_file.write(" 0: %s\n" % to_string_char(rec[0])) write_both("\n") write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) write_both(" %s\n" % to_string_char(rec[1])) output_file.write(" 0: %s\n" % to_string_char(rec[1])) write_both("\n") if rec[2] != None: property_name = "scx" if long_property_name: property_name = "Script_Extensions" write_both("# Script extension check\n") write_both("/^\\p{%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[2])) output_file.write(" 0: %s\n" % to_string_char(rec[2])) write_both("\n") write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) write_both(" %s\n" % to_string_char(rec[3])) output_file.write(" 0: %s\n" % to_string_char(rec[3])) write_both("\n") long_property_name = not long_property_name if rec[4] != None: write_both("# Script extension only character\n") write_both("/^\\p{%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[4])) output_file.write(" 0: %s\n" % to_string_char(rec[4])) write_both("\n") write_both("/^\\p{sc=%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(rec[4])) output_file.write("No match\n") write_both("\n") else: print("External character has not found for %s" % script_name) high = rec[1] if rec[3] != None and rec[3] > rec[1]: high = rec[3] write_both("# Character not in script\n") write_both("/^\\p{%s}/utf\n" % script_name) write_both(" %s\n" % to_string_char(high + 1)) output_file.write("No match\n") write_both("\n") gen_script_tests() write_both("# End of test\n") ================================================ FILE: maint/GenerateUcd.py ================================================ #! /usr/bin/env python3 # PCRE2 UNICODE PROPERTY SUPPORT # ------------------------------ # # This script generates the pcre2_ucd.c file from Unicode data files. This is # the compressed Unicode property data used by PCRE2. The script was created in # December 2021 as part of the Unicode data generation refactoring. It is # basically a re-working of the MultiStage2.py script that was submitted to the # PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of # Unicode property support. A number of extensions have since been added. The # main difference in the 2021 upgrade (apart from comments and layout) is that # the data tables (e.g. list of script names) are now listed in or generated by # a separate Python module that is shared with the other Generate scripts. # # This script must be run in the "maint" directory. It requires the following # Unicode data tables: BidiMirrorring.txt, CaseFolding.txt, # DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt, # GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt, # PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and # emoji-data.txt. These must be in the Unicode.tables subdirectory. # # The emoji-data.txt file is found in the "emoji" subdirectory even though it # is technically part of a different (but coordinated) standard as shown # in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), # for example: # # http://unicode.org/Public/emoji/13.0/ReadMe.txt # # DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" # subdirectory of the Unicode database (UCD) on the Unicode web site; # GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files # are in the top-level UCD directory. # # ----------------------------------------------------------------------------- # Minor modifications made to the original script: # Added #! line at start # Removed tabs # Made it work with Python 2.4 by rewriting two statements that needed 2.5 # Consequent code tidy # Adjusted data file names to take from the Unicode.tables directory # Adjusted global table names by prefixing _pcre_. # Commented out stuff relating to the casefolding table, which isn't used; # removed completely in 2012. # Corrected size calculation # Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. # Update for PCRE2: name changes, and SUPPORT_UCP is abolished. # # Major modifications made to the original script: # Added code to add a grapheme break property field to records. # # Added code to search for sets of more than two characters that must match # each other caselessly. A new table is output containing these sets, and # offsets into the table are added to the main output records. This new # code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer # used. # # Update for Python3: # . Processed with 2to3, but that didn't fix everything # . Changed string.strip to str.strip # . Added encoding='utf-8' to the open() call # . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is # required and the result of the division is a float # # Added code to scan the emoji-data.txt file to find the Extended Pictographic # property, which is used by PCRE2 as a grapheme breaking property. This was # done when updating to Unicode 11.0.0 (July 2018). # # Added code to add a Script Extensions field to records. This has increased # their size from 8 to 12 bytes, only 10 of which are currently used. # # Added code to add a bidi class field to records by scanning the # DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare # bytes, so now 11 out of 12 are in use. # # 01-March-2010: Updated list of scripts for Unicode 5.2.0 # 30-April-2011: Updated list of scripts for Unicode 6.0.0 # July-2012: Updated list of scripts for Unicode 6.1.0 # 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new # field in the record to hold the value. Luckily, the # structure had a hole in it, so the resulting table is # not much bigger than before. # 18-September-2012: Added code for multiple caseless sets. This uses the # final hole in the structure. # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 # 13-May-2014: Updated for PCRE2 # 03-June-2014: Updated for Python 3 # 20-June-2014: Updated for Unicode 7.0.0 # 12-August-2014: Updated to put Unicode version into the file # 19-June-2015: Updated for Unicode 8.0.0 # 02-July-2017: Updated for Unicode 10.0.0 # 03-July-2018: Updated for Unicode 11.0.0 # 07-July-2018: Added code to scan emoji-data.txt for the Extended # Pictographic property. # 01-October-2018: Added the 'Unknown' script name # 03-October-2018: Added new field for Script Extensions # 27-July-2019: Updated for Unicode 12.1.0 # 10-March-2020: Updated for Unicode 13.0.0 # PCRE2-10.39: Updated for Unicode 14.0.0 # 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class, # and also PropList.txt for the Bidi_Control property # 19-December-2021: Reworked script extensions lists to be bit maps instead # of zero-terminated lists of script numbers. # ---------------------------------------------------------------------------- # # Changes to the refactored script: # # 26-December-2021: Refactoring completed # 10-January-2022: Addition of general Boolean property support # 12-January-2022: Merge scriptx and bidiclass fields # 14-January-2022: Enlarge Boolean property offset to 12 bits # 28-January-2023: Remove ASCII "other case" from non-ASCII character that # are present in caseless sets. # # ---------------------------------------------------------------------------- # # # The main tables generated by this script are used by macros defined in # pcre2_internal.h. They look up Unicode character properties using short # sequences of code that contains no branches, which makes for greater speed. # # Conceptually, there is a table of records (of type ucd_record), one for each # Unicode character. Each record contains the script number, script extension # value, character type, grapheme break type, offset to caseless matching set, # offset to the character's other case, the bidi class, and offset to bitmap of # Boolean properties. # # A real table covering all Unicode characters would be far too big. It can be # efficiently compressed by observing that many characters have the same # record, and many blocks of characters (taking 128 characters in a block) have # the same set of records as other blocks. This leads to a 2-stage lookup # process. # # This script constructs seven tables. The ucd_caseless_sets table contains # lists of characters that all match each other caselessly. Each list is # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than # any valid character. The first list is empty; this is used for characters # that are not part of any list. # # The ucd_digit_sets table contains the code points of the '9' characters in # each set of 10 decimal digits in Unicode. This is used to ensure that digits # in script runs all come from the same set. The first element in the vector # contains the number of subsequent elements, which are in ascending order. # # Scripts are partitioned into two groups. Scripts that appear in at least one # character's script extension list come first, followed by "Unknown" and then # all the rest. This sorting is done automatically in the GenerateCommon.py # script. A script's number is its index in the script_names list. # # The ucd_script_sets table contains bitmaps that represent lists of scripts # for Script Extensions properties. Each bitmap consists of a fixed number of # unsigned 32-bit numbers, enough to allocate a bit for every script that is # used in any character's extension list, that is, enough for every script # whose number is less than ucp_Unknown. A character's script extension value # in its ucd record is an offset into the ucd_script_sets vector. The first # bitmap has no bits set; characters that have no script extensions have zero # as their script extensions value so that they use this map. # # The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean # properties. Each bitmap consists of a fixed number of unsigned 32-bit # numbers, enough to allocate a bit for each supported Boolean property. # # The ucd_records table contains one instance of every unique character record # that is required. The ucd_stage1 table is indexed by a character's block # number, which is the character's code point divided by 128, since 128 is the # size of each block. The result of a lookup in ucd_stage1 a "virtual" block # number. # # The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by # the offset of a character within its own block, and the result is the index # number of the required record in the ucd_records vector. # # The following examples are correct for the Unicode 14.0.0 database. Future # updates may make change the actual lookup values. # # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 # lookup 97 (0x61) in the first table in stage2 yields 35 # record 35 is { 0, 5, 12, 0, -32, 18432, 44 } # 0 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # -32 (-0x20) => Other case is U+0041 # 18432 = 0x4800 => Combined Bidi class + script extension values # 44 => Offset to Boolean properties # # The top 5 bits of the sixth field are the Bidi class, with the rest being the # script extension value, giving: # # 9 = ucp_bidiL => Bidi class left-to-right # 0 => No special script extension property # # Almost all lowercase latin characters resolve to the same record. One or two # are different because they are part of a multi-character caseless set (for # example, k, K and the Kelvin symbol are such a set). # # Example: hiragana letter A (U+3042) is in block 96 (0x60) # lookup 96 in stage1 table yields 93 # lookup 66 (0x42) in table 93 in stage2 yields 819 # record 819 is { 20, 7, 12, 0, 0, 18432, 82 } # 20 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # 0 => No other case # 18432 = 0x4800 => Combined Bidi class + script extension values # 82 => Offset to Boolean properties # # The top 5 bits of the sixth field are the Bidi class, with the rest being the # script extension value, giving: # # 9 = ucp_bidiL => Bidi class left-to-right # 0 => No special script extension property # # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) # lookup 57 in stage1 table yields 55 # lookup 80 (0x50) in table 55 in stage2 yields 621 # record 621 is { 84, 12, 3, 0, 0, 26762, 96 } # 84 = ucp_Inherited => Script inherited from predecessor # 12 = ucp_Mn => Non-spacing mark # 3 = ucp_gbExtend => Grapheme break property "Extend" # 0 => Not part of a caseless set # 0 => No other case # 26762 = 0x688A => Combined Bidi class + script extension values # 96 => Offset to Boolean properties # # The top 5 bits of the sixth field are the Bidi class, with the rest being the # script extension value, giving: # # 13 = ucp_bidiNSM => Bidi class non-spacing mark # 138 => Script Extension list offset = 138 # # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, # 18, and 47 set. This means that this character is expected to be used with # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # # Philip Hazel, last updated 14 January 2022. ############################################################################## # Import standard modules import re import string import sys # Import common data lists and functions from GenerateCommon import \ bidi_classes, \ bool_properties, \ bool_propsfiles, \ bool_props_list_item_size, \ break_properties, \ category_names, \ general_category_names, \ script_abbrevs, \ script_list_item_size, \ script_names, \ open_output # Some general parameters MAX_LIST = 8 # keep on sync with the value in pcre2_auto_possess.c MAX_UNICODE = 0x110000 NOTACHAR = 0xffffffff # --------------------------------------------------------------------------- # DEFINE FUNCTIONS # --------------------------------------------------------------------------- # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt def make_get_names(enum): return lambda chardata: enum.index(chardata[1]) # Parse a line of DerivedBidiClass.txt def get_bidi(chardata): if len(chardata[1]) > 3: return bidi_classes_long.index(chardata[1]) else: return bidi_classes_short.index(chardata[1]) # Parse a line of CaseFolding.txt def get_other_case(chardata): if chardata[1] == 'C' or chardata[1] == 'S': return int(chardata[2], 16) - int(chardata[0], 16) return None # Parse a line of ScriptExtensions.txt def get_script_extension(chardata): script_extension = tuple(script_abbrevs.index(abbrev) for abbrev in chardata[1].split(' ')) try: index = script_lists.index(script_extension) except ValueError: index = len(script_lists) script_lists.append(script_extension) return index * script_list_item_size # Read a whole table in memory, setting/checking the Unicode version def read_table(file_name, get_value, default_value): global unicode_version f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) file_base = f.group(1) version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" file = open(file_name, 'r', encoding='utf-8') f = re.match(version_pat, file.readline()) version = f.group(1) if unicode_version == "": unicode_version = version elif unicode_version != version: print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) table = [default_value] * MAX_UNICODE for line in file: if file_base == 'DerivedBidiClass': line = re.sub(r'# @missing: ', '', line) line = re.sub(r'#.*', '', line) chardata = list(map(str.strip, line.split(';'))) if len(chardata) <= 1: continue value = get_value(chardata) if value is None: continue m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) char = int(m.group(1), 16) if m.group(3) is None: last = char else: last = int(m.group(3), 16) for i in range(char, last + 1): if file_base == 'CaseFolding' and table[i] != default_value: print("WARNING: multiple rules for other_case[0x{:X}]".format(i)) table[i] = value file.close() return table # Get the smallest possible C language type for the values in a table def get_type_size(table): type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), ("signed char", 1), ("int16_t", 2), ("int32_t", 4)] limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127), (-32768, 32767), (-2147483648, 2147483647)] minval = min(table) maxval = max(table) for num, (minlimit, maxlimit) in enumerate(limits): if minlimit <= minval and maxval <= maxlimit: return type_size[num] raise OverflowError("Too large to fit into C types") # Get the total size of a list of tables def get_tables_size(*tables): total_size = 0 for table in tables: type, size = get_type_size(table) total_size += size * len(table) return total_size # Compress a table into the two stages def compress_table(table, block_size): blocks = {} # Dictionary for finding identical blocks stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) stage2 = [] # Stage 2 table contains the blocks with property values table = tuple(table) for i in range(0, len(table), block_size): block = table[i:i+block_size] start = blocks.get(block) if start is None: # Allocate a new block start = len(stage2) / block_size stage2 += block blocks[block] = start stage1.append(start) return stage1, stage2 # Output a table def write_table(table, table_name, block_size = None): type, size = get_type_size(table) ELEMS_PER_LINE = 16 s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) if block_size: s += ", block = %d" % block_size f.write(s + " */\n") table = tuple(table) if block_size is None: fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n" mult = MAX_UNICODE / len(table) for i in range(0, len(table), ELEMS_PER_LINE): f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),))) else: if block_size > ELEMS_PER_LINE: el = ELEMS_PER_LINE else: el = block_size fmt = "%3d," * el + "\n" if block_size > ELEMS_PER_LINE: fmt = fmt * int(block_size / ELEMS_PER_LINE) for i in range(0, len(table), block_size): f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) f.write("};\n\n") # Extract the unique combinations of properties into records def combine_tables(*tables): records = {} index = [] for t in zip(*tables): i = records.get(t) if i is None: i = records[t] = len(records) index.append(i) return index, records # Create a record struct def get_record_size_struct(records): size = 0 structure = 'typedef struct {\n' for i in range(len(records[0])): record_slice = [record[i] for record in records] slice_type, slice_size = get_type_size(record_slice) # add padding: round up to the nearest power of slice_size size = (size + slice_size - 1) & -slice_size size += slice_size structure += '%s property_%d;\n' % (slice_type, i) # round up to the first item of the next structure in array record_slice = [record[0] for record in records] slice_type, slice_size = get_type_size(record_slice) size = (size + slice_size - 1) & -slice_size structure += '} ucd_record;\n*/\n' return size, structure # Write records def write_records(records, record_size): f.write('const ucd_record PRIV(ucd_records)[] = { ' + \ '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size)) records = list(zip(list(records.keys()), list(records.values()))) records.sort(key = lambda x: x[1]) for i, record in enumerate(records): f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,))) f.write('};\n\n') # Write a bit set def write_bitsets(list, item_size): for d in list: bitwords = [0] * item_size for idx in d: bitwords[idx // 32] |= 1 << (idx & 31) s = " " for x in bitwords: f.write("%s" % s) s = ", " f.write("0x%08xu" % x) f.write(",\n") f.write("};\n\n") # --------------------------------------------------------------------------- # This bit of code must have been useful when the original script was being # developed. Retain it just in case it is ever needed again. # def test_record_size(): # tests = [ \ # ( [(3,), (6,), (6,), (1,)], 1 ), \ # ( [(300,), (600,), (600,), (100,)], 2 ), \ # ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ # ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ # ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ # ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ # ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ # ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ # ] # for test in tests: # size, struct = get_record_size_struct(test[0]) # assert(size == test[1]) # test_record_size() # --------------------------------------------------------------------------- # --------------------------------------------------------------------------- # MAIN CODE FOR CREATING TABLES # --------------------------------------------------------------------------- unicode_version = "" # Some of the tables imported from GenerateCommon.py have alternate comment # strings for use by GenerateUcpHeader. The comments are not wanted here, so # remove them. bidi_classes_short = bidi_classes[::2] bidi_classes_long = bidi_classes[1::2] break_properties = break_properties[::2] category_names = category_names[::2] # Create the various tables from Unicode data files script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other')) other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', get_bidi, bidi_classes_short.index('L')) # The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now # we need to find the Extended_Pictographic property for emoji characters. This # can be set as an additional grapheme break property, because the default for # all the emojis is "other". We scan the emoji-data.txt file and modify the # break-props table. file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') for line in file: line = re.sub(r'#.*', '', line) chardata = list(map(str.strip, line.split(';'))) if len(chardata) <= 1: continue if chardata[1] != "Extended_Pictographic": continue m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) char = int(m.group(1), 16) if m.group(3) is None: last = char else: last = int(m.group(3), 16) for i in range(char, last + 1): if break_props[i] != break_properties.index('Other'): print("WARNING: Emoji 0x%x has break property %s, not 'Other'", i, break_properties[break_props[i]], file=sys.stderr) break_props[i] = break_properties.index('Extended_Pictographic') file.close() # Handle script extensions. The get_script_extesion() function maintains a # list of unique bitmaps representing lists of scripts, returning the offset # in that list. Initialize the list with an empty set, which is used for # characters that have no script extensions. script_lists = [[]] scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0) for idx in range(len(scriptx_bidi_class)): scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11) bidi_class = None # Find the Boolean properties of each character. This next bit of magic creates # a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to # the *same* list, which is not what we want. bprops = [[] for _ in range(MAX_UNICODE)] # Collect the properties from the various files for filename in bool_propsfiles: try: file = open('Unicode.tables/' + filename, 'r') except IOError: print(f"** Couldn't open {'Unicode.tables/' + filename}\n") sys.exit(1) for line in file: line = re.sub(r'#.*', '', line) data = list(map(str.strip, line.split(';'))) if len(data) <= 1: continue try: ix = bool_properties.index(data[1]) except ValueError: continue m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0]) char = int(m.group(1), 16) if m.group(3) is None: last = char else: last = int(m.group(3), 16) for i in range(char, last + 1): bprops[i].append(ix) file.close() # The ASCII property isn't listed in any files, but it is easy enough to add # it manually. ix = bool_properties.index("ASCII") for i in range(128): bprops[i].append(ix) # The Bidi_Mirrored property isn't listed in any property files. We have to # deduce it from the file that lists the mirrored characters. ix = bool_properties.index("Bidi_Mirrored") try: file = open('Unicode.tables/BidiMirroring.txt', 'r') except IOError: print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n") sys.exit(1) for line in file: line = re.sub(r'#.*', '', line) data = list(map(str.strip, line.split(';'))) if len(data) <= 1: continue c = int(data[0], 16) bprops[c].append(ix) file.close() # Scan each character's boolean property list and created a list of unique # lists, at the same time, setting the index in that list for each property in # the bool_props vector. bool_props = [0] * MAX_UNICODE bool_props_lists = [[]] for c in range(MAX_UNICODE): s = set(bprops[c]) for i in range(len(bool_props_lists)): if s == set(bool_props_lists[i]): break else: bool_props_lists.append(bprops[c]) i += 1 bool_props[c] = i * bool_props_list_item_size # This block of code was added by PH in September 2012. It scans the other_case # table to find sets of more than two characters that must all match each other # caselessly. Later in this script a table of these sets is written out. # However, we have to do this work here in order to compute the offsets in the # table that are inserted into the main table. # The CaseFolding.txt file lists pairs, but the common logic for reading data # sets only one value, so first we go through the table and set "return" # offsets for those that are not already set. for c in range(MAX_UNICODE): if other_case[c] != 0 and other_case[c + other_case[c]] == 0: other_case[c + other_case[c]] = -other_case[c] # Now scan again and create equivalence sets. caseless_sets = [] for c in range(MAX_UNICODE): o = c + other_case[c] # Trigger when this character's other case does not point back here. We # now have three characters that are case-equivalent. if other_case[o] != -other_case[c]: t = o + other_case[o] # Scan the existing sets to see if any of the three characters are already # part of a set. If so, unite the existing set with the new set. appended = 0 for s in caseless_sets: found = 0 for x in s: if x == c or x == o or x == t: found = 1 # Add new characters to an existing set if found: found = 0 for y in [c, o, t]: for x in s: if x == y: found = 1 if not found: s.append(y) appended = 1 # If we have not added to an existing set, create a new one. if not appended: caseless_sets.append([c, o, t]) # End of loop looking for caseless sets. # Now scan the sets and set appropriate offsets for the characters. caseless_offsets = [0] * MAX_UNICODE offset = 1 for s in caseless_sets: for x in s: caseless_offsets[x] = offset offset += len(s) + 1 # End of block of code for creating offsets for caseless matching sets. # Scan the caseless sets, and for any non-ASCII character that has an ASCII # character as its "base" other case, remove the other case. This makes it # easier to handle those characters when the PCRE2 option for not mixing ASCII # and non-ASCII is enabled. In principle one should perhaps scan for a # non-ASCII alternative, but in practice these don't exist. for s in caseless_sets: for x in s: if x > 127 and x + other_case[x] < 128: other_case[x] = 0 # Append a couple of extra caseless sets (unreferenced by the record objects) # to hold the optional Turkish case equivalences. turkish_dotted_i_index = offset caseless_sets.append([0x69, 0x0130]) caseless_sets.append([0x49, 0x0131]) # Combine all the tables table, records = combine_tables(script, category, break_props, caseless_offsets, other_case, scriptx_bidi_class, bool_props) # Find the record size and create a string definition of the structure for # outputting as a comment. record_size, record_struct = get_record_size_struct(list(records.keys())) # Find the optimum block size for the two-stage table min_size = sys.maxsize for block_size in [2 ** i for i in range(5,10)]: size = len(records) * record_size stage1, stage2 = compress_table(table, block_size) size += get_tables_size(stage1, stage2) #print("/* block size {:3d} => {:5d} bytes */".format(block_size, size)) if size < min_size: min_size = size min_stage1, min_stage2 = stage1, stage2 min_block_size = block_size # --------------------------------------------------------------------------- # MAIN CODE FOR WRITING THE OUTPUT FILE # --------------------------------------------------------------------------- # Open the output file (no return on failure). This call also writes standard # header boilerplate. f = open_output("pcre2_ucd.c") # Output this file's heading text f.write("""\ /* This file contains tables of Unicode properties that are extracted from Unicode data files. See the comments at the start of maint/GenerateUcd.py for details. As well as being part of the PCRE2 library, this file is #included by the pcre2test program, which redefines the PRIV macro to change table names from _pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present, just one of these tables is actually needed. When compiling the library, some headers are needed. */ #ifndef PCRE2_PCRE2TEST #include "pcre2_internal.h" #endif /* PCRE2_PCRE2TEST */ /* The tables herein are needed only when UCP support is built, and in PCRE2 that happens automatically with UTF support. This module should not be referenced otherwise, so it should not matter whether it is compiled or not. However a comment was received about space saving - maybe the guy linked all the modules rather than using a library - so we include a condition to cut out the tables when not needed. But don't leave a totally empty module because some compilers barf at that. Instead, just supply some small dummy tables. */ #ifndef SUPPORT_UNICODE const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}}; const uint16_t PRIV(ucd_stage1)[] = {0}; const uint16_t PRIV(ucd_stage2)[] = {0}; const uint32_t PRIV(ucd_caseless_sets)[] = {0}; const uint32_t PRIV(ucd_nocase_ranges)[] = {0}; const uint32_t PRIV(ucd_nocase_ranges_size) = 0; #else \n""") # --- Output some variable heading stuff --- f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size)) f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version)) f.write("""\ /* When recompiling tables with a new Unicode version, please check the types in this structure definition with those in pcre2_internal.h (the actual field names will be different). \n""") f.write(record_struct) f.write(""" /* If the 32-bit library is run in non-32-bit mode, character values greater than 0x10ffff may be encountered. For these we set up a special record. */ #if PCRE2_CODE_UNIT_WIDTH == 32 const ucd_record PRIV(dummy_ucd_record)[] = {{ ucp_Unknown, /* script */ ucp_Cn, /* type unassigned */ ucp_gbOther, /* grapheme break property */ 0, /* case set */ 0, /* other case */ 0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */ 0, /* bool properties offset */ }}; #endif \n""") # --- Output the table of caseless character sets --- f.write("""\ /* This table contains lists of characters that are caseless sets of more than one character. Each list is terminated by NOTACHAR. */ const uint32_t PRIV(ucd_caseless_sets)[] = { NOTACHAR, """) for s in caseless_sets: s = sorted(s) for x in s: f.write(' 0x%04x,' % x) f.write(' NOTACHAR,\n') f.write('};\n\n') # --- Output the indices of the Turkish caseless character sets --- f.write("""\ /* This is the index, within ucd_caseless_sets, of the additional Turkish case-equivalences. The dotted I ones are this offset; the dotless I are +3 from here. */ const uint32_t PRIV(ucd_turkish_dotted_i_caseset) = %d; """ % (turkish_dotted_i_index)) # --- Other tables are not needed by pcre2test --- f.write("""\ /* When #included in pcre2test, we don't need the table of digit sets, nor the the large main UCD tables. */ #ifndef PCRE2_PCRE2TEST \n""") # --- Output the nocase sets --- f.write("""\ /* This table contains character ranges, where the characters in the range have no other case. Both start and end values are excluded from the range. */ const uint32_t PRIV(ucd_nocase_ranges)[] = { """) range_start = 0 size = 0 # The range size is bigger than eight characters. expected_size = 8 total = 0 for c in range(1, MAX_UNICODE): if other_case[c] != 0 or c in [0x0130, 0x0131]: # add the two chars that gain casing in Turkish if c - range_start > expected_size: range_size = c - range_start - 1 f.write(' 0x%04x, 0x%04x, /* %d */\n' % (range_start, c, range_size)) total += range_size size += 2 range_start = c # The else case is unlikely if other_case[MAX_UNICODE - 1] == 0 and MAX_UNICODE - range_start > expected_size: range_size = MAX_UNICODE - range_start - 1 f.write(' 0x%04x, 0x%04x, /* %d */\n' % (range_start, MAX_UNICODE, range_size)) total += range_size size += 2 f.write(' 0xffffffff, 0xffffffff /* terminator */\n};\n\n'); f.write('/* Total: %d characters. */\nconst uint32_t PRIV(ucd_nocase_ranges_size) = %d;\n\n' % (total, size)) # --- Read Scripts.txt again for the sets of 10 digits. --- digitsets = [] file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') for line in file: m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) if m is None: continue first = int(m.group(1),16) last = int(m.group(2),16) if ((last - first + 1) % 10) != 0: f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), file=sys.stderr) while first < last: digitsets.append(first + 9) first += 10 file.close() digitsets.sort() f.write("""\ /* This table lists the code points for the '9' characters in each set of decimal digits. It is used to ensure that all the digits in a script run come from the same set. */ const uint32_t PRIV(ucd_digit_sets)[] = { """) f.write(" %d, /* Number of subsequent values */" % len(digitsets)) count = 8 for d in digitsets: if count == 8: f.write("\n ") count = 0 f.write(" 0x%05x," % d) count += 1 f.write("\n};\n\n") f.write("""\ /* This vector is a list of script bitsets for the Script Extension property. The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as ucd_script_sets_item_size. */ const uint32_t PRIV(ucd_script_sets)[] = { """) write_bitsets(script_lists, script_list_item_size) f.write("""\ /* This vector is a list of bitsets for Boolean properties. The number of 32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in pcre2_ucp.h. */ const uint32_t PRIV(ucd_boolprop_sets)[] = { """) write_bitsets(bool_props_lists, bool_props_list_item_size) # Output the main UCD tables. f.write("""\ /* These are the main two-stage UCD tables. The fields in each record are: script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed into a 16-bit field, and offset in binary properties table (16 bits). */ \n""") write_records(records, record_size) write_table(min_stage1, 'PRIV(ucd_stage1)') write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size) f.write("""\ #error Please correct UCD_BLOCK_SIZE in pcre2_internal.h #endif #endif /* SUPPORT_UNICODE */ #endif /* PCRE2_PCRE2TEST */ /* End of pcre2_ucd.c */ """) f.close() # End ================================================ FILE: maint/GenerateUcpHeader.py ================================================ #! /usr/bin/env python3 # PCRE2 UNICODE PROPERTY SUPPORT # ------------------------------ # This script generates the pcre2_ucp.h file from Unicode data files. This # header uses enumerations to give names to Unicode property types and script # names. # This script was created in December 2021 as part of the Unicode data # generation refactoring. # Import common data lists and functions from GenerateCommon import \ bidi_classes, \ bool_properties, \ bool_props_list_item_size, \ break_properties, \ category_names, \ general_category_names, \ script_list_item_size, \ script_names, \ open_output # Open the output file (no return on failure). This call also writes standard # header boilerplate. f = open_output("pcre2_ucp.h") # Output this file's heading text f.write("""\ #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD #define PCRE2_UCP_H_IDEMPOTENT_GUARD /* This file contains definitions of the Unicode property values that are returned by the UCD access macros and used throughout PCRE2. IMPORTANT: The specific values of the first two enums (general and particular character categories) are assumed by the table called catposstab in the file pcre2_auto_possess.c. They are unlikely to change, but should be checked after an update. */ \n""") f.write("/* These are the general character categories. */\n\nenum {\n") for i in general_category_names: f.write(" ucp_%s,\n" % i) f.write("};\n\n") f.write("/* These are the particular character categories. */\n\nenum {\n") for i in range(0, len(category_names), 2): f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1])) f.write("};\n\n") f.write("/* These are Boolean properties. */\n\nenum {\n") for i in bool_properties: f.write(" ucp_%s,\n" % i) f.write(" /* This must be last */\n") f.write(" ucp_Bprop_Count\n};\n\n") f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n") f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size) f.write("/* These are the bidi class values. */\n\nenum {\n") for i in range(0, len(bidi_classes), 2): sp = ' ' * (4 - len(bidi_classes[i])) f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1])) f.write("};\n\n") f.write("/* These are grapheme break properties. The Extended Pictographic " "property\ncomes from the emoji-data.txt file. */\n\nenum {\n") for i in range(0, len(break_properties), 2): sp = ' ' * (21 - len(break_properties[i])) f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1])) f.write("};\n\n") f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n") for i in script_names: if i == "Unknown": f.write("\n /* Scripts which has no characters in other scripts. */\n") f.write(" ucp_%s,\n" % i) f.write("\n") f.write(" /* This must be last */\n") f.write(" ucp_Script_Count\n};\n\n") f.write("/* Size of entries in ucd_script_sets[] */\n\n") f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size) f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n") f.write("/* End of pcre2_ucp.h */\n") f.close() # End ================================================ FILE: maint/GenerateUcpTables.py ================================================ #! /usr/bin/env python3 # PCRE2 UNICODE PROPERTY SUPPORT # ------------------------------ # This script generates the pcre2_ucptables_inc.h file, which contains tables for # recognizing Unicode property names. It is #included by pcre2_tables.c. In # order to reduce the number of relocations when loading the PCRE2 library, the # names are held as a single large string, with offsets in the table. This is # tedious to maintain by hand. Therefore, a script is used to generate the # table. # This script was created in December 2021 based on the previous GenerateUtt # script, whose output had to be manually edited into pcre2_tables.c. Here is # the history of the original script: # ----------------------------------------------------------------------------- # Modified by PH 17-March-2009 to generate the more verbose form that works # for UTF-support in EBCDIC as well as ASCII environments. # Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. # Modified by PH 04-May-2010 to add new "X.." special categories. # Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 # Modified by ChPe 30-September-2012 to add this note; no other changes were # necessary for Unicode 6.2.0 support. # Modfied by PH 26-February-2013 to add the Xuc special category. # Comment modified by PH 13-May-2014 to update to PCRE2 file names. # Script updated to Python 3 by running it through the 2to3 converter. # Added script names for Unicode 7.0.0, 20-June-2014. # Added script names for Unicode 8.0.0, 19-June-2015. # Added script names for Unicode 10.0.0, 02-July-2017. # Added script names for Unicode 11.0.0, 03-July-2018. # Added 'Unknown' script, 01-October-2018. # Added script names for Unicode 12.1.0, 27-July-2019. # Added script names for Unicode 13.0.0, 10-March-2020. # Added Script names for Unicode 14.0.0, PCRE2-10.39 # Added support for bidi class and bidi control, 06-December-2021 # This also involved lower casing strings and removing underscores, in # accordance with Unicode's "loose matching" rules, which Perl observes. # Changed default script type from PT_SC to PT_SCX, 18-December-2021 # ----------------------------------------------------------------------------- # # Note subsequent changes here: # # 27-December-2021: Added support for 4-letter script abbreviations. # 10-January-2022: Further updates for Boolean property support # ----------------------------------------------------------------------------- # Import common data lists and functions from GenerateCommon import \ abbreviations, \ bool_properties, \ bidi_classes, \ category_names, \ general_category_names, \ script_names, \ open_output # Open the output file (no return on failure). This call also writes standard # header boilerplate. f = open_output("pcre2_ucptables_inc.h") # The list in bidi_classes contains just the Unicode classes such as AN, LRE, # etc., along with comments. We need to add "bidi" in front of each value, in # order to create names that don't clash with other types of property. bidi_class_names = [] for i in range(0, len(bidi_classes), 2): bidi_class_names.append("bidi" + bidi_classes[i]) # Remove the comments from other lists that contain them. category_names = category_names[::2] # Create standardized versions of the names by lowercasing and removing # underscores. def stdname(x): return x.lower().replace('_', '') def stdnames(x): y = [''] * len(x) for i in range(len(x)): y[i] = stdname(x[i]) return y std_category_names = stdnames(category_names) std_general_category_names = stdnames(general_category_names) std_bidi_class_names = stdnames(bidi_class_names) std_bool_properties = stdnames(bool_properties) # Create the table, starting with the Unicode script, category and bidi class # names. We keep both the standardized name and the original, because the # latter is used for the ucp_xx names. NOTE: for the script abbreviations, we # still use the full original names. utt_table = [] scx_end = script_names.index('Unknown') for idx, name in enumerate(script_names): pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC' utt_table.append((stdname(name), name, pt_type)) for abbrev in abbreviations[name]: utt_table.append((stdname(abbrev), name, pt_type)) # Add the remaining property lists utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) for name in bool_properties: utt_table.append((stdname(name), name, 'PT_BOOL')) if name in abbreviations: for abbrev in abbreviations[name]: utt_table.append((stdname(abbrev), name, 'PT_BOOL')) # Now add specials and synonyms. Note both the standardized and capitalized # forms are needed. utt_table.append(('any', 'Any', 'PT_ANY')) utt_table.append(('l&', 'L&', 'PT_LAMP')) utt_table.append(('lc', 'LC', 'PT_LAMP')) utt_table.append(('xan', 'Xan', 'PT_ALNUM')) utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) utt_table.append(('xuc', 'Xuc', 'PT_UCNC')) utt_table.append(('xwd', 'Xwd', 'PT_WORD')) # Remove duplicates from the table and then sort it. utt_table = list(set(utt_table)) utt_table.sort() # Output file-specific heading f.write("""\ #ifdef SUPPORT_UNICODE /* The PRIV(utt)[] table below translates Unicode property names into type and code values. It is searched by binary chop, so must be in collating sequence of name. Originally, the table contained pointers to the name strings in the first field of each entry. However, that leads to a large number of relocations when a shared library is dynamically loaded. A significant reduction is made by putting all the names into a single, large string and using offsets instead. All letters are lower cased, and underscores are removed, in accordance with the "loose matching" rules that Unicode advises and Perl uses. */ \n""") # We have to use STR_ macros to define the strings so that it all works in # UTF-8 mode on EBCDIC platforms. for utt in utt_table: f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND'))) for c in utt[0]: if c == '&': f.write(' STR_AMPERSAND') else: f.write(' STR_%s' % c); f.write(' "\\0"\n') # Output the long string of concatenated names f.write('\nconst char PRIV(utt_names)[] =\n') last = '' for utt in utt_table: if utt == utt_table[-1]: last = ';' f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last)) # Output the property type table f.write('\nconst ucp_type_table PRIV(utt)[] = {\n') offset = 0 last = ',' for utt in utt_table: if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 'PT_SPACE', 'PT_UCNC', 'PT_WORD'): value = '0' else: value = 'ucp_' + utt[1] if utt == utt_table[-1]: last = '' f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last)) offset += len(utt[0]) + 1 f.write('};\n\n') # Ending text f.write("""\ const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); #endif /* SUPPORT_UNICODE */ /* End of pcre2_ucptables_inc.h */ """) f.close # End ================================================ FILE: maint/LintMan ================================================ #!/usr/bin/perl use warnings; use strict; use Getopt::Long; use vars qw /$opt_verbose/; # A script to scan PCRE2's man pages to check for values that might need to # be updated to match the code. # # It updates numerical values after \" DEFINE or errors if name is # not found. my $file; my %defs; foreach $file ("../src/config.h.generic") { open (INCLUDE, $file) or die "Failed to open include $file\n"; while () { next unless /^#define ([[:upper:]_\d]+)\s+(\d+)/a; $defs{$1} = $2; } close(INCLUDE); } GetOptions("verbose"); while (scalar(@ARGV) > 0) { $file = shift @ARGV; open my $fh, "+<", $file or die "Failed to open $file\n"; my @lines = <$fh>; my $updated = 0; foreach my $index (0 .. $#lines) { if ($lines[$index] =~ /^\.\\"\sDEFINE\s([[:upper:]_\d]+)$/a) { my $l = $index + 1; die "Invalid DEFINE line $l of $file\n" unless defined $lines[$l]; my $key = $1; die "Bad DEFINE key $key line $l of $file\n" unless exists $defs{$key}; my $value = $defs{$key}; if ($lines[$index + 1] !~ /^$value\b/) { $updated += $lines[$index + 1] =~ s/^\d+/$value/a; print "Updated $key in $file to $value\n" if $opt_verbose; } } } if ($updated > 0) { seek($fh, 0, 0); print $fh @lines; truncate($fh, tell($fh)); } close($fh); } ================================================ FILE: maint/ManyConfigTests ================================================ #! /bin/sh # This is a script for the use of PCRE2 maintainers. It configures and builds # PCRE2 with a variety of configuration options, and in each case runs the # tests to ensure that all goes well. Every possible combination would take far # too long, so we use a representative sample. This script should be run in the # PCRE2 source directory. # While debugging, it is sometimes useful to be able to cut out some of the # tests, in order to run those that are giving errors. The following options # do this: # # -noasan skip the test that uses -fsanitize=address # -nousan skip the test that uses -fsanitize=undefined # -nodebug skip the test that uses --enable-debug # -nojit skip all JIT tests # -nojitmain skip non-valgrind JIT tests # -nojitvalgrind skip JIT tests with valgrind # -nomain skip all the main (non-JIT) set of tests # -nomainvalgrind skip the main (non-JIT) valgrind tests # -notmp skip the tests in a temporary directory # -notmpjit skip the JIT test in a temporary directory # -noebcdic skip the EBCDIC tests # -novalgrind skip all the valgrind tests # Alternatively, if any of those names are given with '+' instead of '-no', # only those groups named with '+' are run (e.g. +jit). If -dummy is given, # no tests are actually run - this provides a means of testing the selectors. # The -v option causes a call to 'pcre2test -C' to happen for each # configuration. useasan=1 useusan=1 usedebug=1 usejit=1 usejitvalgrind=1 usemain=1 usemainvalgrind=1 usetmp=1 usetmpjit=1 useebcdic=1 useebcdicjit=1 usevalgrind=1 dummy=0 seenplus=0 verbose=0 while [ $# -gt 0 ] ; do case $1 in +*) if [ $seenplus -eq 0 ]; then useasan=0 useusan=0 usedebug=0 usejit=0 usejitvalgrind=0 usemain=0 usemainvalgrind=0 usetmp=0 usetmpjit=0 useebcdic=0 useebcdicjit=0 usevalgrind=0 seenplus=1 fi;; esac case $1 in -dummy) dummy=1;; -v) verbose=1;; -noasan) useasan=0;; -nousan) useusan=0;; -nodebug) usedebug=0;; -nojit) usejit=0; usejitvalgrind=0; usetmpjit=0; useebcdicjit=0;; -nojitmain) usejit=0;; -nojitvalgrind) usejitvalgrind=0;; -nomain) usemain=0; usemainvalgrind=0;; -nomainvalgrind) usemainvalgrind=0;; -notmp) usetmp=0; usetmpjit=0;; -notmpjit) usetmpjit=0;; -noebcdic) useebcdic=0; useebcdicjit=0;; -noebcdicjit) useebcdicjit=0;; -novalgrind) usevalgrind=0;; +asan) useasan=1;; +usan) useusan=1;; +debug) usedebug=1;; +jit) usejit=1; usejitvalgrind=1; usetmpjit=1; useebcdicjit=1;; +jitmain) usejit=1;; +jitvalgrind) usejitvalgrind=1;; +main) usemain=1; usemainvalgrind=1;; +mainvalgrind) usemainvalgrind=1;; +tmp) usetmp=1;; +tmpjit) usetmpjit=1;; +ebcdic) useebcdic=1;; +ebcdicjit) useebcdicjit=1;; +valgrind) usevalgrind=1; usejitvalgrind=1; usemainvalgrind=1;; *) echo "Unknown option '$1'"; exit 1;; esac shift done if [ $usejitvalgrind -eq 0 -a $usemainvalgrind -eq 0 ] ; then usevalgrind=0 fi # This is in case the caller has set aliases (as I do - PH) unset cp ls mv rm # This is a temporary directory for testing out-of-line builds tmp=/tmp/pcre2testing # Don't bother with compiler optimization for most tests; it just slows down # compilation a lot (and running the tests themselves is quick). However, one # special test turns optimization on, because it can provoke some compiler # warnings. CFLAGS="-g" OFLAGS="-O0" CC="${CC:=cc}" ISGCC=0 # If the compiler is GCC, add a lot of warning switches. CC_VER_OUTPUT=`printf '#if defined(__GNUC__) && !defined(__clang__)\nGCC=yes\n#endif\n' | $CC -E -` if [ $? -eq 0 ] && (echo "$CC_VER_OUTPUT" | grep GCC=yes) >/dev/null; then echo "Treating $CC as GCC" ISGCC=1 CFLAGS="$CFLAGS -Wall" CFLAGS="$CFLAGS -Wextra" CFLAGS="$CFLAGS -Wno-overlength-strings" CFLAGS="$CFLAGS -Wpointer-arith" CFLAGS="$CFLAGS -Wwrite-strings" CFLAGS="$CFLAGS -Wundef" CFLAGS="$CFLAGS -Wshadow" CFLAGS="$CFLAGS -Wmissing-field-initializers" CFLAGS="$CFLAGS -Wunused-parameter" CFLAGS="$CFLAGS -Wformat" CFLAGS="$CFLAGS -Wbad-function-cast" CFLAGS="$CFLAGS -Wmissing-declarations" CFLAGS="$CFLAGS -Wnested-externs" CFLAGS="$CFLAGS -pedantic" CFLAGS="$CFLAGS -Wuninitialized" CFLAGS="$CFLAGS -Wmaybe-uninitialized" CFLAGS="$CFLAGS -Wmissing-prototypes" CFLAGS="$CFLAGS -Wstrict-prototypes" CFLAGS="$CFLAGS -Warray-bounds" CFLAGS="$CFLAGS -Wformat-overflow=2" CFLAGS="$CFLAGS -Wformat-truncation=1" CFLAGS="$CFLAGS -Wdeclaration-after-statement" fi # This function runs a single test with the set of configuration options that # are in $opts. The source directory must be set in srcdir. The function must # be defined as "runtest()" not "function runtest()" in order to run on # Solaris. runtest() { rm -f $srcdir/pcre2test $srcdir/pcre2grep $srcdir/pcre2_jit_test $srcdir/pcre2posix_test testcount=`expr $testcount + 1` if [ "$opts" = "" ] ; then echo "[$testcount/$testtotal] Configuring with: default settings" else echo "[$testcount/$testtotal] Configuring with:" echo " $opts" fi if [ $dummy -eq 1 ]; then return; fi CC="$CC" CFLAGS="$CFLAGS" \ $srcdir/configure $opts >/dev/null 2>teststderrM if [ $? -ne 0 ]; then echo " " echo "******** Error while configuring ********" cat teststderrM exit 1 fi # There is an infelicity in the Autotools world (as of October 2015) which # causes the message # # ar: `u' modifier ignored since `D' is the default (see `U') # # to be output while linking. This triggers an unwanted error report from this # script, because it expects no stderr output while making. To get round this # we filter the stderr output through sed, removing all occurrences of the # above lines. Just for paranoia, check that sed is available before doing # this. echo "Making" make -j >/dev/null 2>teststderrM makeRC=$? if command -v sed >/dev/null 2>&1 ; then sed "/\`u' modifier ignored since \`D' is the default/ d" \ teststderrM > teststderrMM mv -f teststderrMM teststderrM fi if [ $makeRC -ne 0 -o -s teststderrM ]; then echo " " echo "******** Errors or warnings while making ********" echo " " cat teststderrM exit 1 fi if [ $verbose -eq 1 ]; then ./pcre2test -C fi ./pcre2test -C jit >/dev/null jit=$? ./pcre2test -C pcre2-8 >/dev/null pcre2_8=$? echo "Running PCRE2 library tests $withvalgrind $withmalloc" $srcdir/RunTest $valgrind $malloc >teststdoutM 2>teststderrM if [ $? -ne 0 -o -s teststderrM ]; then echo " " echo "**** Test failed ****" if [ -s teststderrM ] ; then cat teststderrM else cat teststdoutM fi exit 1 fi if [ $pcre2_8 -eq 0 ]; then echo "Skipping pcre2grep and pcre2posix tests: 8-bit library not compiled" elif [ "x$withebcdic" != x ]; then echo "Skipping pcre2grep and pcre2posix tests: tests not supported on EBCDIC" else echo "Running pcre2grep tests $withvalgrind" $srcdir/RunGrepTest $valgrind >teststdoutM 2>teststderrM if [ $? -ne 0 -o -s teststderrM ]; then echo " " echo "**** Test failed ****" cat teststderrM cat teststdoutM exit 1 fi echo "Running pcre2posix test $withvalgrind" $valgrind ./pcre2posix_test >teststdoutM 2>teststderrM if [ $? -ne 0 ]; then echo " " echo "**** Test failed ****" exit 1 fi fi if [ "$jit" -eq 0 ]; then echo "Skipping JIT regression tests: JIT is not enabled" elif [ "x$withebcdic" != x ]; then echo "Skipping JIT regression tests: tests not supported on EBCDIC" else echo "Running JIT regression tests $withvalgrind" $jrvalgrind ./pcre2_jit_test >teststdoutM 2>teststderrM if [ $? -ne 0 -o -s teststderrM ]; then echo " " echo "**** Test failed ****" cat teststderrM cat teststdoutM exit 1 fi fi } # Update the total count whenever a new test is added; it is used to show # progess as each test is run. testtotal=`expr 17 \* $usemain + \ 1 \* $usemain \* $usedebug + \ 1 \* $usetmp + 1 \* $usetmpjit + \ 1 \* $ISGCC \* $usemain + \ 1 \* $ISGCC \* $usemain \* $useasan + \ 1 \* $ISGCC \* $usemain \* $useusan + \ 13 \* $usejit + \ 2 \* $useebcdic + \ 1 \* $useebcdicjit + \ 2 \* $usemainvalgrind + \ 2 \* $usejitvalgrind` testcount=0 if [ $testtotal -eq 0 ] ; then echo "** No tests selected" exit 1 fi valgrind= jrvalgrind= withvalgrind= malloc= withmalloc= srcdir=. export srcdir if [ $usejit -ne 0 ]; then enable_jit=--enable-jit else enable_jit= fi # If gcc is in use, run a maximally configured test with -O2, because that can # throw up warnings that are not detected with -O0. Then run a second test with # -fsanitize=address, which also may throw up new warnings as well as checking # things at runtime. Finally, run another test using -fsanitize=undefined # -std-gnu99 to check for runtime actions that are not well defined. if [ $ISGCC -ne 0 -a $usemain -ne 0 ]; then echo "---------- Maximally configured test with -O2 ----------" SAVECFLAGS="$CFLAGS" CFLAGS="-O2 $CFLAGS" echo "CFLAGS=$CFLAGS" opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32" runtest if [ $useasan -ne 0 ]; then echo "---------- Maximally configured test with -fsanitize=address ----------" # Following a kernel change, sanitize address doesn't work unless the extra # PIE options are also set. CFLAGS="$OFLAGS $SAVECFLAGS -no-pie -fno-PIE -fsanitize=address" echo "CFLAGS=$CFLAGS" opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32" runtest fi # This also seems to be the case for sanitize undefined. if [ $useusan -ne 0 ]; then echo "------- Maximally configured test with -fsanitize=undefined -fno-sanitize=alignment -std=gnu99 -------" CFLAGS="$OFLAGS $SAVECFLAGS -no-pie -fno-PIE -fsanitize=undefined -fno-sanitize=alignment -std=gnu99" echo "CFLAGS=$CFLAGS" opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32" runtest fi CFLAGS="$SAVECFLAGS" fi # This set of tests builds PCRE2 and runs the tests with a variety of configure # options, in the current (source) directory. The empty configuration builds # with all the default settings. As well as testing that these options work, we # use --disable-shared or --disable-static except for the default test (which # builds both) to save a bit of time by building only one version of the # library for the subsequent tests. echo "---------- CFLAGS for the remaining tests ----------" CFLAGS="$OFLAGS $CFLAGS" echo "CFLAGS=$CFLAGS" if [ $usemain -ne 0 ]; then if [ $usedebug -ne 0 ]; then echo "---------- Maximally configured test with --enable-debug ----------" opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug" runtest fi echo "---------- Non-JIT tests in the current directory ----------" for opts in \ "" \ "--disable-static" \ "--disable-shared" \ "--disable-unicode --disable-shared --enable-never-backslash-C" \ "--with-link-size=3 --disable-shared --disable-pcre2grep-callout" \ "--disable-unicode --enable-rebuild-chartables --disable-shared" \ "--disable-unicode --enable-newline-is-any --disable-shared" \ "--disable-unicode --enable-newline-is-cr --disable-shared" \ "--disable-unicode --enable-newline-is-crlf --disable-shared" \ "--disable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \ "--enable-newline-is-any --disable-static" \ "--disable-unicode --enable-pcre2-16 --enable-debug" \ "--enable-pcre2-16 --disable-shared" \ "--disable-unicode --enable-pcre2-32" \ "--enable-pcre2-32 --disable-shared" \ "--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-shared" \ "--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared" do runtest done fi # Now run the JIT tests unless disabled if [ $usejit -ne 0 ]; then echo "---------- JIT tests in the current directory ----------" for opts in \ "--disable-unicode --enable-jit --disable-shared" \ "--enable-jit --disable-shared" \ "--enable-jit --with-link-size=3 --disable-shared" \ "--enable-jit --enable-pcre2-16 --disable-shared" \ "--disable-unicode --enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \ "--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \ "--enable-jit --enable-pcre2-16 --with-link-size=3 --disable-shared" \ "--enable-jit --enable-pcre2-16 --with-link-size=4 --disable-shared" \ "--enable-jit --enable-pcre2-32 --disable-shared" \ "--disable-unicode --enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \ "--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \ "--enable-jit --enable-pcre2-32 --with-link-size=4 --disable-shared" \ "--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" do runtest done fi # Now run some tests with EBCDIC enabled if [ $useebcdic -ne 0 -o $useebcdicjit -ne 0 ]; then echo "---------- EBCDIC tests in the current directory ----------" withebcdic="with EBCDIC" if [ $useebcdic -ne 0 ]; then for opts in \ "--disable-unicode --enable-ebcdic --enable-ebcdic-ignoring-compiler" \ "--disable-unicode --enable-ebcdic --enable-ebcdic-ignoring-compiler --enable-ebcdic-nl25" do runtest done fi if [ $useebcdicjit -ne 0 ]; then for opts in \ "--disable-unicode --enable-jit --enable-ebcdic --enable-ebcdic-ignoring-compiler" do runtest done fi fi withebcdic= # Now re-run some of the tests under valgrind. if [ $usevalgrind -ne 0 ]; then echo "---------- Tests in the current directory using valgrind ----------" valgrind=valgrind withvalgrind="with valgrind" malloc=-malloc withmalloc="with -malloc" if [ $usemainvalgrind -ne 0 ]; then for opts in \ "--disable-shared" \ "--with-link-size=3 --enable-pcre2-16 --enable-pcre2-32 --disable-shared" do opts="--enable-valgrind $opts" runtest # Only need to run the first test with -malloc. malloc= withmalloc= done fi malloc=-malloc withmalloc="with -malloc" if [ $usejitvalgrind -ne 0 ]; then jrvalgrind="valgrind --tool=memcheck -q --smc-check=all-non-file --suppressions=$srcdir/testdata/valgrind-jit.supp" for opts in \ "--enable-jit --disable-shared" \ "--enable-jit --enable-pcre2-16 --enable-pcre2-32" do opts="--enable-valgrind $opts" runtest # Only need to run the first test with -malloc. malloc= withmalloc= done fi fi valgrind= jrvalgrind= withvalgrind= malloc= withmalloc= # Clean up the distribution and then do at least one build and test in a # directory other than the source directory. It doesn't work unless the # source directory is cleaned up first. if [ -f Makefile ]; then echo "Running 'make distclean'" make distclean >/dev/null 2>&1 if [ $? -ne 0 ]; then echo "** 'make distclean' failed" exit 1 fi fi echo "---------- End of tests in the source directory ----------" echo "Removing teststdoutM and teststderrM" rm -rf teststdoutM teststderrM if [ $usetmp -ne 0 -o $usetmpjit -ne 0 ]; then srcdir=`pwd` export srcdir if [ ! -e $tmp ]; then mkdir $tmp fi if [ ! -d $tmp ]; then echo "** Failed to create $tmp or it is not a directory" exit 1 fi cd $tmp if [ $? -ne 0 ]; then echo "** Failed to cd to $tmp" exit 1 fi if [ $usetmp -ne 0 ]; then echo "---------- Tests in the $tmp directory ----------" for opts in \ "--disable-shared" do runtest done fi if [ $usetmpjit -ne 0 ]; then echo "---------- JIT tests in the $tmp directory ----------" for opts in \ "--enable-jit --disable-shared" do runtest done fi echo "Removing $tmp" rm -rf $tmp fi echo "---------- All done ----------" # End ================================================ FILE: maint/README ================================================ MAINTENANCE README FOR PCRE2 ============================ The files in the "maint" directory of the PCRE2 source contain data, scripts, and programs that are used for the maintenance of PCRE2, but which do not form part of the PCRE2 distribution tarballs. This document describes these files and also contains some notes for maintainers. Its contents are: Files in the maint directory Updating to a new Unicode release Preparing for a PCRE2 release Updating version info for libtool Long-term ideas (wish list) For a description of the way PCRE2 works, see the file called HACKING in the top directory. Files in the maint directory ============================ 132html A Perl script to convert man pages to HTML (.1 and .3 files "two" HTML), used by UpdateAlways. CheckMan A Perl script to validate the syntax in PCRE2 man pages, used by UpdateAlways. CleanTxt A Perl script to clean up the nroff output in PCRE2 man pages, used by UpdateAlways. Detrail A Perl script to remove trailing whitespace from PCRE2 files, used by UpdateAlways. GenerateCommon.py A Python module containing data and functions that are used by the other Generate scripts. GenerateTest.py A Python script that generates input and expected output test data for tests 26 or 27, which tests certain aspects of Unicode property support. GenerateUcd.py A Python script that generates the file pcre2_ucd.c from GenerateCommon.py and Unicode data files, which are themselves downloaded from the Unicode web site. The generated file contains the tables for a 2-stage lookup of Unicode properties, along with some auxiliary tables. The script starts with a long comment that gives details of the tables it constructs. GenerateUcpHeader.py A Python script that generates the file pcre2_ucp.h from GenerateCommon.py and Unicode data files. The generated file defines constants for various Unicode property values. GenerateUcpTables.py A Python script that generates the file pcre2_ucptables_inc.h from GenerateCommon.py and Unicode data files. The generated file contains tables for looking up Unicode property names. FetchUcd.sh A shell script to download the UCD data from the Unicode website into the Unicode.tables directory. FilterCoverage.py A small helper used by the RunCoverage script. LintMan A Perl script to check and update magic numbers in the documentation that correspond to configurable settings in the codebase. manifest-* Data files used to verify the contents of the distribution tarball and `make install` file lists. ManyConfigTests A shell script that runs "configure, make, test" a number of times with different configuration settings. UpdateAlways A shell script to ensure that all auto-generated outputs are ready for release. It should be run often (by CI on each commit) to ensure that the repository is in a clean and consistent state. UpdateDates.py UpdateRelease.py Python scripts to be run less frequently than UpdateAlways. These should only be needed immediately before a release, when finalising the repository. UpdateDates.py checks in the last-updated times on documentation pages. UpdateRelease.py is needed after any change to the version number in configure.ac. pcre2_chartables.c.non-standard This is a set of character tables that came from a Windows system. It has characters greater than 128 that are set as spaces, amongst other things. I kept it so that it can be used for testing from time to time. README This file. RunCoverage A script used to generate the coverage report using Clang. It is called by the GitHub CI actions, and can also be run by a developer locally. RunManifestTest RunManifestTest.ps1 Scripts to generate and verify a list of files against an expected 'manifest' detailing what the directory should contain. Unicode.tables The files in this directory were downloaded from the Unicode web site. They contain information about Unicode characters and scripts, and are used by the Generate scripts. There is also UnicodeData.txt, which is no longer used by any script, because it is useful occasionally for manually looking up the details of certain characters. However, note that character names in this file such as "Arabic sign sanah" do NOT mean that the character is in a particular script (in this case, Arabic). Scripts.txt and ScriptExtensions.txt are where to look for script information. ucptest.c A program for testing the Unicode property macros that do lookups in the pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables. Compile and run this in the "maint" directory (see comments at its head). This program can also be used to find characters with specific properties and to list which properties are supported. ucptestdata A directory containing four files, testinput{1,2} and testoutput{1,2}, for use in conjunction with the ucptest program. utf8.c A short, freestanding C program for converting a Unicode code point into a sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes. If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it treats them as a UTF-8 string and outputs the equivalent code points in hex. See comments at its head for details. Updating to a new Unicode release ================================= When there is a new release of Unicode, the files in Unicode.tables must be refreshed from the Unicode web site, which can be done with the script FetchUcd.sh. Once that is done, the four Python scripts that generate files from the Unicode data can be run from within the "maint" directory. Note that the format used for those files is not stable, and therefore changes to the scripts might be needed to support new versions. Note: Previously, it was necessary to update lists of scripts and their abbreviations by hand before running the Python scripts. This is no longer necessary because the scripts have been upgraded to extract this information themselves. Also, there used to be explicit lists of scripts in two of the man pages. This is no longer the case; the pcre2test program can now output a list of supported scripts, and the command to do so is part of the documentation. You can give an output file name as an argument to the following scripts, but by default: GenerateUcd.py creates pcre2_ucd.c ) GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory GenerateUcpTables.py creates pcre2_ucptables_inc.h ) These files can be compared against the existing versions in the src directory to check on any changes before replacing the old files, but you can also generate directly into the final location by running: ./GenerateUcd.py ../src/pcre2_ucd.c ./GenerateUcpHeader.py ../src/pcre2_ucp.h ./GenerateUcpTables.py ../src/pcre2_ucptables_inc.h Once the .c and .h files are in the ../src directory, the ucptest program can be compiled and used to check that the new tables work properly. The data files in ucptestdata are set up to check a number of test characters. See the comments at the start of ucptest.c. Depending of the type of changes, adding tests for new scripts, properties or characters to the files in ucptestdata is recommended. Make sure to regenerate and validate the output files after. Finally, you should run the GenerateTest.py script to regenerate new versions of the input and expected output from a series of Unicode property tests that are automatically generated from the Unicode data files. By default, the files are written to testinput and testoutput in the current directory, but they should be moved to replace the files inside the main testdata directory and that are being used for tests 27 or 26. In summary: ``` ./GenerateUcd.py ../src/pcre2_ucd.c ./GenerateUcpHeader.py ../src/pcre2_ucp.h ./GenerateUcpTables.py ../src/pcre2_ucptables_inc.h ./GenerateTest.py mv testinput ../testdata/testinput27 mv testoutput ../testdata/testoutput27 ...compile ucptest.c for i in 1 2; do ./ucptest < ucptestdata/testinput$i > testoutput$i diff -U3 testoutput$i ucptestdata/testoutput$i done ``` Preparing for a PCRE2 release ============================= This section contains a checklist of things that I (NCW) do before building a new release. * First of all, make sure that the main branch is in good condition. - Basically, test it. The CI jobs should all be passing. This ensures that pcre2tests are passing, that the build is warning-free, and that all our platforms are running correctly. The CI jobs should be running the Perl tests (which assert that `testdata/testinput1` tests give the same results using Perl's regex engine). The ManyConfigTests exercise a variety of build options and combinations. The Autoconf and CMake builds must pass. - If new build options have been added, ensure that they are added to the CMake files as well as to the Autoconf files. - Run perltest.sh on the test data for tests 1 and 4. The output should match the PCRE2 test output, apart from the version identification at the start of each test. Sometimes there are other differences in test 4 if PCRE2 and Perl are using different Unicode releases. The other tests are not Perl-compatible (they use various PCRE2-specific features or options). The maint/RunPerlTest shell script can be used to do this testing in Unix-like environment. - Check the external testing tools. CodeQL & Clang Static Analyzer report their results to the GitHub "Security" dashboard. Coverity has its own external dashboard, as does OSS-Fuzz. Since we have these tools, we should at least confirm they haven't flagged anything. - Documentation: check the documentation is ready; and LICENCE, NON-AUTOTOOLS-BUILD, and README. Many of these won't need changing, but over the long term things do change. * Ensure the AUTHORS file is up-to-date with any new contributors since the last release. I use this simple command: ```sh git log $GIT_TAG..HEAD --format='RealAuthor: %aN <%aE>%n%w(80,4,4)%b' | \ grep -E '^RealAuthor: .*|Co-authored-by:' | \ sed -E -e 's/RealAuthor: |.*Co-authored-by:\s*//' | \ sort -u ``` * Ensure the ChangeLog and NEWS files are updated with everything that you want to announce in the new release. This command helps dump the Git commits: ```sh git log --reverse -p -U10 --invert-grep --grep='#noupdate' $GIT_TAG..HEAD \ -- ':!doc/*.txt' ':!doc/html' ``` * Update the library version numbers in configure.ac according to the rules given below. * Add the new library version to the src/libpcre2-*.sym.in files (even if no new symbols have been added since the last release). * Push all these changes to main. * Take a branch off main, named "release/pcre2-10.XX-RC1" or "release/pcre2-10.XX". All releases should come from main. The final release isn't branched off from the RC branch; the RC branch is a "throwaway" release which can be pruned from the linear history of the trunk of PCRE2's tree. * In the new branch, remove the "-DEV" prefix from the version number and set the release date in configure.ac. Update the release date in the ChangeLog and NEWS files. ``` vim configure.ac vim NEWS vim ChangeLog git add -u configure.ac NEWS ChangeLog git commit -m"Update version number and release date" ``` * Perform updates of the automatically-generated files. ``` ./autogen.sh && ./configure rm src/config.h.generic src/pcre2.h.generic make src/config.h.generic src/pcre2.h.generic git clean -idx . git add -u src/config.h.generic src/pcre2.h.generic git commit -m"Automatic update of .generic files" maint/UpdateRelease.py maint/UpdateDates.py maint/UpdateAlways git add -u git commit -m"Automatic update of doc files #noupdate" ``` * Commit the Autoconf files to the branch. This is required so that users can check out the Git tag, and receive the same contents as the tarball users. ``` ./autogen.sh git add -f Makefile.in aclocal.m4 ar-lib compile config.guess config.sub \ configure depcomp install-sh ltmain.sh m4/libtool.m4 m4/ltoptions.m4 \ m4/ltsugar.m4 m4/ltversion.m4 m4/lt~obsolete.m4 missing src/config.h.in \ test-driver git commit -m"Commit autogen.sh output" ``` * Now, wait for the CI job to build the tarball. We can't do this locally: we want to be releasing a tarball which is signed by GitHub. The GitHub signature says, "Yes, the developer did not tamper with this tarball, we certify that it was derived solely from the contents of the Git repository at this commit hash". * Create the tag locally. We can't do this via the GitHub UI: it has no way to create signed tags (since my GPG key lives on a Yubikey). ``` git config user.signingkey 'FB63B406!' git tag -s pcre2-10.XX -m"Release 10.XX" git tag -v pcre2-10.XX ``` * Download the tarball from the CI artifacts. Sign these using the GPG key. ``` KEYID=FB63B406 for i in pcre2-10.XX.{zip,tar.gz,tar.bz2}; do gpg --output $i.sig --detach-sig --default-key $KEYID $i gpg --verify $i.sig done ``` * In the GitHub UI, create a "release" from the tag (which must have been already pushed). Add the tarballs and GPG signatures. * Announce the release on the mailing list. * Bump the version number on main to the next release, plus -DEV. * After issuing a final release, merge the release tag back to main with: ``` git merge -s ours release/pcre2-10.XX ``` Do not do this for -RC releases, which are not included in the linear history of the PCRE2 development trunk. We want users with forks of PCRE2 to be able to update from release to the next by simply doing a `git merge` in their fork. If the release tag is not merged back to main, then users will see unnecessary Git conflicts when trying to fast-forward from one release to the next. Updating version info for libtool ================================= This set of rules for updating library version information came from a web page whose URL I have forgotten. The version information consists of three parts: (current, revision, age). 1. Start with version information of 0:0:0 for each libtool library. 2. Update the version information only immediately before a public release of your software. More frequent updates are unnecessary, and only guarantee that the current interface number gets larger faster. 3. If the library source code has changed at all since the last update, then increment revision; c:r:a becomes c:r+1:a. 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0. 5. If any interfaces have been added since the last public release, then increment age. 6. If any interfaces have been removed or changed since the last public release, then set age to 0. The following explanation may help in understanding the above rules a bit better. Consider that there are three possible kinds of reaction from users to changes in a shared library: 1. Programs using the previous version may use the new version as a drop-in replacement, and programs using the new version can also work with the previous one. In other words, no recompiling nor relinking is needed. In this case, increment revision only, don't touch current or age. 2. Programs using the previous version may use the new version as a drop-in replacement, but programs using the new version may use APIs not present in the previous one. In other words, a program linking against the new version may fail if linked against the old version at run time. In this case, set revision to 0, increment current and age. 3. Programs may need to be changed, recompiled, relinked in order to use the new version. Increment current, set revision and age to 0. Future ideas (wish list) ======================== This section records a list of ideas so that they do not get forgotten. They vary enormously in their usefulness and potential for implementation. Some are very sensible; some are rather wacky. Some have been on this list for many years. . Optimization There are always ideas for new optimizations so as to speed up pattern matching. Most of them try to save work by recognizing a non-match without having to scan all the possibilities. These are some that I've recorded: * /((A{0,5}){0,5}){0,5}(something complex)/ on a non-matching string is very slow, though Perl is fast. Can we speed up somehow? Convert to {0,125}? OTOH, this is pathological - the user could easily fix it. * Turn ={4} into ==== ? (for speed). I once did an experiment, and it seems to have little effect, and maybe makes things worse. * "Ends with literal string" - note that a single character doesn't gain much over the existing "required code unit" feature that just remembers one code unit. * Remember an initial string rather than just 1 code unit. * A required code unit from alternatives - not just the last unit, but an earlier one if common to all alternatives. * Friedl contains other ideas. * The code does not set initial code unit flags for Unicode property types such as \p; I don't know how much benefit there would be for, for example, setting the bits for 0-9 and all values >= xC0 (in 8-bit mode) when a pattern starts with \p{N}. . Perl and PCRE2 sometimes differ in the settings of capturing subpatterns inside repeats. One example of the difference is the matching of /(main(O)?)+/ against mainOmain, where PCRE2 leaves $2 set. In Perl, it's unset. Changing this in PCRE2 will be very hard because I think it needs much more state to be remembered. . A feature to suspend a match via a callout was once requested. . An option to convert results into character offsets and character lengths. . A (non-Unix) user wanted pcregrep options to (a) list a file name just once, preceded by a blank line, instead of adding it to every matched line, and (b) support --outputfile=name. . Define a union for the results from pcre2_pattern_info(). . Provide a "random access to the subject" facility so that the way in which it is stored is independent of PCRE2. For efficiency, it probably isn't possible to switch this dynamically. It would have to be specified when PCRE2 was compiled. PCRE2 would then call a function every time it wanted a character. . pcre2grep: add -rs for a sorted recurse. Having to store file names and sort them will of course slow it down. . Someone suggested --disable-callout to save code space when callouts are never wanted. This seems rather marginal. . A user suggested a parameter to limit the length of string matched, for example if the parameter is N, the current match should fail if the matched substring exceeds N. This could apply to both match functions. The value could be a new field in the match context. Compare the offset_limit feature, which limits where a match must start. . Write a function that generates random matching strings for a compiled pattern. . Pcre2grep: an option to specify the output line separator, either as a string or select from a fixed list. This is not straightforward, because at the moment it outputs whatever is in the input file. . Improve the code for duplicate checking in pcre2_dfa_match(). An incomplete, non-thread-safe patch showed that this can help performance for patterns where there are many alternatives. However, a simple thread-safe implementation that I tried made things worse in many simple cases, so this is not an obviously good thing. . PCRE2 cannot at present distinguish between subpatterns with different names, but the same number (created by the use of ?|). In order to do so, a way of remembering *which* subpattern numbered n matched is needed. (*MARK) can perhaps be used as a way round this problem. However, note that Perl does not distinguish: like PCRE2, a name is just an alias for a number in Perl. . Implement something like (?(R2+)... to check outer recursions. . If Perl ever supports the POSIX notation [[.something.]] PCRE2 should try to follow. . A user wanted a way of ignoring all Unicode "mark" characters so that, for example "a" followed by an accent would, together, match "a". This can only be done clumsily at present by using a lookahead such as /(?=a)\X/, which works for "combining" characters. . Perl supports [\N{x}-\N{y}] as a Unicode range, even in EBCDIC. PCRE2 supports \N{U+dd..} everywhere, but not in EBCDIC. . Unicode stuff from Perl: \b{gcb} or \b{g} grapheme cluster boundary \b{sb} sentence boundary \b{wb} word boundary See Unicode TR 29. The last two are very much aimed at natural language. . Allow a callout to specify a number of characters to skip. This can be done compatibly via an extra callout field. . Allow callouts to return *PRUNE, *COMMIT, *THEN, *SKIP, with and without continuing (that is, with and without an implied *FAIL). A new option, PCRE2_CALLOUT_EXTENDED say, would be needed. This is unlikely ever to be implemented by JIT, so this could be an option for pcre2_match(). . A limit on substitutions: a user suggested somehow finding a way of making match_limit apply to the whole operation instead of each match separately. . Some #defines could be replaced with enums to improve robustness. . There was a request for an option for pcre2_match() to return the longest match. This would mean searching for all possible matches, of course. . A neater way of handling recursion file names in pcre2grep, e.g. a single buffer that can grow. See also GitHub issue #2 (recursion looping via symlinks). . A user suggested that before/after parameters in pcre2grep could have negative values, to list lines near to the matched line, but not necessarily the line itself. For example, --before-context=-1 would list the line *after* each matched line, without showing the matched line. The problem here is what to do with matches that are close together. Maybe a simpler way would be a flag to disable showing matched lines, only valid with either -A or -B? . There was a suggestion for a pcre2grep colour default, or possibly a more general PCRE2GREP_OPT, but only for some options - not file names or patterns. . Breaking loops that match an empty string: perhaps find a way of continuing if *something* has changed, but this might mean remembering additional data. "Something" could be a capture value, but then a list of previous values would be needed to avoid a cycle of changes. . If a function could be written to find 3-character (or other length) fixed strings, at least one of which must be present for a match, efficient pre-searching of large datasets could be implemented. . If pcre2grep had --first-line (match only in the first line) it could be efficiently used to find files "starting with xxx". What about --last-line? There was also the suggestion of an option for pcre2grep to scan only the start of a file. I am not keen - this is the job of "head". . A user requested a means of determining whether a failed match was failed by the start-of-match optimizations, or by running the match engine. Easy enough to define a bit in the match data, but all three matchers would need work. . Would inlining "simple" recursions provide a useful performance boost for the interpreters? JIT already does some of this, but it may not be worth it for the interpreters. . Redesign handling of class/nclass/xclass because the compile code logic is currently very contorted and obscure. Also there was a request for a way of re-defining \w (and therefore \W, \b, and \B). An in-pattern sequence such as (?w=[...]) was suggested. Easiest way would be simply to inline the class, with lookarounds for \b and \B. Ideally the setting should last till the end of the group, which means remembering all previous settings; maybe a fixed amount of stack would do - how deep would anyone want to nest these things? . A user suggested something like --with-build-info to set a build information string that could be retrieved by pcre2_config(). However, there's no facility for a length limit in pcre2_config(), and what would be the encoding? . Quantified groups with a fixed count currently operate by replicating the group in the compiled bytecode. This may not really matter in these days of gigabyte memory, but perhaps another implementation might be considered. Needs coordination between the interpreters and JIT. . The POSIX interface is no longer POSIX compatible, because regoff_t is still defined as an int. . The POSIX interface is not thread safe because it modifies a pcre2_match inside its regex_t while doing matching. A thread safe version that uses a thread local object has been proposed but it will require that the code requires at least C11 compatibility. . See also any suggestions in the GitHub issues. Philip Hazel Email local part: Philip.Hazel Email domain: gmail.com Last updated: 22 August 2024 ================================================ FILE: maint/RunCoverage ================================================ #! /bin/sh # Script to run tests with coverage and filter the results. # # We assume that the source has been configured and built with LLVM's source-based # coverage instrumentation. # # Must be run in the build directory. set -e clang_report=0 nomalloc=0 while [ $# -gt 0 ] ; do case $1 in nomalloc|-nomalloc) nomalloc=1;; clang|-clang|clang-report|-clang-report) clang_report=1;; *) echo "Unknown option or test selector '$1'"; exit 1;; esac shift done LLVM_VER=`clang --version | head -n1 | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+' | cut -d. -f1` echo "(Using LLVM version $LLVM_VER)" echo "" rm -f coverage-*.profraw coverage.profdata coverage-lcov.info coverage-lcov.filtered.info [ -d testdata ] || rm -f ../coverage-*.profraw rm -rf coverage-html echo "== Running all tests with CTest ==" LLVM_PROFILE_FILE="coverage-%m.profraw" ctest -j1 --output-on-failure echo "" if [ "$nomalloc" -eq 0 ]; then echo "== Re-running pcre2test with -malloc ==" LLVM_PROFILE_FILE="coverage-%m.profraw" srcdir=.. pcre2test=./pcre2test ../RunTest -malloc echo "" fi # Merge the profiles gathered echo "== Merging coverage data ==" PROF_FILES="coverage-*.profraw" [ -d testdata ] || PROF_FILES="$PROF_FILES ../coverage-*.profraw" llvm-profdata-$LLVM_VER merge -sparse $PROF_FILES -o coverage.profdata echo "" if [ "$clang_report" -eq 1 ]; then echo "== Generating Clang coverage report ==" llvm-cov-$LLVM_VER show \ -format=html \ -show-line-counts-or-regions -show-branches=percent \ -instr-profile=coverage.profdata \ ./pcre2test -object ./pcre2grep -object ./pcre2posix_test -object ./pcre2_jit_test \ -sources ../src/ ./ \ -output-dir=coverage-html echo "" else # Output LCOV-compatible output, for downstream tools echo "== Generating LCOV report ==" llvm-cov-$LLVM_VER export \ -format=lcov \ -instr-profile=coverage.profdata \ ./pcre2test -object ./pcre2grep -object ./pcre2posix_test -object ./pcre2_jit_test \ -sources ../src/ ./ \ > ./coverage-lcov.info echo "" # Filter out lines marked with "LCOV_EXCL_LINE" or "LCOV_EXCL_START"/"LCOV_EXCL_STOP" echo "== Filtering LCOV report ==" python3 ../maint/FilterCoverage.py ./coverage-lcov.info > ./coverage-lcov.filtered.info mv ./coverage-lcov.filtered.info ./coverage-lcov.info echo "" # Use genhtml to generate an HTML report from the LCOV data echo "== Generating HTML report ==" mkdir -p coverage-html genhtml \ --highlight --branch-coverage --legend --title "PCRE2 code coverage report" --num-spaces 2 \ -o coverage-html ./coverage-lcov.info echo "" fi ================================================ FILE: maint/RunManifestTest ================================================ #! /bin/sh # Script to test a directory listing. We use this to verify that the list of # files installed by "make install" or "cmake --install" matches what we expect. set -e # Ensure stable ordering of `sort` output LANG=C LC_ALL=C export LANG LC_ALL if [ "$1" = "" -o "$2" = "" ] ; then echo "Usage: $0 []" >&2 exit 1 fi input_dir="$1" expected_manifest="$2" build_type="${3:-release}" actual_file=`basename $expected_manifest`.actual expected_file=`basename $expected_manifest`.expected sed=sed # Helper for Solaris if [ -f /usr/bin/gsed ] ; then sed=/usr/bin/gsed fi find "$input_dir" -print | \ sort | \ xargs -n1 -- ls -l -d -n | \ $sed -E -e 's/ {2,}/ /g' | \ cut -d' ' -f '1,9-' \ > "$actual_file" # The CMake install is a bit annoying now. Its installed files are actually # dependent on the build type. So, if the build type is not "release", we need # to modify the expected manifest to match the actual one. cat "$expected_manifest" | \ $sed -E -e "s/pcre2-targets-release.cmake/pcre2-targets-$build_type.cmake/" \ > "$expected_file" if ! diff -u "$expected_file" "$actual_file"; then echo "Installed files differ from expected" echo "===Actual===" cat "$actual_file" echo "===End===" exit 1 fi echo "Installed files match expected" rm -f "$actual_file" "$expected_file" ================================================ FILE: maint/RunManifestTest.ps1 ================================================ # Script to test a directory listing. We use this to verify that the list of # files installed by "make install" or "cmake --install" matches what we expect. param ( [Parameter(Mandatory=$true)] [string]$inputDir, [Parameter(Mandatory=$true)] [string]$manifestName ) if ((-not $inputDir) -or (-not $manifestName)) { throw "Usage: .\RunManifestTest.ps1 " } $base = [System.IO.Path]::GetFileName($manifestName) $installedFiles = Get-ChildItem -Recurse -Force -Path $inputDir | Sort-Object {[System.BitConverter]::ToString([system.Text.Encoding]::UTF8.GetBytes($_.FullName))} | ForEach-Object { $_.Mode.Substring(0,5) + " " + ($_.FullName | Resolve-Path -Relative) } $null = New-Item -Force $base -Value (($installedFiles | Out-String) -replace "`r`n", "`n") $expectedFiles = Get-Content -Path $manifestName -Raw $actualFiles = Get-Content -Path $base -Raw if ($expectedFiles -ne $actualFiles) { Write-Host "===Actual===" Write-Host $actualFiles Write-Host "===End===" throw "Installed files differ from expected" } Write-Host "Installed files match expected" Remove-Item -Path $base -Force ================================================ FILE: maint/RunPerlTest ================================================ #! /bin/sh # Script to run the Perl-compatible PCRE2 tests through Perl. For testing # with different versions of Perl, if the first argument is "-perl" then the # second is taken as the Perl command to use, and both are then removed. # # The argument can be the number of the specific Perl compatible test to run # (ex: "1", "4", "26" or "27"), otherwise it runs all tests and returns at # exit, the test number with an incorrect output or the test number plus 32 # if it failed to run completely. It returns with 0 on success. # This script should be run with the main PCRE2 directory current. if [ "$1" = "-perl" ]; then PERL="$2" ARGS="$1 $PERL" shift 2 else PERL=perl ARGS="" fi RC=0 if [ -z "$1" ] || [ "$1" = "1" ]; then echo "-----------------------------------------------------------------" echo "Perl test: main functionality (PCRE2 test 1)" if ./perltest.sh $ARGS testdata/testinput1 testtry; then tail -n +2 testtry > testtry2 diff -u testdata/testoutput1 testtry2 || RC=1 else RC=33 fi /bin/rm -f testtry testtry2 echo "" fi if [ -z "$1" ] || [ "$1" = "4" ]; then echo "-----------------------------------------------------------------" echo "Perl test: UTF-8 and Unicode property features (PCRE2 test 4)" if ./perltest.sh $ARGS -utf8 testdata/testinput4 testtry; then tail -n +2 testtry > testtry2 diff -u testdata/testoutput4 testtry2 || RC=4 else RC=36 fi /bin/rm -f testtry testtry2 echo "" fi P=$($PERL -MUnicode::UCD -e 'print Unicode::UCD::UnicodeVersion, "\n"') if [ -z "$1" ] || [ "$1" = "26" ]; then echo "-----------------------------------------------------------------" echo "Perl test: Unicode property tests (PCRE2 test 26)" U=$(head -5 testdata/testinput26 | $PERL -ne 'print "$1\n" if /tests for version ([\d.]+)$/') if [ "$U" != "$P" ]; then echo "SKIPPED: Perl uses Unicode $P but version $U was expected" else if ./perltest.sh $ARGS testdata/testinput26 testtry; then tail -n +2 testtry > testtry2 diff -u testdata/testoutput26 testtry2 || RC=26 else RC=58 fi /bin/rm -f testtry testtry2 echo "" fi fi if [ -z "$1" ] || [ "$1" = "27" ]; then echo "-----------------------------------------------------------------" echo "Perl test: Unicode property tests (PCRE2 test 27)" U=$(head -5 testdata/testinput27 | $PERL -ne 'print "$1\n" if /tests for version ([\d.]+)$/') if [ "$U" != "$P" ]; then echo "SKIPPED: Perl uses Unicode $P but version $U was expected" else if ./perltest.sh $ARGS testdata/testinput27 testtry; then tail -n +2 testtry > testtry2 diff -u testdata/testoutput27 testtry2 || RC=27 else RC=59 fi /bin/rm -f testtry testtry2 echo "" fi fi exit $RC # End ================================================ FILE: maint/RunSymbolTest ================================================ #! /bin/sh # Script to test that all the symbols of a shared object are as expected. set -e # Ensure stable ordering of `sort` output LANG=C LC_ALL=C export LANG LC_ALL if [ "$1" = "" -o "$2" = "" ] ; then echo "Usage: $0 " >&2 exit 1 fi input_dir="$1" manifest_dir="$2" sed=sed grep=grep # Helpers for Solaris if [ -f /usr/bin/gsed ] ; then sed=/usr/bin/gsed fi if [ -f /usr/bin/ggrep ] ; then grep=/usr/bin/ggrep fi nm="nm -B -D" if [ "`uname -s`" = "Linux" ]; then nm="$nm --with-symbol-versions" elif [ "`uname -s`" = "FreeBSD" ]; then # Use llvm-nm to get symbol version information nm="llvm-nm -B -D" elif [ "`uname -s`" = "SunOS" ]; then # Highly annoyingly, Solaris' nm doesn't show symbol versions, so here is a # laborious way to reformat the output of elfdump to be as we require. #nm="nm -p -h -D -g" nm=emulated_nm emulated_nm() { # Grab the versions from the version table, and convert into a sed script. VERSUB=`elfdump -v "$1" | \ $grep -E '^\s*\[[0-9]+\]' | \ $sed -E -e 's/\s+/,/g' | \ tr -d '\[\]' | \ cut -d',' -f 2,3 | \ $sed -E -e 's/([0-9]+),(.*)/s\/@@\1\/@@\2\/;/'` # Then grab the symbols and heavily reformat the output to match nm. elfdump -T SHT_DYNSYM "$1" | \ $grep -E '^\s*\[[0-9]+\]' | \ $sed -E -e 's/\s+/ /g' | \ cut -d' ' -f 3,8,9,10 | \ $sed -E -e 's/^0x//;s/([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*)/\1 \3 \4@@\2/;' | \ $sed -E -e 's/^([^ ]*) .text /\1 T /' | \ $sed -E -e 's/^([^ ]*) ([UA])(NDEF|BS) /\1 \2 /' | \ $sed -E -e "$VERSUB" } elif [ "`uname -s`" = "Darwin" ]; then nm="nm -B -g" fi supports_versions=1 if [ "`uname -s`" = "Darwin" ]; then supports_versions=0 fi so_ext=so so_mangling() { cat; } if [ "`uname -s`" = "Darwin" ]; then so_ext=dylib so_mangling() { $sed -E -e 's/_([_0-9a-zA-Z]+)$/\1/g' } fi for so_name in "libpcre2-8" "libpcre2-16" "libpcre2-32" "libpcre2-posix"; do expected_file="$manifest_dir/manifest-$so_name.so" so_file="$input_dir/$so_name.$so_ext" base=`basename $expected_file` $nm "$so_file" | \ $sed -E -e 's/^[0-9a-fA-F]* *//g' | \ $grep -E -v '^[Uw] ' | \ $grep -E -v '^A PCRE2_' | \ $grep -E -v ' (_init|_fini)($|@)' | \ $grep -E -v ' (__bss_start|_end|_DYNAMIC|_GLOBAL_OFFSET_TABLE_|_PROCEDURE_LINKAGE_TABLE_|_edata|_etext)($|@)' | \ so_mangling | \ sort \ > "$base.actual" if [ $supports_versions -eq 0 ]; then $sed -E -e 's/@.*$//' "$expected_file" \ > "$base.expected" else cp "$expected_file" "$base.expected" fi if ! diff -u "$base.expected" "$base.actual"; then echo "Shared object contents for $so_file differ from expected" echo "===Actual===" cat "$base.actual" echo "===End===" exit 1 fi echo "Shared object contents for $so_file match expected" rm -f "$base.expected" "$base.actual" done ================================================ FILE: maint/RunSymbolTest.ps1 ================================================ # Script to test that all the symbols of a DLL are as expected. param ( [Parameter(Mandatory=$true)] [string]$inputDir, [Parameter(Mandatory=$true)] [string]$manifestDir ) if ((-not $inputDir) -or (-not $manifestDir)) { throw "Usage: .\RunSymbolTest.ps1 " } $dllNames = @("pcre2-8", "pcre2-16", "pcre2-32", "pcre2-posix") foreach ($dllName in $dllNames) { $expectedFile = Join-Path $manifestDir ("manifest-lib$dllName.so") $dllFile = Join-Path $inputDir ("$dllName.dll") $base = [System.IO.Path]::GetFileName($expectedFile) # Get path to dumpbin using vswhere $vswhere = "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" $dumpbin = & $vswhere -latest -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -find VC\Tools\MSVC\*\bin\Hostx64\x64\dumpbin.exe | Select-Object -First 1 $actualSymbols = & $dumpbin /exports $dllFile | ForEach-Object { if ($_ -match '^\s*\d+\s+[0-9A-Fa-f]+\s+[0-9A-Fa-f]+\s+(\S+)') { "T $($matches[1])" } } | Where-Object { $_ -match '^T ' } | Sort-Object $actualOutput = ($actualSymbols -join "`n") + "`n" $null = New-Item -Force $base -Value $actualOutput $expectedSymbols = (Get-Content -Path $expectedFile -Raw).TrimEnd("`n") | ForEach-Object { $_ -replace '@@.*', '' } $expectedOutput = ($expectedSymbols -join "`n") + "`n" if ($expectedOutput -ne $actualOutput) { Write-Host "Shared object contents for $dllFile differ from expected" Write-Host "===Actual===" Write-Host $actualOutput Write-Host "===End===" throw "Symbol test failed" } else { Write-Host "Shared object contents for $dllFile match expected" } Remove-Item -Path $base -Force } ================================================ FILE: maint/Unicode.tables/BidiMirroring.txt ================================================ # BidiMirroring-17.0.0.txt # Date: 2025-08-01 # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # # Bidi_Mirroring_Glyph Property # # This file is an informative contributory data file in the # Unicode Character Database. # # This data file lists characters that have the Bidi_Mirrored=Yes property # value, for which there is another Unicode character that typically has a glyph # that is the mirror image of the original character's glyph. # # The repertoire covered by the file is Unicode 17.0.0. # # The file contains a list of lines with mappings from one code point # to another one for character-based mirroring. # Note that for "real" mirroring, a rendering engine needs to select # appropriate alternative glyphs, and that many Unicode characters do not # have a mirror-image Unicode character. # # Each mapping line contains two fields, separated by a semicolon (';'). # Each of the two fields contains a code point represented as a # variable-length hexadecimal value with 4 to 6 digits. # A comment indicates where the characters are "BEST FIT" mirroring. # # Code points for which Bidi_Mirrored=Yes, but for which no appropriate # characters exist with mirrored glyphs, are # listed as comments at the end of the file. # # Formally, the default value of the Bidi_Mirroring_Glyph property # for each code point is , unless a mapping to # some other character is specified in this data file. When a code # point has the default value for the Bidi_Mirroring_Glyph property, # that means that no other character exists whose glyph is suitable # for character-based mirroring. # # For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm, # at https://www.unicode.org/reports/tr9/ # # This file was originally created by Markus Scherer. # Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler, # and for subsequent versions by Ken Whistler, Laurentiu Iancu, Roozbeh Pournader, # and Robin Leroy. # # Historical and Compatibility Information: # # The OpenType Mirroring Pairs List (OMPL) is frozen to match the # Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008). # See https://www.microsoft.com/typography/otspec/ompl.txt # # The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011) # added one mirroring pair: 27CB <--> 27CD. # # The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018) # underwent a substantial revision, to formally recognize all of the # exact mirroring pairs and "BEST FIT" mirroring pairs that had been # added after the freezing of the OMPL list. As a result, starting # with Unicode 11.0, the bmg mapping values more accurately reflect # the current status of glyphs for Bidi_Mirrored characters in # the Unicode Standard, but this listing now extends significantly # beyond the frozen OMPL list. Implementers should be aware of this # intentional distinction. # # ############################################################ # # Property: Bidi_Mirroring_Glyph # # @missing: 0000..10FFFF; 0028; 0029 # LEFT PARENTHESIS 0029; 0028 # RIGHT PARENTHESIS 003C; 003E # LESS-THAN SIGN 003E; 003C # GREATER-THAN SIGN 005B; 005D # LEFT SQUARE BRACKET 005D; 005B # RIGHT SQUARE BRACKET 007B; 007D # LEFT CURLY BRACKET 007D; 007B # RIGHT CURLY BRACKET 00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON 0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS 0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON 0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS 169B; 169C # OGHAM FEATHER MARK 169C; 169B # OGHAM REVERSED FEATHER MARK 2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 2045; 2046 # LEFT SQUARE BRACKET WITH QUILL 2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL 207D; 207E # SUPERSCRIPT LEFT PARENTHESIS 207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS 208D; 208E # SUBSCRIPT LEFT PARENTHESIS 208E; 208D # SUBSCRIPT RIGHT PARENTHESIS 2208; 220B # ELEMENT OF 2209; 220C # [BEST FIT] NOT AN ELEMENT OF 220A; 220D # SMALL ELEMENT OF 220B; 2208 # CONTAINS AS MEMBER 220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER 220D; 220A # SMALL CONTAINS AS MEMBER 2215; 29F5 # DIVISION SLASH 221F; 2BFE # RIGHT ANGLE 2220; 29A3 # ANGLE 2221; 299B # MEASURED ANGLE 2222; 29A0 # SPHERICAL ANGLE 2224; 2AEE # DOES NOT DIVIDE 223C; 223D # TILDE OPERATOR 223D; 223C # REVERSED TILDE 2243; 22CD # ASYMPTOTICALLY EQUAL TO 2245; 224C # APPROXIMATELY EQUAL TO 224C; 2245 # ALL EQUAL TO 2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF 2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO 2254; 2255 # COLON EQUALS 2255; 2254 # EQUALS COLON 2264; 2265 # LESS-THAN OR EQUAL TO 2265; 2264 # GREATER-THAN OR EQUAL TO 2266; 2267 # LESS-THAN OVER EQUAL TO 2267; 2266 # GREATER-THAN OVER EQUAL TO 2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO 2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO 226A; 226B # MUCH LESS-THAN 226B; 226A # MUCH GREATER-THAN 226E; 226F # [BEST FIT] NOT LESS-THAN 226F; 226E # [BEST FIT] NOT GREATER-THAN 2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO 2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO 2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO 2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO 2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO 2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO 2276; 2277 # LESS-THAN OR GREATER-THAN 2277; 2276 # GREATER-THAN OR LESS-THAN 2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN 2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN 227A; 227B # PRECEDES 227B; 227A # SUCCEEDS 227C; 227D # PRECEDES OR EQUAL TO 227D; 227C # SUCCEEDS OR EQUAL TO 227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO 227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO 2280; 2281 # [BEST FIT] DOES NOT PRECEDE 2281; 2280 # [BEST FIT] DOES NOT SUCCEED 2282; 2283 # SUBSET OF 2283; 2282 # SUPERSET OF 2284; 2285 # [BEST FIT] NOT A SUBSET OF 2285; 2284 # [BEST FIT] NOT A SUPERSET OF 2286; 2287 # SUBSET OF OR EQUAL TO 2287; 2286 # SUPERSET OF OR EQUAL TO 2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO 2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO 228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO 228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO 228F; 2290 # SQUARE IMAGE OF 2290; 228F # SQUARE ORIGINAL OF 2291; 2292 # SQUARE IMAGE OF OR EQUAL TO 2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO 2298; 29B8 # CIRCLED DIVISION SLASH 22A2; 22A3 # RIGHT TACK 22A3; 22A2 # LEFT TACK 22A6; 2ADE # ASSERTION 22A8; 2AE4 # TRUE 22A9; 2AE3 # FORCES 22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE 22B0; 22B1 # PRECEDES UNDER RELATION 22B1; 22B0 # SUCCEEDS UNDER RELATION 22B2; 22B3 # NORMAL SUBGROUP OF 22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP 22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO 22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO 22B6; 22B7 # ORIGINAL OF 22B7; 22B6 # IMAGE OF 22B8; 27DC # MULTIMAP 22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT 22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT 22CB; 22CC # LEFT SEMIDIRECT PRODUCT 22CC; 22CB # RIGHT SEMIDIRECT PRODUCT 22CD; 2243 # REVERSED TILDE EQUALS 22D0; 22D1 # DOUBLE SUBSET 22D1; 22D0 # DOUBLE SUPERSET 22D6; 22D7 # LESS-THAN WITH DOT 22D7; 22D6 # GREATER-THAN WITH DOT 22D8; 22D9 # VERY MUCH LESS-THAN 22D9; 22D8 # VERY MUCH GREATER-THAN 22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN 22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN 22DC; 22DD # EQUAL TO OR LESS-THAN 22DD; 22DC # EQUAL TO OR GREATER-THAN 22DE; 22DF # EQUAL TO OR PRECEDES 22DF; 22DE # EQUAL TO OR SUCCEEDS 22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL 22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL 22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO 22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO 22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO 22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO 22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO 22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO 22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO 22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO 22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF 22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP 22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO 22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL 22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS 22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS 22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE 22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 22F6; 22FD # ELEMENT OF WITH OVERBAR 22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR 22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE 22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 22FD; 22F6 # CONTAINS WITH OVERBAR 22FE; 22F7 # SMALL CONTAINS WITH OVERBAR 2308; 2309 # LEFT CEILING 2309; 2308 # RIGHT CEILING 230A; 230B # LEFT FLOOR 230B; 230A # RIGHT FLOOR 2329; 232A # LEFT-POINTING ANGLE BRACKET 232A; 2329 # RIGHT-POINTING ANGLE BRACKET 2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT 2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT 276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT 2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT 2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT 2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT 27C3; 27C4 # OPEN SUBSET 27C4; 27C3 # OPEN SUPERSET 27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER 27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER 27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET 27C9; 27C8 # SUPERSET PRECEDING SOLIDUS 27CB; 27CD # MATHEMATICAL RISING DIAGONAL 27CD; 27CB # MATHEMATICAL FALLING DIAGONAL 27D5; 27D6 # LEFT OUTER JOIN 27D6; 27D5 # RIGHT OUTER JOIN 27DC; 22B8 # LEFT MULTIMAP 27DD; 27DE # LONG RIGHT TACK 27DE; 27DD # LONG LEFT TACK 27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK 27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK 27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK 27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK 27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET 27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET 27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS 27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS 2983; 2984 # LEFT WHITE CURLY BRACKET 2984; 2983 # RIGHT WHITE CURLY BRACKET 2985; 2986 # LEFT WHITE PARENTHESIS 2986; 2985 # RIGHT WHITE PARENTHESIS 2987; 2988 # Z NOTATION LEFT IMAGE BRACKET 2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET 2989; 298A # Z NOTATION LEFT BINDING BRACKET 298A; 2989 # Z NOTATION RIGHT BINDING BRACKET 298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR 298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR 298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2991; 2992 # LEFT ANGLE BRACKET WITH DOT 2992; 2991 # RIGHT ANGLE BRACKET WITH DOT 2993; 2994 # LEFT ARC LESS-THAN BRACKET 2994; 2993 # RIGHT ARC GREATER-THAN BRACKET 2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET 2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET 2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET 2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET 299B; 2221 # MEASURED ANGLE OPENING LEFT 29A0; 2222 # SPHERICAL ANGLE OPENING LEFT 29A3; 2220 # REVERSED ANGLE 29A4; 29A5 # ANGLE WITH UNDERBAR 29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR 29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT 29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT 29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT 29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT 29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP 29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP 29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN 29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN 29B8; 2298 # CIRCLED REVERSE SOLIDUS 29C0; 29C1 # CIRCLED LESS-THAN 29C1; 29C0 # CIRCLED GREATER-THAN 29C4; 29C5 # SQUARED RISING DIAGONAL SLASH 29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH 29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR 29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE 29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK 29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK 29D4; 29D5 # TIMES WITH LEFT HALF BLACK 29D5; 29D4 # TIMES WITH RIGHT HALF BLACK 29D8; 29D9 # LEFT WIGGLY FENCE 29D9; 29D8 # RIGHT WIGGLY FENCE 29DA; 29DB # LEFT DOUBLE WIGGLY FENCE 29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE 29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK 29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK 29F5; 2215 # REVERSE SOLIDUS OPERATOR 29F8; 29F9 # BIG SOLIDUS 29F9; 29F8 # BIG REVERSE SOLIDUS 29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET 29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET 2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS 2A2C; 2A2B # MINUS SIGN WITH RISING DOTS 2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE 2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE 2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE 2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE 2A3C; 2A3D # INTERIOR PRODUCT 2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT 2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION 2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION 2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE 2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE 2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE 2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE 2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO 2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO 2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE 2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE 2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE 2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE 2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT 2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT 2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE 2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE 2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO 2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO 2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE 2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE 2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN 2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN 2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL 2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL 2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN 2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN 2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL 2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL 2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL 2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL 2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN 2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN 2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE 2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE 2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN 2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN 2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN 2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN 2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN 2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN 2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN 2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN 2AA1; 2AA2 # DOUBLE NESTED LESS-THAN 2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN 2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE 2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE 2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL 2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL 2AAA; 2AAB # SMALLER THAN 2AAB; 2AAA # LARGER THAN 2AAC; 2AAD # SMALLER THAN OR EQUAL TO 2AAD; 2AAC # LARGER THAN OR EQUAL TO 2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN 2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN 2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO 2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO 2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN 2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN 2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO 2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO 2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO 2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO 2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO 2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO 2ABB; 2ABC # DOUBLE PRECEDES 2ABC; 2ABB # DOUBLE SUCCEEDS 2ABD; 2ABE # SUBSET WITH DOT 2ABE; 2ABD # SUPERSET WITH DOT 2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW 2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW 2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW 2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW 2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE 2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE 2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN 2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN 2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR 2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR 2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO 2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO 2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO 2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO 2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR 2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR 2ACF; 2AD0 # CLOSED SUBSET 2AD0; 2ACF # CLOSED SUPERSET 2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO 2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO 2AD3; 2AD4 # SUBSET ABOVE SUPERSET 2AD4; 2AD3 # SUPERSET ABOVE SUBSET 2AD5; 2AD6 # SUBSET ABOVE SUBSET 2AD6; 2AD5 # SUPERSET ABOVE SUPERSET 2ADE; 22A6 # SHORT LEFT TACK 2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE 2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE 2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE 2AEC; 2AED # DOUBLE STROKE NOT SIGN 2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN 2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH 2AF7; 2AF8 # TRIPLE NESTED LESS-THAN 2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN 2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO 2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO 2BFE; 221F # REVERSED RIGHT ANGLE 2E02; 2E03 # LEFT SUBSTITUTION BRACKET 2E03; 2E02 # RIGHT SUBSTITUTION BRACKET 2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET 2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET 2E09; 2E0A # LEFT TRANSPOSITION BRACKET 2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET 2E0C; 2E0D # LEFT RAISED OMISSION BRACKET 2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET 2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET 2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET 2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL 2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL 2E22; 2E23 # TOP LEFT HALF BRACKET 2E23; 2E22 # TOP RIGHT HALF BRACKET 2E24; 2E25 # BOTTOM LEFT HALF BRACKET 2E25; 2E24 # BOTTOM RIGHT HALF BRACKET 2E26; 2E27 # LEFT SIDEWAYS U BRACKET 2E27; 2E26 # RIGHT SIDEWAYS U BRACKET 2E28; 2E29 # LEFT DOUBLE PARENTHESIS 2E29; 2E28 # RIGHT DOUBLE PARENTHESIS 2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE 2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE 2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE 2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE 2E59; 2E5A # TOP HALF LEFT PARENTHESIS 2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS 2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS 2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS 3008; 3009 # LEFT ANGLE BRACKET 3009; 3008 # RIGHT ANGLE BRACKET 300A; 300B # LEFT DOUBLE ANGLE BRACKET 300B; 300A # RIGHT DOUBLE ANGLE BRACKET 300C; 300D # [BEST FIT] LEFT CORNER BRACKET 300D; 300C # [BEST FIT] RIGHT CORNER BRACKET 300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET 300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET 3010; 3011 # LEFT BLACK LENTICULAR BRACKET 3011; 3010 # RIGHT BLACK LENTICULAR BRACKET 3014; 3015 # LEFT TORTOISE SHELL BRACKET 3015; 3014 # RIGHT TORTOISE SHELL BRACKET 3016; 3017 # LEFT WHITE LENTICULAR BRACKET 3017; 3016 # RIGHT WHITE LENTICULAR BRACKET 3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET 3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET 301A; 301B # LEFT WHITE SQUARE BRACKET 301B; 301A # RIGHT WHITE SQUARE BRACKET FE59; FE5A # SMALL LEFT PARENTHESIS FE5A; FE59 # SMALL RIGHT PARENTHESIS FE5B; FE5C # SMALL LEFT CURLY BRACKET FE5C; FE5B # SMALL RIGHT CURLY BRACKET FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET FE64; FE65 # SMALL LESS-THAN SIGN FE65; FE64 # SMALL GREATER-THAN SIGN FF08; FF09 # FULLWIDTH LEFT PARENTHESIS FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS FF1C; FF1E # FULLWIDTH LESS-THAN SIGN FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET # The following characters have no appropriate mirroring character. # For these characters it is up to the rendering system # to provide mirrored glyphs. # 2140; DOUBLE-STRUCK N-ARY SUMMATION # 2201; COMPLEMENT # 2202; PARTIAL DIFFERENTIAL # 2203; THERE EXISTS # 2204; THERE DOES NOT EXIST # 2211; N-ARY SUMMATION # 2216; SET MINUS # 221A; SQUARE ROOT # 221B; CUBE ROOT # 221C; FOURTH ROOT # 221D; PROPORTIONAL TO # 2226; NOT PARALLEL TO # 222B; INTEGRAL # 222C; DOUBLE INTEGRAL # 222D; TRIPLE INTEGRAL # 222E; CONTOUR INTEGRAL # 222F; SURFACE INTEGRAL # 2230; VOLUME INTEGRAL # 2231; CLOCKWISE INTEGRAL # 2232; CLOCKWISE CONTOUR INTEGRAL # 2233; ANTICLOCKWISE CONTOUR INTEGRAL # 2239; EXCESS # 223B; HOMOTHETIC # 223E; INVERTED LAZY S # 223F; SINE WAVE # 2240; WREATH PRODUCT # 2241; NOT TILDE # 2242; MINUS TILDE # 2244; NOT ASYMPTOTICALLY EQUAL TO # 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO # 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO # 2248; ALMOST EQUAL TO # 2249; NOT ALMOST EQUAL TO # 224A; ALMOST EQUAL OR EQUAL TO # 224B; TRIPLE TILDE # 225F; QUESTIONED EQUAL TO # 2260; NOT EQUAL TO # 2262; NOT IDENTICAL TO # 226D; NOT EQUIVALENT TO # 228C; MULTISET # 22A7; MODELS # 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE # 22AC; DOES NOT PROVE # 22AD; NOT TRUE # 22AE; DOES NOT FORCE # 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE # 22BE; RIGHT ANGLE WITH ARC # 22BF; RIGHT TRIANGLE # 22F5; ELEMENT OF WITH DOT ABOVE # 22F8; ELEMENT OF WITH UNDERBAR # 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES # 22FF; Z NOTATION BAG MEMBERSHIP # 2320; TOP HALF INTEGRAL # 2321; BOTTOM HALF INTEGRAL # 27C0; THREE DIMENSIONAL ANGLE # 27CC; LONG DIVISION # 27D3; LOWER RIGHT CORNER WITH DOT # 27D4; UPPER LEFT CORNER WITH DOT # 299C; RIGHT ANGLE VARIANT WITH SQUARE # 299D; MEASURED RIGHT ANGLE WITH DOT # 299E; ANGLE WITH S INSIDE # 299F; ACUTE ANGLE # 29A2; TURNED ANGLE # 29A6; OBLIQUE ANGLE OPENING UP # 29A7; OBLIQUE ANGLE OPENING DOWN # 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT # 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT # 29C9; TWO JOINED SQUARES # 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE # 29DC; INCOMPLETE INFINITY # 29E1; INCREASES AS # 29E3; EQUALS SIGN AND SLANTED PARALLEL # 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE # 29E5; IDENTICAL TO AND SLANTED PARALLEL # 29F4; RULE-DELAYED # 29F6; SOLIDUS WITH OVERBAR # 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE # 2A0A; MODULO TWO SUM # 2A0B; SUMMATION WITH INTEGRAL # 2A0C; QUADRUPLE INTEGRAL OPERATOR # 2A0D; FINITE PART INTEGRAL # 2A0E; INTEGRAL WITH DOUBLE STROKE # 2A0F; INTEGRAL AVERAGE WITH SLASH # 2A10; CIRCULATION FUNCTION # 2A11; ANTICLOCKWISE INTEGRATION # 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE # 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE # 2A14; LINE INTEGRATION NOT INCLUDING THE POLE # 2A15; INTEGRAL AROUND A POINT OPERATOR # 2A16; QUATERNION INTEGRAL OPERATOR # 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK # 2A18; INTEGRAL WITH TIMES SIGN # 2A19; INTEGRAL WITH INTERSECTION # 2A1A; INTEGRAL WITH UNION # 2A1B; INTEGRAL WITH OVERBAR # 2A1C; INTEGRAL WITH UNDERBAR # 2A1E; LARGE LEFT TRIANGLE OPERATOR # 2A1F; Z NOTATION SCHEMA COMPOSITION # 2A20; Z NOTATION SCHEMA PIPING # 2A21; Z NOTATION SCHEMA PROJECTION # 2A24; PLUS SIGN WITH TILDE ABOVE # 2A26; PLUS SIGN WITH TILDE BELOW # 2A29; MINUS SIGN WITH COMMA ABOVE # 2A3E; Z NOTATION RELATIONAL COMPOSITION # 2A57; SLOPING LARGE OR # 2A58; SLOPING LARGE AND # 2A6A; TILDE OPERATOR WITH DOT ABOVE # 2A6B; TILDE OPERATOR WITH RISING DOTS # 2A6C; SIMILAR MINUS SIMILAR # 2A6D; CONGRUENT WITH DOT ABOVE # 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT # 2A70; APPROXIMATELY EQUAL OR EQUAL TO # 2A73; EQUALS SIGN ABOVE TILDE OPERATOR # 2A74; DOUBLE COLON EQUAL # 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR # 2ADC; FORKING # 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE # 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL # 2AF3; PARALLEL WITH TILDE OPERATOR # 2AFB; TRIPLE SOLIDUS BINARY RELATION # 2AFD; DOUBLE SOLIDUS OPERATOR # 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL # 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL # 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL # 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL # 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL # EOF ================================================ FILE: maint/Unicode.tables/CaseFolding.txt ================================================ # CaseFolding-17.0.0.txt # Date: 2025-07-30, 23:54:36 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # # Case Folding Properties # # This file is a supplement to the UnicodeData file. # It provides a case folding mapping generated from the Unicode Character Database. # If all characters are mapped according to the full mapping below, then # case differences (according to UnicodeData.txt and SpecialCasing.txt) # are eliminated. # # The data supports both implementations that require simple case foldings # (where string lengths don't change), and implementations that allow full case folding # (where string lengths may grow). Note that where they can be supported, the # full case foldings are superior: for example, they allow "FUSS" and "Fuß" to match. # # All code points not listed in this file map to themselves. # # NOTE: case folding does not preserve normalization formats! # # For information on case folding, including how to have case folding # preserve normalization formats, see the # "Conformance" / "Default Case Algorithms" section of the core specification. # # ================================================================================ # Format # ================================================================================ # The entries in this file are in the following machine-readable format: # # ; ; ; # # # The status field is: # C: common case folding, common mappings shared by both simple and full mappings. # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. # S: simple case folding, mappings to single characters where different from F. # T: special case for uppercase I and dotted uppercase I # - For non-Turkic languages, this mapping is normally not used. # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. # Note that the Turkic mappings do not maintain canonical equivalence without additional processing. # See the discussions of case mapping in the Unicode Standard for more information. # # Usage: # A. To do a simple case folding, use the mappings with status C + S. # B. To do a full case folding, use the mappings with status C + F. # # The mappings with status T can be used or omitted depending on the desired case-folding # behavior. (The default option is to exclude them.) # # ================================================================= # Property: Case_Folding # All code points not explicitly listed for Case_Folding # have the value C for the status field, and the code point itself for the mapping field. # ================================================================= 0041; C; 0061; # LATIN CAPITAL LETTER A 0042; C; 0062; # LATIN CAPITAL LETTER B 0043; C; 0063; # LATIN CAPITAL LETTER C 0044; C; 0064; # LATIN CAPITAL LETTER D 0045; C; 0065; # LATIN CAPITAL LETTER E 0046; C; 0066; # LATIN CAPITAL LETTER F 0047; C; 0067; # LATIN CAPITAL LETTER G 0048; C; 0068; # LATIN CAPITAL LETTER H 0049; C; 0069; # LATIN CAPITAL LETTER I 0049; T; 0131; # LATIN CAPITAL LETTER I 004A; C; 006A; # LATIN CAPITAL LETTER J 004B; C; 006B; # LATIN CAPITAL LETTER K 004C; C; 006C; # LATIN CAPITAL LETTER L 004D; C; 006D; # LATIN CAPITAL LETTER M 004E; C; 006E; # LATIN CAPITAL LETTER N 004F; C; 006F; # LATIN CAPITAL LETTER O 0050; C; 0070; # LATIN CAPITAL LETTER P 0051; C; 0071; # LATIN CAPITAL LETTER Q 0052; C; 0072; # LATIN CAPITAL LETTER R 0053; C; 0073; # LATIN CAPITAL LETTER S 0054; C; 0074; # LATIN CAPITAL LETTER T 0055; C; 0075; # LATIN CAPITAL LETTER U 0056; C; 0076; # LATIN CAPITAL LETTER V 0057; C; 0077; # LATIN CAPITAL LETTER W 0058; C; 0078; # LATIN CAPITAL LETTER X 0059; C; 0079; # LATIN CAPITAL LETTER Y 005A; C; 007A; # LATIN CAPITAL LETTER Z 00B5; C; 03BC; # MICRO SIGN 00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE 00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE 00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX 00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE 00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS 00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE 00C6; C; 00E6; # LATIN CAPITAL LETTER AE 00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA 00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE 00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE 00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX 00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS 00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE 00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE 00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX 00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS 00D0; C; 00F0; # LATIN CAPITAL LETTER ETH 00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE 00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE 00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE 00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX 00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE 00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS 00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE 00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE 00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE 00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX 00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS 00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE 00DE; C; 00FE; # LATIN CAPITAL LETTER THORN 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON 0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE 0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK 0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE 0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX 010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE 010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON 010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON 0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE 0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON 0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE 0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE 0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK 011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON 011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX 011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE 0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE 0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA 0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX 0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE 0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE 012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON 012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE 012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 0132; C; 0133; # LATIN CAPITAL LIGATURE IJ 0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX 0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA 0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE 013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA 013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON 013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT 0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE 0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE 0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA 0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON 0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 014A; C; 014B; # LATIN CAPITAL LETTER ENG 014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON 014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE 0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE 0152; C; 0153; # LATIN CAPITAL LIGATURE OE 0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE 0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA 0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON 015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE 015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX 015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA 0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON 0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA 0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON 0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE 0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE 016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON 016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE 016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE 0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE 0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK 0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX 0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX 0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS 0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE 017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE 017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON 017F; C; 0073; # LATIN SMALL LETTER LONG S 0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK 0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR 0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX 0186; C; 0254; # LATIN CAPITAL LETTER OPEN O 0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK 0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D 018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK 018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR 018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E 018F; C; 0259; # LATIN CAPITAL LETTER SCHWA 0190; C; 025B; # LATIN CAPITAL LETTER OPEN E 0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK 0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK 0194; C; 0263; # LATIN CAPITAL LETTER GAMMA 0196; C; 0269; # LATIN CAPITAL LETTER IOTA 0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE 0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK 019C; C; 026F; # LATIN CAPITAL LETTER TURNED M 019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK 019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE 01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN 01A2; C; 01A3; # LATIN CAPITAL LETTER OI 01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK 01A6; C; 0280; # LATIN LETTER YR 01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO 01A9; C; 0283; # LATIN CAPITAL LETTER ESH 01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK 01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK 01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN 01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON 01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK 01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK 01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE 01B7; C; 0292; # LATIN CAPITAL LETTER EZH 01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED 01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE 01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON 01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 01C7; C; 01C9; # LATIN CAPITAL LETTER LJ 01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J 01CA; C; 01CC; # LATIN CAPITAL LETTER NJ 01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J 01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON 01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON 01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON 01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON 01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON 01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE 01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON 01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE 01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON 01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON 01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON 01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE 01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON 01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON 01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK 01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON 01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON 01F1; C; 01F3; # LATIN CAPITAL LETTER DZ 01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z 01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE 01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR 01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN 01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE 01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE 01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE 01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE 0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE 0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE 0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE 0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE 020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE 020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE 020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE 0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE 0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE 0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE 0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE 0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW 021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW 021C; C; 021D; # LATIN CAPITAL LETTER YOGH 021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON 0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 0222; C; 0223; # LATIN CAPITAL LETTER OU 0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK 0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE 0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA 022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON 022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE 0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON 0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON 023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE 023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE 023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR 023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE 0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP 0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE 0244; C; 0289; # LATIN CAPITAL LETTER U BAR 0245; C; 028C; # LATIN CAPITAL LETTER TURNED V 0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE 0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE 024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL 024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE 024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE 0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI 0370; C; 0371; # GREEK CAPITAL LETTER HETA 0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI 0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA 037F; C; 03F3; # GREEK CAPITAL LETTER YOT 0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS 0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS 0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS 038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS 038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS 038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS 038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS 0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA 0392; C; 03B2; # GREEK CAPITAL LETTER BETA 0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA 0394; C; 03B4; # GREEK CAPITAL LETTER DELTA 0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON 0396; C; 03B6; # GREEK CAPITAL LETTER ZETA 0397; C; 03B7; # GREEK CAPITAL LETTER ETA 0398; C; 03B8; # GREEK CAPITAL LETTER THETA 0399; C; 03B9; # GREEK CAPITAL LETTER IOTA 039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA 039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA 039C; C; 03BC; # GREEK CAPITAL LETTER MU 039D; C; 03BD; # GREEK CAPITAL LETTER NU 039E; C; 03BE; # GREEK CAPITAL LETTER XI 039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON 03A0; C; 03C0; # GREEK CAPITAL LETTER PI 03A1; C; 03C1; # GREEK CAPITAL LETTER RHO 03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA 03A4; C; 03C4; # GREEK CAPITAL LETTER TAU 03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON 03A6; C; 03C6; # GREEK CAPITAL LETTER PHI 03A7; C; 03C7; # GREEK CAPITAL LETTER CHI 03A8; C; 03C8; # GREEK CAPITAL LETTER PSI 03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA 03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA 03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA 03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA 03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL 03D0; C; 03B2; # GREEK BETA SYMBOL 03D1; C; 03B8; # GREEK THETA SYMBOL 03D5; C; 03C6; # GREEK PHI SYMBOL 03D6; C; 03C0; # GREEK PI SYMBOL 03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA 03DA; C; 03DB; # GREEK LETTER STIGMA 03DC; C; 03DD; # GREEK LETTER DIGAMMA 03DE; C; 03DF; # GREEK LETTER KOPPA 03E0; C; 03E1; # GREEK LETTER SAMPI 03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI 03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI 03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI 03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI 03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA 03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA 03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI 03F0; C; 03BA; # GREEK KAPPA SYMBOL 03F1; C; 03C1; # GREEK RHO SYMBOL 03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL 03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL 03F7; C; 03F8; # GREEK CAPITAL LETTER SHO 03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL 03FA; C; 03FB; # GREEK CAPITAL LETTER SAN 03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL 03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL 03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL 0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE 0401; C; 0451; # CYRILLIC CAPITAL LETTER IO 0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE 0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE 0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE 0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE 0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I 0407; C; 0457; # CYRILLIC CAPITAL LETTER YI 0408; C; 0458; # CYRILLIC CAPITAL LETTER JE 0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE 040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE 040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE 040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE 040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE 040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U 040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE 0410; C; 0430; # CYRILLIC CAPITAL LETTER A 0411; C; 0431; # CYRILLIC CAPITAL LETTER BE 0412; C; 0432; # CYRILLIC CAPITAL LETTER VE 0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE 0414; C; 0434; # CYRILLIC CAPITAL LETTER DE 0415; C; 0435; # CYRILLIC CAPITAL LETTER IE 0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE 0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE 0418; C; 0438; # CYRILLIC CAPITAL LETTER I 0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I 041A; C; 043A; # CYRILLIC CAPITAL LETTER KA 041B; C; 043B; # CYRILLIC CAPITAL LETTER EL 041C; C; 043C; # CYRILLIC CAPITAL LETTER EM 041D; C; 043D; # CYRILLIC CAPITAL LETTER EN 041E; C; 043E; # CYRILLIC CAPITAL LETTER O 041F; C; 043F; # CYRILLIC CAPITAL LETTER PE 0420; C; 0440; # CYRILLIC CAPITAL LETTER ER 0421; C; 0441; # CYRILLIC CAPITAL LETTER ES 0422; C; 0442; # CYRILLIC CAPITAL LETTER TE 0423; C; 0443; # CYRILLIC CAPITAL LETTER U 0424; C; 0444; # CYRILLIC CAPITAL LETTER EF 0425; C; 0445; # CYRILLIC CAPITAL LETTER HA 0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE 0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE 0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA 0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA 042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN 042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU 042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN 042D; C; 044D; # CYRILLIC CAPITAL LETTER E 042E; C; 044E; # CYRILLIC CAPITAL LETTER YU 042F; C; 044F; # CYRILLIC CAPITAL LETTER YA 0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA 0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT 0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E 0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS 0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS 046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS 046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS 046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI 0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI 0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA 0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA 0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0478; C; 0479; # CYRILLIC CAPITAL LETTER UK 047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA 047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO 047E; C; 047F; # CYRILLIC CAPITAL LETTER OT 0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA 048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL 048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN 048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK 0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN 0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE 0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK 0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER 049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER 049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE 049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE 04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA 04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER 04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE 04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK 04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA 04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER 04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER 04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U 04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE 04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER 04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE 04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER 04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE 04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA 04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE 04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER 04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA 04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE 04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK 04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL 04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK 04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL 04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE 04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL 04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE 04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS 04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE 04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE 04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA 04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS 04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS 04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS 04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE 04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON 04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS 04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS 04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O 04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS 04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS 04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON 04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS 04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS 04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER 04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS 04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK 04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK 04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE 0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE 0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE 0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE 0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE 0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE 050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE 050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE 050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE 0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE 0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK 0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA 0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA 0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE 051A; C; 051B; # CYRILLIC CAPITAL LETTER QA 051C; C; 051D; # CYRILLIC CAPITAL LETTER WE 051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA 0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK 0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK 0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER 0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER 0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK 052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE 052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE 052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER 0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB 0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN 0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM 0534; C; 0564; # ARMENIAN CAPITAL LETTER DA 0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH 0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA 0537; C; 0567; # ARMENIAN CAPITAL LETTER EH 0538; C; 0568; # ARMENIAN CAPITAL LETTER ET 0539; C; 0569; # ARMENIAN CAPITAL LETTER TO 053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE 053B; C; 056B; # ARMENIAN CAPITAL LETTER INI 053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN 053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH 053E; C; 056E; # ARMENIAN CAPITAL LETTER CA 053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN 0540; C; 0570; # ARMENIAN CAPITAL LETTER HO 0541; C; 0571; # ARMENIAN CAPITAL LETTER JA 0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD 0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH 0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN 0545; C; 0575; # ARMENIAN CAPITAL LETTER YI 0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW 0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA 0548; C; 0578; # ARMENIAN CAPITAL LETTER VO 0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA 054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH 054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH 054C; C; 057C; # ARMENIAN CAPITAL LETTER RA 054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH 054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW 054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN 0550; C; 0580; # ARMENIAN CAPITAL LETTER REH 0551; C; 0581; # ARMENIAN CAPITAL LETTER CO 0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN 0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR 0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH 0555; C; 0585; # ARMENIAN CAPITAL LETTER OH 0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH 0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN 10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN 10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN 10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN 10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON 10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN 10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN 10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN 10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN 10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN 10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN 10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS 10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN 10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR 10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON 10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR 10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR 10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE 10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN 10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR 10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN 10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR 10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR 10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN 10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR 10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN 10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN 10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN 10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL 10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL 10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR 10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN 10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN 10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE 10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE 10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE 10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE 10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR 10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE 10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN 10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN 13F8; C; 13F0; # CHEROKEE SMALL LETTER YE 13F9; C; 13F1; # CHEROKEE SMALL LETTER YI 13FA; C; 13F2; # CHEROKEE SMALL LETTER YO 13FB; C; 13F3; # CHEROKEE SMALL LETTER YU 13FC; C; 13F4; # CHEROKEE SMALL LETTER YV 13FD; C; 13F5; # CHEROKEE SMALL LETTER MV 1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE 1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE 1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O 1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES 1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE 1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE 1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN 1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT 1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK 1C89; C; 1C8A; # CYRILLIC CAPITAL LETTER TJE 1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN 1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN 1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN 1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON 1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN 1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN 1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN 1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN 1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN 1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN 1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS 1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN 1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR 1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON 1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR 1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR 1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE 1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN 1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR 1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN 1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR 1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR 1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN 1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR 1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN 1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN 1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN 1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL 1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL 1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR 1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN 1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN 1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE 1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE 1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE 1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE 1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR 1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE 1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI 1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN 1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI 1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN 1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN 1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN 1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW 1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW 1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW 1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE 1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE 1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW 1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW 1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA 1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW 1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE 1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE 1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW 1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW 1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE 1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE 1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON 1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE 1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW 1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS 1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA 1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW 1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW 1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE 1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE 1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW 1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW 1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW 1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON 1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW 1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW 1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE 1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE 1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW 1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE 1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW 1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW 1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW 1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE 1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS 1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE 1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE 1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE 1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE 1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE 1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW 1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON 1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW 1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE 1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW 1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE 1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE 1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE 1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW 1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW 1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW 1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW 1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW 1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW 1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE 1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS 1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE 1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW 1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE 1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE 1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS 1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE 1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW 1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE 1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS 1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE 1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX 1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW 1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW 1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW 1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS 1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE 1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE 1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING 1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE 1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW 1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE 1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE 1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE 1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE 1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE 1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE 1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW 1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW 1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE 1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE 1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE 1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE 1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW 1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW 1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE 1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE 1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE 1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE 1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE 1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE 1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW 1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW 1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE 1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE 1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE 1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE 1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE 1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW 1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE 1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW 1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE 1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE 1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL 1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V 1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP 1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI 1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA 1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA 1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA 1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA 1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA 1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI 1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI 1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA 1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA 1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA 1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA 1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI 1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA 1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA 1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA 1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA 1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA 1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI 1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI 1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI 1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA 1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA 1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA 1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA 1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA 1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI 1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI 1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI 1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA 1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA 1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA 1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA 1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI 1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA 1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA 1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI 1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI 1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA 1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA 1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA 1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA 1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA 1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI 1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI 1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI 1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI 1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI 1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI 1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI 1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI 1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI 1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI 1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI 1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI 1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI 1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI 1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI 1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI 1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI 1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI 1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI 1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI 1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI 1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI 1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI 1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI 1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI 1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI 1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI 1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI 1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI 1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI 1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI 1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI 1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI 1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI 1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI 1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI 1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI 1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI 1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI 1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI 1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI 1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI 1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI 1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI 1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI 1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI 1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI 1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI 1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI 1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI 1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI 1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI 1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI 1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI 1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI 1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI 1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI 1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI 1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI 1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI 1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI 1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY 1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON 1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA 1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA 1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE; C; 03B9; # GREEK PROSGEGRAMMENI 1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI 1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI 1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI 1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA 1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA 1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA 1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA 1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA 1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD3; S; 0390; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI 1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY 1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON 1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA 1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA 1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA 1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 1FE3; S; 03B0; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI 1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI 1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI 1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY 1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON 1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA 1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA 1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA 1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI 1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI 1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI 1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA 1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA 1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA 1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA 1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2126; C; 03C9; # OHM SIGN 212A; C; 006B; # KELVIN SIGN 212B; C; 00E5; # ANGSTROM SIGN 2132; C; 214E; # TURNED CAPITAL F 2160; C; 2170; # ROMAN NUMERAL ONE 2161; C; 2171; # ROMAN NUMERAL TWO 2162; C; 2172; # ROMAN NUMERAL THREE 2163; C; 2173; # ROMAN NUMERAL FOUR 2164; C; 2174; # ROMAN NUMERAL FIVE 2165; C; 2175; # ROMAN NUMERAL SIX 2166; C; 2176; # ROMAN NUMERAL SEVEN 2167; C; 2177; # ROMAN NUMERAL EIGHT 2168; C; 2178; # ROMAN NUMERAL NINE 2169; C; 2179; # ROMAN NUMERAL TEN 216A; C; 217A; # ROMAN NUMERAL ELEVEN 216B; C; 217B; # ROMAN NUMERAL TWELVE 216C; C; 217C; # ROMAN NUMERAL FIFTY 216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED 216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED 216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND 2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED 24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A 24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B 24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C 24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D 24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E 24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F 24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G 24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H 24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I 24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J 24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K 24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L 24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M 24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N 24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O 24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P 24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q 24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R 24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S 24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T 24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U 24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V 24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W 24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X 24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y 24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z 2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU 2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY 2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE 2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI 2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO 2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU 2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE 2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO 2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA 2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE 2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE 2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I 2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI 2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO 2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE 2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE 2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI 2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU 2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI 2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI 2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO 2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO 2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU 2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU 2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU 2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU 2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE 2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA 2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI 2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI 2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA 2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU 2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI 2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI 2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA 2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU 2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS 2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL 2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO 2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS 2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS 2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS 2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA 2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA 2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC 2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A 2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE 2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI 2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR 2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE 2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE 2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL 2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER 2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER 2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER 2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA 2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK 2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A 2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA 2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK 2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H 2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL 2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL 2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA 2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA 2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA 2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA 2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE 2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU 2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA 2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE 2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE 2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA 2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA 2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA 2C98; C; 2C99; # COPTIC CAPITAL LETTER MI 2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI 2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI 2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O 2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI 2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO 2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA 2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU 2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA 2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI 2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI 2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI 2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU 2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF 2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN 2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE 2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA 2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI 2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI 2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU 2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI 2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI 2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI 2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH 2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI 2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI 2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI 2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA 2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA 2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI 2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT 2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA 2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA 2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA 2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA 2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI 2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI 2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU 2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI 2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA 2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O A680; C; A681; # CYRILLIC CAPITAL LETTER DWE A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE A694; C; A695; # CYRILLIC CAPITAL LETTER HWE A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN A726; C; A727; # LATIN CAPITAL LETTER HENG A728; C; A729; # LATIN CAPITAL LETTER TZ A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA A732; C; A733; # LATIN CAPITAL LETTER AA A734; C; A735; # LATIN CAPITAL LETTER AO A736; C; A737; # LATIN CAPITAL LETTER AU A738; C; A739; # LATIN CAPITAL LETTER AV A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR A73C; C; A73D; # LATIN CAPITAL LETTER AY A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE A746; C; A747; # LATIN CAPITAL LETTER BROKEN L A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP A74E; C; A74F; # LATIN CAPITAL LETTER OO A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE A760; C; A761; # LATIN CAPITAL LETTER VY A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER A768; C; A769; # LATIN CAPITAL LETTER VEND A76A; C; A76B; # LATIN CAPITAL LETTER ET A76C; C; A76D; # LATIN CAPITAL LETTER IS A76E; C; A76F; # LATIN CAPITAL LETTER CON A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G A780; C; A781; # LATIN CAPITAL LETTER TURNED L A782; C; A783; # LATIN CAPITAL LETTER INSULAR R A784; C; A785; # LATIN CAPITAL LETTER INSULAR S A786; C; A787; # LATIN CAPITAL LETTER INSULAR T A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL A7B3; C; AB53; # LATIN CAPITAL LETTER CHI A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE A7CE; C; A7CF; # LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G A7D2; C; A7D3; # LATIN CAPITAL LETTER DOUBLE THORN A7D4; C; A7D5; # LATIN CAPITAL LETTER DOUBLE WYNN A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA A7DC; C; 019B; # LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H AB70; C; 13A0; # CHEROKEE SMALL LETTER A AB71; C; 13A1; # CHEROKEE SMALL LETTER E AB72; C; 13A2; # CHEROKEE SMALL LETTER I AB73; C; 13A3; # CHEROKEE SMALL LETTER O AB74; C; 13A4; # CHEROKEE SMALL LETTER U AB75; C; 13A5; # CHEROKEE SMALL LETTER V AB76; C; 13A6; # CHEROKEE SMALL LETTER GA AB77; C; 13A7; # CHEROKEE SMALL LETTER KA AB78; C; 13A8; # CHEROKEE SMALL LETTER GE AB79; C; 13A9; # CHEROKEE SMALL LETTER GI AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI AB80; C; 13B0; # CHEROKEE SMALL LETTER HO AB81; C; 13B1; # CHEROKEE SMALL LETTER HU AB82; C; 13B2; # CHEROKEE SMALL LETTER HV AB83; C; 13B3; # CHEROKEE SMALL LETTER LA AB84; C; 13B4; # CHEROKEE SMALL LETTER LE AB85; C; 13B5; # CHEROKEE SMALL LETTER LI AB86; C; 13B6; # CHEROKEE SMALL LETTER LO AB87; C; 13B7; # CHEROKEE SMALL LETTER LU AB88; C; 13B8; # CHEROKEE SMALL LETTER LV AB89; C; 13B9; # CHEROKEE SMALL LETTER MA AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH AB91; C; 13C1; # CHEROKEE SMALL LETTER NE AB92; C; 13C2; # CHEROKEE SMALL LETTER NI AB93; C; 13C3; # CHEROKEE SMALL LETTER NO AB94; C; 13C4; # CHEROKEE SMALL LETTER NU AB95; C; 13C5; # CHEROKEE SMALL LETTER NV AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA AB9D; C; 13CD; # CHEROKEE SMALL LETTER S AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T FB05; S; FB06; # LATIN SMALL LIGATURE LONG S T FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 10400; C; 10428; # DESERET CAPITAL LETTER LONG I 10401; C; 10429; # DESERET CAPITAL LETTER LONG E 10402; C; 1042A; # DESERET CAPITAL LETTER LONG A 10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH 10404; C; 1042C; # DESERET CAPITAL LETTER LONG O 10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO 10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I 10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E 10408; C; 10430; # DESERET CAPITAL LETTER SHORT A 10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH 1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O 1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO 1040C; C; 10434; # DESERET CAPITAL LETTER AY 1040D; C; 10435; # DESERET CAPITAL LETTER OW 1040E; C; 10436; # DESERET CAPITAL LETTER WU 1040F; C; 10437; # DESERET CAPITAL LETTER YEE 10410; C; 10438; # DESERET CAPITAL LETTER H 10411; C; 10439; # DESERET CAPITAL LETTER PEE 10412; C; 1043A; # DESERET CAPITAL LETTER BEE 10413; C; 1043B; # DESERET CAPITAL LETTER TEE 10414; C; 1043C; # DESERET CAPITAL LETTER DEE 10415; C; 1043D; # DESERET CAPITAL LETTER CHEE 10416; C; 1043E; # DESERET CAPITAL LETTER JEE 10417; C; 1043F; # DESERET CAPITAL LETTER KAY 10418; C; 10440; # DESERET CAPITAL LETTER GAY 10419; C; 10441; # DESERET CAPITAL LETTER EF 1041A; C; 10442; # DESERET CAPITAL LETTER VEE 1041B; C; 10443; # DESERET CAPITAL LETTER ETH 1041C; C; 10444; # DESERET CAPITAL LETTER THEE 1041D; C; 10445; # DESERET CAPITAL LETTER ES 1041E; C; 10446; # DESERET CAPITAL LETTER ZEE 1041F; C; 10447; # DESERET CAPITAL LETTER ESH 10420; C; 10448; # DESERET CAPITAL LETTER ZHEE 10421; C; 10449; # DESERET CAPITAL LETTER ER 10422; C; 1044A; # DESERET CAPITAL LETTER EL 10423; C; 1044B; # DESERET CAPITAL LETTER EM 10424; C; 1044C; # DESERET CAPITAL LETTER EN 10425; C; 1044D; # DESERET CAPITAL LETTER ENG 10426; C; 1044E; # DESERET CAPITAL LETTER OI 10427; C; 1044F; # DESERET CAPITAL LETTER EW 104B0; C; 104D8; # OSAGE CAPITAL LETTER A 104B1; C; 104D9; # OSAGE CAPITAL LETTER AI 104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN 104B3; C; 104DB; # OSAGE CAPITAL LETTER AH 104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA 104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA 104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA 104B7; C; 104DF; # OSAGE CAPITAL LETTER E 104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN 104B9; C; 104E1; # OSAGE CAPITAL LETTER HA 104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA 104BB; C; 104E3; # OSAGE CAPITAL LETTER I 104BC; C; 104E4; # OSAGE CAPITAL LETTER KA 104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA 104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA 104BF; C; 104E7; # OSAGE CAPITAL LETTER LA 104C0; C; 104E8; # OSAGE CAPITAL LETTER MA 104C1; C; 104E9; # OSAGE CAPITAL LETTER NA 104C2; C; 104EA; # OSAGE CAPITAL LETTER O 104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN 104C4; C; 104EC; # OSAGE CAPITAL LETTER PA 104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA 104C6; C; 104EE; # OSAGE CAPITAL LETTER SA 104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA 104C8; C; 104F0; # OSAGE CAPITAL LETTER TA 104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA 104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA 104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA 104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA 104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA 104CE; C; 104F6; # OSAGE CAPITAL LETTER U 104CF; C; 104F7; # OSAGE CAPITAL LETTER WA 104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA 104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA 104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA 104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA 10570; C; 10597; # VITHKUQI CAPITAL LETTER A 10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE 10572; C; 10599; # VITHKUQI CAPITAL LETTER BE 10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE 10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE 10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE 10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE 10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI 10578; C; 1059F; # VITHKUQI CAPITAL LETTER E 10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE 1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA 1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA 1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA 1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I 1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE 10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE 10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA 10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA 10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA 10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME 10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE 10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE 10587; C; 105AE; # VITHKUQI CAPITAL LETTER O 10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE 10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA 1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE 1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE 1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE 1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE 1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE 10590; C; 105B7; # VITHKUQI CAPITAL LETTER U 10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE 10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE 10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y 10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE 10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A 10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA 10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB 10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB 10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC 10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC 10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS 10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED 10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND 10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E 10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E 10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE 10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF 10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG 10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY 10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH 10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I 10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II 10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ 10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK 10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK 10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK 10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL 10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY 10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM 10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN 10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY 10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O 10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO 10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE 10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE 10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE 10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP 10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP 10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER 10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER 10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES 10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ 10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET 10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT 10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY 10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH 10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U 10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU 10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE 10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE 10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV 10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ 10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS 10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN 10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US 10D50; C; 10D70; # GARAY CAPITAL LETTER A 10D51; C; 10D71; # GARAY CAPITAL LETTER CA 10D52; C; 10D72; # GARAY CAPITAL LETTER MA 10D53; C; 10D73; # GARAY CAPITAL LETTER KA 10D54; C; 10D74; # GARAY CAPITAL LETTER BA 10D55; C; 10D75; # GARAY CAPITAL LETTER JA 10D56; C; 10D76; # GARAY CAPITAL LETTER SA 10D57; C; 10D77; # GARAY CAPITAL LETTER WA 10D58; C; 10D78; # GARAY CAPITAL LETTER LA 10D59; C; 10D79; # GARAY CAPITAL LETTER GA 10D5A; C; 10D7A; # GARAY CAPITAL LETTER DA 10D5B; C; 10D7B; # GARAY CAPITAL LETTER XA 10D5C; C; 10D7C; # GARAY CAPITAL LETTER YA 10D5D; C; 10D7D; # GARAY CAPITAL LETTER TA 10D5E; C; 10D7E; # GARAY CAPITAL LETTER RA 10D5F; C; 10D7F; # GARAY CAPITAL LETTER NYA 10D60; C; 10D80; # GARAY CAPITAL LETTER FA 10D61; C; 10D81; # GARAY CAPITAL LETTER NA 10D62; C; 10D82; # GARAY CAPITAL LETTER PA 10D63; C; 10D83; # GARAY CAPITAL LETTER HA 10D64; C; 10D84; # GARAY CAPITAL LETTER OLD KA 10D65; C; 10D85; # GARAY CAPITAL LETTER OLD NA 118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA 118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A 118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI 118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU 118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA 118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO 118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II 118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU 118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E 118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O 118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG 118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA 118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO 118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY 118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ 118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC 118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN 118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD 118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE 118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG 118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA 118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT 118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM 118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU 118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU 118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO 118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO 118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR 118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR 118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU 118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII 118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO 16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M 16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S 16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V 16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W 16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU 16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z 16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP 16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P 16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T 16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G 16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F 16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I 16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K 16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A 16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J 16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E 16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B 16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C 16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U 16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU 16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L 16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q 16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP 16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY 16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X 16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D 16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE 16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N 16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R 16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O 16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI 16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y 16EA0; C; 16EBB; # BERIA ERFE CAPITAL LETTER ARKAB 16EA1; C; 16EBC; # BERIA ERFE CAPITAL LETTER BASIGNA 16EA2; C; 16EBD; # BERIA ERFE CAPITAL LETTER DARBAI 16EA3; C; 16EBE; # BERIA ERFE CAPITAL LETTER EH 16EA4; C; 16EBF; # BERIA ERFE CAPITAL LETTER FITKO 16EA5; C; 16EC0; # BERIA ERFE CAPITAL LETTER GOWAY 16EA6; C; 16EC1; # BERIA ERFE CAPITAL LETTER HIRDEABO 16EA7; C; 16EC2; # BERIA ERFE CAPITAL LETTER I 16EA8; C; 16EC3; # BERIA ERFE CAPITAL LETTER DJAI 16EA9; C; 16EC4; # BERIA ERFE CAPITAL LETTER KOBO 16EAA; C; 16EC5; # BERIA ERFE CAPITAL LETTER LAKKO 16EAB; C; 16EC6; # BERIA ERFE CAPITAL LETTER MERI 16EAC; C; 16EC7; # BERIA ERFE CAPITAL LETTER NINI 16EAD; C; 16EC8; # BERIA ERFE CAPITAL LETTER GNA 16EAE; C; 16EC9; # BERIA ERFE CAPITAL LETTER NGAY 16EAF; C; 16ECA; # BERIA ERFE CAPITAL LETTER OI 16EB0; C; 16ECB; # BERIA ERFE CAPITAL LETTER PI 16EB1; C; 16ECC; # BERIA ERFE CAPITAL LETTER ERIGO 16EB2; C; 16ECD; # BERIA ERFE CAPITAL LETTER ERIGO TAMURA 16EB3; C; 16ECE; # BERIA ERFE CAPITAL LETTER SERI 16EB4; C; 16ECF; # BERIA ERFE CAPITAL LETTER SHEP 16EB5; C; 16ED0; # BERIA ERFE CAPITAL LETTER TATASOUE 16EB6; C; 16ED1; # BERIA ERFE CAPITAL LETTER UI 16EB7; C; 16ED2; # BERIA ERFE CAPITAL LETTER WASSE 16EB8; C; 16ED3; # BERIA ERFE CAPITAL LETTER AY 1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF 1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI 1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM 1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM 1E904; C; 1E926; # ADLAM CAPITAL LETTER BA 1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE 1E906; C; 1E928; # ADLAM CAPITAL LETTER PE 1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE 1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA 1E909; C; 1E92B; # ADLAM CAPITAL LETTER E 1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA 1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I 1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O 1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA 1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE 1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW 1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN 1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF 1E912; C; 1E934; # ADLAM CAPITAL LETTER YA 1E913; C; 1E935; # ADLAM CAPITAL LETTER U 1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM 1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI 1E916; C; 1E938; # ADLAM CAPITAL LETTER HA 1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF 1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA 1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA 1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU 1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA 1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA 1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA 1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE 1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL 1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO 1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA # # EOF ================================================ FILE: maint/Unicode.tables/DerivedBidiClass.txt ================================================ # DerivedBidiClass-17.0.0.txt # Date: 2025-07-24, 00:12:44 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # ================================================ # Bidi Class (listing UnicodeData.txt, field 4: see UAX #44: https://www.unicode.org/reports/tr44/) # Unlike other properties, unassigned code points in blocks # reserved for right-to-left scripts are given either values R or AL, # and unassigned code points in the Currency Symbols block are given the value ET. # For details see the @missing lines below. # # The unassigned code points that default to BN have one of the following properties: # Default_Ignorable_Code_Point # Noncharacter_Code_Point # # For all other cases: # All code points not explicitly listed for Bidi_Class # have the value Left_To_Right (L). # @missing: 0000..10FFFF; Left_To_Right # 0590..05FF Hebrew # @missing: 0590..05FF; Right_To_Left # 0600..06FF Arabic # 0700..074F Syriac # 0750..077F Arabic_Supplement # 0780..07BF Thaana # @missing: 0600..07BF; Arabic_Letter # 07C0..07FF NKo # 0800..083F Samaritan # 0840..085F Mandaic # @missing: 07C0..085F; Right_To_Left # 0860..086F Syriac_Supplement # 0870..089F Arabic_Extended_B # 08A0..08FF Arabic_Extended_A # @missing: 0860..08FF; Arabic_Letter # 20A0..20CF Currency_Symbols # @missing: 20A0..20CF; European_Terminator # FB00..FB4F Alphabetic_Presentation_Forms (partial) # @missing: FB1D..FB4F; Right_To_Left # FB50..FDFF Arabic_Presentation_Forms_A (partial) # @missing: FB50..FDCF; Arabic_Letter # FB50..FDFF Arabic_Presentation_Forms_A (partial) # @missing: FDF0..FDFF; Arabic_Letter # FE70..FEFF Arabic_Presentation_Forms_B # @missing: FE70..FEFF; Arabic_Letter # 10800..1083F Cypriot_Syllabary # 10840..1085F Imperial_Aramaic # 10860..1087F Palmyrene # 10880..108AF Nabataean # 108E0..108FF Hatran # 10900..1091F Phoenician # 10920..1093F Lydian # 10940..1095F Sidetic # 10980..1099F Meroitic_Hieroglyphs # 109A0..109FF Meroitic_Cursive # 10A00..10A5F Kharoshthi # 10A60..10A7F Old_South_Arabian # 10A80..10A9F Old_North_Arabian # 10AC0..10AFF Manichaean # 10B00..10B3F Avestan # 10B40..10B5F Inscriptional_Parthian # 10B60..10B7F Inscriptional_Pahlavi # 10B80..10BAF Psalter_Pahlavi # 10C00..10C4F Old_Turkic # 10C80..10CFF Old_Hungarian # @missing: 10800..10CFF; Right_To_Left # 10D00..10D3F Hanifi_Rohingya # @missing: 10D00..10D3F; Arabic_Letter # 10D40..10D8F Garay # 10E60..10E7F Rumi_Numeral_Symbols # 10E80..10EBF Yezidi # @missing: 10D40..10EBF; Right_To_Left # 10EC0..10EFF Arabic_Extended_C # @missing: 10EC0..10EFF; Arabic_Letter # 10F00..10F2F Old_Sogdian # @missing: 10F00..10F2F; Right_To_Left # 10F30..10F6F Sogdian # @missing: 10F30..10F6F; Arabic_Letter # 10F70..10FAF Old_Uyghur # 10FB0..10FDF Chorasmian # 10FE0..10FFF Elymaic # @missing: 10F70..10FFF; Right_To_Left # 1E800..1E8DF Mende_Kikakui # 1E900..1E95F Adlam # @missing: 1E800..1EC6F; Right_To_Left # 1EC70..1ECBF Indic_Siyaq_Numbers # @missing: 1EC70..1ECBF; Arabic_Letter # @missing: 1ECC0..1ECFF; Right_To_Left # 1ED00..1ED4F Ottoman_Siyaq_Numbers # @missing: 1ED00..1ED4F; Arabic_Letter # @missing: 1ED50..1EDFF; Right_To_Left # 1EE00..1EEFF Arabic_Mathematical_Alphabetic_Symbols # @missing: 1EE00..1EEFF; Arabic_Letter # @missing: 1EF00..1EFFF; Right_To_Left # ================================================ # Bidi_Class=Left_To_Right 0041..005A ; L # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 0061..007A ; L # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; L # Lo FEMININE ORDINAL INDICATOR 00B5 ; L # L& MICRO SIGN 00BA ; L # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; L # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; L # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; L # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; L # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; L # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; L # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; L # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; L # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; L # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02B8 ; L # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 02BB..02C1 ; L # Lm [7] MODIFIER LETTER TURNED COMMA..MODIFIER LETTER REVERSED GLOTTAL STOP 02D0..02D1 ; L # Lm [2] MODIFIER LETTER TRIANGULAR COLON..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; L # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EE ; L # Lm MODIFIER LETTER DOUBLE APOSTROPHE 0370..0373 ; L # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0376..0377 ; L # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; L # Lm GREEK YPOGEGRAMMENI 037B..037D ; L # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; L # L& GREEK CAPITAL LETTER YOT 0386 ; L # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; L # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; L # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; L # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; L # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; L # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 0482 ; L # So CYRILLIC THOUSANDS SIGN 048A..052F ; L # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; L # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; L # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 055A..055F ; L # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK 0560..0588 ; L # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 0589 ; L # Po ARMENIAN FULL STOP 0903 ; L # Mc DEVANAGARI SIGN VISARGA 0904..0939 ; L # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093B ; L # Mc DEVANAGARI VOWEL SIGN OOE 093D ; L # Lo DEVANAGARI SIGN AVAGRAHA 093E..0940 ; L # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0949..094C ; L # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094E..094F ; L # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0950 ; L # Lo DEVANAGARI OM 0958..0961 ; L # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0964..0965 ; L # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA 0966..096F ; L # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 0970 ; L # Po DEVANAGARI ABBREVIATION SIGN 0971 ; L # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; L # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0982..0983 ; L # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; L # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; L # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; L # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; L # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; L # Lo BENGALI LETTER LA 09B6..09B9 ; L # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BD ; L # Lo BENGALI SIGN AVAGRAHA 09BE..09C0 ; L # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II 09C7..09C8 ; L # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; L # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09CE ; L # Lo BENGALI LETTER KHANDA TA 09D7 ; L # Mc BENGALI AU LENGTH MARK 09DC..09DD ; L # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; L # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09E6..09EF ; L # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE 09F0..09F1 ; L # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09F4..09F9 ; L # No [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN 09FA ; L # So BENGALI ISSHAR 09FC ; L # Lo BENGALI LETTER VEDIC ANUSVARA 09FD ; L # Po BENGALI ABBREVIATION SIGN 0A03 ; L # Mc GURMUKHI SIGN VISARGA 0A05..0A0A ; L # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; L # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; L # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; L # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; L # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; L # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; L # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A3E..0A40 ; L # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A59..0A5C ; L # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; L # Lo GURMUKHI LETTER FA 0A66..0A6F ; L # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE 0A72..0A74 ; L # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A76 ; L # Po GURMUKHI ABBREVIATION SIGN 0A83 ; L # Mc GUJARATI SIGN VISARGA 0A85..0A8D ; L # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; L # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; L # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; L # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; L # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; L # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABD ; L # Lo GUJARATI SIGN AVAGRAHA 0ABE..0AC0 ; L # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC9 ; L # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; L # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0AD0 ; L # Lo GUJARATI OM 0AE0..0AE1 ; L # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AE6..0AEF ; L # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE 0AF0 ; L # Po GUJARATI ABBREVIATION SIGN 0AF9 ; L # Lo GUJARATI LETTER ZHA 0B02..0B03 ; L # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B05..0B0C ; L # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; L # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; L # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; L # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; L # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; L # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3D ; L # Lo ORIYA SIGN AVAGRAHA 0B3E ; L # Mc ORIYA VOWEL SIGN AA 0B40 ; L # Mc ORIYA VOWEL SIGN II 0B47..0B48 ; L # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; L # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B57 ; L # Mc ORIYA AU LENGTH MARK 0B5C..0B5D ; L # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; L # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B66..0B6F ; L # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE 0B70 ; L # So ORIYA ISSHAR 0B71 ; L # Lo ORIYA LETTER WA 0B72..0B77 ; L # No [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS 0B83 ; L # Lo TAMIL SIGN VISARGA 0B85..0B8A ; L # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; L # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; L # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; L # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; L # Lo TAMIL LETTER JA 0B9E..0B9F ; L # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; L # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; L # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; L # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BBE..0BBF ; L # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I 0BC1..0BC2 ; L # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; L # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; L # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD0 ; L # Lo TAMIL OM 0BD7 ; L # Mc TAMIL AU LENGTH MARK 0BE6..0BEF ; L # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE 0BF0..0BF2 ; L # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND 0C01..0C03 ; L # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C05..0C0C ; L # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; L # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; L # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; L # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; L # Lo TELUGU SIGN AVAGRAHA 0C41..0C44 ; L # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C58..0C5A ; L # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; L # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; L # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C66..0C6F ; L # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE 0C77 ; L # Po TELUGU SIGN SIDDHAM 0C7F ; L # So TELUGU SIGN TUUMU 0C80 ; L # Lo KANNADA SIGN SPACING CANDRABINDU 0C82..0C83 ; L # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C84 ; L # Po KANNADA SIGN SIDDHAM 0C85..0C8C ; L # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; L # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; L # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; L # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; L # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBD ; L # Lo KANNADA SIGN AVAGRAHA 0CBE ; L # Mc KANNADA VOWEL SIGN AA 0CBF ; L # Mn KANNADA VOWEL SIGN I 0CC0..0CC4 ; L # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR 0CC6 ; L # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; L # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; L # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CD5..0CD6 ; L # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CDC..0CDE ; L # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; L # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE6..0CEF ; L # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; L # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; L # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D02..0D03 ; L # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D04..0D0C ; L # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; L # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; L # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3D ; L # Lo MALAYALAM SIGN AVAGRAHA 0D3E..0D40 ; L # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D46..0D48 ; L # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; L # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D4E ; L # Lo MALAYALAM LETTER DOT REPH 0D4F ; L # So MALAYALAM SIGN PARA 0D54..0D56 ; L # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D57 ; L # Mc MALAYALAM AU LENGTH MARK 0D58..0D5E ; L # No [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH 0D5F..0D61 ; L # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D66..0D6F ; L # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE 0D70..0D78 ; L # No [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS 0D79 ; L # So MALAYALAM DATE MARK 0D7A..0D7F ; L # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D82..0D83 ; L # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0D85..0D96 ; L # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; L # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; L # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; L # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; L # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0DCF..0DD1 ; L # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD8..0DDF ; L # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA 0DE6..0DEF ; L # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0DF2..0DF3 ; L # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0DF4 ; L # Po SINHALA PUNCTUATION KUNDDALIYA 0E01..0E30 ; L # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E32..0E33 ; L # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E40..0E45 ; L # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; L # Lm THAI CHARACTER MAIYAMOK 0E4F ; L # Po THAI CHARACTER FONGMAN 0E50..0E59 ; L # Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE 0E5A..0E5B ; L # Po [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT 0E81..0E82 ; L # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; L # Lo LAO LETTER KHO TAM 0E86..0E8A ; L # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; L # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; L # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; L # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB2..0EB3 ; L # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EBD ; L # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; L # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; L # Lm LAO KO LA 0ED0..0ED9 ; L # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE 0EDC..0EDF ; L # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; L # Lo TIBETAN SYLLABLE OM 0F01..0F03 ; L # So [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA 0F04..0F12 ; L # Po [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD 0F13 ; L # So TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN 0F14 ; L # Po TIBETAN MARK GTER TSHEG 0F15..0F17 ; L # So [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS 0F1A..0F1F ; L # So [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG 0F20..0F29 ; L # Nd [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE 0F2A..0F33 ; L # No [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO 0F34 ; L # So TIBETAN MARK BSDUS RTAGS 0F36 ; L # So TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN 0F38 ; L # So TIBETAN MARK CHE MGO 0F3E..0F3F ; L # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F40..0F47 ; L # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; L # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F7F ; L # Mc TIBETAN SIGN RNAM BCAD 0F85 ; L # Po TIBETAN MARK PALUTA 0F88..0F8C ; L # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 0FBE..0FC5 ; L # So [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE 0FC7..0FCC ; L # So [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL 0FCE..0FCF ; L # So [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM 0FD0..0FD4 ; L # Po [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA 0FD5..0FD8 ; L # So [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS 0FD9..0FDA ; L # Po [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS 1000..102A ; L # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 102B..102C ; L # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 1031 ; L # Mc MYANMAR VOWEL SIGN E 1038 ; L # Mc MYANMAR SIGN VISARGA 103B..103C ; L # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 103F ; L # Lo MYANMAR LETTER GREAT SA 1040..1049 ; L # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE 104A..104F ; L # Po [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE 1050..1055 ; L # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 1056..1057 ; L # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 105A..105D ; L # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 1061 ; L # Lo MYANMAR LETTER SGAW KAREN SHA 1062..1064 ; L # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1065..1066 ; L # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 1067..106D ; L # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 106E..1070 ; L # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1075..1081 ; L # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 1083..1084 ; L # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1087..108C ; L # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108E ; L # Lo MYANMAR LETTER RUMAI PALAUNG FA 108F ; L # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 1090..1099 ; L # Nd [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE 109A..109C ; L # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109E..109F ; L # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION 10A0..10C5 ; L # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; L # L& GEORGIAN CAPITAL LETTER YN 10CD ; L # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; L # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FB ; L # Po GEORGIAN PARAGRAPH SEPARATOR 10FC ; L # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; L # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; L # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; L # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; L # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; L # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; L # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; L # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; L # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; L # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; L # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; L # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; L # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; L # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; L # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; L # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; L # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; L # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 1360..1368 ; L # Po [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR 1369..137C ; L # No [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND 1380..138F ; L # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 13A0..13F5 ; L # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; L # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1401..166C ; L # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166D ; L # So CANADIAN SYLLABICS CHI SIGN 166E ; L # Po CANADIAN SYLLABICS FULL STOP 166F..167F ; L # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; L # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; L # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EB..16ED ; L # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION 16EE..16F0 ; L # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; L # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; L # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 1715 ; L # Mc TAGALOG SIGN PAMUDPOD 171F..1731 ; L # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1734 ; L # Mc HANUNOO SIGN PAMUDPOD 1735..1736 ; L # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 1740..1751 ; L # Lo [18] BUHID LETTER A..BUHID LETTER HA 1760..176C ; L # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; L # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1780..17B3 ; L # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17B6 ; L # Mc KHMER VOWEL SIGN AA 17BE..17C5 ; L # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C7..17C8 ; L # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 17D4..17D6 ; L # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 17D7 ; L # Lm KHMER SIGN LEK TOO 17D8..17DA ; L # Po [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT 17DC ; L # Lo KHMER SIGN AVAKRAHASANYA 17E0..17E9 ; L # Nd [10] KHMER DIGIT ZERO..KHMER DIGIT NINE 1810..1819 ; L # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1820..1842 ; L # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; L # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; L # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; L # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1887..18A8 ; L # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18AA ; L # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; L # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; L # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1923..1926 ; L # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1929..192B ; L # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; L # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1933..1938 ; L # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1946..194F ; L # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE 1950..196D ; L # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; L # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; L # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; L # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 19D0..19D9 ; L # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE 19DA ; L # No NEW TAI LUE THAM DIGIT ONE 1A00..1A16 ; L # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A19..1A1A ; L # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A1E..1A1F ; L # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION 1A20..1A54 ; L # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1A55 ; L # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A57 ; L # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A61 ; L # Mc TAI THAM VOWEL SIGN A 1A63..1A64 ; L # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A6D..1A72 ; L # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1A80..1A89 ; L # Nd [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE 1A90..1A99 ; L # Nd [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE 1AA0..1AA6 ; L # Po [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA 1AA7 ; L # Lm TAI THAM SIGN MAI YAMOK 1AA8..1AAD ; L # Po [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG 1B04 ; L # Mc BALINESE SIGN BISAH 1B05..1B33 ; L # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B35 ; L # Mc BALINESE VOWEL SIGN TEDUNG 1B3B ; L # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3D..1B41 ; L # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B43..1B44 ; L # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1B45..1B4C ; L # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B4E..1B4F ; L # Po [2] BALINESE INVERTED CARIK SIKI..BALINESE INVERTED CARIK PAREREN 1B50..1B59 ; L # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE 1B5A..1B60 ; L # Po [7] BALINESE PANTI..BALINESE PAMENENG 1B61..1B6A ; L # So [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE 1B74..1B7C ; L # So [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING 1B7D..1B7F ; L # Po [3] BALINESE PANTI LANTANG..BALINESE PANTI BAWAK 1B82 ; L # Mc SUNDANESE SIGN PANGWISAD 1B83..1BA0 ; L # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BA1 ; L # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA6..1BA7 ; L # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BAA ; L # Mc SUNDANESE SIGN PAMAAEH 1BAE..1BAF ; L # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BB0..1BB9 ; L # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1BBA..1BE5 ; L # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1BE7 ; L # Mc BATAK VOWEL SIGN E 1BEA..1BEC ; L # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BEE ; L # Mc BATAK VOWEL SIGN U 1BF2..1BF3 ; L # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1BFC..1BFF ; L # Po [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT 1C00..1C23 ; L # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C24..1C2B ; L # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C34..1C35 ; L # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C3B..1C3F ; L # Po [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK 1C40..1C49 ; L # Nd [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE 1C4D..1C4F ; L # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C50..1C59 ; L # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE 1C5A..1C77 ; L # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; L # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; L # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD 1C80..1C8A ; L # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; L # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; L # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; L # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA 1CD3 ; L # Po VEDIC SIGN NIHSHVASA 1CE1 ; L # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CE9..1CEC ; L # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CEE..1CF3 ; L # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF5..1CF6 ; L # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CF7 ; L # Mc VEDIC SIGN ATIKRAMA 1CFA ; L # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; L # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; L # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; L # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; L # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; L # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; L # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1E00..1F15 ; L # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; L # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; L # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; L # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; L # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; L # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; L # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; L # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; L # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; L # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; L # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; L # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; L # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; L # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; L # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; L # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; L # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; L # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; L # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 200E ; L # Cf LEFT-TO-RIGHT MARK 2071 ; L # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; L # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; L # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2102 ; L # L& DOUBLE-STRUCK CAPITAL C 2107 ; L # L& EULER CONSTANT 210A..2113 ; L # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; L # L& DOUBLE-STRUCK CAPITAL N 2119..211D ; L # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; L # L& DOUBLE-STRUCK CAPITAL Z 2126 ; L # L& OHM SIGN 2128 ; L # L& BLACK-LETTER CAPITAL Z 212A..212D ; L # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212F..2134 ; L # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; L # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; L # L& INFORMATION SOURCE 213C..213F ; L # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; L # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; L # L& TURNED SMALL F 214F ; L # So SYMBOL FOR SAMARITAN SOURCE 2160..2182 ; L # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; L # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; L # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 2336..237A ; L # So [69] APL FUNCTIONAL SYMBOL I-BEAM..APL FUNCTIONAL SYMBOL ALPHA 2395 ; L # So APL FUNCTIONAL SYMBOL QUAD 249C..24E9 ; L # So [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 26AC ; L # So MEDIUM SMALL WHITE CIRCLE 2800..28FF ; L # So [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 2C00..2C7B ; L # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; L # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; L # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; L # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; L # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; L # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; L # L& GEORGIAN SMALL LETTER YN 2D2D ; L # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; L # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; L # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D70 ; L # Po TIFINAGH SEPARATOR MARK 2D80..2D96 ; L # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; L # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; L # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; L # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; L # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; L # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; L # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; L # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; L # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 3005 ; L # Lm IDEOGRAPHIC ITERATION MARK 3006 ; L # Lo IDEOGRAPHIC CLOSING MARK 3007 ; L # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; L # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 302E..302F ; L # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3031..3035 ; L # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3038..303A ; L # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; L # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; L # Lo MASU MARK 3041..3096 ; L # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 309D..309E ; L # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; L # Lo HIRAGANA DIGRAPH YORI 30A1..30FA ; L # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FC..30FE ; L # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; L # Lo KATAKANA DIGRAPH KOTO 3105..312F ; L # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; L # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 3190..3191 ; L # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK 3192..3195 ; L # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK 3196..319F ; L # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK 31A0..31BF ; L # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; L # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3200..321C ; L # So [29] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED HANGUL CIEUC U 3220..3229 ; L # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN 322A..3247 ; L # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO 3248..324F ; L # No [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE 3260..327B ; L # So [28] CIRCLED HANGUL KIYEOK..CIRCLED HANGUL HIEUH A 327F ; L # So KOREAN STANDARD SYMBOL 3280..3289 ; L # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN 328A..32B0 ; L # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT 32C0..32CB ; L # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER 32D0..3376 ; L # So [167] CIRCLED KATAKANA A..SQUARE PC 337B..33DD ; L # So [99] SQUARE ERA NAME HEISEI..SQUARE WB 33E0..33FE ; L # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 3400..4DBF ; L # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; L # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; L # Lm YI SYLLABLE WU A016..A48C ; L # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; L # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; L # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A4FE..A4FF ; L # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP A500..A60B ; L # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; L # Lm VAI SYLLABLE LENGTHENER A610..A61F ; L # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A620..A629 ; L # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE A62A..A62B ; L # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; L # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; L # Lo CYRILLIC LETTER MULTIOCULAR O A680..A69B ; L # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; L # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6A0..A6E5 ; L # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; L # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A6F2..A6F7 ; L # Po [6] BAMUM NJAEMLI..BAMUM QUESTION MARK A722..A76F ; L # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; L # Lm MODIFIER LETTER US A771..A787 ; L # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A789..A78A ; L # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN A78B..A78E ; L # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; L # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; L # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; L # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; L # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; L # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; L # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; L # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; L # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A803..A805 ; L # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A807..A80A ; L # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80C..A822 ; L # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A823..A824 ; L # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A827 ; L # Mc SYLOTI NAGRI VOWEL SIGN OO A830..A835 ; L # No [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS A836..A837 ; L # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK A840..A873 ; L # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A880..A881 ; L # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A882..A8B3 ; L # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8B4..A8C3 ; L # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A8CE..A8CF ; L # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA A8D0..A8D9 ; L # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE A8F2..A8F7 ; L # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8F8..A8FA ; L # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET A8FB ; L # Lo DEVANAGARI HEADSTROKE A8FC ; L # Po DEVANAGARI SIGN SIDDHAM A8FD..A8FE ; L # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A900..A909 ; L # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A90A..A925 ; L # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A92E..A92F ; L # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA A930..A946 ; L # Lo [23] REJANG LETTER KA..REJANG LETTER A A952..A953 ; L # Mc [2] REJANG CONSONANT SIGN H..REJANG VIRAMA A95F ; L # Po REJANG SECTION MARK A960..A97C ; L # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A983 ; L # Mc JAVANESE SIGN WIGNYAN A984..A9B2 ; L # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9B4..A9B5 ; L # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9BA..A9BB ; L # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BE..A9C0 ; L # Mc [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON A9C1..A9CD ; L # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH A9CF ; L # Lm JAVANESE PANGRANGKEP A9D0..A9D9 ; L # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9DE..A9DF ; L # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN A9E0..A9E4 ; L # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E6 ; L # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; L # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9F0..A9F9 ; L # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE A9FA..A9FE ; L # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; L # Lo [41] CHAM LETTER A..CHAM LETTER HA AA2F..AA30 ; L # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA33..AA34 ; L # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA40..AA42 ; L # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA44..AA4B ; L # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA4D ; L # Mc CHAM CONSONANT SIGN FINAL H AA50..AA59 ; L # Nd [10] CHAM DIGIT ZERO..CHAM DIGIT NINE AA5C..AA5F ; L # Po [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA AA60..AA6F ; L # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; L # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; L # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA77..AA79 ; L # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO AA7A ; L # Lo MYANMAR LETTER AITON RA AA7B ; L # Mc MYANMAR SIGN PAO KAREN TONE AA7D ; L # Mc MYANMAR SIGN TAI LAING TONE-5 AA7E..AAAF ; L # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB1 ; L # Lo TAI VIET VOWEL AA AAB5..AAB6 ; L # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB9..AABD ; L # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AAC0 ; L # Lo TAI VIET TONE MAI NUENG AAC2 ; L # Lo TAI VIET TONE MAI SONG AADB..AADC ; L # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; L # Lm TAI VIET SYMBOL SAM AADE..AADF ; L # Po [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI AAE0..AAEA ; L # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAEB ; L # Mc MEETEI MAYEK VOWEL SIGN II AAEE..AAEF ; L # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF0..AAF1 ; L # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM AAF2 ; L # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; L # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AAF5 ; L # Mc MEETEI MAYEK VOWEL SIGN VISARGA AB01..AB06 ; L # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; L # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; L # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; L # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; L # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; L # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5B ; L # Sk MODIFIER BREVE WITH INVERTED BREVE AB5C..AB5F ; L # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; L # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; L # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; L # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; L # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM ABE3..ABE4 ; L # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE6..ABE7 ; L # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE9..ABEA ; L # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG ABEB ; L # Po MEETEI MAYEK CHEIKHEI ABEC ; L # Mc MEETEI MAYEK LUM IYEK ABF0..ABF9 ; L # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE AC00..D7A3 ; L # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; L # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; L # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH E000..F8FF ; L # Co [6400] .. F900..FA6D ; L # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; L # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; L # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; L # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF21..FF3A ; L # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; L # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF66..FF6F ; L # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; L # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; L # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FF9E..FF9F ; L # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFA0..FFBE ; L # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; L # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; L # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; L # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; L # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; L # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; L # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; L # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; L # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; L # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; L # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10100 ; L # Po AEGEAN WORD SEPARATOR LINE 10102 ; L # Po AEGEAN CHECK MARK 10107..10133 ; L # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND 10137..1013F ; L # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT 1018D..1018E ; L # So [2] GREEK INDICTION SIGN..NOMISMA SIGN 101D0..101FC ; L # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND 10280..1029C ; L # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; L # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 10300..1031F ; L # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 10320..10323 ; L # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY 1032D..10340 ; L # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; L # Nl GOTHIC LETTER NINETY 10342..10349 ; L # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; L # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; L # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10380..1039D ; L # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 1039F ; L # Po UGARITIC WORD DIVIDER 103A0..103C3 ; L # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; L # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D0 ; L # Po OLD PERSIAN WORD DIVIDER 103D1..103D5 ; L # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; L # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; L # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104A0..104A9 ; L # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE 104B0..104D3 ; L # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; L # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; L # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; L # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 1056F ; L # Po CAUCASIAN ALBANIAN CITATION MARK 10570..1057A ; L # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; L # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; L # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; L # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; L # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; L # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; L # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; L # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; L # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; L # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; L # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; L # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; L # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; L # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; L # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 11000 ; L # Mc BRAHMI SIGN CANDRABINDU 11002 ; L # Mc BRAHMI SIGN VISARGA 11003..11037 ; L # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11047..1104D ; L # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 11066..1106F ; L # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE 11071..11072 ; L # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11075 ; L # Lo BRAHMI LETTER OLD TAMIL LLA 11082 ; L # Mc KAITHI SIGN VISARGA 11083..110AF ; L # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110B0..110B2 ; L # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B7..110B8 ; L # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 110BB..110BC ; L # Po [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN 110BD ; L # Cf KAITHI NUMBER SIGN 110BE..110C1 ; L # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 110CD ; L # Cf KAITHI NUMBER SIGN ABOVE 110D0..110E8 ; L # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 110F0..110F9 ; L # Nd [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE 11103..11126 ; L # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 1112C ; L # Mc CHAKMA VOWEL SIGN E 11136..1113F ; L # Nd [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE 11140..11143 ; L # Po [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK 11144 ; L # Lo CHAKMA LETTER LHAA 11145..11146 ; L # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11147 ; L # Lo CHAKMA LETTER VAA 11150..11172 ; L # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11174..11175 ; L # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK 11176 ; L # Lo MAHAJANI LIGATURE SHRI 11182 ; L # Mc SHARADA SIGN VISARGA 11183..111B2 ; L # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111B3..111B5 ; L # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111BF..111C0 ; L # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA 111C1..111C4 ; L # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111C5..111C8 ; L # Po [4] SHARADA DANDA..SHARADA SEPARATOR 111CD ; L # Po SHARADA SUTRA MARK 111CE ; L # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 111D0..111D9 ; L # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE 111DA ; L # Lo SHARADA EKAM 111DB ; L # Po SHARADA SIGN SIDDHAM 111DC ; L # Lo SHARADA HEADSTROKE 111DD..111DF ; L # Po [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 111E1..111F4 ; L # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND 11200..11211 ; L # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; L # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1122C..1122E ; L # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 11232..11233 ; L # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11235 ; L # Mc KHOJKI SIGN VIRAMA 11238..1123D ; L # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN 1123F..11240 ; L # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11280..11286 ; L # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; L # Lo MULTANI LETTER GHA 1128A..1128D ; L # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; L # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; L # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112A9 ; L # Po MULTANI SECTION MARK 112B0..112DE ; L # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 112E0..112E2 ; L # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 112F0..112F9 ; L # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE 11302..11303 ; L # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 11305..1130C ; L # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; L # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; L # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; L # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; L # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; L # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133D ; L # Lo GRANTHA SIGN AVAGRAHA 1133E..1133F ; L # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I 11341..11344 ; L # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; L # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134D ; L # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA 11350 ; L # Lo GRANTHA OM 11357 ; L # Mc GRANTHA AU LENGTH MARK 1135D..11361 ; L # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11362..11363 ; L # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 11380..11389 ; L # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; L # Lo TULU-TIGALARI LETTER EE 1138E ; L # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; L # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; L # Lo TULU-TIGALARI SIGN AVAGRAHA 113B8..113BA ; L # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II 113C2 ; L # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; L # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113CA ; L # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; L # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 113CF ; L # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D1 ; L # Lo TULU-TIGALARI REPHA 113D3 ; L # Lo TULU-TIGALARI SIGN PLUTA 113D4..113D5 ; L # Po [2] TULU-TIGALARI DANDA..TULU-TIGALARI DOUBLE DANDA 113D7..113D8 ; L # Po [2] TULU-TIGALARI SIGN OM PUSHPIKA..TULU-TIGALARI SIGN SHRII PUSHPIKA 11400..11434 ; L # Lo [53] NEWA LETTER A..NEWA LETTER HA 11435..11437 ; L # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11440..11441 ; L # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11445 ; L # Mc NEWA SIGN VISARGA 11447..1144A ; L # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 1144B..1144F ; L # Po [5] NEWA DANDA..NEWA ABBREVIATION SIGN 11450..11459 ; L # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE 1145A..1145B ; L # Po [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK 1145D ; L # Po NEWA INSERTION SIGN 1145F..11461 ; L # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; L # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114B0..114B2 ; L # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II 114B9 ; L # Mc TIRHUTA VOWEL SIGN E 114BB..114BE ; L # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU 114C1 ; L # Mc TIRHUTA SIGN VISARGA 114C4..114C5 ; L # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C6 ; L # Po TIRHUTA ABBREVIATION SIGN 114C7 ; L # Lo TIRHUTA OM 114D0..114D9 ; L # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE 11580..115AE ; L # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115AF..115B1 ; L # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II 115B8..115BB ; L # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BE ; L # Mc SIDDHAM SIGN VISARGA 115C1..115D7 ; L # Po [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 115D8..115DB ; L # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 11600..1162F ; L # Lo [48] MODI LETTER A..MODI LETTER LLA 11630..11632 ; L # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 1163B..1163C ; L # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163E ; L # Mc MODI SIGN VISARGA 11641..11643 ; L # Po [3] MODI DANDA..MODI ABBREVIATION SIGN 11644 ; L # Lo MODI SIGN HUVA 11650..11659 ; L # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE 11680..116AA ; L # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116AC ; L # Mc TAKRI SIGN VISARGA 116AE..116AF ; L # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B6 ; L # Mc TAKRI SIGN VIRAMA 116B8 ; L # Lo TAKRI LETTER ARCHAIC KHA 116B9 ; L # Po TAKRI ABBREVIATION SIGN 116C0..116C9 ; L # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE 116D0..116E3 ; L # Nd [20] MYANMAR PAO DIGIT ZERO..MYANMAR EASTERN PWO KAREN DIGIT NINE 11700..1171A ; L # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 1171E ; L # Mc AHOM CONSONANT SIGN MEDIAL RA 11720..11721 ; L # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11726 ; L # Mc AHOM VOWEL SIGN E 11730..11739 ; L # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 1173A..1173B ; L # No [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY 1173C..1173E ; L # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI 1173F ; L # So AHOM SYMBOL VI 11740..11746 ; L # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; L # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 1182C..1182E ; L # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 11838 ; L # Mc DOGRA SIGN VISARGA 1183B ; L # Po DOGRA ABBREVIATION SIGN 118A0..118DF ; L # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118E0..118E9 ; L # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 118EA..118F2 ; L # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY 118FF..11906 ; L # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; L # Lo DIVES AKURU LETTER O 1190C..11913 ; L # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; L # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; L # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 11930..11935 ; L # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E 11937..11938 ; L # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193D ; L # Mc DIVES AKURU SIGN HALANTA 1193F ; L # Lo DIVES AKURU PREFIXED NASAL SIGN 11940 ; L # Mc DIVES AKURU MEDIAL YA 11941 ; L # Lo DIVES AKURU INITIAL RA 11942 ; L # Mc DIVES AKURU MEDIAL RA 11944..11946 ; L # Po [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK 11950..11959 ; L # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE 119A0..119A7 ; L # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; L # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119D1..119D3 ; L # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119DC..119DF ; L # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E1 ; L # Lo NANDINAGARI SIGN AVAGRAHA 119E2 ; L # Po NANDINAGARI SIGN SIDDHAM 119E3 ; L # Lo NANDINAGARI HEADSTROKE 119E4 ; L # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A00 ; L # Lo ZANABAZAR SQUARE LETTER A 11A07..11A08 ; L # Mn [2] ZANABAZAR SQUARE VOWEL SIGN AI..ZANABAZAR SQUARE VOWEL SIGN AU 11A0B..11A32 ; L # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A39 ; L # Mc ZANABAZAR SQUARE SIGN VISARGA 11A3A ; L # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A3F..11A46 ; L # Po [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK 11A50 ; L # Lo SOYOMBO LETTER A 11A57..11A58 ; L # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A5C..11A89 ; L # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A97 ; L # Mc SOYOMBO SIGN VISARGA 11A9A..11A9C ; L # Po [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD 11A9D ; L # Lo SOYOMBO MARK PLUTA 11A9E..11AA2 ; L # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11AB0..11AF8 ; L # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; L # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU 11B61 ; L # Mc SHARADA VOWEL SIGN OOE 11B65 ; L # Mc SHARADA VOWEL SIGN SHORT O 11B67 ; L # Mc SHARADA VOWEL SIGN CANDRA O 11BC0..11BE0 ; L # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11BE1 ; L # Po SUNUWAR SIGN PVO 11BF0..11BF9 ; L # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; L # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; L # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; L # Mc BHAIKSUKI VOWEL SIGN AA 11C3E ; L # Mc BHAIKSUKI SIGN VISARGA 11C3F ; L # Mn BHAIKSUKI SIGN VIRAMA 11C40 ; L # Lo BHAIKSUKI SIGN AVAGRAHA 11C41..11C45 ; L # Po [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 11C50..11C59 ; L # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11C5A..11C6C ; L # No [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK 11C70..11C71 ; L # Po [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD 11C72..11C8F ; L # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11CA9 ; L # Mc MARCHEN SUBJOINED LETTER YA 11CB1 ; L # Mc MARCHEN VOWEL SIGN I 11CB4 ; L # Mc MARCHEN VOWEL SIGN O 11D00..11D06 ; L # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; L # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; L # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D46 ; L # Lo MASARAM GONDI REPHA 11D50..11D59 ; L # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11D60..11D65 ; L # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; L # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; L # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D8A..11D8E ; L # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D93..11D94 ; L # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D96 ; L # Mc GUNJALA GONDI SIGN VISARGA 11D98 ; L # Lo GUNJALA GONDI OM 11DA0..11DA9 ; L # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE 11DB0..11DD8 ; L # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; L # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; L # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11DE0..11DE9 ; L # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE 11EE0..11EF2 ; L # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11EF5..11EF6 ; L # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11EF7..11EF8 ; L # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION 11F02 ; L # Lo KAWI SIGN REPHA 11F03 ; L # Mc KAWI SIGN VISARGA 11F04..11F10 ; L # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; L # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11F34..11F35 ; L # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F3E..11F3F ; L # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F41 ; L # Mc KAWI SIGN KILLER 11F43..11F4F ; L # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL 11F50..11F59 ; L # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE 11FB0 ; L # Lo LISU LETTER YHA 11FC0..11FD4 ; L # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH 11FFF ; L # Po TAMIL PUNCTUATION END OF TEXT 12000..12399 ; L # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; L # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12470..12474 ; L # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 12480..12543 ; L # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; L # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 12FF1..12FF2 ; L # Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302 13000..1342F ; L # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13430..1343F ; L # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE 13441..13446 ; L # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13460..143FA ; L # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; L # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; L # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 1612A..1612C ; L # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 16130..16139 ; L # Nd [10] GURUNG KHEMA DIGIT ZERO..GURUNG KHEMA DIGIT NINE 16800..16A38 ; L # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; L # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A60..16A69 ; L # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE 16A6E..16A6F ; L # Po [2] MRO DANDA..MRO DOUBLE DANDA 16A70..16ABE ; L # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AC0..16AC9 ; L # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE 16AD0..16AED ; L # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16AF5 ; L # Po BASSA VAH FULL STOP 16B00..16B2F ; L # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B37..16B3B ; L # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM 16B3C..16B3F ; L # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB 16B40..16B43 ; L # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B44 ; L # Po PAHAWH HMONG SIGN XAUS 16B45 ; L # So PAHAWH HMONG SIGN CIM TSOV ROG 16B50..16B59 ; L # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE 16B5B..16B61 ; L # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS 16B63..16B77 ; L # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; L # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; L # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; L # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; L # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16D6D..16D6F ; L # Po [3] KIRAT RAI SIGN YUPI..KIRAT RAI DOUBLE DANDA 16D70..16D79 ; L # Nd [10] KIRAT RAI DIGIT ZERO..KIRAT RAI DIGIT NINE 16E40..16E7F ; L # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16E80..16E96 ; L # No [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM 16E97..16E9A ; L # Po [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH 16EA0..16EB8 ; L # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; L # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; L # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; L # Lo MIAO LETTER NASALIZATION 16F51..16F87 ; L # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F93..16F9F ; L # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; L # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; L # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; L # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 16FF2..16FF3 ; L # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; L # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; L # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; L # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; L # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; L # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; L # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; L # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; L # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; L # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; L # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; L # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; L # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; L # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; L # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; L # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; L # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; L # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1BC9C ; L # So DUPLOYAN SIGN O WITH CROSS 1BC9F ; L # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP 1CCD6..1CCEF ; L # So [26] OUTLINED LATIN CAPITAL LETTER A..OUTLINED LATIN CAPITAL LETTER Z 1CF50..1CFC3 ; L # So [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK 1D000..1D0F5 ; L # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; L # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 1D129..1D164 ; L # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE 1D165..1D166 ; L # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D16A..1D16C ; L # So [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3 1D16D..1D172 ; L # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D183..1D184 ; L # So [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN 1D18C..1D1A9 ; L # So [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH 1D1AE..1D1E8 ; L # So [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN 1D2C0..1D2D3 ; L # No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN 1D2E0..1D2F3 ; L # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN 1D360..1D378 ; L # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE 1D400..1D454 ; L # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; L # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; L # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; L # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; L # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; L # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; L # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; L # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; L # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; L # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; L # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; L # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; L # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; L # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; L # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; L # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; L # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; L # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; L # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; L # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; L # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; L # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; L # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; L # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; L # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; L # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; L # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; L # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; L # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; L # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1D800..1D9FF ; L # So [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD 1DA37..1DA3A ; L # So [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE 1DA6D..1DA74 ; L # So [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING 1DA76..1DA83 ; L # So [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH 1DA85..1DA86 ; L # So [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS 1DA87..1DA8B ; L # Po [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS 1DF00..1DF09 ; L # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; L # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; L # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; L # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E030..1E06D ; L # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E100..1E12C ; L # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E137..1E13D ; L # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E140..1E149 ; L # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE 1E14E ; L # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E14F ; L # So NYIAKENG PUACHUE HMONG CIRCLED CA 1E290..1E2AD ; L # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; L # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E2F0..1E2F9 ; L # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE 1E4D0..1E4EA ; L # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; L # Lm NAG MUNDARI SIGN OJOD 1E4F0..1E4F9 ; L # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE 1E5D0..1E5ED ; L # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5F0 ; L # Lo OL ONAL SIGN HODDOND 1E5F1..1E5FA ; L # Nd [10] OL ONAL DIGIT ZERO..OL ONAL DIGIT NINE 1E5FF ; L # Po OL ONAL ABBREVIATION SIGN 1E6C0..1E6DE ; L # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; L # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E4..1E6E5 ; L # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E7..1E6ED ; L # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6F0..1E6F4 ; L # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6FE ; L # Lo TAI YO SYMBOL MUEANG 1E6FF ; L # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; L # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; L # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; L # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; L # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1F110..1F12E ; L # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ 1F130..1F169 ; L # So [58] SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F1AC ; L # So [61] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD 1F1E6..1F202 ; L # So [29] REGIONAL INDICATOR SYMBOL LETTER A..SQUARED KATAKANA SA 1F210..1F23B ; L # So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D 1F240..1F248 ; L # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 1F250..1F251 ; L # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT 20000..2A6DF ; L # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; L # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; L # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; L # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; L # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; L # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; L # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; L # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 F0000..FFFFD ; L # Co [65534] .. 100000..10FFFD; L # Co [65534] .. # The above property value applies to 810615 code points not listed here. # Total code points: 1095407 # ================================================ # Bidi_Class=Right_To_Left 05BE ; R # Pd HEBREW PUNCTUATION MAQAF 05C0 ; R # Po HEBREW PUNCTUATION PASEQ 05C3 ; R # Po HEBREW PUNCTUATION SOF PASUQ 05C6 ; R # Po HEBREW PUNCTUATION NUN HAFUKHA 05D0..05EA ; R # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; R # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 05F3..05F4 ; R # Po [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM 07C0..07C9 ; R # Nd [10] NKO DIGIT ZERO..NKO DIGIT NINE 07CA..07EA ; R # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07F4..07F5 ; R # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; R # Lm NKO LAJANYALAN 07FE..07FF ; R # Sc [2] NKO DOROME SIGN..NKO TAMAN SIGN 0800..0815 ; R # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 081A ; R # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 0824 ; R # Lm SAMARITAN MODIFIER LETTER SHORT A 0828 ; R # Lm SAMARITAN MODIFIER LETTER I 0830..083E ; R # Po [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU 0840..0858 ; R # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 085E ; R # Po MANDAIC PUNCTUATION 200F ; R # Cf RIGHT-TO-LEFT MARK FB1D ; R # Lo HEBREW LETTER YOD WITH HIRIQ FB1F..FB28 ; R # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; R # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; R # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; R # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; R # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; R # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FB4F ; R # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATURE ALEF LAMED 10800..10805 ; R # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; R # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; R # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; R # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; R # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; R # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10857 ; R # Po IMPERIAL ARAMAIC SECTION SIGN 10858..1085F ; R # No [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND 10860..10876 ; R # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10877..10878 ; R # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON 10879..1087F ; R # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY 10880..1089E ; R # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108A7..108AF ; R # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED 108E0..108F2 ; R # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; R # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 108FB..108FF ; R # No [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED 10900..10915 ; R # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10916..1091B ; R # No [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE 10920..10939 ; R # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 1093F ; R # Po LYDIAN TRIANGULAR MARK 10940..10959 ; R # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; R # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BC..109BD ; R # No [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF 109BE..109BF ; R # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 109C0..109CF ; R # No [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY 109D2..109FF ; R # No [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS 10A00 ; R # Lo KHAROSHTHI LETTER A 10A10..10A13 ; R # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; R # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; R # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A40..10A48 ; R # No [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF 10A50..10A58 ; R # Po [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES 10A60..10A7C ; R # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A7D..10A7E ; R # No [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY 10A7F ; R # Po OLD SOUTH ARABIAN NUMERIC INDICATOR 10A80..10A9C ; R # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10A9D..10A9F ; R # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY 10AC0..10AC7 ; R # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC8 ; R # So MANICHAEAN SIGN UD 10AC9..10AE4 ; R # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10AEB..10AEF ; R # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED 10AF0..10AF6 ; R # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER 10B00..10B35 ; R # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; R # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B58..10B5F ; R # No [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND 10B60..10B72 ; R # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B78..10B7F ; R # No [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND 10B80..10B91 ; R # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10B99..10B9C ; R # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT 10BA9..10BAF ; R # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED 10C00..10C48 ; R # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; R # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; R # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10CFA..10CFF ; R # No [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND 10D4A..10D4D ; R # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; R # Lm GARAY VOWEL LENGTH MARK 10D4F ; R # Lo GARAY SUKUN 10D50..10D65 ; R # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D6F ; R # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; R # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10D8E..10D8F ; R # Sm [2] GARAY PLUS SIGN..GARAY MINUS SIGN 10E80..10EA9 ; R # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAD ; R # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; R # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10F00..10F1C ; R # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; R # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; R # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F70..10F81 ; R # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10F86..10F89 ; R # Po [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS 10FB0..10FC4 ; R # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FC5..10FCB ; R # No [7] CHORASMIAN NUMBER ONE..CHORASMIAN NUMBER ONE HUNDRED 10FE0..10FF6 ; R # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 1E800..1E8C4 ; R # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E8C7..1E8CF ; R # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE 1E900..1E943 ; R # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E94B ; R # Lm ADLAM NASALIZATION MARK 1E950..1E959 ; R # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1E95E..1E95F ; R # Po [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK # The above property value applies to 2061 code points not listed here. # Total code points: 3631 # ================================================ # Bidi_Class=European_Number 0030..0039 ; EN # Nd [10] DIGIT ZERO..DIGIT NINE 00B2..00B3 ; EN # No [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE 00B9 ; EN # No SUPERSCRIPT ONE 06F0..06F9 ; EN # Nd [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE 2070 ; EN # No SUPERSCRIPT ZERO 2074..2079 ; EN # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE 2080..2089 ; EN # No [10] SUBSCRIPT ZERO..SUBSCRIPT NINE 2488..249B ; EN # No [20] DIGIT ONE FULL STOP..NUMBER TWENTY FULL STOP FF10..FF19 ; EN # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 102E1..102FB ; EN # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED 1CCF0..1CCF9 ; EN # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE 1D7CE..1D7FF ; EN # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1F100..1F10A ; EN # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA 1FBF0..1FBF9 ; EN # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE # Total code points: 178 # ================================================ # Bidi_Class=European_Separator 002B ; ES # Sm PLUS SIGN 002D ; ES # Pd HYPHEN-MINUS 207A..207B ; ES # Sm [2] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT MINUS 208A..208B ; ES # Sm [2] SUBSCRIPT PLUS SIGN..SUBSCRIPT MINUS 2212 ; ES # Sm MINUS SIGN FB29 ; ES # Sm HEBREW LETTER ALTERNATIVE PLUS SIGN FE62 ; ES # Sm SMALL PLUS SIGN FE63 ; ES # Pd SMALL HYPHEN-MINUS FF0B ; ES # Sm FULLWIDTH PLUS SIGN FF0D ; ES # Pd FULLWIDTH HYPHEN-MINUS # Total code points: 12 # ================================================ # Bidi_Class=European_Terminator 0023 ; ET # Po NUMBER SIGN 0024 ; ET # Sc DOLLAR SIGN 0025 ; ET # Po PERCENT SIGN 00A2..00A5 ; ET # Sc [4] CENT SIGN..YEN SIGN 00B0 ; ET # So DEGREE SIGN 00B1 ; ET # Sm PLUS-MINUS SIGN 058F ; ET # Sc ARMENIAN DRAM SIGN 0609..060A ; ET # Po [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN 066A ; ET # Po ARABIC PERCENT SIGN 09F2..09F3 ; ET # Sc [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN 09FB ; ET # Sc BENGALI GANDA MARK 0AF1 ; ET # Sc GUJARATI RUPEE SIGN 0BF9 ; ET # Sc TAMIL RUPEE SIGN 0E3F ; ET # Sc THAI CURRENCY SYMBOL BAHT 17DB ; ET # Sc KHMER CURRENCY SYMBOL RIEL 2030..2034 ; ET # Po [5] PER MILLE SIGN..TRIPLE PRIME 20A0..20C1 ; ET # Sc [34] EURO-CURRENCY SIGN..SAUDI RIYAL SIGN 212E ; ET # So ESTIMATED SYMBOL 2213 ; ET # Sm MINUS-OR-PLUS SIGN A838 ; ET # Sc NORTH INDIC RUPEE MARK A839 ; ET # So NORTH INDIC QUANTITY MARK FE5F ; ET # Po SMALL NUMBER SIGN FE69 ; ET # Sc SMALL DOLLAR SIGN FE6A ; ET # Po SMALL PERCENT SIGN FF03 ; ET # Po FULLWIDTH NUMBER SIGN FF04 ; ET # Sc FULLWIDTH DOLLAR SIGN FF05 ; ET # Po FULLWIDTH PERCENT SIGN FFE0..FFE1 ; ET # Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN FFE5..FFE6 ; ET # Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN 11FDD..11FE0 ; ET # Sc [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN 1E2FF ; ET # Sc WANCHO NGUN SIGN # The above property value applies to 14 code points not listed here. # Total code points: 92 # ================================================ # Bidi_Class=Arabic_Number 0600..0605 ; AN # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE 0660..0669 ; AN # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE 066B..066C ; AN # Po [2] ARABIC DECIMAL SEPARATOR..ARABIC THOUSANDS SEPARATOR 06DD ; AN # Cf ARABIC END OF AYAH 0890..0891 ; AN # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 08E2 ; AN # Cf ARABIC DISPUTED END OF AYAH 10D30..10D39 ; AN # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE 10D40..10D49 ; AN # Nd [10] GARAY DIGIT ZERO..GARAY DIGIT NINE 10E60..10E7E ; AN # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS # Total code points: 73 # ================================================ # Bidi_Class=Common_Separator 002C ; CS # Po COMMA 002E..002F ; CS # Po [2] FULL STOP..SOLIDUS 003A ; CS # Po COLON 00A0 ; CS # Zs NO-BREAK SPACE 060C ; CS # Po ARABIC COMMA 202F ; CS # Zs NARROW NO-BREAK SPACE 2044 ; CS # Sm FRACTION SLASH FE50 ; CS # Po SMALL COMMA FE52 ; CS # Po SMALL FULL STOP FE55 ; CS # Po SMALL COLON FF0C ; CS # Po FULLWIDTH COMMA FF0E..FF0F ; CS # Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS FF1A ; CS # Po FULLWIDTH COLON # Total code points: 15 # ================================================ # Bidi_Class=Paragraph_Separator 000A ; B # Cc 000D ; B # Cc 001C..001E ; B # Cc [3] .. 0085 ; B # Cc 2029 ; B # Zp PARAGRAPH SEPARATOR # Total code points: 7 # ================================================ # Bidi_Class=Segment_Separator 0009 ; S # Cc 000B ; S # Cc 001F ; S # Cc # Total code points: 3 # ================================================ # Bidi_Class=White_Space 000C ; WS # Cc 0020 ; WS # Zs SPACE 1680 ; WS # Zs OGHAM SPACE MARK 2000..200A ; WS # Zs [11] EN QUAD..HAIR SPACE 2028 ; WS # Zl LINE SEPARATOR 205F ; WS # Zs MEDIUM MATHEMATICAL SPACE 3000 ; WS # Zs IDEOGRAPHIC SPACE # Total code points: 17 # ================================================ # Bidi_Class=Other_Neutral 0021..0022 ; ON # Po [2] EXCLAMATION MARK..QUOTATION MARK 0026..0027 ; ON # Po [2] AMPERSAND..APOSTROPHE 0028 ; ON # Ps LEFT PARENTHESIS 0029 ; ON # Pe RIGHT PARENTHESIS 002A ; ON # Po ASTERISK 003B ; ON # Po SEMICOLON 003C..003E ; ON # Sm [3] LESS-THAN SIGN..GREATER-THAN SIGN 003F..0040 ; ON # Po [2] QUESTION MARK..COMMERCIAL AT 005B ; ON # Ps LEFT SQUARE BRACKET 005C ; ON # Po REVERSE SOLIDUS 005D ; ON # Pe RIGHT SQUARE BRACKET 005E ; ON # Sk CIRCUMFLEX ACCENT 005F ; ON # Pc LOW LINE 0060 ; ON # Sk GRAVE ACCENT 007B ; ON # Ps LEFT CURLY BRACKET 007C ; ON # Sm VERTICAL LINE 007D ; ON # Pe RIGHT CURLY BRACKET 007E ; ON # Sm TILDE 00A1 ; ON # Po INVERTED EXCLAMATION MARK 00A6 ; ON # So BROKEN BAR 00A7 ; ON # Po SECTION SIGN 00A8 ; ON # Sk DIAERESIS 00A9 ; ON # So COPYRIGHT SIGN 00AB ; ON # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00AC ; ON # Sm NOT SIGN 00AE ; ON # So REGISTERED SIGN 00AF ; ON # Sk MACRON 00B4 ; ON # Sk ACUTE ACCENT 00B6..00B7 ; ON # Po [2] PILCROW SIGN..MIDDLE DOT 00B8 ; ON # Sk CEDILLA 00BB ; ON # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00BC..00BE ; ON # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS 00BF ; ON # Po INVERTED QUESTION MARK 00D7 ; ON # Sm MULTIPLICATION SIGN 00F7 ; ON # Sm DIVISION SIGN 02B9..02BA ; ON # Lm [2] MODIFIER LETTER PRIME..MODIFIER LETTER DOUBLE PRIME 02C2..02C5 ; ON # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD 02C6..02CF ; ON # Lm [10] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER LOW ACUTE ACCENT 02D2..02DF ; ON # Sk [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT 02E5..02EB ; ON # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK 02EC ; ON # Lm MODIFIER LETTER VOICING 02ED ; ON # Sk MODIFIER LETTER UNASPIRATED 02EF..02FF ; ON # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW 0374 ; ON # Lm GREEK NUMERAL SIGN 0375 ; ON # Sk GREEK LOWER NUMERAL SIGN 037E ; ON # Po GREEK QUESTION MARK 0384..0385 ; ON # Sk [2] GREEK TONOS..GREEK DIALYTIKA TONOS 0387 ; ON # Po GREEK ANO TELEIA 03F6 ; ON # Sm GREEK REVERSED LUNATE EPSILON SYMBOL 058A ; ON # Pd ARMENIAN HYPHEN 058D..058E ; ON # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN 0606..0607 ; ON # Sm [2] ARABIC-INDIC CUBE ROOT..ARABIC-INDIC FOURTH ROOT 060E..060F ; ON # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA 06DE ; ON # So ARABIC START OF RUB EL HIZB 06E9 ; ON # So ARABIC PLACE OF SAJDAH 07F6 ; ON # So NKO SYMBOL OO DENNEN 07F7..07F9 ; ON # Po [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK 0BF3..0BF8 ; ON # So [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN 0BFA ; ON # So TAMIL NUMBER SIGN 0C78..0C7E ; ON # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0F3A ; ON # Ps TIBETAN MARK GUG RTAGS GYON 0F3B ; ON # Pe TIBETAN MARK GUG RTAGS GYAS 0F3C ; ON # Ps TIBETAN MARK ANG KHANG GYON 0F3D ; ON # Pe TIBETAN MARK ANG KHANG GYAS 1390..1399 ; ON # So [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT 1400 ; ON # Pd CANADIAN SYLLABICS HYPHEN 169B ; ON # Ps OGHAM FEATHER MARK 169C ; ON # Pe OGHAM REVERSED FEATHER MARK 17F0..17F9 ; ON # No [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON 1800..1805 ; ON # Po [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS 1806 ; ON # Pd MONGOLIAN TODO SOFT HYPHEN 1807..180A ; ON # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU 1940 ; ON # So LIMBU SIGN LOO 1944..1945 ; ON # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 19DE..19FF ; ON # So [34] NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM ROC 1FBD ; ON # Sk GREEK KORONIS 1FBF..1FC1 ; ON # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI 1FCD..1FCF ; ON # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI 1FDD..1FDF ; ON # Sk [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI 1FED..1FEF ; ON # Sk [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA 1FFD..1FFE ; ON # Sk [2] GREEK OXIA..GREEK DASIA 2010..2015 ; ON # Pd [6] HYPHEN..HORIZONTAL BAR 2016..2017 ; ON # Po [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE 2018 ; ON # Pi LEFT SINGLE QUOTATION MARK 2019 ; ON # Pf RIGHT SINGLE QUOTATION MARK 201A ; ON # Ps SINGLE LOW-9 QUOTATION MARK 201B..201C ; ON # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK 201D ; ON # Pf RIGHT DOUBLE QUOTATION MARK 201E ; ON # Ps DOUBLE LOW-9 QUOTATION MARK 201F ; ON # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2020..2027 ; ON # Po [8] DAGGER..HYPHENATION POINT 2035..2038 ; ON # Po [4] REVERSED PRIME..CARET 2039 ; ON # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A ; ON # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 203B..203E ; ON # Po [4] REFERENCE MARK..OVERLINE 203F..2040 ; ON # Pc [2] UNDERTIE..CHARACTER TIE 2041..2043 ; ON # Po [3] CARET INSERTION POINT..HYPHEN BULLET 2045 ; ON # Ps LEFT SQUARE BRACKET WITH QUILL 2046 ; ON # Pe RIGHT SQUARE BRACKET WITH QUILL 2047..2051 ; ON # Po [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY 2052 ; ON # Sm COMMERCIAL MINUS SIGN 2053 ; ON # Po SWUNG DASH 2054 ; ON # Pc INVERTED UNDERTIE 2055..205E ; ON # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS 207C ; ON # Sm SUPERSCRIPT EQUALS SIGN 207D ; ON # Ps SUPERSCRIPT LEFT PARENTHESIS 207E ; ON # Pe SUPERSCRIPT RIGHT PARENTHESIS 208C ; ON # Sm SUBSCRIPT EQUALS SIGN 208D ; ON # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; ON # Pe SUBSCRIPT RIGHT PARENTHESIS 2100..2101 ; ON # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2103..2106 ; ON # So [4] DEGREE CELSIUS..CADA UNA 2108..2109 ; ON # So [2] SCRUPLE..DEGREE FAHRENHEIT 2114 ; ON # So L B BAR SYMBOL 2116..2117 ; ON # So [2] NUMERO SIGN..SOUND RECORDING COPYRIGHT 2118 ; ON # Sm SCRIPT CAPITAL P 211E..2123 ; ON # So [6] PRESCRIPTION TAKE..VERSICLE 2125 ; ON # So OUNCE SIGN 2127 ; ON # So INVERTED OHM SIGN 2129 ; ON # So TURNED GREEK SMALL LETTER IOTA 213A..213B ; ON # So [2] ROTATED CAPITAL Q..FACSIMILE SIGN 2140..2144 ; ON # Sm [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y 214A ; ON # So PROPERTY LINE 214B ; ON # Sm TURNED AMPERSAND 214C..214D ; ON # So [2] PER SIGN..AKTIESELSKAB 2150..215F ; ON # No [16] VULGAR FRACTION ONE SEVENTH..FRACTION NUMERATOR ONE 2189 ; ON # No VULGAR FRACTION ZERO THIRDS 218A..218B ; ON # So [2] TURNED DIGIT TWO..TURNED DIGIT THREE 2190..2194 ; ON # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW 2195..2199 ; ON # So [5] UP DOWN ARROW..SOUTH WEST ARROW 219A..219B ; ON # Sm [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE 219C..219F ; ON # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW 21A0 ; ON # Sm RIGHTWARDS TWO HEADED ARROW 21A1..21A2 ; ON # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL 21A3 ; ON # Sm RIGHTWARDS ARROW WITH TAIL 21A4..21A5 ; ON # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR 21A6 ; ON # Sm RIGHTWARDS ARROW FROM BAR 21A7..21AD ; ON # So [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW 21AE ; ON # Sm LEFT RIGHT ARROW WITH STROKE 21AF..21CD ; ON # So [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE 21CE..21CF ; ON # Sm [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE 21D0..21D1 ; ON # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW 21D2 ; ON # Sm RIGHTWARDS DOUBLE ARROW 21D3 ; ON # So DOWNWARDS DOUBLE ARROW 21D4 ; ON # Sm LEFT RIGHT DOUBLE ARROW 21D5..21F3 ; ON # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..2211 ; ON # Sm [30] RIGHT ARROW WITH SMALL CIRCLE..N-ARY SUMMATION 2214..22FF ; ON # Sm [236] DOT PLUS..Z NOTATION BAG MEMBERSHIP 2300..2307 ; ON # So [8] DIAMETER SIGN..WAVY LINE 2308 ; ON # Ps LEFT CEILING 2309 ; ON # Pe RIGHT CEILING 230A ; ON # Ps LEFT FLOOR 230B ; ON # Pe RIGHT FLOOR 230C..231F ; ON # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; ON # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; ON # So [7] FROWN..KEYBOARD 2329 ; ON # Ps LEFT-POINTING ANGLE BRACKET 232A ; ON # Pe RIGHT-POINTING ANGLE BRACKET 232B..2335 ; ON # So [11] ERASE TO THE LEFT..COUNTERSINK 237B ; ON # So NOT CHECK MARK 237C ; ON # Sm RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW 237D..2394 ; ON # So [24] SHOULDERED OPEN BOX..SOFTWARE-FUNCTION SYMBOL 2396..239A ; ON # So [5] DECIMAL SEPARATOR KEY SYMBOL..CLEAR SCREEN SYMBOL 239B..23B3 ; ON # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; ON # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; ON # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET 23E2..2429 ; ON # So [72] WHITE TRAPEZIUM..SYMBOL FOR DELETE MEDIUM SHADE FORM 2440..244A ; ON # So [11] OCR HOOK..OCR DOUBLE BACKSLASH 2460..2487 ; ON # No [40] CIRCLED DIGIT ONE..PARENTHESIZED NUMBER TWENTY 24EA..24FF ; ON # No [22] CIRCLED DIGIT ZERO..NEGATIVE CIRCLED DIGIT ZERO 2500..25B6 ; ON # So [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE 25B7 ; ON # Sm WHITE RIGHT-POINTING TRIANGLE 25B8..25C0 ; ON # So [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE 25C1 ; ON # Sm WHITE LEFT-POINTING TRIANGLE 25C2..25F7 ; ON # So [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT 25F8..25FF ; ON # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; ON # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; ON # Sm MUSIC SHARP SIGN 2670..26AB ; ON # So [60] WEST SYRIAC CROSS..MEDIUM BLACK CIRCLE 26AD..2767 ; ON # So [187] MARRIAGE SYMBOL..ROTATED FLORAL HEART BULLET 2768 ; ON # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; ON # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; ON # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 276B ; ON # Pe MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 276C ; ON # Ps MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 276D ; ON # Pe MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 276E ; ON # Ps HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 276F ; ON # Pe HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 2770 ; ON # Ps HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 2771 ; ON # Pe HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 2772 ; ON # Ps LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT 2773 ; ON # Pe LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT 2774 ; ON # Ps MEDIUM LEFT CURLY BRACKET ORNAMENT 2775 ; ON # Pe MEDIUM RIGHT CURLY BRACKET ORNAMENT 2776..2793 ; ON # No [30] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN 2794..27BF ; ON # So [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP 27C0..27C4 ; ON # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET 27C5 ; ON # Ps LEFT S-SHAPED BAG DELIMITER 27C6 ; ON # Pe RIGHT S-SHAPED BAG DELIMITER 27C7..27E5 ; ON # Sm [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK 27E6 ; ON # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E7 ; ON # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E8 ; ON # Ps MATHEMATICAL LEFT ANGLE BRACKET 27E9 ; ON # Pe MATHEMATICAL RIGHT ANGLE BRACKET 27EA ; ON # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EB ; ON # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27EC ; ON # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27ED ; ON # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EE ; ON # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS 27EF ; ON # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS 27F0..27FF ; ON # Sm [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW 2900..2982 ; ON # Sm [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON 2983 ; ON # Ps LEFT WHITE CURLY BRACKET 2984 ; ON # Pe RIGHT WHITE CURLY BRACKET 2985 ; ON # Ps LEFT WHITE PARENTHESIS 2986 ; ON # Pe RIGHT WHITE PARENTHESIS 2987 ; ON # Ps Z NOTATION LEFT IMAGE BRACKET 2988 ; ON # Pe Z NOTATION RIGHT IMAGE BRACKET 2989 ; ON # Ps Z NOTATION LEFT BINDING BRACKET 298A ; ON # Pe Z NOTATION RIGHT BINDING BRACKET 298B ; ON # Ps LEFT SQUARE BRACKET WITH UNDERBAR 298C ; ON # Pe RIGHT SQUARE BRACKET WITH UNDERBAR 298D ; ON # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298E ; ON # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 298F ; ON # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990 ; ON # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2991 ; ON # Ps LEFT ANGLE BRACKET WITH DOT 2992 ; ON # Pe RIGHT ANGLE BRACKET WITH DOT 2993 ; ON # Ps LEFT ARC LESS-THAN BRACKET 2994 ; ON # Pe RIGHT ARC GREATER-THAN BRACKET 2995 ; ON # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET 2996 ; ON # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET 2997 ; ON # Ps LEFT BLACK TORTOISE SHELL BRACKET 2998 ; ON # Pe RIGHT BLACK TORTOISE SHELL BRACKET 2999..29D7 ; ON # Sm [63] DOTTED FENCE..BLACK HOURGLASS 29D8 ; ON # Ps LEFT WIGGLY FENCE 29D9 ; ON # Pe RIGHT WIGGLY FENCE 29DA ; ON # Ps LEFT DOUBLE WIGGLY FENCE 29DB ; ON # Pe RIGHT DOUBLE WIGGLY FENCE 29DC..29FB ; ON # Sm [32] INCOMPLETE INFINITY..TRIPLE PLUS 29FC ; ON # Ps LEFT-POINTING CURVED ANGLE BRACKET 29FD ; ON # Pe RIGHT-POINTING CURVED ANGLE BRACKET 29FE..2AFF ; ON # Sm [258] TINY..N-ARY WHITE VERTICAL BAR 2B00..2B2F ; ON # So [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE 2B30..2B44 ; ON # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; ON # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; ON # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR 2B4D..2B73 ; ON # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR 2B76..2BFF ; ON # So [138] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..HELLSCHREIBER PAUSE SYMBOL 2CE5..2CEA ; ON # So [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA 2CF9..2CFC ; ON # Po [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER 2CFD ; ON # No COPTIC FRACTION ONE HALF 2CFE..2CFF ; ON # Po [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER 2E00..2E01 ; ON # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; ON # Pi LEFT SUBSTITUTION BRACKET 2E03 ; ON # Pf RIGHT SUBSTITUTION BRACKET 2E04 ; ON # Pi LEFT DOTTED SUBSTITUTION BRACKET 2E05 ; ON # Pf RIGHT DOTTED SUBSTITUTION BRACKET 2E06..2E08 ; ON # Po [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER 2E09 ; ON # Pi LEFT TRANSPOSITION BRACKET 2E0A ; ON # Pf RIGHT TRANSPOSITION BRACKET 2E0B ; ON # Po RAISED SQUARE 2E0C ; ON # Pi LEFT RAISED OMISSION BRACKET 2E0D ; ON # Pf RIGHT RAISED OMISSION BRACKET 2E0E..2E16 ; ON # Po [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE 2E17 ; ON # Pd DOUBLE OBLIQUE HYPHEN 2E18..2E19 ; ON # Po [2] INVERTED INTERROBANG..PALM BRANCH 2E1A ; ON # Pd HYPHEN WITH DIAERESIS 2E1B ; ON # Po TILDE WITH RING ABOVE 2E1C ; ON # Pi LEFT LOW PARAPHRASE BRACKET 2E1D ; ON # Pf RIGHT LOW PARAPHRASE BRACKET 2E1E..2E1F ; ON # Po [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW 2E20 ; ON # Pi LEFT VERTICAL BAR WITH QUILL 2E21 ; ON # Pf RIGHT VERTICAL BAR WITH QUILL 2E22 ; ON # Ps TOP LEFT HALF BRACKET 2E23 ; ON # Pe TOP RIGHT HALF BRACKET 2E24 ; ON # Ps BOTTOM LEFT HALF BRACKET 2E25 ; ON # Pe BOTTOM RIGHT HALF BRACKET 2E26 ; ON # Ps LEFT SIDEWAYS U BRACKET 2E27 ; ON # Pe RIGHT SIDEWAYS U BRACKET 2E28 ; ON # Ps LEFT DOUBLE PARENTHESIS 2E29 ; ON # Pe RIGHT DOUBLE PARENTHESIS 2E2A..2E2E ; ON # Po [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK 2E2F ; ON # Lm VERTICAL TILDE 2E30..2E39 ; ON # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; ON # Pd [2] TWO-EM DASH..THREE-EM DASH 2E3C..2E3F ; ON # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM 2E40 ; ON # Pd DOUBLE HYPHEN 2E41 ; ON # Po REVERSED COMMA 2E42 ; ON # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 2E43..2E4F ; ON # Po [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER 2E50..2E51 ; ON # So [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR 2E52..2E54 ; ON # Po [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK 2E55 ; ON # Ps LEFT SQUARE BRACKET WITH STROKE 2E56 ; ON # Pe RIGHT SQUARE BRACKET WITH STROKE 2E57 ; ON # Ps LEFT SQUARE BRACKET WITH DOUBLE STROKE 2E58 ; ON # Pe RIGHT SQUARE BRACKET WITH DOUBLE STROKE 2E59 ; ON # Ps TOP HALF LEFT PARENTHESIS 2E5A ; ON # Pe TOP HALF RIGHT PARENTHESIS 2E5B ; ON # Ps BOTTOM HALF LEFT PARENTHESIS 2E5C ; ON # Pe BOTTOM HALF RIGHT PARENTHESIS 2E5D ; ON # Pd OBLIQUE HYPHEN 2E80..2E99 ; ON # So [26] CJK RADICAL REPEAT..CJK RADICAL RAP 2E9B..2EF3 ; ON # So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE 2F00..2FD5 ; ON # So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE 2FF0..2FFF ; ON # So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION 3001..3003 ; ON # Po [3] IDEOGRAPHIC COMMA..DITTO MARK 3004 ; ON # So JAPANESE INDUSTRIAL STANDARD SYMBOL 3008 ; ON # Ps LEFT ANGLE BRACKET 3009 ; ON # Pe RIGHT ANGLE BRACKET 300A ; ON # Ps LEFT DOUBLE ANGLE BRACKET 300B ; ON # Pe RIGHT DOUBLE ANGLE BRACKET 300C ; ON # Ps LEFT CORNER BRACKET 300D ; ON # Pe RIGHT CORNER BRACKET 300E ; ON # Ps LEFT WHITE CORNER BRACKET 300F ; ON # Pe RIGHT WHITE CORNER BRACKET 3010 ; ON # Ps LEFT BLACK LENTICULAR BRACKET 3011 ; ON # Pe RIGHT BLACK LENTICULAR BRACKET 3012..3013 ; ON # So [2] POSTAL MARK..GETA MARK 3014 ; ON # Ps LEFT TORTOISE SHELL BRACKET 3015 ; ON # Pe RIGHT TORTOISE SHELL BRACKET 3016 ; ON # Ps LEFT WHITE LENTICULAR BRACKET 3017 ; ON # Pe RIGHT WHITE LENTICULAR BRACKET 3018 ; ON # Ps LEFT WHITE TORTOISE SHELL BRACKET 3019 ; ON # Pe RIGHT WHITE TORTOISE SHELL BRACKET 301A ; ON # Ps LEFT WHITE SQUARE BRACKET 301B ; ON # Pe RIGHT WHITE SQUARE BRACKET 301C ; ON # Pd WAVE DASH 301D ; ON # Ps REVERSED DOUBLE PRIME QUOTATION MARK 301E..301F ; ON # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK 3020 ; ON # So POSTAL MARK FACE 3030 ; ON # Pd WAVY DASH 3036..3037 ; ON # So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL 303D ; ON # Po PART ALTERNATION MARK 303E..303F ; ON # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE 309B..309C ; ON # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 30A0 ; ON # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN 30FB ; ON # Po KATAKANA MIDDLE DOT 31C0..31E5 ; ON # So [38] CJK STROKE T..CJK STROKE SZP 31EF ; ON # So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION 321D..321E ; ON # So [2] PARENTHESIZED KOREAN CHARACTER OJEON..PARENTHESIZED KOREAN CHARACTER O HU 3250 ; ON # So PARTNERSHIP SIGN 3251..325F ; ON # No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE 327C..327E ; ON # So [3] CIRCLED KOREAN CHARACTER CHAMKO..CIRCLED HANGUL IEUNG U 32B1..32BF ; ON # No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY 32CC..32CF ; ON # So [4] SQUARE HG..LIMITED LIABILITY SIGN 3377..337A ; ON # So [4] SQUARE DM..SQUARE IU 33DE..33DF ; ON # So [2] SQUARE V OVER M..SQUARE A OVER M 33FF ; ON # So SQUARE GAL 4DC0..4DFF ; ON # So [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION A490..A4C6 ; ON # So [55] YI RADICAL QOT..YI RADICAL KE A60D..A60F ; ON # Po [3] VAI COMMA..VAI QUESTION MARK A673 ; ON # Po SLAVONIC ASTERISK A67E ; ON # Po CYRILLIC KAVYKA A67F ; ON # Lm CYRILLIC PAYEROK A700..A716 ; ON # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR A717..A71F ; ON # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A720..A721 ; ON # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE A788 ; ON # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A828..A82B ; ON # So [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4 A874..A877 ; ON # Po [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD AB6A..AB6B ; ON # Sk [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK FBC3..FBD2 ; ON # So [16] ARABIC LIGATURE JALLA WA-ALAA..ARABIC LIGATURE ALAYHI AR-RAHMAH FD3E ; ON # Pe ORNATE LEFT PARENTHESIS FD3F ; ON # Ps ORNATE RIGHT PARENTHESIS FD40..FD4F ; ON # So [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH FD90..FD91 ; ON # So [2] ARABIC LIGATURE RAHMATU ALLAAHI ALAYH..ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA FDC8..FDCF ; ON # So [8] ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA..ARABIC LIGATURE SALAAMUHU ALAYNAA FDFD..FDFF ; ON # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL FE10..FE16 ; ON # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE17 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET FE18 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET FE19 ; ON # Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS FE30 ; ON # Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER FE31..FE32 ; ON # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH FE33..FE34 ; ON # Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE FE35 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS FE36 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS FE37 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET FE38 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET FE39 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET FE3A ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET FE3B ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET FE3C ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET FE3D ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET FE3E ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET FE3F ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET FE40 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET FE41 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET FE42 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET FE43 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET FE44 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET FE45..FE46 ; ON # Po [2] SESAME DOT..WHITE SESAME DOT FE47 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET FE48 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET FE49..FE4C ; ON # Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE FE4D..FE4F ; ON # Pc [3] DASHED LOW LINE..WAVY LOW LINE FE51 ; ON # Po SMALL IDEOGRAPHIC COMMA FE54 ; ON # Po SMALL SEMICOLON FE56..FE57 ; ON # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK FE58 ; ON # Pd SMALL EM DASH FE59 ; ON # Ps SMALL LEFT PARENTHESIS FE5A ; ON # Pe SMALL RIGHT PARENTHESIS FE5B ; ON # Ps SMALL LEFT CURLY BRACKET FE5C ; ON # Pe SMALL RIGHT CURLY BRACKET FE5D ; ON # Ps SMALL LEFT TORTOISE SHELL BRACKET FE5E ; ON # Pe SMALL RIGHT TORTOISE SHELL BRACKET FE60..FE61 ; ON # Po [2] SMALL AMPERSAND..SMALL ASTERISK FE64..FE66 ; ON # Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN FE68 ; ON # Po SMALL REVERSE SOLIDUS FE6B ; ON # Po SMALL COMMERCIAL AT FF01..FF02 ; ON # Po [2] FULLWIDTH EXCLAMATION MARK..FULLWIDTH QUOTATION MARK FF06..FF07 ; ON # Po [2] FULLWIDTH AMPERSAND..FULLWIDTH APOSTROPHE FF08 ; ON # Ps FULLWIDTH LEFT PARENTHESIS FF09 ; ON # Pe FULLWIDTH RIGHT PARENTHESIS FF0A ; ON # Po FULLWIDTH ASTERISK FF1B ; ON # Po FULLWIDTH SEMICOLON FF1C..FF1E ; ON # Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN FF1F..FF20 ; ON # Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT FF3B ; ON # Ps FULLWIDTH LEFT SQUARE BRACKET FF3C ; ON # Po FULLWIDTH REVERSE SOLIDUS FF3D ; ON # Pe FULLWIDTH RIGHT SQUARE BRACKET FF3E ; ON # Sk FULLWIDTH CIRCUMFLEX ACCENT FF3F ; ON # Pc FULLWIDTH LOW LINE FF40 ; ON # Sk FULLWIDTH GRAVE ACCENT FF5B ; ON # Ps FULLWIDTH LEFT CURLY BRACKET FF5C ; ON # Sm FULLWIDTH VERTICAL LINE FF5D ; ON # Pe FULLWIDTH RIGHT CURLY BRACKET FF5E ; ON # Sm FULLWIDTH TILDE FF5F ; ON # Ps FULLWIDTH LEFT WHITE PARENTHESIS FF60 ; ON # Pe FULLWIDTH RIGHT WHITE PARENTHESIS FF61 ; ON # Po HALFWIDTH IDEOGRAPHIC FULL STOP FF62 ; ON # Ps HALFWIDTH LEFT CORNER BRACKET FF63 ; ON # Pe HALFWIDTH RIGHT CORNER BRACKET FF64..FF65 ; ON # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT FFE2 ; ON # Sm FULLWIDTH NOT SIGN FFE3 ; ON # Sk FULLWIDTH MACRON FFE4 ; ON # So FULLWIDTH BROKEN BAR FFE8 ; ON # So HALFWIDTH FORMS LIGHT VERTICAL FFE9..FFEC ; ON # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW FFED..FFEE ; ON # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE FFF9..FFFB ; ON # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR FFFC..FFFD ; ON # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 10101 ; ON # Po AEGEAN WORD SEPARATOR DOT 10140..10174 ; ON # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10175..10178 ; ON # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN 10179..10189 ; ON # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN 1018A..1018B ; ON # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN 1018C ; ON # So GREEK SINUSOID SIGN 10190..1019C ; ON # So [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL 101A0 ; ON # So GREEK SYMBOL TAU RHO 1091F ; ON # Po PHOENICIAN WORD SEPARATOR 10B39..10B3F ; ON # Po [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION 10D6E ; ON # Pd GARAY HYPHEN 10ED0 ; ON # Po ARABIC BIBLICAL END OF VERSE 10ED1..10ED8 ; ON # So [8] ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM..ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH 11052..11065 ; ON # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND 11660..1166C ; ON # Po [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT 11FD5..11FDC ; ON # So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI 11FE1..11FF1 ; ON # So [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA 16FE2 ; ON # Po OLD CHINESE HOOK MARK 1CC00..1CCD5 ; ON # So [214] UP-POINTING GO-KART..LOWER RIGHT QUADRANT STANDING KNIGHT 1CCFA..1CCFC ; ON # So [3] SNAKE SYMBOL..NOSE SYMBOL 1CD00..1CEB3 ; ON # So [436] BLOCK OCTANT-3..BLACK RIGHT TRIANGLE CARET 1CEBA..1CED0 ; ON # So [23] FRAGILE SYMBOL..LEUKOTHEA 1CEE0..1CEEF ; ON # So [16] GEOMANTIC FIGURE POPULUS..GEOMANTIC FIGURE VIA 1CEF0 ; ON # Sm MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR 1D1E9..1D1EA ; ON # So [2] MUSICAL SYMBOL SORI..MUSICAL SYMBOL KORON 1D200..1D241 ; ON # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 1D245 ; ON # So GREEK MUSICAL LEIMMA 1D300..1D356 ; ON # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING 1D6C1 ; ON # Sm MATHEMATICAL BOLD NABLA 1D6DB ; ON # Sm MATHEMATICAL BOLD PARTIAL DIFFERENTIAL 1D6FB ; ON # Sm MATHEMATICAL ITALIC NABLA 1D715 ; ON # Sm MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL 1D735 ; ON # Sm MATHEMATICAL BOLD ITALIC NABLA 1D74F ; ON # Sm MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL 1D76F ; ON # Sm MATHEMATICAL SANS-SERIF BOLD NABLA 1D789 ; ON # Sm MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL 1D7A9 ; ON # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA 1D7C3 ; ON # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL 1EEF0..1EEF1 ; ON # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL 1F000..1F02B ; ON # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK 1F030..1F093 ; ON # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 1F0A0..1F0AE ; ON # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES 1F0B1..1F0BF ; ON # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER 1F0C1..1F0CF ; ON # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER 1F0D1..1F0F5 ; ON # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 1F10B..1F10C ; ON # No [2] DINGBAT CIRCLED SANS-SERIF DIGIT ZERO..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO 1F10D..1F10F ; ON # So [3] CIRCLED ZERO WITH SLASH..CIRCLED DOLLAR SIGN WITH OVERLAID BACKSLASH 1F12F ; ON # So COPYLEFT SYMBOL 1F16A..1F16F ; ON # So [6] RAISED MC SIGN..CIRCLED HUMAN FIGURE 1F1AD ; ON # So MASK WORK SYMBOL 1F260..1F265 ; ON # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI 1F300..1F3FA ; ON # So [251] CYCLONE..AMPHORA 1F3FB..1F3FF ; ON # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 1F400..1F6D8 ; ON # So [729] RAT..LANDSLIDE 1F6DC..1F6EC ; ON # So [17] WIRELESS..AIRPLANE ARRIVING 1F6F0..1F6FC ; ON # So [13] SATELLITE..ROLLER SKATE 1F700..1F7D9 ; ON # So [218] ALCHEMICAL SYMBOL FOR QUINTESSENCE..NINE POINTED WHITE STAR 1F7E0..1F7EB ; ON # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE 1F7F0 ; ON # So HEAVY EQUALS SIGN 1F800..1F80B ; ON # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD 1F810..1F847 ; ON # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW 1F850..1F859 ; ON # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW 1F860..1F887 ; ON # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW 1F890..1F8AD ; ON # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS 1F8B0..1F8BB ; ON # So [12] ARROW POINTING UPWARDS THEN NORTH WEST..SOUTH WEST ARROW FROM BAR 1F8C0..1F8C1 ; ON # So [2] LEFTWARDS ARROW FROM DOWNWARDS ARROW..RIGHTWARDS ARROW FROM DOWNWARDS ARROW 1F8D0..1F8D8 ; ON # Sm [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE 1F900..1FA57 ; ON # So [344] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS ALFIL 1FA60..1FA6D ; ON # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER 1FA70..1FA7C ; ON # So [13] BALLET SHOES..CRUTCH 1FA80..1FA8A ; ON # So [11] YO-YO..TROMBONE 1FA8E..1FAC6 ; ON # So [57] TREASURE CHEST..FINGERPRINT 1FAC8 ; ON # So HAIRY CREATURE 1FACD..1FADC ; ON # So [16] ORCA..ROOT VEGETABLE 1FADF..1FAEA ; ON # So [12] SPLATTER..DISTORTED FACE 1FAEF..1FAF8 ; ON # So [10] FIGHT CLOUD..RIGHTWARDS PUSHING HAND 1FB00..1FB92 ; ON # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK 1FB94..1FBEF ; ON # So [92] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..TOP LEFT JUSTIFIED LOWER RIGHT QUARTER BLACK CIRCLE 1FBFA ; ON # So ALARM BELL SYMBOL # Total code points: 6854 # ================================================ # Bidi_Class=Boundary_Neutral 0000..0008 ; BN # Cc [9] .. 000E..001B ; BN # Cc [14] .. 007F..0084 ; BN # Cc [6] .. 0086..009F ; BN # Cc [26] .. 00AD ; BN # Cf SOFT HYPHEN 180E ; BN # Cf MONGOLIAN VOWEL SEPARATOR 200B..200D ; BN # Cf [3] ZERO WIDTH SPACE..ZERO WIDTH JOINER 2060..2064 ; BN # Cf [5] WORD JOINER..INVISIBLE PLUS 2065 ; BN # Cn 206A..206F ; BN # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES FDD0..FDEF ; BN # Cn [32] .. FEFF ; BN # Cf ZERO WIDTH NO-BREAK SPACE FFF0..FFF8 ; BN # Cn [9] .. FFFE..FFFF ; BN # Cn [2] .. 1BCA0..1BCA3 ; BN # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1D173..1D17A ; BN # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE 1FFFE..1FFFF ; BN # Cn [2] .. 2FFFE..2FFFF ; BN # Cn [2] .. 3FFFE..3FFFF ; BN # Cn [2] .. 4FFFE..4FFFF ; BN # Cn [2] .. 5FFFE..5FFFF ; BN # Cn [2] .. 6FFFE..6FFFF ; BN # Cn [2] .. 7FFFE..7FFFF ; BN # Cn [2] .. 8FFFE..8FFFF ; BN # Cn [2] .. 9FFFE..9FFFF ; BN # Cn [2] .. AFFFE..AFFFF ; BN # Cn [2] .. BFFFE..BFFFF ; BN # Cn [2] .. CFFFE..CFFFF ; BN # Cn [2] .. DFFFE..E0000 ; BN # Cn [3] .. E0001 ; BN # Cf LANGUAGE TAG E0002..E001F ; BN # Cn [30] .. E0020..E007F ; BN # Cf [96] TAG SPACE..CANCEL TAG E0080..E00FF ; BN # Cn [128] .. E01F0..E0FFF ; BN # Cn [3600] .. EFFFE..EFFFF ; BN # Cn [2] .. FFFFE..FFFFF ; BN # Cn [2] .. 10FFFE..10FFFF; BN # Cn [2] .. # Total code points: 4016 # ================================================ # Bidi_Class=Nonspacing_Mark 0300..036F ; NSM # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0483..0487 ; NSM # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0488..0489 ; NSM # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 0591..05BD ; NSM # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; NSM # Mn HEBREW POINT RAFE 05C1..05C2 ; NSM # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; NSM # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; NSM # Mn HEBREW POINT QAMATS QATAN 0610..061A ; NSM # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 064B..065F ; NSM # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0670 ; NSM # Mn ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; NSM # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; NSM # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E7..06E8 ; NSM # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; NSM # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 0711 ; NSM # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0730..074A ; NSM # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; NSM # Mn [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; NSM # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07FD ; NSM # Mn NKO DANTAYALAN 0816..0819 ; NSM # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081B..0823 ; NSM # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; NSM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; NSM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; NSM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0897..089F ; NSM # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; NSM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; NSM # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; NSM # Mn DEVANAGARI VOWEL SIGN OE 093C ; NSM # Mn DEVANAGARI SIGN NUKTA 0941..0948 ; NSM # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 094D ; NSM # Mn DEVANAGARI SIGN VIRAMA 0951..0957 ; NSM # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; NSM # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0981 ; NSM # Mn BENGALI SIGN CANDRABINDU 09BC ; NSM # Mn BENGALI SIGN NUKTA 09C1..09C4 ; NSM # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09CD ; NSM # Mn BENGALI SIGN VIRAMA 09E2..09E3 ; NSM # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09FE ; NSM # Mn BENGALI SANDHI MARK 0A01..0A02 ; NSM # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A3C ; NSM # Mn GURMUKHI SIGN NUKTA 0A41..0A42 ; NSM # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; NSM # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; NSM # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; NSM # Mn GURMUKHI SIGN UDAAT 0A70..0A71 ; NSM # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; NSM # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; NSM # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0ABC ; NSM # Mn GUJARATI SIGN NUKTA 0AC1..0AC5 ; NSM # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; NSM # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0ACD ; NSM # Mn GUJARATI SIGN VIRAMA 0AE2..0AE3 ; NSM # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFF ; NSM # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; NSM # Mn ORIYA SIGN CANDRABINDU 0B3C ; NSM # Mn ORIYA SIGN NUKTA 0B3F ; NSM # Mn ORIYA VOWEL SIGN I 0B41..0B44 ; NSM # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B4D ; NSM # Mn ORIYA SIGN VIRAMA 0B55..0B56 ; NSM # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B62..0B63 ; NSM # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; NSM # Mn TAMIL SIGN ANUSVARA 0BC0 ; NSM # Mn TAMIL VOWEL SIGN II 0BCD ; NSM # Mn TAMIL SIGN VIRAMA 0C00 ; NSM # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C04 ; NSM # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3C ; NSM # Mn TELUGU SIGN NUKTA 0C3E..0C40 ; NSM # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C46..0C48 ; NSM # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; NSM # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; NSM # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; NSM # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; NSM # Mn KANNADA SIGN CANDRABINDU 0CBC ; NSM # Mn KANNADA SIGN NUKTA 0CCC..0CCD ; NSM # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CE2..0CE3 ; NSM # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0D00..0D01 ; NSM # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D3B..0D3C ; NSM # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D41..0D44 ; NSM # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D4D ; NSM # Mn MALAYALAM SIGN VIRAMA 0D62..0D63 ; NSM # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; NSM # Mn SINHALA SIGN CANDRABINDU 0DCA ; NSM # Mn SINHALA SIGN AL-LAKUNA 0DD2..0DD4 ; NSM # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; NSM # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0E31 ; NSM # Mn THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; NSM # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E47..0E4E ; NSM # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0EB1 ; NSM # Mn LAO VOWEL SIGN MAI KAN 0EB4..0EBC ; NSM # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EC8..0ECE ; NSM # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0F18..0F19 ; NSM # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; NSM # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; NSM # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; NSM # Mn TIBETAN MARK TSA -PHRU 0F71..0F7E ; NSM # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F80..0F84 ; NSM # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; NSM # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F8D..0F97 ; NSM # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; NSM # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; NSM # Mn TIBETAN SYMBOL PADMA GDAN 102D..1030 ; NSM # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1032..1037 ; NSM # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1039..103A ; NSM # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103D..103E ; NSM # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1058..1059 ; NSM # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; NSM # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1071..1074 ; NSM # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; NSM # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1085..1086 ; NSM # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 108D ; NSM # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 109D ; NSM # Mn MYANMAR VOWEL SIGN AITON AI 135D..135F ; NSM # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; NSM # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1732..1733 ; NSM # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1752..1753 ; NSM # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; NSM # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; NSM # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B7..17BD ; NSM # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17C6 ; NSM # Mn KHMER SIGN NIKAHIT 17C9..17D3 ; NSM # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17DD ; NSM # Mn KHMER SIGN ATTHACAN 180B..180D ; NSM # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; NSM # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1885..1886 ; NSM # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; NSM # Mn MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; NSM # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1927..1928 ; NSM # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1932 ; NSM # Mn LIMBU SMALL LETTER ANUSVARA 1939..193B ; NSM # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A17..1A18 ; NSM # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A1B ; NSM # Mn BUGINESE VOWEL SIGN AE 1A56 ; NSM # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A58..1A5E ; NSM # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; NSM # Mn TAI THAM SIGN SAKOT 1A62 ; NSM # Mn TAI THAM VOWEL SIGN MAI SAT 1A65..1A6C ; NSM # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A73..1A7C ; NSM # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; NSM # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AB0..1ABD ; NSM # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; NSM # Me COMBINING PARENTHESES OVERLAY 1ABF..1ADD ; NSM # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; NSM # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; NSM # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B34 ; NSM # Mn BALINESE SIGN REREKAN 1B36..1B3A ; NSM # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3C ; NSM # Mn BALINESE VOWEL SIGN LA LENGA 1B42 ; NSM # Mn BALINESE VOWEL SIGN PEPET 1B6B..1B73 ; NSM # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; NSM # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; NSM # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; NSM # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAB..1BAD ; NSM # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; NSM # Mn BATAK SIGN TOMPI 1BE8..1BE9 ; NSM # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; NSM # Mn BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; NSM # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1C2C..1C33 ; NSM # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; NSM # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1CD0..1CD2 ; NSM # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; NSM # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; NSM # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; NSM # Mn VEDIC SIGN TIRYAK 1CF4 ; NSM # Mn VEDIC TONE CANDRA ABOVE 1CF8..1CF9 ; NSM # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1DC0..1DFF ; NSM # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 20D0..20DC ; NSM # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20DD..20E0 ; NSM # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH 20E1 ; NSM # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E2..20E4 ; NSM # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE 20E5..20F0 ; NSM # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2CEF..2CF1 ; NSM # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2D7F ; NSM # Mn TIFINAGH CONSONANT JOINER 2DE0..2DFF ; NSM # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 302A..302D ; NSM # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 3099..309A ; NSM # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK A66F ; NSM # Mn COMBINING CYRILLIC VZMET A670..A672 ; NSM # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN A674..A67D ; NSM # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A69E..A69F ; NSM # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6F0..A6F1 ; NSM # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A802 ; NSM # Mn SYLOTI NAGRI SIGN DVISVARA A806 ; NSM # Mn SYLOTI NAGRI SIGN HASANTA A80B ; NSM # Mn SYLOTI NAGRI SIGN ANUSVARA A825..A826 ; NSM # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A82C ; NSM # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4..A8C5 ; NSM # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8E0..A8F1 ; NSM # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8FF ; NSM # Mn DEVANAGARI VOWEL SIGN AY A926..A92D ; NSM # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; NSM # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A980..A982 ; NSM # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; NSM # Mn JAVANESE SIGN CECAK TELU A9B6..A9B9 ; NSM # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; NSM # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9E5 ; NSM # Mn MYANMAR SIGN SHAN SAW AA29..AA2E ; NSM # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; NSM # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA35..AA36 ; NSM # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; NSM # Mn CHAM CONSONANT SIGN FINAL NG AA4C ; NSM # Mn CHAM CONSONANT SIGN FINAL M AA7C ; NSM # Mn MYANMAR SIGN TAI LAING TONE-2 AAB0 ; NSM # Mn TAI VIET MAI KANG AAB2..AAB4 ; NSM # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; NSM # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE..AABF ; NSM # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC1 ; NSM # Mn TAI VIET TONE MAI THO AAEC..AAED ; NSM # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAF6 ; NSM # Mn MEETEI MAYEK VIRAMA ABE5 ; NSM # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE8 ; NSM # Mn MEETEI MAYEK VOWEL SIGN UNAP ABED ; NSM # Mn MEETEI MAYEK APUN IYEK FB1E ; NSM # Mn HEBREW POINT JUDEO-SPANISH VARIKA FE00..FE0F ; NSM # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; NSM # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF 101FD ; NSM # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 102E0 ; NSM # Mn COPTIC EPACT THOUSANDS MARK 10376..1037A ; NSM # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; NSM # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; NSM # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; NSM # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A38..10A3A ; NSM # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; NSM # Mn KHAROSHTHI VIRAMA 10AE5..10AE6 ; NSM # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; NSM # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D69..10D6D ; NSM # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10EAB..10EAC ; NSM # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EFA..10EFF ; NSM # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; NSM # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; NSM # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; NSM # Mn BRAHMI SIGN ANUSVARA 11038..11046 ; NSM # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11070 ; NSM # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11073..11074 ; NSM # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 1107F..11081 ; NSM # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 110B3..110B6 ; NSM # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B9..110BA ; NSM # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; NSM # Mn KAITHI VOWEL SIGN VOCALIC R 11100..11102 ; NSM # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; NSM # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112D..11134 ; NSM # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11173 ; NSM # Mn MAHAJANI SIGN NUKTA 11180..11181 ; NSM # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; NSM # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111C9..111CC ; NSM # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; NSM # Mn SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; NSM # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; NSM # Mn KHOJKI SIGN ANUSVARA 11236..11237 ; NSM # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; NSM # Mn KHOJKI SIGN SUKUN 11241 ; NSM # Mn KHOJKI VOWEL SIGN VOCALIC R 112DF ; NSM # Mn KHUDAWADI SIGN ANUSVARA 112E3..112EA ; NSM # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 11300..11301 ; NSM # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 1133B..1133C ; NSM # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 11340 ; NSM # Mn GRANTHA VOWEL SIGN II 11366..1136C ; NSM # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; NSM # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113BB..113C0 ; NSM # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113CE ; NSM # Mn TULU-TIGALARI SIGN VIRAMA 113D0 ; NSM # Mn TULU-TIGALARI CONJOINER 113D2 ; NSM # Mn TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; NSM # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11438..1143F ; NSM # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11442..11444 ; NSM # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11446 ; NSM # Mn NEWA SIGN NUKTA 1145E ; NSM # Mn NEWA SANDHI MARK 114B3..114B8 ; NSM # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114BA ; NSM # Mn TIRHUTA VOWEL SIGN SHORT E 114BF..114C0 ; NSM # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C2..114C3 ; NSM # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115B2..115B5 ; NSM # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115BC..115BD ; NSM # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BF..115C0 ; NSM # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115DC..115DD ; NSM # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11633..1163A ; NSM # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163D ; NSM # Mn MODI SIGN ANUSVARA 1163F..11640 ; NSM # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 116AB ; NSM # Mn TAKRI SIGN ANUSVARA 116AD ; NSM # Mn TAKRI VOWEL SIGN AA 116B0..116B5 ; NSM # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B7 ; NSM # Mn TAKRI SIGN NUKTA 1171D ; NSM # Mn AHOM CONSONANT SIGN MEDIAL LA 1171F ; NSM # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11722..11725 ; NSM # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11727..1172B ; NSM # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 1182F..11837 ; NSM # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11839..1183A ; NSM # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 1193B..1193C ; NSM # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193E ; NSM # Mn DIVES AKURU VIRAMA 11943 ; NSM # Mn DIVES AKURU SIGN NUKTA 119D4..119D7 ; NSM # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; NSM # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119E0 ; NSM # Mn NANDINAGARI SIGN VIRAMA 11A01..11A06 ; NSM # Mn [6] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL SIGN O 11A09..11A0A ; NSM # Mn [2] ZANABAZAR SQUARE VOWEL SIGN REVERSED I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A33..11A38 ; NSM # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A3B..11A3E ; NSM # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; NSM # Mn ZANABAZAR SQUARE SUBJOINER 11A51..11A56 ; NSM # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A59..11A5B ; NSM # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; NSM # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A98..11A99 ; NSM # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11B60 ; NSM # Mn SHARADA VOWEL SIGN OE 11B62..11B64 ; NSM # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B66 ; NSM # Mn SHARADA VOWEL SIGN CANDRA E 11C30..11C36 ; NSM # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; NSM # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C92..11CA7 ; NSM # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CAA..11CB0 ; NSM # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB2..11CB3 ; NSM # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB5..11CB6 ; NSM # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; NSM # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; NSM # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; NSM # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; NSM # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D47 ; NSM # Mn MASARAM GONDI RA-KARA 11D90..11D91 ; NSM # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D95 ; NSM # Mn GUNJALA GONDI SIGN ANUSVARA 11D97 ; NSM # Mn GUNJALA GONDI VIRAMA 11EF3..11EF4 ; NSM # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11F00..11F01 ; NSM # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; NSM # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; NSM # Mn KAWI VOWEL SIGN EU 11F42 ; NSM # Mn KAWI CONJOINER 11F5A ; NSM # Mn KAWI SIGN NUKTA 13440 ; NSM # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13447..13455 ; NSM # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1611E..16129 ; NSM # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612D..1612F ; NSM # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; NSM # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; NSM # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F4F ; NSM # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; NSM # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FE4 ; NSM # Mn KHITAN SMALL SCRIPT FILLER 1BC9D..1BC9E ; NSM # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CF00..1CF2D ; NSM # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; NSM # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D167..1D169 ; NSM # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D17B..1D182 ; NSM # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; NSM # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; NSM # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; NSM # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1DA00..1DA36 ; NSM # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; NSM # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; NSM # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; NSM # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; NSM # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; NSM # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1E000..1E006 ; NSM # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; NSM # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; NSM # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; NSM # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; NSM # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E08F ; NSM # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E130..1E136 ; NSM # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E2AE ; NSM # Mn TOTO SIGN RISING TONE 1E2EC..1E2EF ; NSM # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E4EC..1E4EF ; NSM # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E5EE..1E5EF ; NSM # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E6E3 ; NSM # Mn TAI YO SIGN UE 1E6E6 ; NSM # Mn TAI YO SIGN AU 1E6EE..1E6EF ; NSM # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; NSM # Mn TAI YO SIGN OM 1E8D0..1E8D6 ; NSM # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E94A ; NSM # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 2067 # ================================================ # Bidi_Class=Arabic_Letter 0608 ; AL # Sm ARABIC RAY 060B ; AL # Sc AFGHANI SIGN 060D ; AL # Po ARABIC DATE SEPARATOR 061B ; AL # Po ARABIC SEMICOLON 061C ; AL # Cf ARABIC LETTER MARK 061D..061F ; AL # Po [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK 0620..063F ; AL # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; AL # Lm ARABIC TATWEEL 0641..064A ; AL # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 066D ; AL # Po ARABIC FIVE POINTED STAR 066E..066F ; AL # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0671..06D3 ; AL # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D4 ; AL # Po ARABIC FULL STOP 06D5 ; AL # Lo ARABIC LETTER AE 06E5..06E6 ; AL # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06EE..06EF ; AL # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06FA..06FC ; AL # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FD..06FE ; AL # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN 06FF ; AL # Lo ARABIC LETTER HEH WITH INVERTED V 0700..070D ; AL # Po [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS 070F ; AL # Cf SYRIAC ABBREVIATION MARK 0710 ; AL # Lo SYRIAC LETTER ALAPH 0712..072F ; AL # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 074D..07A5 ; AL # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07B1 ; AL # Lo THAANA LETTER NAA 0860..086A ; AL # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; AL # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0888 ; AL # Sk ARABIC RAISED ROUND DOT 0889..088F ; AL # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 08A0..08C8 ; AL # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; AL # Lm ARABIC SMALL FARSI YEH FB50..FBB1 ; AL # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBB2..FBC2 ; AL # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE FBD3..FD3D ; AL # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; AL # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; AL # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; AL # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FDFC ; AL # Sc RIAL SIGN FE70..FE74 ; AL # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10D00..10D23 ; AL # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10EC2..10EC4 ; AL # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; AL # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; AL # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10F30..10F45 ; AL # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F51..10F54 ; AL # No [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED 10F55..10F59 ; AL # Po [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 1EC71..1ECAB ; AL # No [59] INDIC SIYAQ NUMBER ONE..INDIC SIYAQ NUMBER PREFIXED NINE 1ECAC ; AL # So INDIC SIYAQ PLACEHOLDER 1ECAD..1ECAF ; AL # No [3] INDIC SIYAQ FRACTION ONE QUARTER..INDIC SIYAQ FRACTION THREE QUARTERS 1ECB0 ; AL # Sc INDIC SIYAQ RUPEE MARK 1ECB1..1ECB4 ; AL # No [4] INDIC SIYAQ NUMBER ALTERNATE ONE..INDIC SIYAQ ALTERNATE LAKH MARK 1ED01..1ED2D ; AL # No [45] OTTOMAN SIYAQ NUMBER ONE..OTTOMAN SIYAQ NUMBER NINETY THOUSAND 1ED2E ; AL # So OTTOMAN SIYAQ MARRATAN 1ED2F..1ED3D ; AL # No [15] OTTOMAN SIYAQ ALTERNATE NUMBER TWO..OTTOMAN SIYAQ FRACTION ONE SIXTH 1EE00..1EE03 ; AL # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; AL # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; AL # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; AL # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; AL # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; AL # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; AL # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; AL # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; AL # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; AL # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; AL # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; AL # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; AL # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; AL # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; AL # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; AL # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; AL # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; AL # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; AL # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; AL # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; AL # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; AL # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; AL # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; AL # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; AL # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; AL # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; AL # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; AL # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; AL # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; AL # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; AL # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; AL # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; AL # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN # The above property value applies to 253 code points not listed here. # Total code points: 1731 # ================================================ # Bidi_Class=Left_To_Right_Override 202D ; LRO # Cf LEFT-TO-RIGHT OVERRIDE # Total code points: 1 # ================================================ # Bidi_Class=Right_To_Left_Override 202E ; RLO # Cf RIGHT-TO-LEFT OVERRIDE # Total code points: 1 # ================================================ # Bidi_Class=Left_To_Right_Embedding 202A ; LRE # Cf LEFT-TO-RIGHT EMBEDDING # Total code points: 1 # ================================================ # Bidi_Class=Right_To_Left_Embedding 202B ; RLE # Cf RIGHT-TO-LEFT EMBEDDING # Total code points: 1 # ================================================ # Bidi_Class=Pop_Directional_Format 202C ; PDF # Cf POP DIRECTIONAL FORMATTING # Total code points: 1 # ================================================ # Bidi_Class=Left_To_Right_Isolate 2066 ; LRI # Cf LEFT-TO-RIGHT ISOLATE # Total code points: 1 # ================================================ # Bidi_Class=Right_To_Left_Isolate 2067 ; RLI # Cf RIGHT-TO-LEFT ISOLATE # Total code points: 1 # ================================================ # Bidi_Class=First_Strong_Isolate 2068 ; FSI # Cf FIRST STRONG ISOLATE # Total code points: 1 # ================================================ # Bidi_Class=Pop_Directional_Isolate 2069 ; PDI # Cf POP DIRECTIONAL ISOLATE # Total code points: 1 # EOF ================================================ FILE: maint/Unicode.tables/DerivedCoreProperties.txt ================================================ # DerivedCoreProperties-17.0.0.txt # Date: 2025-07-30, 23:55:08 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # ================================================ # Derived Property: Math # Generated from: Sm + Other_Math 002B ; Math # Sm PLUS SIGN 003C..003E ; Math # Sm [3] LESS-THAN SIGN..GREATER-THAN SIGN 005E ; Math # Sk CIRCUMFLEX ACCENT 007C ; Math # Sm VERTICAL LINE 007E ; Math # Sm TILDE 00AC ; Math # Sm NOT SIGN 00B1 ; Math # Sm PLUS-MINUS SIGN 00D7 ; Math # Sm MULTIPLICATION SIGN 00F7 ; Math # Sm DIVISION SIGN 03D0..03D2 ; Math # L& [3] GREEK BETA SYMBOL..GREEK UPSILON WITH HOOK SYMBOL 03D5 ; Math # L& GREEK PHI SYMBOL 03F0..03F1 ; Math # L& [2] GREEK KAPPA SYMBOL..GREEK RHO SYMBOL 03F4..03F5 ; Math # L& [2] GREEK CAPITAL THETA SYMBOL..GREEK LUNATE EPSILON SYMBOL 03F6 ; Math # Sm GREEK REVERSED LUNATE EPSILON SYMBOL 0606..0608 ; Math # Sm [3] ARABIC-INDIC CUBE ROOT..ARABIC RAY 2016 ; Math # Po DOUBLE VERTICAL LINE 2032..2034 ; Math # Po [3] PRIME..TRIPLE PRIME 2040 ; Math # Pc CHARACTER TIE 2044 ; Math # Sm FRACTION SLASH 2052 ; Math # Sm COMMERCIAL MINUS SIGN 2061..2064 ; Math # Cf [4] FUNCTION APPLICATION..INVISIBLE PLUS 207A..207C ; Math # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN 207D ; Math # Ps SUPERSCRIPT LEFT PARENTHESIS 207E ; Math # Pe SUPERSCRIPT RIGHT PARENTHESIS 208A..208C ; Math # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; Math # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Math # Pe SUBSCRIPT RIGHT PARENTHESIS 20D0..20DC ; Math # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20E1 ; Math # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E5..20E6 ; Math # Mn [2] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING DOUBLE VERTICAL STROKE OVERLAY 20EB..20EF ; Math # Mn [5] COMBINING LONG DOUBLE SOLIDUS OVERLAY..COMBINING RIGHT ARROW BELOW 2102 ; Math # L& DOUBLE-STRUCK CAPITAL C 2107 ; Math # L& EULER CONSTANT 210A..2113 ; Math # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; Math # L& DOUBLE-STRUCK CAPITAL N 2118 ; Math # Sm SCRIPT CAPITAL P 2119..211D ; Math # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; Math # L& DOUBLE-STRUCK CAPITAL Z 2128 ; Math # L& BLACK-LETTER CAPITAL Z 2129 ; Math # So TURNED GREEK SMALL LETTER IOTA 212C..212D ; Math # L& [2] SCRIPT CAPITAL B..BLACK-LETTER CAPITAL C 212F..2131 ; Math # L& [3] SCRIPT SMALL E..SCRIPT CAPITAL F 2133..2134 ; Math # L& [2] SCRIPT CAPITAL M..SCRIPT SMALL O 2135..2138 ; Math # Lo [4] ALEF SYMBOL..DALET SYMBOL 213C..213F ; Math # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2140..2144 ; Math # Sm [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y 2145..2149 ; Math # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214B ; Math # Sm TURNED AMPERSAND 2190..2194 ; Math # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW 2195..2199 ; Math # So [5] UP DOWN ARROW..SOUTH WEST ARROW 219A..219B ; Math # Sm [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE 219C..219F ; Math # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW 21A0 ; Math # Sm RIGHTWARDS TWO HEADED ARROW 21A1..21A2 ; Math # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL 21A3 ; Math # Sm RIGHTWARDS ARROW WITH TAIL 21A4..21A5 ; Math # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR 21A6 ; Math # Sm RIGHTWARDS ARROW FROM BAR 21A7 ; Math # So DOWNWARDS ARROW FROM BAR 21A9..21AD ; Math # So [5] LEFTWARDS ARROW WITH HOOK..LEFT RIGHT WAVE ARROW 21AE ; Math # Sm LEFT RIGHT ARROW WITH STROKE 21B0..21B1 ; Math # So [2] UPWARDS ARROW WITH TIP LEFTWARDS..UPWARDS ARROW WITH TIP RIGHTWARDS 21B6..21B7 ; Math # So [2] ANTICLOCKWISE TOP SEMICIRCLE ARROW..CLOCKWISE TOP SEMICIRCLE ARROW 21BC..21CD ; Math # So [18] LEFTWARDS HARPOON WITH BARB UPWARDS..LEFTWARDS DOUBLE ARROW WITH STROKE 21CE..21CF ; Math # Sm [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE 21D0..21D1 ; Math # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW 21D2 ; Math # Sm RIGHTWARDS DOUBLE ARROW 21D3 ; Math # So DOWNWARDS DOUBLE ARROW 21D4 ; Math # Sm LEFT RIGHT DOUBLE ARROW 21D5..21DB ; Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW 21DD ; Math # So RIGHTWARDS SQUIGGLE ARROW 21E4..21E5 ; Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR 21F4..22FF ; Math # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2308 ; Math # Ps LEFT CEILING 2309 ; Math # Pe RIGHT CEILING 230A ; Math # Ps LEFT FLOOR 230B ; Math # Pe RIGHT FLOOR 2320..2321 ; Math # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 237C ; Math # Sm RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW 239B..23B3 ; Math # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23B5 ; Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET 23B7 ; Math # So RADICAL SYMBOL BOTTOM 23D0 ; Math # So VERTICAL LINE EXTENSION 23DC..23E1 ; Math # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET 23E2 ; Math # So WHITE TRAPEZIUM 25A0..25A1 ; Math # So [2] BLACK SQUARE..WHITE SQUARE 25AE..25B6 ; Math # So [9] BLACK VERTICAL RECTANGLE..BLACK RIGHT-POINTING TRIANGLE 25B7 ; Math # Sm WHITE RIGHT-POINTING TRIANGLE 25BC..25C0 ; Math # So [5] BLACK DOWN-POINTING TRIANGLE..BLACK LEFT-POINTING TRIANGLE 25C1 ; Math # Sm WHITE LEFT-POINTING TRIANGLE 25C6..25C7 ; Math # So [2] BLACK DIAMOND..WHITE DIAMOND 25CA..25CB ; Math # So [2] LOZENGE..WHITE CIRCLE 25CF..25D3 ; Math # So [5] BLACK CIRCLE..CIRCLE WITH UPPER HALF BLACK 25E2 ; Math # So BLACK LOWER RIGHT TRIANGLE 25E4 ; Math # So BLACK UPPER LEFT TRIANGLE 25E7..25EC ; Math # So [6] SQUARE WITH LEFT HALF BLACK..WHITE UP-POINTING TRIANGLE WITH DOT 25F8..25FF ; Math # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2605..2606 ; Math # So [2] BLACK STAR..WHITE STAR 2640 ; Math # So FEMALE SIGN 2642 ; Math # So MALE SIGN 2660..2663 ; Math # So [4] BLACK SPADE SUIT..BLACK CLUB SUIT 266D..266E ; Math # So [2] MUSIC FLAT SIGN..MUSIC NATURAL SIGN 266F ; Math # Sm MUSIC SHARP SIGN 27C0..27C4 ; Math # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET 27C5 ; Math # Ps LEFT S-SHAPED BAG DELIMITER 27C6 ; Math # Pe RIGHT S-SHAPED BAG DELIMITER 27C7..27E5 ; Math # Sm [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK 27E6 ; Math # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E7 ; Math # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E8 ; Math # Ps MATHEMATICAL LEFT ANGLE BRACKET 27E9 ; Math # Pe MATHEMATICAL RIGHT ANGLE BRACKET 27EA ; Math # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EB ; Math # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27EC ; Math # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27ED ; Math # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EE ; Math # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS 27EF ; Math # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS 27F0..27FF ; Math # Sm [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW 2900..2982 ; Math # Sm [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON 2983 ; Math # Ps LEFT WHITE CURLY BRACKET 2984 ; Math # Pe RIGHT WHITE CURLY BRACKET 2985 ; Math # Ps LEFT WHITE PARENTHESIS 2986 ; Math # Pe RIGHT WHITE PARENTHESIS 2987 ; Math # Ps Z NOTATION LEFT IMAGE BRACKET 2988 ; Math # Pe Z NOTATION RIGHT IMAGE BRACKET 2989 ; Math # Ps Z NOTATION LEFT BINDING BRACKET 298A ; Math # Pe Z NOTATION RIGHT BINDING BRACKET 298B ; Math # Ps LEFT SQUARE BRACKET WITH UNDERBAR 298C ; Math # Pe RIGHT SQUARE BRACKET WITH UNDERBAR 298D ; Math # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298E ; Math # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 298F ; Math # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990 ; Math # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2991 ; Math # Ps LEFT ANGLE BRACKET WITH DOT 2992 ; Math # Pe RIGHT ANGLE BRACKET WITH DOT 2993 ; Math # Ps LEFT ARC LESS-THAN BRACKET 2994 ; Math # Pe RIGHT ARC GREATER-THAN BRACKET 2995 ; Math # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET 2996 ; Math # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET 2997 ; Math # Ps LEFT BLACK TORTOISE SHELL BRACKET 2998 ; Math # Pe RIGHT BLACK TORTOISE SHELL BRACKET 2999..29D7 ; Math # Sm [63] DOTTED FENCE..BLACK HOURGLASS 29D8 ; Math # Ps LEFT WIGGLY FENCE 29D9 ; Math # Pe RIGHT WIGGLY FENCE 29DA ; Math # Ps LEFT DOUBLE WIGGLY FENCE 29DB ; Math # Pe RIGHT DOUBLE WIGGLY FENCE 29DC..29FB ; Math # Sm [32] INCOMPLETE INFINITY..TRIPLE PLUS 29FC ; Math # Ps LEFT-POINTING CURVED ANGLE BRACKET 29FD ; Math # Pe RIGHT-POINTING CURVED ANGLE BRACKET 29FE..2AFF ; Math # Sm [258] TINY..N-ARY WHITE VERTICAL BAR 2B30..2B44 ; Math # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B47..2B4C ; Math # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR FB29 ; Math # Sm HEBREW LETTER ALTERNATIVE PLUS SIGN FE61 ; Math # Po SMALL ASTERISK FE62 ; Math # Sm SMALL PLUS SIGN FE63 ; Math # Pd SMALL HYPHEN-MINUS FE64..FE66 ; Math # Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN FE68 ; Math # Po SMALL REVERSE SOLIDUS FF0B ; Math # Sm FULLWIDTH PLUS SIGN FF1C..FF1E ; Math # Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN FF3C ; Math # Po FULLWIDTH REVERSE SOLIDUS FF3E ; Math # Sk FULLWIDTH CIRCUMFLEX ACCENT FF5C ; Math # Sm FULLWIDTH VERTICAL LINE FF5E ; Math # Sm FULLWIDTH TILDE FFE2 ; Math # Sm FULLWIDTH NOT SIGN FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW 10D8E..10D8F ; Math # Sm [2] GARAY PLUS SIGN..GARAY MINUS SIGN 1CEF0 ; Math # Sm MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR 1D400..1D454 ; Math # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; Math # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Math # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Math # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Math # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Math # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; Math # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Math # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Math # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; Math # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Math # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Math # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Math # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; Math # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Math # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Math # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Math # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Math # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; Math # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; Math # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C1 ; Math # Sm MATHEMATICAL BOLD NABLA 1D6C2..1D6DA ; Math # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DB ; Math # Sm MATHEMATICAL BOLD PARTIAL DIFFERENTIAL 1D6DC..1D6FA ; Math # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FB ; Math # Sm MATHEMATICAL ITALIC NABLA 1D6FC..1D714 ; Math # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D715 ; Math # Sm MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL 1D716..1D734 ; Math # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D735 ; Math # Sm MATHEMATICAL BOLD ITALIC NABLA 1D736..1D74E ; Math # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D74F ; Math # Sm MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL 1D750..1D76E ; Math # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D76F ; Math # Sm MATHEMATICAL SANS-SERIF BOLD NABLA 1D770..1D788 ; Math # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D789 ; Math # Sm MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL 1D78A..1D7A8 ; Math # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7A9 ; Math # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA 1D7AA..1D7C2 ; Math # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C3 ; Math # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL 1D7C4..1D7CB ; Math # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1D7CE..1D7FF ; Math # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1EE00..1EE03 ; Math # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Math # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Math # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; Math # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; Math # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; Math # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; Math # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; Math # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; Math # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; Math # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; Math # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; Math # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; Math # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; Math # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; Math # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; Math # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; Math # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; Math # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; Math # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; Math # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; Math # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; Math # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; Math # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; Math # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; Math # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; Math # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; Math # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; Math # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; Math # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; Math # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; Math # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; Math # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Math # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Math # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL 1F8D0..1F8D8 ; Math # Sm [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE # Total code points: 2322 # ================================================ # Derived Property: Alphabetic # Generated from: Uppercase + Lowercase + Lt + Lm + Lo + Nl + Other_Alphabetic 0041..005A ; Alphabetic # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 0061..007A ; Alphabetic # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; Alphabetic # Lo FEMININE ORDINAL INDICATOR 00B5 ; Alphabetic # L& MICRO SIGN 00BA ; Alphabetic # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; Alphabetic # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; Alphabetic # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; Alphabetic # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; Alphabetic # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; Alphabetic # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; Alphabetic # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; Alphabetic # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; Alphabetic # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; Alphabetic # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02C1 ; Alphabetic # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C6..02D1 ; Alphabetic # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; Alphabetic # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EC ; Alphabetic # Lm MODIFIER LETTER VOICING 02EE ; Alphabetic # Lm MODIFIER LETTER DOUBLE APOSTROPHE 0345 ; Alphabetic # Mn COMBINING GREEK YPOGEGRAMMENI 0363..036F ; Alphabetic # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X 0370..0373 ; Alphabetic # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0374 ; Alphabetic # Lm GREEK NUMERAL SIGN 0376..0377 ; Alphabetic # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; Alphabetic # Lm GREEK YPOGEGRAMMENI 037B..037D ; Alphabetic # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; Alphabetic # L& GREEK CAPITAL LETTER YOT 0386 ; Alphabetic # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Alphabetic # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Alphabetic # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; Alphabetic # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; Alphabetic # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; Alphabetic # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 048A..052F ; Alphabetic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; Alphabetic # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; Alphabetic # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 0560..0588 ; Alphabetic # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 05B0..05BD ; Alphabetic # Mn [14] HEBREW POINT SHEVA..HEBREW POINT METEG 05BF ; Alphabetic # Mn HEBREW POINT RAFE 05C1..05C2 ; Alphabetic # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Alphabetic # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Alphabetic # Mn HEBREW POINT QAMATS QATAN 05D0..05EA ; Alphabetic # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; Alphabetic # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 0610..061A ; Alphabetic # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 0620..063F ; Alphabetic # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; Alphabetic # Lm ARABIC TATWEEL 0641..064A ; Alphabetic # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 064B..0657 ; Alphabetic # Mn [13] ARABIC FATHATAN..ARABIC INVERTED DAMMA 0659..065F ; Alphabetic # Mn [7] ARABIC ZWARAKAY..ARABIC WAVY HAMZA BELOW 066E..066F ; Alphabetic # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0670 ; Alphabetic # Mn ARABIC LETTER SUPERSCRIPT ALEF 0671..06D3 ; Alphabetic # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D5 ; Alphabetic # Lo ARABIC LETTER AE 06D6..06DC ; Alphabetic # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06E1..06E4 ; Alphabetic # Mn [4] ARABIC SMALL HIGH DOTLESS HEAD OF KHAH..ARABIC SMALL HIGH MADDA 06E5..06E6 ; Alphabetic # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06E7..06E8 ; Alphabetic # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06ED ; Alphabetic # Mn ARABIC SMALL LOW MEEM 06EE..06EF ; Alphabetic # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06FA..06FC ; Alphabetic # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FF ; Alphabetic # Lo ARABIC LETTER HEH WITH INVERTED V 0710 ; Alphabetic # Lo SYRIAC LETTER ALAPH 0711 ; Alphabetic # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0712..072F ; Alphabetic # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 0730..073F ; Alphabetic # Mn [16] SYRIAC PTHAHA ABOVE..SYRIAC RWAHA 074D..07A5 ; Alphabetic # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07A6..07B0 ; Alphabetic # Mn [11] THAANA ABAFILI..THAANA SUKUN 07B1 ; Alphabetic # Lo THAANA LETTER NAA 07CA..07EA ; Alphabetic # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07F4..07F5 ; Alphabetic # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; Alphabetic # Lm NKO LAJANYALAN 0800..0815 ; Alphabetic # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 0816..0817 ; Alphabetic # Mn [2] SAMARITAN MARK IN..SAMARITAN MARK IN-ALAF 081A ; Alphabetic # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 081B..0823 ; Alphabetic # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0824 ; Alphabetic # Lm SAMARITAN MODIFIER LETTER SHORT A 0825..0827 ; Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0828 ; Alphabetic # Lm SAMARITAN MODIFIER LETTER I 0829..082C ; Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN 0840..0858 ; Alphabetic # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 0860..086A ; Alphabetic # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; Alphabetic # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088F ; Alphabetic # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 0897 ; Alphabetic # Mn ARABIC PEPET 08A0..08C8 ; Alphabetic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; Alphabetic # Lm ARABIC SMALL FARSI YEH 08D4..08DF ; Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA 08E3..08E9 ; Alphabetic # Mn [7] ARABIC TURNED DAMMA BELOW..ARABIC CURLY KASRATAN 08F0..0902 ; Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA 0903 ; Alphabetic # Mc DEVANAGARI SIGN VISARGA 0904..0939 ; Alphabetic # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093A ; Alphabetic # Mn DEVANAGARI VOWEL SIGN OE 093B ; Alphabetic # Mc DEVANAGARI VOWEL SIGN OOE 093D ; Alphabetic # Lo DEVANAGARI SIGN AVAGRAHA 093E..0940 ; Alphabetic # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0941..0948 ; Alphabetic # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 0949..094C ; Alphabetic # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094E..094F ; Alphabetic # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0950 ; Alphabetic # Lo DEVANAGARI OM 0955..0957 ; Alphabetic # Mn [3] DEVANAGARI VOWEL SIGN CANDRA LONG E..DEVANAGARI VOWEL SIGN UUE 0958..0961 ; Alphabetic # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0962..0963 ; Alphabetic # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0971 ; Alphabetic # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; Alphabetic # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0981 ; Alphabetic # Mn BENGALI SIGN CANDRABINDU 0982..0983 ; Alphabetic # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; Alphabetic # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; Alphabetic # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; Alphabetic # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; Alphabetic # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; Alphabetic # Lo BENGALI LETTER LA 09B6..09B9 ; Alphabetic # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BD ; Alphabetic # Lo BENGALI SIGN AVAGRAHA 09BE..09C0 ; Alphabetic # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II 09C1..09C4 ; Alphabetic # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09C7..09C8 ; Alphabetic # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; Alphabetic # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09CE ; Alphabetic # Lo BENGALI LETTER KHANDA TA 09D7 ; Alphabetic # Mc BENGALI AU LENGTH MARK 09DC..09DD ; Alphabetic # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; Alphabetic # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09E2..09E3 ; Alphabetic # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09F0..09F1 ; Alphabetic # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09FC ; Alphabetic # Lo BENGALI LETTER VEDIC ANUSVARA 0A01..0A02 ; Alphabetic # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A03 ; Alphabetic # Mc GURMUKHI SIGN VISARGA 0A05..0A0A ; Alphabetic # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; Alphabetic # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; Alphabetic # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; Alphabetic # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; Alphabetic # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; Alphabetic # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; Alphabetic # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A3E..0A40 ; Alphabetic # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A41..0A42 ; Alphabetic # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; Alphabetic # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4C ; Alphabetic # Mn [2] GURMUKHI VOWEL SIGN OO..GURMUKHI VOWEL SIGN AU 0A51 ; Alphabetic # Mn GURMUKHI SIGN UDAAT 0A59..0A5C ; Alphabetic # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; Alphabetic # Lo GURMUKHI LETTER FA 0A70..0A71 ; Alphabetic # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A72..0A74 ; Alphabetic # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A75 ; Alphabetic # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; Alphabetic # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0A83 ; Alphabetic # Mc GUJARATI SIGN VISARGA 0A85..0A8D ; Alphabetic # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; Alphabetic # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; Alphabetic # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; Alphabetic # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; Alphabetic # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; Alphabetic # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABD ; Alphabetic # Lo GUJARATI SIGN AVAGRAHA 0ABE..0AC0 ; Alphabetic # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC1..0AC5 ; Alphabetic # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; Alphabetic # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0AC9 ; Alphabetic # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; Alphabetic # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0AD0 ; Alphabetic # Lo GUJARATI OM 0AE0..0AE1 ; Alphabetic # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AE2..0AE3 ; Alphabetic # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AF9 ; Alphabetic # Lo GUJARATI LETTER ZHA 0AFA..0AFC ; Alphabetic # Mn [3] GUJARATI SIGN SUKUN..GUJARATI SIGN MADDAH 0B01 ; Alphabetic # Mn ORIYA SIGN CANDRABINDU 0B02..0B03 ; Alphabetic # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B05..0B0C ; Alphabetic # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; Alphabetic # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; Alphabetic # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; Alphabetic # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; Alphabetic # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; Alphabetic # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3D ; Alphabetic # Lo ORIYA SIGN AVAGRAHA 0B3E ; Alphabetic # Mc ORIYA VOWEL SIGN AA 0B3F ; Alphabetic # Mn ORIYA VOWEL SIGN I 0B40 ; Alphabetic # Mc ORIYA VOWEL SIGN II 0B41..0B44 ; Alphabetic # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B47..0B48 ; Alphabetic # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; Alphabetic # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B56 ; Alphabetic # Mn ORIYA AI LENGTH MARK 0B57 ; Alphabetic # Mc ORIYA AU LENGTH MARK 0B5C..0B5D ; Alphabetic # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; Alphabetic # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B62..0B63 ; Alphabetic # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B71 ; Alphabetic # Lo ORIYA LETTER WA 0B82 ; Alphabetic # Mn TAMIL SIGN ANUSVARA 0B83 ; Alphabetic # Lo TAMIL SIGN VISARGA 0B85..0B8A ; Alphabetic # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; Alphabetic # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; Alphabetic # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; Alphabetic # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; Alphabetic # Lo TAMIL LETTER JA 0B9E..0B9F ; Alphabetic # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; Alphabetic # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; Alphabetic # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; Alphabetic # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BBE..0BBF ; Alphabetic # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I 0BC0 ; Alphabetic # Mn TAMIL VOWEL SIGN II 0BC1..0BC2 ; Alphabetic # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD0 ; Alphabetic # Lo TAMIL OM 0BD7 ; Alphabetic # Mc TAMIL AU LENGTH MARK 0C00 ; Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C04 ; Alphabetic # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C05..0C0C ; Alphabetic # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; Alphabetic # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; Alphabetic # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; Alphabetic # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; Alphabetic # Lo TELUGU SIGN AVAGRAHA 0C3E..0C40 ; Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C46..0C48 ; Alphabetic # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4C ; Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU 0C55..0C56 ; Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C58..0C5A ; Alphabetic # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; Alphabetic # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; Alphabetic # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C62..0C63 ; Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C80 ; Alphabetic # Lo KANNADA SIGN SPACING CANDRABINDU 0C81 ; Alphabetic # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C85..0C8C ; Alphabetic # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; Alphabetic # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; Alphabetic # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; Alphabetic # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; Alphabetic # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBD ; Alphabetic # Lo KANNADA SIGN AVAGRAHA 0CBE ; Alphabetic # Mc KANNADA VOWEL SIGN AA 0CBF ; Alphabetic # Mn KANNADA VOWEL SIGN I 0CC0..0CC4 ; Alphabetic # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR 0CC6 ; Alphabetic # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; Alphabetic # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; Alphabetic # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC ; Alphabetic # Mn KANNADA VOWEL SIGN AU 0CD5..0CD6 ; Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CDC..0CDE ; Alphabetic # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; Alphabetic # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE2..0CE3 ; Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0CF1..0CF2 ; Alphabetic # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; Alphabetic # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D00..0D01 ; Alphabetic # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D04..0D0C ; Alphabetic # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; Alphabetic # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; Alphabetic # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3D ; Alphabetic # Lo MALAYALAM SIGN AVAGRAHA 0D3E..0D40 ; Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D41..0D44 ; Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D46..0D48 ; Alphabetic # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; Alphabetic # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D4E ; Alphabetic # Lo MALAYALAM LETTER DOT REPH 0D54..0D56 ; Alphabetic # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D57 ; Alphabetic # Mc MALAYALAM AU LENGTH MARK 0D5F..0D61 ; Alphabetic # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D62..0D63 ; Alphabetic # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D7A..0D7F ; Alphabetic # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D81 ; Alphabetic # Mn SINHALA SIGN CANDRABINDU 0D82..0D83 ; Alphabetic # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0D85..0D96 ; Alphabetic # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; Alphabetic # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; Alphabetic # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; Alphabetic # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; Alphabetic # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0DCF..0DD1 ; Alphabetic # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD2..0DD4 ; Alphabetic # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Alphabetic # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DD8..0DDF ; Alphabetic # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA 0DF2..0DF3 ; Alphabetic # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0E01..0E30 ; Alphabetic # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E31 ; Alphabetic # Mn THAI CHARACTER MAI HAN-AKAT 0E32..0E33 ; Alphabetic # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E34..0E3A ; Alphabetic # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E40..0E45 ; Alphabetic # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; Alphabetic # Lm THAI CHARACTER MAIYAMOK 0E4D ; Alphabetic # Mn THAI CHARACTER NIKHAHIT 0E81..0E82 ; Alphabetic # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; Alphabetic # Lo LAO LETTER KHO TAM 0E86..0E8A ; Alphabetic # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; Alphabetic # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; Alphabetic # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; Alphabetic # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB1 ; Alphabetic # Mn LAO VOWEL SIGN MAI KAN 0EB2..0EB3 ; Alphabetic # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EB4..0EB9 ; Alphabetic # Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU 0EBB..0EBC ; Alphabetic # Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO 0EBD ; Alphabetic # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; Alphabetic # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; Alphabetic # Lm LAO KO LA 0ECD ; Alphabetic # Mn LAO NIGGAHITA 0EDC..0EDF ; Alphabetic # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; Alphabetic # Lo TIBETAN SYLLABLE OM 0F40..0F47 ; Alphabetic # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; Alphabetic # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F71..0F7E ; Alphabetic # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F7F ; Alphabetic # Mc TIBETAN SIGN RNAM BCAD 0F80..0F83 ; Alphabetic # Mn [4] TIBETAN VOWEL SIGN REVERSED I..TIBETAN SIGN SNA LDAN 0F88..0F8C ; Alphabetic # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 0F8D..0F97 ; Alphabetic # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; Alphabetic # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 1000..102A ; Alphabetic # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 102B..102C ; Alphabetic # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 102D..1030 ; Alphabetic # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1031 ; Alphabetic # Mc MYANMAR VOWEL SIGN E 1032..1036 ; Alphabetic # Mn [5] MYANMAR VOWEL SIGN AI..MYANMAR SIGN ANUSVARA 1038 ; Alphabetic # Mc MYANMAR SIGN VISARGA 103B..103C ; Alphabetic # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 103D..103E ; Alphabetic # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 103F ; Alphabetic # Lo MYANMAR LETTER GREAT SA 1050..1055 ; Alphabetic # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 1056..1057 ; Alphabetic # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1058..1059 ; Alphabetic # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105A..105D ; Alphabetic # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 105E..1060 ; Alphabetic # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1061 ; Alphabetic # Lo MYANMAR LETTER SGAW KAREN SHA 1062..1064 ; Alphabetic # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1065..1066 ; Alphabetic # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 1067..106D ; Alphabetic # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 106E..1070 ; Alphabetic # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1071..1074 ; Alphabetic # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1075..1081 ; Alphabetic # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 1082 ; Alphabetic # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1083..1084 ; Alphabetic # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1085..1086 ; Alphabetic # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 1087..108C ; Alphabetic # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108D ; Alphabetic # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 108E ; Alphabetic # Lo MYANMAR LETTER RUMAI PALAUNG FA 108F ; Alphabetic # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 109A..109C ; Alphabetic # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109D ; Alphabetic # Mn MYANMAR VOWEL SIGN AITON AI 10A0..10C5 ; Alphabetic # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Alphabetic # L& GEORGIAN CAPITAL LETTER YN 10CD ; Alphabetic # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; Alphabetic # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; Alphabetic # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; Alphabetic # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; Alphabetic # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; Alphabetic # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; Alphabetic # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; Alphabetic # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; Alphabetic # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; Alphabetic # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; Alphabetic # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; Alphabetic # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 1380..138F ; Alphabetic # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 13A0..13F5 ; Alphabetic # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; Alphabetic # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1401..166C ; Alphabetic # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166F..167F ; Alphabetic # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; Alphabetic # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; Alphabetic # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; Alphabetic # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; Alphabetic # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; Alphabetic # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 1712..1713 ; Alphabetic # Mn [2] TAGALOG VOWEL SIGN I..TAGALOG VOWEL SIGN U 171F..1731 ; Alphabetic # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1732..1733 ; Alphabetic # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1740..1751 ; Alphabetic # Lo [18] BUHID LETTER A..BUHID LETTER HA 1752..1753 ; Alphabetic # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1760..176C ; Alphabetic # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; Alphabetic # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1772..1773 ; Alphabetic # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 1780..17B3 ; Alphabetic # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17B6 ; Alphabetic # Mc KHMER VOWEL SIGN AA 17B7..17BD ; Alphabetic # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17BE..17C5 ; Alphabetic # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C6 ; Alphabetic # Mn KHMER SIGN NIKAHIT 17C7..17C8 ; Alphabetic # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 17D7 ; Alphabetic # Lm KHMER SIGN LEK TOO 17DC ; Alphabetic # Lo KHMER SIGN AVAKRAHASANYA 1820..1842 ; Alphabetic # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; Alphabetic # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; Alphabetic # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; Alphabetic # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1885..1886 ; Alphabetic # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 1887..18A8 ; Alphabetic # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18A9 ; Alphabetic # Mn MONGOLIAN LETTER ALI GALI DAGALGA 18AA ; Alphabetic # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; Alphabetic # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; Alphabetic # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1920..1922 ; Alphabetic # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1923..1926 ; Alphabetic # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1927..1928 ; Alphabetic # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1929..192B ; Alphabetic # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; Alphabetic # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1932 ; Alphabetic # Mn LIMBU SMALL LETTER ANUSVARA 1933..1938 ; Alphabetic # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1950..196D ; Alphabetic # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; Alphabetic # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; Alphabetic # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; Alphabetic # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 1A00..1A16 ; Alphabetic # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A17..1A18 ; Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A19..1A1A ; Alphabetic # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A1B ; Alphabetic # Mn BUGINESE VOWEL SIGN AE 1A20..1A54 ; Alphabetic # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1A55 ; Alphabetic # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A56 ; Alphabetic # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A57 ; Alphabetic # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A58..1A5E ; Alphabetic # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A61 ; Alphabetic # Mc TAI THAM VOWEL SIGN A 1A62 ; Alphabetic # Mn TAI THAM VOWEL SIGN MAI SAT 1A63..1A64 ; Alphabetic # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A65..1A6C ; Alphabetic # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A6D..1A72 ; Alphabetic # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1A73..1A74 ; Alphabetic # Mn [2] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN MAI KANG 1AA7 ; Alphabetic # Lm TAI THAM SIGN MAI YAMOK 1ABF..1AC0 ; Alphabetic # Mn [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW 1ACC..1ACE ; Alphabetic # Mn [3] COMBINING LATIN SMALL LETTER INSULAR G..COMBINING LATIN SMALL LETTER INSULAR T 1B00..1B03 ; Alphabetic # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B04 ; Alphabetic # Mc BALINESE SIGN BISAH 1B05..1B33 ; Alphabetic # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B35 ; Alphabetic # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; Alphabetic # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; Alphabetic # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; Alphabetic # Mn BALINESE VOWEL SIGN LA LENGA 1B3D..1B41 ; Alphabetic # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B42 ; Alphabetic # Mn BALINESE VOWEL SIGN PEPET 1B43 ; Alphabetic # Mc BALINESE VOWEL SIGN PEPET TEDUNG 1B45..1B4C ; Alphabetic # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B80..1B81 ; Alphabetic # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1B82 ; Alphabetic # Mc SUNDANESE SIGN PANGWISAD 1B83..1BA0 ; Alphabetic # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BA1 ; Alphabetic # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA2..1BA5 ; Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA6..1BA7 ; Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAC..1BAD ; Alphabetic # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BAE..1BAF ; Alphabetic # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BBA..1BE5 ; Alphabetic # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1BE7 ; Alphabetic # Mc BATAK VOWEL SIGN E 1BE8..1BE9 ; Alphabetic # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BEA..1BEC ; Alphabetic # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BED ; Alphabetic # Mn BATAK VOWEL SIGN KARO O 1BEE ; Alphabetic # Mc BATAK VOWEL SIGN U 1BEF..1BF1 ; Alphabetic # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1C00..1C23 ; Alphabetic # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C24..1C2B ; Alphabetic # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C2C..1C33 ; Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C36 ; Alphabetic # Mn LEPCHA SIGN RAN 1C4D..1C4F ; Alphabetic # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; Alphabetic # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; Alphabetic # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C80..1C8A ; Alphabetic # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Alphabetic # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Alphabetic # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; Alphabetic # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CEE..1CF3 ; Alphabetic # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF5..1CF6 ; Alphabetic # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CFA ; Alphabetic # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; Alphabetic # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; Alphabetic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; Alphabetic # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; Alphabetic # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; Alphabetic # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; Alphabetic # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1DD3..1DF4 ; Alphabetic # Mn [34] COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 1E00..1F15 ; Alphabetic # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; Alphabetic # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; Alphabetic # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; Alphabetic # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Alphabetic # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; Alphabetic # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Alphabetic # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Alphabetic # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; Alphabetic # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; Alphabetic # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; Alphabetic # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; Alphabetic # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Alphabetic # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; Alphabetic # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; Alphabetic # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; Alphabetic # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; Alphabetic # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; Alphabetic # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; Alphabetic # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2071 ; Alphabetic # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; Alphabetic # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; Alphabetic # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2102 ; Alphabetic # L& DOUBLE-STRUCK CAPITAL C 2107 ; Alphabetic # L& EULER CONSTANT 210A..2113 ; Alphabetic # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; Alphabetic # L& DOUBLE-STRUCK CAPITAL N 2119..211D ; Alphabetic # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; Alphabetic # L& DOUBLE-STRUCK CAPITAL Z 2126 ; Alphabetic # L& OHM SIGN 2128 ; Alphabetic # L& BLACK-LETTER CAPITAL Z 212A..212D ; Alphabetic # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212F..2134 ; Alphabetic # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; Alphabetic # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; Alphabetic # L& INFORMATION SOURCE 213C..213F ; Alphabetic # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; Alphabetic # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; Alphabetic # L& TURNED SMALL F 2160..2182 ; Alphabetic # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; Alphabetic # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; Alphabetic # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 24B6..24E9 ; Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C00..2C7B ; Alphabetic # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; Alphabetic # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; Alphabetic # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; Alphabetic # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; Alphabetic # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Alphabetic # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Alphabetic # L& GEORGIAN SMALL LETTER YN 2D2D ; Alphabetic # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; Alphabetic # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; Alphabetic # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D80..2D96 ; Alphabetic # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 2DE0..2DFF ; Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 2E2F ; Alphabetic # Lm VERTICAL TILDE 3005 ; Alphabetic # Lm IDEOGRAPHIC ITERATION MARK 3006 ; Alphabetic # Lo IDEOGRAPHIC CLOSING MARK 3007 ; Alphabetic # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; Alphabetic # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 3031..3035 ; Alphabetic # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3038..303A ; Alphabetic # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; Alphabetic # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; Alphabetic # Lo MASU MARK 3041..3096 ; Alphabetic # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 309D..309E ; Alphabetic # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; Alphabetic # Lo HIRAGANA DIGRAPH YORI 30A1..30FA ; Alphabetic # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FC..30FE ; Alphabetic # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; Alphabetic # Lo KATAKANA DIGRAPH KOTO 3105..312F ; Alphabetic # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; Alphabetic # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 31A0..31BF ; Alphabetic # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; Alphabetic # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3400..4DBF ; Alphabetic # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; Alphabetic # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; Alphabetic # Lm YI SYLLABLE WU A016..A48C ; Alphabetic # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; Alphabetic # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; Alphabetic # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A500..A60B ; Alphabetic # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; Alphabetic # Lm VAI SYLLABLE LENGTHENER A610..A61F ; Alphabetic # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A62A..A62B ; Alphabetic # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; Alphabetic # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; Alphabetic # Lo CYRILLIC LETTER MULTIOCULAR O A674..A67B ; Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA A67F ; Alphabetic # Lm CYRILLIC PAYEROK A680..A69B ; Alphabetic # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; Alphabetic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A69E..A69F ; Alphabetic # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6A0..A6E5 ; Alphabetic # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; Alphabetic # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A717..A71F ; Alphabetic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A722..A76F ; Alphabetic # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; Alphabetic # Lm MODIFIER LETTER US A771..A787 ; Alphabetic # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A788 ; Alphabetic # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A78B..A78E ; Alphabetic # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; Alphabetic # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; Alphabetic # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; Alphabetic # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; Alphabetic # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; Alphabetic # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; Alphabetic # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; Alphabetic # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; Alphabetic # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A802 ; Alphabetic # Mn SYLOTI NAGRI SIGN DVISVARA A803..A805 ; Alphabetic # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A807..A80A ; Alphabetic # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80B ; Alphabetic # Mn SYLOTI NAGRI SIGN ANUSVARA A80C..A822 ; Alphabetic # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A823..A824 ; Alphabetic # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A825..A826 ; Alphabetic # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A827 ; Alphabetic # Mc SYLOTI NAGRI VOWEL SIGN OO A840..A873 ; Alphabetic # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A880..A881 ; Alphabetic # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A882..A8B3 ; Alphabetic # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8B4..A8C3 ; Alphabetic # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A8C5 ; Alphabetic # Mn SAURASHTRA SIGN CANDRABINDU A8F2..A8F7 ; Alphabetic # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8FB ; Alphabetic # Lo DEVANAGARI HEADSTROKE A8FD..A8FE ; Alphabetic # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A8FF ; Alphabetic # Mn DEVANAGARI VOWEL SIGN AY A90A..A925 ; Alphabetic # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A926..A92A ; Alphabetic # Mn [5] KAYAH LI VOWEL UE..KAYAH LI VOWEL O A930..A946 ; Alphabetic # Lo [23] REJANG LETTER KA..REJANG LETTER A A947..A951 ; Alphabetic # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A952 ; Alphabetic # Mc REJANG CONSONANT SIGN H A960..A97C ; Alphabetic # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A980..A982 ; Alphabetic # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A983 ; Alphabetic # Mc JAVANESE SIGN WIGNYAN A984..A9B2 ; Alphabetic # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9B4..A9B5 ; Alphabetic # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9B6..A9B9 ; Alphabetic # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BA..A9BB ; Alphabetic # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BC..A9BD ; Alphabetic # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9BE..A9BF ; Alphabetic # Mc [2] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE CONSONANT SIGN CAKRA A9CF ; Alphabetic # Lm JAVANESE PANGRANGKEP A9E0..A9E4 ; Alphabetic # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E5 ; Alphabetic # Mn MYANMAR SIGN SHAN SAW A9E6 ; Alphabetic # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; Alphabetic # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9FA..A9FE ; Alphabetic # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; Alphabetic # Lo [41] CHAM LETTER A..CHAM LETTER HA AA29..AA2E ; Alphabetic # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA2F..AA30 ; Alphabetic # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA31..AA32 ; Alphabetic # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA33..AA34 ; Alphabetic # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA35..AA36 ; Alphabetic # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA40..AA42 ; Alphabetic # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA43 ; Alphabetic # Mn CHAM CONSONANT SIGN FINAL NG AA44..AA4B ; Alphabetic # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA4C ; Alphabetic # Mn CHAM CONSONANT SIGN FINAL M AA4D ; Alphabetic # Mc CHAM CONSONANT SIGN FINAL H AA60..AA6F ; Alphabetic # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; Alphabetic # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; Alphabetic # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA7A ; Alphabetic # Lo MYANMAR LETTER AITON RA AA7B ; Alphabetic # Mc MYANMAR SIGN PAO KAREN TONE AA7C ; Alphabetic # Mn MYANMAR SIGN TAI LAING TONE-2 AA7D ; Alphabetic # Mc MYANMAR SIGN TAI LAING TONE-5 AA7E..AAAF ; Alphabetic # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB0 ; Alphabetic # Mn TAI VIET MAI KANG AAB1 ; Alphabetic # Lo TAI VIET VOWEL AA AAB2..AAB4 ; Alphabetic # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB5..AAB6 ; Alphabetic # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB7..AAB8 ; Alphabetic # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AAB9..AABD ; Alphabetic # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AABE ; Alphabetic # Mn TAI VIET VOWEL AM AAC0 ; Alphabetic # Lo TAI VIET TONE MAI NUENG AAC2 ; Alphabetic # Lo TAI VIET TONE MAI SONG AADB..AADC ; Alphabetic # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; Alphabetic # Lm TAI VIET SYMBOL SAM AAE0..AAEA ; Alphabetic # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAEB ; Alphabetic # Mc MEETEI MAYEK VOWEL SIGN II AAEC..AAED ; Alphabetic # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAEE..AAEF ; Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF2 ; Alphabetic # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; Alphabetic # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AAF5 ; Alphabetic # Mc MEETEI MAYEK VOWEL SIGN VISARGA AB01..AB06 ; Alphabetic # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; Alphabetic # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; Alphabetic # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; Alphabetic # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; Alphabetic # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; Alphabetic # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; Alphabetic # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; Alphabetic # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; Alphabetic # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM ABE3..ABE4 ; Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE5 ; Alphabetic # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE6..ABE7 ; Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE8 ; Alphabetic # Mn MEETEI MAYEK VOWEL SIGN UNAP ABE9..ABEA ; Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG AC00..D7A3 ; Alphabetic # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; Alphabetic # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; Alphabetic # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; Alphabetic # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Alphabetic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; Alphabetic # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Alphabetic # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FB1D ; Alphabetic # Lo HEBREW LETTER YOD WITH HIRIQ FB1E ; Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA FB1F..FB28 ; Alphabetic # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; Alphabetic # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; Alphabetic # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; Alphabetic # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; Alphabetic # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; Alphabetic # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; Alphabetic # Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBD3..FD3D ; Alphabetic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; Alphabetic # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; Alphabetic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; Alphabetic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FE70..FE74 ; Alphabetic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; Alphabetic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF21..FF3A ; Alphabetic # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; Alphabetic # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF66..FF6F ; Alphabetic # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; Alphabetic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; Alphabetic # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FF9E..FF9F ; Alphabetic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFA0..FFBE ; Alphabetic # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; Alphabetic # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; Alphabetic # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; Alphabetic # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; Alphabetic # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; Alphabetic # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; Alphabetic # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; Alphabetic # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; Alphabetic # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; Alphabetic # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; Alphabetic # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10140..10174 ; Alphabetic # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10280..1029C ; Alphabetic # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; Alphabetic # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 10300..1031F ; Alphabetic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 1032D..10340 ; Alphabetic # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; Alphabetic # Nl GOTHIC LETTER NINETY 10342..10349 ; Alphabetic # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; Alphabetic # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; Alphabetic # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10376..1037A ; Alphabetic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10380..1039D ; Alphabetic # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 103A0..103C3 ; Alphabetic # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; Alphabetic # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D1..103D5 ; Alphabetic # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; Alphabetic # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; Alphabetic # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104B0..104D3 ; Alphabetic # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; Alphabetic # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; Alphabetic # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; Alphabetic # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 10570..1057A ; Alphabetic # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Alphabetic # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Alphabetic # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Alphabetic # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; Alphabetic # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Alphabetic # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Alphabetic # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Alphabetic # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; Alphabetic # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; Alphabetic # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; Alphabetic # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; Alphabetic # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; Alphabetic # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Alphabetic # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Alphabetic # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10800..10805 ; Alphabetic # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; Alphabetic # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; Alphabetic # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; Alphabetic # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; Alphabetic # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; Alphabetic # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10860..10876 ; Alphabetic # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10880..1089E ; Alphabetic # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108E0..108F2 ; Alphabetic # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; Alphabetic # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 10900..10915 ; Alphabetic # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10920..10939 ; Alphabetic # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 10940..10959 ; Alphabetic # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; Alphabetic # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BE..109BF ; Alphabetic # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 10A00 ; Alphabetic # Lo KHAROSHTHI LETTER A 10A01..10A03 ; Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A10..10A13 ; Alphabetic # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; Alphabetic # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; Alphabetic # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A60..10A7C ; Alphabetic # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A80..10A9C ; Alphabetic # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10AC0..10AC7 ; Alphabetic # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC9..10AE4 ; Alphabetic # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10B00..10B35 ; Alphabetic # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; Alphabetic # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B60..10B72 ; Alphabetic # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B80..10B91 ; Alphabetic # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10C00..10C48 ; Alphabetic # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; Alphabetic # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; Alphabetic # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D00..10D23 ; Alphabetic # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D24..10D27 ; Alphabetic # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D4A..10D4D ; Alphabetic # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; Alphabetic # Lm GARAY VOWEL LENGTH MARK 10D4F ; Alphabetic # Lo GARAY SUKUN 10D50..10D65 ; Alphabetic # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D69 ; Alphabetic # Mn GARAY VOWEL SIGN E 10D6F ; Alphabetic # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; Alphabetic # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10E80..10EA9 ; Alphabetic # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAB..10EAC ; Alphabetic # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EB0..10EB1 ; Alphabetic # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; Alphabetic # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; Alphabetic # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; Alphabetic # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10EFA..10EFC ; Alphabetic # Mn [3] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC COMBINING ALEF OVERLAY 10F00..10F1C ; Alphabetic # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; Alphabetic # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; Alphabetic # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F70..10F81 ; Alphabetic # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10FB0..10FC4 ; Alphabetic # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FE0..10FF6 ; Alphabetic # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11000 ; Alphabetic # Mc BRAHMI SIGN CANDRABINDU 11001 ; Alphabetic # Mn BRAHMI SIGN ANUSVARA 11002 ; Alphabetic # Mc BRAHMI SIGN VISARGA 11003..11037 ; Alphabetic # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11038..11045 ; Alphabetic # Mn [14] BRAHMI VOWEL SIGN AA..BRAHMI VOWEL SIGN AU 11071..11072 ; Alphabetic # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11073..11074 ; Alphabetic # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 11075 ; Alphabetic # Lo BRAHMI LETTER OLD TAMIL LLA 11080..11081 ; Alphabetic # Mn [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA 11082 ; Alphabetic # Mc KAITHI SIGN VISARGA 11083..110AF ; Alphabetic # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110B0..110B2 ; Alphabetic # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B3..110B6 ; Alphabetic # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B7..110B8 ; Alphabetic # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 110C2 ; Alphabetic # Mn KAITHI VOWEL SIGN VOCALIC R 110D0..110E8 ; Alphabetic # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 11100..11102 ; Alphabetic # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11103..11126 ; Alphabetic # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11127..1112B ; Alphabetic # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112C ; Alphabetic # Mc CHAKMA VOWEL SIGN E 1112D..11132 ; Alphabetic # Mn [6] CHAKMA VOWEL SIGN AI..CHAKMA AU MARK 11144 ; Alphabetic # Lo CHAKMA LETTER LHAA 11145..11146 ; Alphabetic # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11147 ; Alphabetic # Lo CHAKMA LETTER VAA 11150..11172 ; Alphabetic # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11176 ; Alphabetic # Lo MAHAJANI LIGATURE SHRI 11180..11181 ; Alphabetic # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 11182 ; Alphabetic # Mc SHARADA SIGN VISARGA 11183..111B2 ; Alphabetic # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111B3..111B5 ; Alphabetic # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111B6..111BE ; Alphabetic # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111BF ; Alphabetic # Mc SHARADA VOWEL SIGN AU 111C1..111C4 ; Alphabetic # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111CE ; Alphabetic # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 111CF ; Alphabetic # Mn SHARADA SIGN INVERTED CANDRABINDU 111DA ; Alphabetic # Lo SHARADA EKAM 111DC ; Alphabetic # Lo SHARADA HEADSTROKE 11200..11211 ; Alphabetic # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; Alphabetic # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1122C..1122E ; Alphabetic # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 1122F..11231 ; Alphabetic # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11232..11233 ; Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11234 ; Alphabetic # Mn KHOJKI SIGN ANUSVARA 11237 ; Alphabetic # Mn KHOJKI SIGN SHADDA 1123E ; Alphabetic # Mn KHOJKI SIGN SUKUN 1123F..11240 ; Alphabetic # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11241 ; Alphabetic # Mn KHOJKI VOWEL SIGN VOCALIC R 11280..11286 ; Alphabetic # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; Alphabetic # Lo MULTANI LETTER GHA 1128A..1128D ; Alphabetic # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; Alphabetic # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; Alphabetic # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112B0..112DE ; Alphabetic # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 112DF ; Alphabetic # Mn KHUDAWADI SIGN ANUSVARA 112E0..112E2 ; Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 112E3..112E8 ; Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU 11300..11301 ; Alphabetic # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 11302..11303 ; Alphabetic # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 11305..1130C ; Alphabetic # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; Alphabetic # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; Alphabetic # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; Alphabetic # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; Alphabetic # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; Alphabetic # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133D ; Alphabetic # Lo GRANTHA SIGN AVAGRAHA 1133E..1133F ; Alphabetic # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I 11340 ; Alphabetic # Mn GRANTHA VOWEL SIGN II 11341..11344 ; Alphabetic # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; Alphabetic # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134C ; Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU 11350 ; Alphabetic # Lo GRANTHA OM 11357 ; Alphabetic # Mc GRANTHA AU LENGTH MARK 1135D..11361 ; Alphabetic # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11362..11363 ; Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 11380..11389 ; Alphabetic # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; Alphabetic # Lo TULU-TIGALARI LETTER EE 1138E ; Alphabetic # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; Alphabetic # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; Alphabetic # Lo TULU-TIGALARI SIGN AVAGRAHA 113B8..113BA ; Alphabetic # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II 113BB..113C0 ; Alphabetic # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; Alphabetic # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; Alphabetic # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113CA ; Alphabetic # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; Alphabetic # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 113D1 ; Alphabetic # Lo TULU-TIGALARI REPHA 113D3 ; Alphabetic # Lo TULU-TIGALARI SIGN PLUTA 11400..11434 ; Alphabetic # Lo [53] NEWA LETTER A..NEWA LETTER HA 11435..11437 ; Alphabetic # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11438..1143F ; Alphabetic # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11440..11441 ; Alphabetic # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11443..11444 ; Alphabetic # Mn [2] NEWA SIGN CANDRABINDU..NEWA SIGN ANUSVARA 11445 ; Alphabetic # Mc NEWA SIGN VISARGA 11447..1144A ; Alphabetic # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 1145F..11461 ; Alphabetic # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; Alphabetic # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114B0..114B2 ; Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II 114B3..114B8 ; Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114B9 ; Alphabetic # Mc TIRHUTA VOWEL SIGN E 114BA ; Alphabetic # Mn TIRHUTA VOWEL SIGN SHORT E 114BB..114BE ; Alphabetic # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU 114BF..114C0 ; Alphabetic # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C1 ; Alphabetic # Mc TIRHUTA SIGN VISARGA 114C4..114C5 ; Alphabetic # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C7 ; Alphabetic # Lo TIRHUTA OM 11580..115AE ; Alphabetic # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115AF..115B1 ; Alphabetic # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II 115B2..115B5 ; Alphabetic # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115B8..115BB ; Alphabetic # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BC..115BD ; Alphabetic # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BE ; Alphabetic # Mc SIDDHAM SIGN VISARGA 115D8..115DB ; Alphabetic # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 115DC..115DD ; Alphabetic # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11600..1162F ; Alphabetic # Lo [48] MODI LETTER A..MODI LETTER LLA 11630..11632 ; Alphabetic # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 11633..1163A ; Alphabetic # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163B..1163C ; Alphabetic # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163D ; Alphabetic # Mn MODI SIGN ANUSVARA 1163E ; Alphabetic # Mc MODI SIGN VISARGA 11640 ; Alphabetic # Mn MODI SIGN ARDHACANDRA 11644 ; Alphabetic # Lo MODI SIGN HUVA 11680..116AA ; Alphabetic # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116AB ; Alphabetic # Mn TAKRI SIGN ANUSVARA 116AC ; Alphabetic # Mc TAKRI SIGN VISARGA 116AD ; Alphabetic # Mn TAKRI VOWEL SIGN AA 116AE..116AF ; Alphabetic # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B0..116B5 ; Alphabetic # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B8 ; Alphabetic # Lo TAKRI LETTER ARCHAIC KHA 11700..1171A ; Alphabetic # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 1171D ; Alphabetic # Mn AHOM CONSONANT SIGN MEDIAL LA 1171E ; Alphabetic # Mc AHOM CONSONANT SIGN MEDIAL RA 1171F ; Alphabetic # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11720..11721 ; Alphabetic # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11722..11725 ; Alphabetic # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11726 ; Alphabetic # Mc AHOM VOWEL SIGN E 11727..1172A ; Alphabetic # Mn [4] AHOM VOWEL SIGN AW..AHOM VOWEL SIGN AM 11740..11746 ; Alphabetic # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; Alphabetic # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 1182C..1182E ; Alphabetic # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 1182F..11837 ; Alphabetic # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11838 ; Alphabetic # Mc DOGRA SIGN VISARGA 118A0..118DF ; Alphabetic # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118FF..11906 ; Alphabetic # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; Alphabetic # Lo DIVES AKURU LETTER O 1190C..11913 ; Alphabetic # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; Alphabetic # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; Alphabetic # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 11930..11935 ; Alphabetic # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E 11937..11938 ; Alphabetic # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193B..1193C ; Alphabetic # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193F ; Alphabetic # Lo DIVES AKURU PREFIXED NASAL SIGN 11940 ; Alphabetic # Mc DIVES AKURU MEDIAL YA 11941 ; Alphabetic # Lo DIVES AKURU INITIAL RA 11942 ; Alphabetic # Mc DIVES AKURU MEDIAL RA 119A0..119A7 ; Alphabetic # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; Alphabetic # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119D1..119D3 ; Alphabetic # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119D4..119D7 ; Alphabetic # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; Alphabetic # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119DC..119DF ; Alphabetic # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E1 ; Alphabetic # Lo NANDINAGARI SIGN AVAGRAHA 119E3 ; Alphabetic # Lo NANDINAGARI HEADSTROKE 119E4 ; Alphabetic # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A00 ; Alphabetic # Lo ZANABAZAR SQUARE LETTER A 11A01..11A0A ; Alphabetic # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A0B..11A32 ; Alphabetic # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A35..11A38 ; Alphabetic # Mn [4] ZANABAZAR SQUARE SIGN CANDRABINDU..ZANABAZAR SQUARE SIGN ANUSVARA 11A39 ; Alphabetic # Mc ZANABAZAR SQUARE SIGN VISARGA 11A3A ; Alphabetic # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A3B..11A3E ; Alphabetic # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A50 ; Alphabetic # Lo SOYOMBO LETTER A 11A51..11A56 ; Alphabetic # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A57..11A58 ; Alphabetic # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A59..11A5B ; Alphabetic # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A5C..11A89 ; Alphabetic # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A8A..11A96 ; Alphabetic # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A97 ; Alphabetic # Mc SOYOMBO SIGN VISARGA 11A9D ; Alphabetic # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; Alphabetic # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B60 ; Alphabetic # Mn SHARADA VOWEL SIGN OE 11B61 ; Alphabetic # Mc SHARADA VOWEL SIGN OOE 11B62..11B64 ; Alphabetic # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B65 ; Alphabetic # Mc SHARADA VOWEL SIGN SHORT O 11B66 ; Alphabetic # Mn SHARADA VOWEL SIGN CANDRA E 11B67 ; Alphabetic # Mc SHARADA VOWEL SIGN CANDRA O 11BC0..11BE0 ; Alphabetic # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; Alphabetic # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; Alphabetic # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; Alphabetic # Mc BHAIKSUKI VOWEL SIGN AA 11C30..11C36 ; Alphabetic # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; Alphabetic # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3E ; Alphabetic # Mc BHAIKSUKI SIGN VISARGA 11C40 ; Alphabetic # Lo BHAIKSUKI SIGN AVAGRAHA 11C72..11C8F ; Alphabetic # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11C92..11CA7 ; Alphabetic # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CA9 ; Alphabetic # Mc MARCHEN SUBJOINED LETTER YA 11CAA..11CB0 ; Alphabetic # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB1 ; Alphabetic # Mc MARCHEN VOWEL SIGN I 11CB2..11CB3 ; Alphabetic # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB4 ; Alphabetic # Mc MARCHEN VOWEL SIGN O 11CB5..11CB6 ; Alphabetic # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D00..11D06 ; Alphabetic # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; Alphabetic # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; Alphabetic # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D31..11D36 ; Alphabetic # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; Alphabetic # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; Alphabetic # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D41 ; Alphabetic # Mn [3] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI SIGN VISARGA 11D43 ; Alphabetic # Mn MASARAM GONDI SIGN CANDRA 11D46 ; Alphabetic # Lo MASARAM GONDI REPHA 11D47 ; Alphabetic # Mn MASARAM GONDI RA-KARA 11D60..11D65 ; Alphabetic # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; Alphabetic # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; Alphabetic # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D8A..11D8E ; Alphabetic # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D90..11D91 ; Alphabetic # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D93..11D94 ; Alphabetic # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D95 ; Alphabetic # Mn GUNJALA GONDI SIGN ANUSVARA 11D96 ; Alphabetic # Mc GUNJALA GONDI SIGN VISARGA 11D98 ; Alphabetic # Lo GUNJALA GONDI OM 11DB0..11DD8 ; Alphabetic # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; Alphabetic # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; Alphabetic # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11EE0..11EF2 ; Alphabetic # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11EF3..11EF4 ; Alphabetic # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11EF5..11EF6 ; Alphabetic # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11F00..11F01 ; Alphabetic # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F02 ; Alphabetic # Lo KAWI SIGN REPHA 11F03 ; Alphabetic # Mc KAWI SIGN VISARGA 11F04..11F10 ; Alphabetic # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; Alphabetic # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11F34..11F35 ; Alphabetic # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F36..11F3A ; Alphabetic # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F3E..11F3F ; Alphabetic # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F40 ; Alphabetic # Mn KAWI VOWEL SIGN EU 11FB0 ; Alphabetic # Lo LISU LETTER YHA 12000..12399 ; Alphabetic # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; Alphabetic # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12480..12543 ; Alphabetic # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; Alphabetic # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 13000..1342F ; Alphabetic # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13441..13446 ; Alphabetic # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13460..143FA ; Alphabetic # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; Alphabetic # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; Alphabetic # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 1611E..16129 ; Alphabetic # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612A..1612C ; Alphabetic # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 1612D..1612E ; Alphabetic # Mn [2] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA CONSONANT SIGN MEDIAL RA 16800..16A38 ; Alphabetic # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; Alphabetic # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A70..16ABE ; Alphabetic # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AD0..16AED ; Alphabetic # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16B00..16B2F ; Alphabetic # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B40..16B43 ; Alphabetic # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B63..16B77 ; Alphabetic # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; Alphabetic # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; Alphabetic # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; Alphabetic # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; Alphabetic # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16E40..16E7F ; Alphabetic # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; Alphabetic # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; Alphabetic # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; Alphabetic # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F4F ; Alphabetic # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F50 ; Alphabetic # Lo MIAO LETTER NASALIZATION 16F51..16F87 ; Alphabetic # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; Alphabetic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; Alphabetic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; Alphabetic # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Alphabetic # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Alphabetic # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 16FF2..16FF3 ; Alphabetic # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; Alphabetic # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; Alphabetic # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; Alphabetic # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; Alphabetic # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; Alphabetic # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Alphabetic # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Alphabetic # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; Alphabetic # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; Alphabetic # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; Alphabetic # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; Alphabetic # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; Alphabetic # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; Alphabetic # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; Alphabetic # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; Alphabetic # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; Alphabetic # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; Alphabetic # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1BC9E ; Alphabetic # Mn DUPLOYAN DOUBLE MARK 1D400..1D454 ; Alphabetic # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; Alphabetic # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Alphabetic # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Alphabetic # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Alphabetic # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Alphabetic # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; Alphabetic # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Alphabetic # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Alphabetic # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; Alphabetic # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Alphabetic # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Alphabetic # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Alphabetic # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; Alphabetic # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Alphabetic # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Alphabetic # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Alphabetic # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Alphabetic # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; Alphabetic # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; Alphabetic # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; Alphabetic # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; Alphabetic # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; Alphabetic # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; Alphabetic # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; Alphabetic # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; Alphabetic # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; Alphabetic # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; Alphabetic # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; Alphabetic # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; Alphabetic # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1DF00..1DF09 ; Alphabetic # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; Alphabetic # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; Alphabetic # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Alphabetic # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E000..1E006 ; Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; Alphabetic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; Alphabetic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; Alphabetic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E030..1E06D ; Alphabetic # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E08F ; Alphabetic # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E100..1E12C ; Alphabetic # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E137..1E13D ; Alphabetic # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E14E ; Alphabetic # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E290..1E2AD ; Alphabetic # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; Alphabetic # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E4D0..1E4EA ; Alphabetic # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; Alphabetic # Lm NAG MUNDARI SIGN OJOD 1E5D0..1E5ED ; Alphabetic # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5F0 ; Alphabetic # Lo OL ONAL SIGN HODDOND 1E6C0..1E6DE ; Alphabetic # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; Alphabetic # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E3 ; Alphabetic # Mn TAI YO SIGN UE 1E6E4..1E6E5 ; Alphabetic # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E6 ; Alphabetic # Mn TAI YO SIGN AU 1E6E7..1E6ED ; Alphabetic # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6EE..1E6EF ; Alphabetic # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F0..1E6F4 ; Alphabetic # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6F5 ; Alphabetic # Mn TAI YO SIGN OM 1E6FE ; Alphabetic # Lo TAI YO SYMBOL MUEANG 1E6FF ; Alphabetic # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; Alphabetic # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; Alphabetic # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; Alphabetic # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; Alphabetic # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; Alphabetic # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E900..1E943 ; Alphabetic # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E947 ; Alphabetic # Mn ADLAM HAMZA 1E94B ; Alphabetic # Lm ADLAM NASALIZATION MARK 1EE00..1EE03 ; Alphabetic # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Alphabetic # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Alphabetic # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; Alphabetic # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; Alphabetic # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; Alphabetic # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; Alphabetic # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; Alphabetic # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; Alphabetic # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; Alphabetic # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; Alphabetic # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; Alphabetic # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; Alphabetic # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; Alphabetic # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; Alphabetic # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; Alphabetic # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; Alphabetic # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; Alphabetic # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; Alphabetic # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; Alphabetic # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; Alphabetic # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; Alphabetic # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; Alphabetic # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Alphabetic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1F130..1F149 ; Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z 1F150..1F169 ; Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z 20000..2A6DF ; Alphabetic # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; Alphabetic # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; Alphabetic # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Alphabetic # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Alphabetic # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; Alphabetic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; Alphabetic # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 147421 # ================================================ # Derived Property: Lowercase # Generated from: Ll + Other_Lowercase 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR 00B5 ; Lowercase # L& MICRO SIGN 00BA ; Lowercase # Lo MASCULINE ORDINAL INDICATOR 00DF..00F6 ; Lowercase # L& [24] LATIN SMALL LETTER SHARP S..LATIN SMALL LETTER O WITH DIAERESIS 00F8..00FF ; Lowercase # L& [8] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER Y WITH DIAERESIS 0101 ; Lowercase # L& LATIN SMALL LETTER A WITH MACRON 0103 ; Lowercase # L& LATIN SMALL LETTER A WITH BREVE 0105 ; Lowercase # L& LATIN SMALL LETTER A WITH OGONEK 0107 ; Lowercase # L& LATIN SMALL LETTER C WITH ACUTE 0109 ; Lowercase # L& LATIN SMALL LETTER C WITH CIRCUMFLEX 010B ; Lowercase # L& LATIN SMALL LETTER C WITH DOT ABOVE 010D ; Lowercase # L& LATIN SMALL LETTER C WITH CARON 010F ; Lowercase # L& LATIN SMALL LETTER D WITH CARON 0111 ; Lowercase # L& LATIN SMALL LETTER D WITH STROKE 0113 ; Lowercase # L& LATIN SMALL LETTER E WITH MACRON 0115 ; Lowercase # L& LATIN SMALL LETTER E WITH BREVE 0117 ; Lowercase # L& LATIN SMALL LETTER E WITH DOT ABOVE 0119 ; Lowercase # L& LATIN SMALL LETTER E WITH OGONEK 011B ; Lowercase # L& LATIN SMALL LETTER E WITH CARON 011D ; Lowercase # L& LATIN SMALL LETTER G WITH CIRCUMFLEX 011F ; Lowercase # L& LATIN SMALL LETTER G WITH BREVE 0121 ; Lowercase # L& LATIN SMALL LETTER G WITH DOT ABOVE 0123 ; Lowercase # L& LATIN SMALL LETTER G WITH CEDILLA 0125 ; Lowercase # L& LATIN SMALL LETTER H WITH CIRCUMFLEX 0127 ; Lowercase # L& LATIN SMALL LETTER H WITH STROKE 0129 ; Lowercase # L& LATIN SMALL LETTER I WITH TILDE 012B ; Lowercase # L& LATIN SMALL LETTER I WITH MACRON 012D ; Lowercase # L& LATIN SMALL LETTER I WITH BREVE 012F ; Lowercase # L& LATIN SMALL LETTER I WITH OGONEK 0131 ; Lowercase # L& LATIN SMALL LETTER DOTLESS I 0133 ; Lowercase # L& LATIN SMALL LIGATURE IJ 0135 ; Lowercase # L& LATIN SMALL LETTER J WITH CIRCUMFLEX 0137..0138 ; Lowercase # L& [2] LATIN SMALL LETTER K WITH CEDILLA..LATIN SMALL LETTER KRA 013A ; Lowercase # L& LATIN SMALL LETTER L WITH ACUTE 013C ; Lowercase # L& LATIN SMALL LETTER L WITH CEDILLA 013E ; Lowercase # L& LATIN SMALL LETTER L WITH CARON 0140 ; Lowercase # L& LATIN SMALL LETTER L WITH MIDDLE DOT 0142 ; Lowercase # L& LATIN SMALL LETTER L WITH STROKE 0144 ; Lowercase # L& LATIN SMALL LETTER N WITH ACUTE 0146 ; Lowercase # L& LATIN SMALL LETTER N WITH CEDILLA 0148..0149 ; Lowercase # L& [2] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 014B ; Lowercase # L& LATIN SMALL LETTER ENG 014D ; Lowercase # L& LATIN SMALL LETTER O WITH MACRON 014F ; Lowercase # L& LATIN SMALL LETTER O WITH BREVE 0151 ; Lowercase # L& LATIN SMALL LETTER O WITH DOUBLE ACUTE 0153 ; Lowercase # L& LATIN SMALL LIGATURE OE 0155 ; Lowercase # L& LATIN SMALL LETTER R WITH ACUTE 0157 ; Lowercase # L& LATIN SMALL LETTER R WITH CEDILLA 0159 ; Lowercase # L& LATIN SMALL LETTER R WITH CARON 015B ; Lowercase # L& LATIN SMALL LETTER S WITH ACUTE 015D ; Lowercase # L& LATIN SMALL LETTER S WITH CIRCUMFLEX 015F ; Lowercase # L& LATIN SMALL LETTER S WITH CEDILLA 0161 ; Lowercase # L& LATIN SMALL LETTER S WITH CARON 0163 ; Lowercase # L& LATIN SMALL LETTER T WITH CEDILLA 0165 ; Lowercase # L& LATIN SMALL LETTER T WITH CARON 0167 ; Lowercase # L& LATIN SMALL LETTER T WITH STROKE 0169 ; Lowercase # L& LATIN SMALL LETTER U WITH TILDE 016B ; Lowercase # L& LATIN SMALL LETTER U WITH MACRON 016D ; Lowercase # L& LATIN SMALL LETTER U WITH BREVE 016F ; Lowercase # L& LATIN SMALL LETTER U WITH RING ABOVE 0171 ; Lowercase # L& LATIN SMALL LETTER U WITH DOUBLE ACUTE 0173 ; Lowercase # L& LATIN SMALL LETTER U WITH OGONEK 0175 ; Lowercase # L& LATIN SMALL LETTER W WITH CIRCUMFLEX 0177 ; Lowercase # L& LATIN SMALL LETTER Y WITH CIRCUMFLEX 017A ; Lowercase # L& LATIN SMALL LETTER Z WITH ACUTE 017C ; Lowercase # L& LATIN SMALL LETTER Z WITH DOT ABOVE 017E..0180 ; Lowercase # L& [3] LATIN SMALL LETTER Z WITH CARON..LATIN SMALL LETTER B WITH STROKE 0183 ; Lowercase # L& LATIN SMALL LETTER B WITH TOPBAR 0185 ; Lowercase # L& LATIN SMALL LETTER TONE SIX 0188 ; Lowercase # L& LATIN SMALL LETTER C WITH HOOK 018C..018D ; Lowercase # L& [2] LATIN SMALL LETTER D WITH TOPBAR..LATIN SMALL LETTER TURNED DELTA 0192 ; Lowercase # L& LATIN SMALL LETTER F WITH HOOK 0195 ; Lowercase # L& LATIN SMALL LETTER HV 0199..019B ; Lowercase # L& [3] LATIN SMALL LETTER K WITH HOOK..LATIN SMALL LETTER LAMBDA WITH STROKE 019E ; Lowercase # L& LATIN SMALL LETTER N WITH LONG RIGHT LEG 01A1 ; Lowercase # L& LATIN SMALL LETTER O WITH HORN 01A3 ; Lowercase # L& LATIN SMALL LETTER OI 01A5 ; Lowercase # L& LATIN SMALL LETTER P WITH HOOK 01A8 ; Lowercase # L& LATIN SMALL LETTER TONE TWO 01AA..01AB ; Lowercase # L& [2] LATIN LETTER REVERSED ESH LOOP..LATIN SMALL LETTER T WITH PALATAL HOOK 01AD ; Lowercase # L& LATIN SMALL LETTER T WITH HOOK 01B0 ; Lowercase # L& LATIN SMALL LETTER U WITH HORN 01B4 ; Lowercase # L& LATIN SMALL LETTER Y WITH HOOK 01B6 ; Lowercase # L& LATIN SMALL LETTER Z WITH STROKE 01B9..01BA ; Lowercase # L& [2] LATIN SMALL LETTER EZH REVERSED..LATIN SMALL LETTER EZH WITH TAIL 01BD..01BF ; Lowercase # L& [3] LATIN SMALL LETTER TONE FIVE..LATIN LETTER WYNN 01C6 ; Lowercase # L& LATIN SMALL LETTER DZ WITH CARON 01C9 ; Lowercase # L& LATIN SMALL LETTER LJ 01CC ; Lowercase # L& LATIN SMALL LETTER NJ 01CE ; Lowercase # L& LATIN SMALL LETTER A WITH CARON 01D0 ; Lowercase # L& LATIN SMALL LETTER I WITH CARON 01D2 ; Lowercase # L& LATIN SMALL LETTER O WITH CARON 01D4 ; Lowercase # L& LATIN SMALL LETTER U WITH CARON 01D6 ; Lowercase # L& LATIN SMALL LETTER U WITH DIAERESIS AND MACRON 01D8 ; Lowercase # L& LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE 01DA ; Lowercase # L& LATIN SMALL LETTER U WITH DIAERESIS AND CARON 01DC..01DD ; Lowercase # L& [2] LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE..LATIN SMALL LETTER TURNED E 01DF ; Lowercase # L& LATIN SMALL LETTER A WITH DIAERESIS AND MACRON 01E1 ; Lowercase # L& LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON 01E3 ; Lowercase # L& LATIN SMALL LETTER AE WITH MACRON 01E5 ; Lowercase # L& LATIN SMALL LETTER G WITH STROKE 01E7 ; Lowercase # L& LATIN SMALL LETTER G WITH CARON 01E9 ; Lowercase # L& LATIN SMALL LETTER K WITH CARON 01EB ; Lowercase # L& LATIN SMALL LETTER O WITH OGONEK 01ED ; Lowercase # L& LATIN SMALL LETTER O WITH OGONEK AND MACRON 01EF..01F0 ; Lowercase # L& [2] LATIN SMALL LETTER EZH WITH CARON..LATIN SMALL LETTER J WITH CARON 01F3 ; Lowercase # L& LATIN SMALL LETTER DZ 01F5 ; Lowercase # L& LATIN SMALL LETTER G WITH ACUTE 01F9 ; Lowercase # L& LATIN SMALL LETTER N WITH GRAVE 01FB ; Lowercase # L& LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE 01FD ; Lowercase # L& LATIN SMALL LETTER AE WITH ACUTE 01FF ; Lowercase # L& LATIN SMALL LETTER O WITH STROKE AND ACUTE 0201 ; Lowercase # L& LATIN SMALL LETTER A WITH DOUBLE GRAVE 0203 ; Lowercase # L& LATIN SMALL LETTER A WITH INVERTED BREVE 0205 ; Lowercase # L& LATIN SMALL LETTER E WITH DOUBLE GRAVE 0207 ; Lowercase # L& LATIN SMALL LETTER E WITH INVERTED BREVE 0209 ; Lowercase # L& LATIN SMALL LETTER I WITH DOUBLE GRAVE 020B ; Lowercase # L& LATIN SMALL LETTER I WITH INVERTED BREVE 020D ; Lowercase # L& LATIN SMALL LETTER O WITH DOUBLE GRAVE 020F ; Lowercase # L& LATIN SMALL LETTER O WITH INVERTED BREVE 0211 ; Lowercase # L& LATIN SMALL LETTER R WITH DOUBLE GRAVE 0213 ; Lowercase # L& LATIN SMALL LETTER R WITH INVERTED BREVE 0215 ; Lowercase # L& LATIN SMALL LETTER U WITH DOUBLE GRAVE 0217 ; Lowercase # L& LATIN SMALL LETTER U WITH INVERTED BREVE 0219 ; Lowercase # L& LATIN SMALL LETTER S WITH COMMA BELOW 021B ; Lowercase # L& LATIN SMALL LETTER T WITH COMMA BELOW 021D ; Lowercase # L& LATIN SMALL LETTER YOGH 021F ; Lowercase # L& LATIN SMALL LETTER H WITH CARON 0221 ; Lowercase # L& LATIN SMALL LETTER D WITH CURL 0223 ; Lowercase # L& LATIN SMALL LETTER OU 0225 ; Lowercase # L& LATIN SMALL LETTER Z WITH HOOK 0227 ; Lowercase # L& LATIN SMALL LETTER A WITH DOT ABOVE 0229 ; Lowercase # L& LATIN SMALL LETTER E WITH CEDILLA 022B ; Lowercase # L& LATIN SMALL LETTER O WITH DIAERESIS AND MACRON 022D ; Lowercase # L& LATIN SMALL LETTER O WITH TILDE AND MACRON 022F ; Lowercase # L& LATIN SMALL LETTER O WITH DOT ABOVE 0231 ; Lowercase # L& LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON 0233..0239 ; Lowercase # L& [7] LATIN SMALL LETTER Y WITH MACRON..LATIN SMALL LETTER QP DIGRAPH 023C ; Lowercase # L& LATIN SMALL LETTER C WITH STROKE 023F..0240 ; Lowercase # L& [2] LATIN SMALL LETTER S WITH SWASH TAIL..LATIN SMALL LETTER Z WITH SWASH TAIL 0242 ; Lowercase # L& LATIN SMALL LETTER GLOTTAL STOP 0247 ; Lowercase # L& LATIN SMALL LETTER E WITH STROKE 0249 ; Lowercase # L& LATIN SMALL LETTER J WITH STROKE 024B ; Lowercase # L& LATIN SMALL LETTER Q WITH HOOK TAIL 024D ; Lowercase # L& LATIN SMALL LETTER R WITH STROKE 024F..0293 ; Lowercase # L& [69] LATIN SMALL LETTER Y WITH STROKE..LATIN SMALL LETTER EZH WITH CURL 0296..02AF ; Lowercase # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02B8 ; Lowercase # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 02C0..02C1 ; Lowercase # Lm [2] MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP 02E0..02E4 ; Lowercase # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 0345 ; Lowercase # Mn COMBINING GREEK YPOGEGRAMMENI 0371 ; Lowercase # L& GREEK SMALL LETTER HETA 0373 ; Lowercase # L& GREEK SMALL LETTER ARCHAIC SAMPI 0377 ; Lowercase # L& GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; Lowercase # Lm GREEK YPOGEGRAMMENI 037B..037D ; Lowercase # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 0390 ; Lowercase # L& GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 03AC..03CE ; Lowercase # L& [35] GREEK SMALL LETTER ALPHA WITH TONOS..GREEK SMALL LETTER OMEGA WITH TONOS 03D0..03D1 ; Lowercase # L& [2] GREEK BETA SYMBOL..GREEK THETA SYMBOL 03D5..03D7 ; Lowercase # L& [3] GREEK PHI SYMBOL..GREEK KAI SYMBOL 03D9 ; Lowercase # L& GREEK SMALL LETTER ARCHAIC KOPPA 03DB ; Lowercase # L& GREEK SMALL LETTER STIGMA 03DD ; Lowercase # L& GREEK SMALL LETTER DIGAMMA 03DF ; Lowercase # L& GREEK SMALL LETTER KOPPA 03E1 ; Lowercase # L& GREEK SMALL LETTER SAMPI 03E3 ; Lowercase # L& COPTIC SMALL LETTER SHEI 03E5 ; Lowercase # L& COPTIC SMALL LETTER FEI 03E7 ; Lowercase # L& COPTIC SMALL LETTER KHEI 03E9 ; Lowercase # L& COPTIC SMALL LETTER HORI 03EB ; Lowercase # L& COPTIC SMALL LETTER GANGIA 03ED ; Lowercase # L& COPTIC SMALL LETTER SHIMA 03EF..03F3 ; Lowercase # L& [5] COPTIC SMALL LETTER DEI..GREEK LETTER YOT 03F5 ; Lowercase # L& GREEK LUNATE EPSILON SYMBOL 03F8 ; Lowercase # L& GREEK SMALL LETTER SHO 03FB..03FC ; Lowercase # L& [2] GREEK SMALL LETTER SAN..GREEK RHO WITH STROKE SYMBOL 0430..045F ; Lowercase # L& [48] CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETTER DZHE 0461 ; Lowercase # L& CYRILLIC SMALL LETTER OMEGA 0463 ; Lowercase # L& CYRILLIC SMALL LETTER YAT 0465 ; Lowercase # L& CYRILLIC SMALL LETTER IOTIFIED E 0467 ; Lowercase # L& CYRILLIC SMALL LETTER LITTLE YUS 0469 ; Lowercase # L& CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS 046B ; Lowercase # L& CYRILLIC SMALL LETTER BIG YUS 046D ; Lowercase # L& CYRILLIC SMALL LETTER IOTIFIED BIG YUS 046F ; Lowercase # L& CYRILLIC SMALL LETTER KSI 0471 ; Lowercase # L& CYRILLIC SMALL LETTER PSI 0473 ; Lowercase # L& CYRILLIC SMALL LETTER FITA 0475 ; Lowercase # L& CYRILLIC SMALL LETTER IZHITSA 0477 ; Lowercase # L& CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0479 ; Lowercase # L& CYRILLIC SMALL LETTER UK 047B ; Lowercase # L& CYRILLIC SMALL LETTER ROUND OMEGA 047D ; Lowercase # L& CYRILLIC SMALL LETTER OMEGA WITH TITLO 047F ; Lowercase # L& CYRILLIC SMALL LETTER OT 0481 ; Lowercase # L& CYRILLIC SMALL LETTER KOPPA 048B ; Lowercase # L& CYRILLIC SMALL LETTER SHORT I WITH TAIL 048D ; Lowercase # L& CYRILLIC SMALL LETTER SEMISOFT SIGN 048F ; Lowercase # L& CYRILLIC SMALL LETTER ER WITH TICK 0491 ; Lowercase # L& CYRILLIC SMALL LETTER GHE WITH UPTURN 0493 ; Lowercase # L& CYRILLIC SMALL LETTER GHE WITH STROKE 0495 ; Lowercase # L& CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK 0497 ; Lowercase # L& CYRILLIC SMALL LETTER ZHE WITH DESCENDER 0499 ; Lowercase # L& CYRILLIC SMALL LETTER ZE WITH DESCENDER 049B ; Lowercase # L& CYRILLIC SMALL LETTER KA WITH DESCENDER 049D ; Lowercase # L& CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE 049F ; Lowercase # L& CYRILLIC SMALL LETTER KA WITH STROKE 04A1 ; Lowercase # L& CYRILLIC SMALL LETTER BASHKIR KA 04A3 ; Lowercase # L& CYRILLIC SMALL LETTER EN WITH DESCENDER 04A5 ; Lowercase # L& CYRILLIC SMALL LIGATURE EN GHE 04A7 ; Lowercase # L& CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK 04A9 ; Lowercase # L& CYRILLIC SMALL LETTER ABKHASIAN HA 04AB ; Lowercase # L& CYRILLIC SMALL LETTER ES WITH DESCENDER 04AD ; Lowercase # L& CYRILLIC SMALL LETTER TE WITH DESCENDER 04AF ; Lowercase # L& CYRILLIC SMALL LETTER STRAIGHT U 04B1 ; Lowercase # L& CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE 04B3 ; Lowercase # L& CYRILLIC SMALL LETTER HA WITH DESCENDER 04B5 ; Lowercase # L& CYRILLIC SMALL LIGATURE TE TSE 04B7 ; Lowercase # L& CYRILLIC SMALL LETTER CHE WITH DESCENDER 04B9 ; Lowercase # L& CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE 04BB ; Lowercase # L& CYRILLIC SMALL LETTER SHHA 04BD ; Lowercase # L& CYRILLIC SMALL LETTER ABKHASIAN CHE 04BF ; Lowercase # L& CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER 04C2 ; Lowercase # L& CYRILLIC SMALL LETTER ZHE WITH BREVE 04C4 ; Lowercase # L& CYRILLIC SMALL LETTER KA WITH HOOK 04C6 ; Lowercase # L& CYRILLIC SMALL LETTER EL WITH TAIL 04C8 ; Lowercase # L& CYRILLIC SMALL LETTER EN WITH HOOK 04CA ; Lowercase # L& CYRILLIC SMALL LETTER EN WITH TAIL 04CC ; Lowercase # L& CYRILLIC SMALL LETTER KHAKASSIAN CHE 04CE..04CF ; Lowercase # L& [2] CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC SMALL LETTER PALOCHKA 04D1 ; Lowercase # L& CYRILLIC SMALL LETTER A WITH BREVE 04D3 ; Lowercase # L& CYRILLIC SMALL LETTER A WITH DIAERESIS 04D5 ; Lowercase # L& CYRILLIC SMALL LIGATURE A IE 04D7 ; Lowercase # L& CYRILLIC SMALL LETTER IE WITH BREVE 04D9 ; Lowercase # L& CYRILLIC SMALL LETTER SCHWA 04DB ; Lowercase # L& CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS 04DD ; Lowercase # L& CYRILLIC SMALL LETTER ZHE WITH DIAERESIS 04DF ; Lowercase # L& CYRILLIC SMALL LETTER ZE WITH DIAERESIS 04E1 ; Lowercase # L& CYRILLIC SMALL LETTER ABKHASIAN DZE 04E3 ; Lowercase # L& CYRILLIC SMALL LETTER I WITH MACRON 04E5 ; Lowercase # L& CYRILLIC SMALL LETTER I WITH DIAERESIS 04E7 ; Lowercase # L& CYRILLIC SMALL LETTER O WITH DIAERESIS 04E9 ; Lowercase # L& CYRILLIC SMALL LETTER BARRED O 04EB ; Lowercase # L& CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS 04ED ; Lowercase # L& CYRILLIC SMALL LETTER E WITH DIAERESIS 04EF ; Lowercase # L& CYRILLIC SMALL LETTER U WITH MACRON 04F1 ; Lowercase # L& CYRILLIC SMALL LETTER U WITH DIAERESIS 04F3 ; Lowercase # L& CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE 04F5 ; Lowercase # L& CYRILLIC SMALL LETTER CHE WITH DIAERESIS 04F7 ; Lowercase # L& CYRILLIC SMALL LETTER GHE WITH DESCENDER 04F9 ; Lowercase # L& CYRILLIC SMALL LETTER YERU WITH DIAERESIS 04FB ; Lowercase # L& CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK 04FD ; Lowercase # L& CYRILLIC SMALL LETTER HA WITH HOOK 04FF ; Lowercase # L& CYRILLIC SMALL LETTER HA WITH STROKE 0501 ; Lowercase # L& CYRILLIC SMALL LETTER KOMI DE 0503 ; Lowercase # L& CYRILLIC SMALL LETTER KOMI DJE 0505 ; Lowercase # L& CYRILLIC SMALL LETTER KOMI ZJE 0507 ; Lowercase # L& CYRILLIC SMALL LETTER KOMI DZJE 0509 ; Lowercase # L& CYRILLIC SMALL LETTER KOMI LJE 050B ; Lowercase # L& CYRILLIC SMALL LETTER KOMI NJE 050D ; Lowercase # L& CYRILLIC SMALL LETTER KOMI SJE 050F ; Lowercase # L& CYRILLIC SMALL LETTER KOMI TJE 0511 ; Lowercase # L& CYRILLIC SMALL LETTER REVERSED ZE 0513 ; Lowercase # L& CYRILLIC SMALL LETTER EL WITH HOOK 0515 ; Lowercase # L& CYRILLIC SMALL LETTER LHA 0517 ; Lowercase # L& CYRILLIC SMALL LETTER RHA 0519 ; Lowercase # L& CYRILLIC SMALL LETTER YAE 051B ; Lowercase # L& CYRILLIC SMALL LETTER QA 051D ; Lowercase # L& CYRILLIC SMALL LETTER WE 051F ; Lowercase # L& CYRILLIC SMALL LETTER ALEUT KA 0521 ; Lowercase # L& CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK 0523 ; Lowercase # L& CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK 0525 ; Lowercase # L& CYRILLIC SMALL LETTER PE WITH DESCENDER 0527 ; Lowercase # L& CYRILLIC SMALL LETTER SHHA WITH DESCENDER 0529 ; Lowercase # L& CYRILLIC SMALL LETTER EN WITH LEFT HOOK 052B ; Lowercase # L& CYRILLIC SMALL LETTER DZZHE 052D ; Lowercase # L& CYRILLIC SMALL LETTER DCHE 052F ; Lowercase # L& CYRILLIC SMALL LETTER EL WITH DESCENDER 0560..0588 ; Lowercase # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 10D0..10FA ; Lowercase # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; Lowercase # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; Lowercase # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13F8..13FD ; Lowercase # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Lowercase # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK 1C8A ; Lowercase # L& CYRILLIC SMALL LETTER TJE 1D00..1D2B ; Lowercase # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; Lowercase # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; Lowercase # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; Lowercase # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; Lowercase # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; Lowercase # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1E01 ; Lowercase # L& LATIN SMALL LETTER A WITH RING BELOW 1E03 ; Lowercase # L& LATIN SMALL LETTER B WITH DOT ABOVE 1E05 ; Lowercase # L& LATIN SMALL LETTER B WITH DOT BELOW 1E07 ; Lowercase # L& LATIN SMALL LETTER B WITH LINE BELOW 1E09 ; Lowercase # L& LATIN SMALL LETTER C WITH CEDILLA AND ACUTE 1E0B ; Lowercase # L& LATIN SMALL LETTER D WITH DOT ABOVE 1E0D ; Lowercase # L& LATIN SMALL LETTER D WITH DOT BELOW 1E0F ; Lowercase # L& LATIN SMALL LETTER D WITH LINE BELOW 1E11 ; Lowercase # L& LATIN SMALL LETTER D WITH CEDILLA 1E13 ; Lowercase # L& LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW 1E15 ; Lowercase # L& LATIN SMALL LETTER E WITH MACRON AND GRAVE 1E17 ; Lowercase # L& LATIN SMALL LETTER E WITH MACRON AND ACUTE 1E19 ; Lowercase # L& LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW 1E1B ; Lowercase # L& LATIN SMALL LETTER E WITH TILDE BELOW 1E1D ; Lowercase # L& LATIN SMALL LETTER E WITH CEDILLA AND BREVE 1E1F ; Lowercase # L& LATIN SMALL LETTER F WITH DOT ABOVE 1E21 ; Lowercase # L& LATIN SMALL LETTER G WITH MACRON 1E23 ; Lowercase # L& LATIN SMALL LETTER H WITH DOT ABOVE 1E25 ; Lowercase # L& LATIN SMALL LETTER H WITH DOT BELOW 1E27 ; Lowercase # L& LATIN SMALL LETTER H WITH DIAERESIS 1E29 ; Lowercase # L& LATIN SMALL LETTER H WITH CEDILLA 1E2B ; Lowercase # L& LATIN SMALL LETTER H WITH BREVE BELOW 1E2D ; Lowercase # L& LATIN SMALL LETTER I WITH TILDE BELOW 1E2F ; Lowercase # L& LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE 1E31 ; Lowercase # L& LATIN SMALL LETTER K WITH ACUTE 1E33 ; Lowercase # L& LATIN SMALL LETTER K WITH DOT BELOW 1E35 ; Lowercase # L& LATIN SMALL LETTER K WITH LINE BELOW 1E37 ; Lowercase # L& LATIN SMALL LETTER L WITH DOT BELOW 1E39 ; Lowercase # L& LATIN SMALL LETTER L WITH DOT BELOW AND MACRON 1E3B ; Lowercase # L& LATIN SMALL LETTER L WITH LINE BELOW 1E3D ; Lowercase # L& LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW 1E3F ; Lowercase # L& LATIN SMALL LETTER M WITH ACUTE 1E41 ; Lowercase # L& LATIN SMALL LETTER M WITH DOT ABOVE 1E43 ; Lowercase # L& LATIN SMALL LETTER M WITH DOT BELOW 1E45 ; Lowercase # L& LATIN SMALL LETTER N WITH DOT ABOVE 1E47 ; Lowercase # L& LATIN SMALL LETTER N WITH DOT BELOW 1E49 ; Lowercase # L& LATIN SMALL LETTER N WITH LINE BELOW 1E4B ; Lowercase # L& LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW 1E4D ; Lowercase # L& LATIN SMALL LETTER O WITH TILDE AND ACUTE 1E4F ; Lowercase # L& LATIN SMALL LETTER O WITH TILDE AND DIAERESIS 1E51 ; Lowercase # L& LATIN SMALL LETTER O WITH MACRON AND GRAVE 1E53 ; Lowercase # L& LATIN SMALL LETTER O WITH MACRON AND ACUTE 1E55 ; Lowercase # L& LATIN SMALL LETTER P WITH ACUTE 1E57 ; Lowercase # L& LATIN SMALL LETTER P WITH DOT ABOVE 1E59 ; Lowercase # L& LATIN SMALL LETTER R WITH DOT ABOVE 1E5B ; Lowercase # L& LATIN SMALL LETTER R WITH DOT BELOW 1E5D ; Lowercase # L& LATIN SMALL LETTER R WITH DOT BELOW AND MACRON 1E5F ; Lowercase # L& LATIN SMALL LETTER R WITH LINE BELOW 1E61 ; Lowercase # L& LATIN SMALL LETTER S WITH DOT ABOVE 1E63 ; Lowercase # L& LATIN SMALL LETTER S WITH DOT BELOW 1E65 ; Lowercase # L& LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE 1E67 ; Lowercase # L& LATIN SMALL LETTER S WITH CARON AND DOT ABOVE 1E69 ; Lowercase # L& LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6B ; Lowercase # L& LATIN SMALL LETTER T WITH DOT ABOVE 1E6D ; Lowercase # L& LATIN SMALL LETTER T WITH DOT BELOW 1E6F ; Lowercase # L& LATIN SMALL LETTER T WITH LINE BELOW 1E71 ; Lowercase # L& LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW 1E73 ; Lowercase # L& LATIN SMALL LETTER U WITH DIAERESIS BELOW 1E75 ; Lowercase # L& LATIN SMALL LETTER U WITH TILDE BELOW 1E77 ; Lowercase # L& LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW 1E79 ; Lowercase # L& LATIN SMALL LETTER U WITH TILDE AND ACUTE 1E7B ; Lowercase # L& LATIN SMALL LETTER U WITH MACRON AND DIAERESIS 1E7D ; Lowercase # L& LATIN SMALL LETTER V WITH TILDE 1E7F ; Lowercase # L& LATIN SMALL LETTER V WITH DOT BELOW 1E81 ; Lowercase # L& LATIN SMALL LETTER W WITH GRAVE 1E83 ; Lowercase # L& LATIN SMALL LETTER W WITH ACUTE 1E85 ; Lowercase # L& LATIN SMALL LETTER W WITH DIAERESIS 1E87 ; Lowercase # L& LATIN SMALL LETTER W WITH DOT ABOVE 1E89 ; Lowercase # L& LATIN SMALL LETTER W WITH DOT BELOW 1E8B ; Lowercase # L& LATIN SMALL LETTER X WITH DOT ABOVE 1E8D ; Lowercase # L& LATIN SMALL LETTER X WITH DIAERESIS 1E8F ; Lowercase # L& LATIN SMALL LETTER Y WITH DOT ABOVE 1E91 ; Lowercase # L& LATIN SMALL LETTER Z WITH CIRCUMFLEX 1E93 ; Lowercase # L& LATIN SMALL LETTER Z WITH DOT BELOW 1E95..1E9D ; Lowercase # L& [9] LATIN SMALL LETTER Z WITH LINE BELOW..LATIN SMALL LETTER LONG S WITH HIGH STROKE 1E9F ; Lowercase # L& LATIN SMALL LETTER DELTA 1EA1 ; Lowercase # L& LATIN SMALL LETTER A WITH DOT BELOW 1EA3 ; Lowercase # L& LATIN SMALL LETTER A WITH HOOK ABOVE 1EA5 ; Lowercase # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA7 ; Lowercase # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA9 ; Lowercase # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAB ; Lowercase # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE 1EAD ; Lowercase # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAF ; Lowercase # L& LATIN SMALL LETTER A WITH BREVE AND ACUTE 1EB1 ; Lowercase # L& LATIN SMALL LETTER A WITH BREVE AND GRAVE 1EB3 ; Lowercase # L& LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE 1EB5 ; Lowercase # L& LATIN SMALL LETTER A WITH BREVE AND TILDE 1EB7 ; Lowercase # L& LATIN SMALL LETTER A WITH BREVE AND DOT BELOW 1EB9 ; Lowercase # L& LATIN SMALL LETTER E WITH DOT BELOW 1EBB ; Lowercase # L& LATIN SMALL LETTER E WITH HOOK ABOVE 1EBD ; Lowercase # L& LATIN SMALL LETTER E WITH TILDE 1EBF ; Lowercase # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC1 ; Lowercase # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC3 ; Lowercase # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC5 ; Lowercase # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE 1EC7 ; Lowercase # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC9 ; Lowercase # L& LATIN SMALL LETTER I WITH HOOK ABOVE 1ECB ; Lowercase # L& LATIN SMALL LETTER I WITH DOT BELOW 1ECD ; Lowercase # L& LATIN SMALL LETTER O WITH DOT BELOW 1ECF ; Lowercase # L& LATIN SMALL LETTER O WITH HOOK ABOVE 1ED1 ; Lowercase # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED3 ; Lowercase # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED5 ; Lowercase # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED7 ; Lowercase # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE 1ED9 ; Lowercase # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDB ; Lowercase # L& LATIN SMALL LETTER O WITH HORN AND ACUTE 1EDD ; Lowercase # L& LATIN SMALL LETTER O WITH HORN AND GRAVE 1EDF ; Lowercase # L& LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE 1EE1 ; Lowercase # L& LATIN SMALL LETTER O WITH HORN AND TILDE 1EE3 ; Lowercase # L& LATIN SMALL LETTER O WITH HORN AND DOT BELOW 1EE5 ; Lowercase # L& LATIN SMALL LETTER U WITH DOT BELOW 1EE7 ; Lowercase # L& LATIN SMALL LETTER U WITH HOOK ABOVE 1EE9 ; Lowercase # L& LATIN SMALL LETTER U WITH HORN AND ACUTE 1EEB ; Lowercase # L& LATIN SMALL LETTER U WITH HORN AND GRAVE 1EED ; Lowercase # L& LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE 1EEF ; Lowercase # L& LATIN SMALL LETTER U WITH HORN AND TILDE 1EF1 ; Lowercase # L& LATIN SMALL LETTER U WITH HORN AND DOT BELOW 1EF3 ; Lowercase # L& LATIN SMALL LETTER Y WITH GRAVE 1EF5 ; Lowercase # L& LATIN SMALL LETTER Y WITH DOT BELOW 1EF7 ; Lowercase # L& LATIN SMALL LETTER Y WITH HOOK ABOVE 1EF9 ; Lowercase # L& LATIN SMALL LETTER Y WITH TILDE 1EFB ; Lowercase # L& LATIN SMALL LETTER MIDDLE-WELSH LL 1EFD ; Lowercase # L& LATIN SMALL LETTER MIDDLE-WELSH V 1EFF..1F07 ; Lowercase # L& [9] LATIN SMALL LETTER Y WITH LOOP..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F10..1F15 ; Lowercase # L& [6] GREEK SMALL LETTER EPSILON WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F27 ; Lowercase # L& [8] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI 1F30..1F37 ; Lowercase # L& [8] GREEK SMALL LETTER IOTA WITH PSILI..GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI 1F40..1F45 ; Lowercase # L& [6] GREEK SMALL LETTER OMICRON WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Lowercase # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F60..1F67 ; Lowercase # L& [8] GREEK SMALL LETTER OMEGA WITH PSILI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F70..1F7D ; Lowercase # L& [14] GREEK SMALL LETTER ALPHA WITH VARIA..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1F87 ; Lowercase # L& [8] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1F90..1F97 ; Lowercase # L& [8] GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FA0..1FA7 ; Lowercase # L& [8] GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FB0..1FB4 ; Lowercase # L& [5] GREEK SMALL LETTER ALPHA WITH VRACHY..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FB7 ; Lowercase # L& [2] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 1FBE ; Lowercase # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Lowercase # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FC7 ; Lowercase # L& [2] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 1FD0..1FD3 ; Lowercase # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FD7 ; Lowercase # L& [2] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FE0..1FE7 ; Lowercase # L& [8] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI 1FF2..1FF4 ; Lowercase # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FF7 ; Lowercase # L& [2] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 2071 ; Lowercase # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; Lowercase # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; Lowercase # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 210A ; Lowercase # L& SCRIPT SMALL G 210E..210F ; Lowercase # L& [2] PLANCK CONSTANT..PLANCK CONSTANT OVER TWO PI 2113 ; Lowercase # L& SCRIPT SMALL L 212F ; Lowercase # L& SCRIPT SMALL E 2134 ; Lowercase # L& SCRIPT SMALL O 2139 ; Lowercase # L& INFORMATION SOURCE 213C..213D ; Lowercase # L& [2] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK SMALL GAMMA 2146..2149 ; Lowercase # L& [4] DOUBLE-STRUCK ITALIC SMALL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; Lowercase # L& TURNED SMALL F 2170..217F ; Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 2184 ; Lowercase # L& LATIN SMALL LETTER REVERSED C 24D0..24E9 ; Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C30..2C5F ; Lowercase # L& [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI 2C61 ; Lowercase # L& LATIN SMALL LETTER L WITH DOUBLE BAR 2C65..2C66 ; Lowercase # L& [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE 2C68 ; Lowercase # L& LATIN SMALL LETTER H WITH DESCENDER 2C6A ; Lowercase # L& LATIN SMALL LETTER K WITH DESCENDER 2C6C ; Lowercase # L& LATIN SMALL LETTER Z WITH DESCENDER 2C71 ; Lowercase # L& LATIN SMALL LETTER V WITH RIGHT HOOK 2C73..2C74 ; Lowercase # L& [2] LATIN SMALL LETTER W WITH HOOK..LATIN SMALL LETTER V WITH CURL 2C76..2C7B ; Lowercase # L& [6] LATIN SMALL LETTER HALF H..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; Lowercase # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C81 ; Lowercase # L& COPTIC SMALL LETTER ALFA 2C83 ; Lowercase # L& COPTIC SMALL LETTER VIDA 2C85 ; Lowercase # L& COPTIC SMALL LETTER GAMMA 2C87 ; Lowercase # L& COPTIC SMALL LETTER DALDA 2C89 ; Lowercase # L& COPTIC SMALL LETTER EIE 2C8B ; Lowercase # L& COPTIC SMALL LETTER SOU 2C8D ; Lowercase # L& COPTIC SMALL LETTER ZATA 2C8F ; Lowercase # L& COPTIC SMALL LETTER HATE 2C91 ; Lowercase # L& COPTIC SMALL LETTER THETHE 2C93 ; Lowercase # L& COPTIC SMALL LETTER IAUDA 2C95 ; Lowercase # L& COPTIC SMALL LETTER KAPA 2C97 ; Lowercase # L& COPTIC SMALL LETTER LAULA 2C99 ; Lowercase # L& COPTIC SMALL LETTER MI 2C9B ; Lowercase # L& COPTIC SMALL LETTER NI 2C9D ; Lowercase # L& COPTIC SMALL LETTER KSI 2C9F ; Lowercase # L& COPTIC SMALL LETTER O 2CA1 ; Lowercase # L& COPTIC SMALL LETTER PI 2CA3 ; Lowercase # L& COPTIC SMALL LETTER RO 2CA5 ; Lowercase # L& COPTIC SMALL LETTER SIMA 2CA7 ; Lowercase # L& COPTIC SMALL LETTER TAU 2CA9 ; Lowercase # L& COPTIC SMALL LETTER UA 2CAB ; Lowercase # L& COPTIC SMALL LETTER FI 2CAD ; Lowercase # L& COPTIC SMALL LETTER KHI 2CAF ; Lowercase # L& COPTIC SMALL LETTER PSI 2CB1 ; Lowercase # L& COPTIC SMALL LETTER OOU 2CB3 ; Lowercase # L& COPTIC SMALL LETTER DIALECT-P ALEF 2CB5 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC AIN 2CB7 ; Lowercase # L& COPTIC SMALL LETTER CRYPTOGRAMMIC EIE 2CB9 ; Lowercase # L& COPTIC SMALL LETTER DIALECT-P KAPA 2CBB ; Lowercase # L& COPTIC SMALL LETTER DIALECT-P NI 2CBD ; Lowercase # L& COPTIC SMALL LETTER CRYPTOGRAMMIC NI 2CBF ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC OOU 2CC1 ; Lowercase # L& COPTIC SMALL LETTER SAMPI 2CC3 ; Lowercase # L& COPTIC SMALL LETTER CROSSED SHEI 2CC5 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC SHEI 2CC7 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC ESH 2CC9 ; Lowercase # L& COPTIC SMALL LETTER AKHMIMIC KHEI 2CCB ; Lowercase # L& COPTIC SMALL LETTER DIALECT-P HORI 2CCD ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC HORI 2CCF ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC HA 2CD1 ; Lowercase # L& COPTIC SMALL LETTER L-SHAPED HA 2CD3 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC HEI 2CD5 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC HAT 2CD7 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC GANGIA 2CD9 ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC DJA 2CDB ; Lowercase # L& COPTIC SMALL LETTER OLD COPTIC SHIMA 2CDD ; Lowercase # L& COPTIC SMALL LETTER OLD NUBIAN SHIMA 2CDF ; Lowercase # L& COPTIC SMALL LETTER OLD NUBIAN NGI 2CE1 ; Lowercase # L& COPTIC SMALL LETTER OLD NUBIAN NYI 2CE3..2CE4 ; Lowercase # L& [2] COPTIC SMALL LETTER OLD NUBIAN WAU..COPTIC SYMBOL KAI 2CEC ; Lowercase # L& COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI 2CEE ; Lowercase # L& COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF3 ; Lowercase # L& COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Lowercase # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Lowercase # L& GEORGIAN SMALL LETTER YN 2D2D ; Lowercase # L& GEORGIAN SMALL LETTER AEN A641 ; Lowercase # L& CYRILLIC SMALL LETTER ZEMLYA A643 ; Lowercase # L& CYRILLIC SMALL LETTER DZELO A645 ; Lowercase # L& CYRILLIC SMALL LETTER REVERSED DZE A647 ; Lowercase # L& CYRILLIC SMALL LETTER IOTA A649 ; Lowercase # L& CYRILLIC SMALL LETTER DJERV A64B ; Lowercase # L& CYRILLIC SMALL LETTER MONOGRAPH UK A64D ; Lowercase # L& CYRILLIC SMALL LETTER BROAD OMEGA A64F ; Lowercase # L& CYRILLIC SMALL LETTER NEUTRAL YER A651 ; Lowercase # L& CYRILLIC SMALL LETTER YERU WITH BACK YER A653 ; Lowercase # L& CYRILLIC SMALL LETTER IOTIFIED YAT A655 ; Lowercase # L& CYRILLIC SMALL LETTER REVERSED YU A657 ; Lowercase # L& CYRILLIC SMALL LETTER IOTIFIED A A659 ; Lowercase # L& CYRILLIC SMALL LETTER CLOSED LITTLE YUS A65B ; Lowercase # L& CYRILLIC SMALL LETTER BLENDED YUS A65D ; Lowercase # L& CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE YUS A65F ; Lowercase # L& CYRILLIC SMALL LETTER YN A661 ; Lowercase # L& CYRILLIC SMALL LETTER REVERSED TSE A663 ; Lowercase # L& CYRILLIC SMALL LETTER SOFT DE A665 ; Lowercase # L& CYRILLIC SMALL LETTER SOFT EL A667 ; Lowercase # L& CYRILLIC SMALL LETTER SOFT EM A669 ; Lowercase # L& CYRILLIC SMALL LETTER MONOCULAR O A66B ; Lowercase # L& CYRILLIC SMALL LETTER BINOCULAR O A66D ; Lowercase # L& CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A681 ; Lowercase # L& CYRILLIC SMALL LETTER DWE A683 ; Lowercase # L& CYRILLIC SMALL LETTER DZWE A685 ; Lowercase # L& CYRILLIC SMALL LETTER ZHWE A687 ; Lowercase # L& CYRILLIC SMALL LETTER CCHE A689 ; Lowercase # L& CYRILLIC SMALL LETTER DZZE A68B ; Lowercase # L& CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK A68D ; Lowercase # L& CYRILLIC SMALL LETTER TWE A68F ; Lowercase # L& CYRILLIC SMALL LETTER TSWE A691 ; Lowercase # L& CYRILLIC SMALL LETTER TSSE A693 ; Lowercase # L& CYRILLIC SMALL LETTER TCHE A695 ; Lowercase # L& CYRILLIC SMALL LETTER HWE A697 ; Lowercase # L& CYRILLIC SMALL LETTER SHWE A699 ; Lowercase # L& CYRILLIC SMALL LETTER DOUBLE O A69B ; Lowercase # L& CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; Lowercase # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A723 ; Lowercase # L& LATIN SMALL LETTER EGYPTOLOGICAL ALEF A725 ; Lowercase # L& LATIN SMALL LETTER EGYPTOLOGICAL AIN A727 ; Lowercase # L& LATIN SMALL LETTER HENG A729 ; Lowercase # L& LATIN SMALL LETTER TZ A72B ; Lowercase # L& LATIN SMALL LETTER TRESILLO A72D ; Lowercase # L& LATIN SMALL LETTER CUATRILLO A72F..A731 ; Lowercase # L& [3] LATIN SMALL LETTER CUATRILLO WITH COMMA..LATIN LETTER SMALL CAPITAL S A733 ; Lowercase # L& LATIN SMALL LETTER AA A735 ; Lowercase # L& LATIN SMALL LETTER AO A737 ; Lowercase # L& LATIN SMALL LETTER AU A739 ; Lowercase # L& LATIN SMALL LETTER AV A73B ; Lowercase # L& LATIN SMALL LETTER AV WITH HORIZONTAL BAR A73D ; Lowercase # L& LATIN SMALL LETTER AY A73F ; Lowercase # L& LATIN SMALL LETTER REVERSED C WITH DOT A741 ; Lowercase # L& LATIN SMALL LETTER K WITH STROKE A743 ; Lowercase # L& LATIN SMALL LETTER K WITH DIAGONAL STROKE A745 ; Lowercase # L& LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE A747 ; Lowercase # L& LATIN SMALL LETTER BROKEN L A749 ; Lowercase # L& LATIN SMALL LETTER L WITH HIGH STROKE A74B ; Lowercase # L& LATIN SMALL LETTER O WITH LONG STROKE OVERLAY A74D ; Lowercase # L& LATIN SMALL LETTER O WITH LOOP A74F ; Lowercase # L& LATIN SMALL LETTER OO A751 ; Lowercase # L& LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER A753 ; Lowercase # L& LATIN SMALL LETTER P WITH FLOURISH A755 ; Lowercase # L& LATIN SMALL LETTER P WITH SQUIRREL TAIL A757 ; Lowercase # L& LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER A759 ; Lowercase # L& LATIN SMALL LETTER Q WITH DIAGONAL STROKE A75B ; Lowercase # L& LATIN SMALL LETTER R ROTUNDA A75D ; Lowercase # L& LATIN SMALL LETTER RUM ROTUNDA A75F ; Lowercase # L& LATIN SMALL LETTER V WITH DIAGONAL STROKE A761 ; Lowercase # L& LATIN SMALL LETTER VY A763 ; Lowercase # L& LATIN SMALL LETTER VISIGOTHIC Z A765 ; Lowercase # L& LATIN SMALL LETTER THORN WITH STROKE A767 ; Lowercase # L& LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER A769 ; Lowercase # L& LATIN SMALL LETTER VEND A76B ; Lowercase # L& LATIN SMALL LETTER ET A76D ; Lowercase # L& LATIN SMALL LETTER IS A76F ; Lowercase # L& LATIN SMALL LETTER CON A770 ; Lowercase # Lm MODIFIER LETTER US A771..A778 ; Lowercase # L& [8] LATIN SMALL LETTER DUM..LATIN SMALL LETTER UM A77A ; Lowercase # L& LATIN SMALL LETTER INSULAR D A77C ; Lowercase # L& LATIN SMALL LETTER INSULAR F A77F ; Lowercase # L& LATIN SMALL LETTER TURNED INSULAR G A781 ; Lowercase # L& LATIN SMALL LETTER TURNED L A783 ; Lowercase # L& LATIN SMALL LETTER INSULAR R A785 ; Lowercase # L& LATIN SMALL LETTER INSULAR S A787 ; Lowercase # L& LATIN SMALL LETTER INSULAR T A78C ; Lowercase # L& LATIN SMALL LETTER SALTILLO A78E ; Lowercase # L& LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A791 ; Lowercase # L& LATIN SMALL LETTER N WITH DESCENDER A793..A795 ; Lowercase # L& [3] LATIN SMALL LETTER C WITH BAR..LATIN SMALL LETTER H WITH PALATAL HOOK A797 ; Lowercase # L& LATIN SMALL LETTER B WITH FLOURISH A799 ; Lowercase # L& LATIN SMALL LETTER F WITH STROKE A79B ; Lowercase # L& LATIN SMALL LETTER VOLAPUK AE A79D ; Lowercase # L& LATIN SMALL LETTER VOLAPUK OE A79F ; Lowercase # L& LATIN SMALL LETTER VOLAPUK UE A7A1 ; Lowercase # L& LATIN SMALL LETTER G WITH OBLIQUE STROKE A7A3 ; Lowercase # L& LATIN SMALL LETTER K WITH OBLIQUE STROKE A7A5 ; Lowercase # L& LATIN SMALL LETTER N WITH OBLIQUE STROKE A7A7 ; Lowercase # L& LATIN SMALL LETTER R WITH OBLIQUE STROKE A7A9 ; Lowercase # L& LATIN SMALL LETTER S WITH OBLIQUE STROKE A7AF ; Lowercase # L& LATIN LETTER SMALL CAPITAL Q A7B5 ; Lowercase # L& LATIN SMALL LETTER BETA A7B7 ; Lowercase # L& LATIN SMALL LETTER OMEGA A7B9 ; Lowercase # L& LATIN SMALL LETTER U WITH STROKE A7BB ; Lowercase # L& LATIN SMALL LETTER GLOTTAL A A7BD ; Lowercase # L& LATIN SMALL LETTER GLOTTAL I A7BF ; Lowercase # L& LATIN SMALL LETTER GLOTTAL U A7C1 ; Lowercase # L& LATIN SMALL LETTER OLD POLISH O A7C3 ; Lowercase # L& LATIN SMALL LETTER ANGLICANA W A7C8 ; Lowercase # L& LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY A7CA ; Lowercase # L& LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY A7CD ; Lowercase # L& LATIN SMALL LETTER S WITH DIAGONAL STROKE A7CF ; Lowercase # L& LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE A7D1 ; Lowercase # L& LATIN SMALL LETTER CLOSED INSULAR G A7D3 ; Lowercase # L& LATIN SMALL LETTER DOUBLE THORN A7D5 ; Lowercase # L& LATIN SMALL LETTER DOUBLE WYNN A7D7 ; Lowercase # L& LATIN SMALL LETTER MIDDLE SCOTS S A7D9 ; Lowercase # L& LATIN SMALL LETTER SIGMOID S A7DB ; Lowercase # L& LATIN SMALL LETTER LAMBDA A7F1..A7F4 ; Lowercase # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F6 ; Lowercase # L& LATIN SMALL LETTER REVERSED HALF H A7F8..A7F9 ; Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; Lowercase # L& LATIN LETTER SMALL CAPITAL TURNED M AB30..AB5A ; Lowercase # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; Lowercase # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; Lowercase # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; Lowercase # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Lowercase # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Lowercase # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF41..FF5A ; Lowercase # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z 10428..1044F ; Lowercase # L& [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW 104D8..104FB ; Lowercase # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10597..105A1 ; Lowercase # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Lowercase # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Lowercase # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Lowercase # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 10780 ; Lowercase # Lm MODIFIER LETTER SMALL CAPITAL AA 10783..10785 ; Lowercase # Lm [3] MODIFIER LETTER SMALL AE..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Lowercase # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Lowercase # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10CC0..10CF2 ; Lowercase # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D70..10D85 ; Lowercase # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 118C0..118DF ; Lowercase # L& [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 16E60..16E7F ; Lowercase # L& [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EBB..16ED3 ; Lowercase # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 1D41A..1D433 ; Lowercase # L& [26] MATHEMATICAL BOLD SMALL A..MATHEMATICAL BOLD SMALL Z 1D44E..1D454 ; Lowercase # L& [7] MATHEMATICAL ITALIC SMALL A..MATHEMATICAL ITALIC SMALL G 1D456..1D467 ; Lowercase # L& [18] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL ITALIC SMALL Z 1D482..1D49B ; Lowercase # L& [26] MATHEMATICAL BOLD ITALIC SMALL A..MATHEMATICAL BOLD ITALIC SMALL Z 1D4B6..1D4B9 ; Lowercase # L& [4] MATHEMATICAL SCRIPT SMALL A..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Lowercase # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Lowercase # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D4CF ; Lowercase # L& [11] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL SCRIPT SMALL Z 1D4EA..1D503 ; Lowercase # L& [26] MATHEMATICAL BOLD SCRIPT SMALL A..MATHEMATICAL BOLD SCRIPT SMALL Z 1D51E..1D537 ; Lowercase # L& [26] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL FRAKTUR SMALL Z 1D552..1D56B ; Lowercase # L& [26] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL DOUBLE-STRUCK SMALL Z 1D586..1D59F ; Lowercase # L& [26] MATHEMATICAL BOLD FRAKTUR SMALL A..MATHEMATICAL BOLD FRAKTUR SMALL Z 1D5BA..1D5D3 ; Lowercase # L& [26] MATHEMATICAL SANS-SERIF SMALL A..MATHEMATICAL SANS-SERIF SMALL Z 1D5EE..1D607 ; Lowercase # L& [26] MATHEMATICAL SANS-SERIF BOLD SMALL A..MATHEMATICAL SANS-SERIF BOLD SMALL Z 1D622..1D63B ; Lowercase # L& [26] MATHEMATICAL SANS-SERIF ITALIC SMALL A..MATHEMATICAL SANS-SERIF ITALIC SMALL Z 1D656..1D66F ; Lowercase # L& [26] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL A..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL Z 1D68A..1D6A5 ; Lowercase # L& [28] MATHEMATICAL MONOSPACE SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6C2..1D6DA ; Lowercase # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6E1 ; Lowercase # L& [6] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL BOLD PI SYMBOL 1D6FC..1D714 ; Lowercase # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D71B ; Lowercase # L& [6] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL ITALIC PI SYMBOL 1D736..1D74E ; Lowercase # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D755 ; Lowercase # L& [6] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC PI SYMBOL 1D770..1D788 ; Lowercase # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D78F ; Lowercase # L& [6] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD PI SYMBOL 1D7AA..1D7C2 ; Lowercase # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7C9 ; Lowercase # L& [6] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL 1D7CB ; Lowercase # L& MATHEMATICAL BOLD SMALL DIGAMMA 1DF00..1DF09 ; Lowercase # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0B..1DF1E ; Lowercase # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Lowercase # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E030..1E06D ; Lowercase # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E922..1E943 ; Lowercase # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA # Total code points: 2595 # ================================================ # Derived Property: Uppercase # Generated from: Lu + Other_Uppercase 0041..005A ; Uppercase # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 00C0..00D6 ; Uppercase # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00DE ; Uppercase # L& [7] LATIN CAPITAL LETTER O WITH STROKE..LATIN CAPITAL LETTER THORN 0100 ; Uppercase # L& LATIN CAPITAL LETTER A WITH MACRON 0102 ; Uppercase # L& LATIN CAPITAL LETTER A WITH BREVE 0104 ; Uppercase # L& LATIN CAPITAL LETTER A WITH OGONEK 0106 ; Uppercase # L& LATIN CAPITAL LETTER C WITH ACUTE 0108 ; Uppercase # L& LATIN CAPITAL LETTER C WITH CIRCUMFLEX 010A ; Uppercase # L& LATIN CAPITAL LETTER C WITH DOT ABOVE 010C ; Uppercase # L& LATIN CAPITAL LETTER C WITH CARON 010E ; Uppercase # L& LATIN CAPITAL LETTER D WITH CARON 0110 ; Uppercase # L& LATIN CAPITAL LETTER D WITH STROKE 0112 ; Uppercase # L& LATIN CAPITAL LETTER E WITH MACRON 0114 ; Uppercase # L& LATIN CAPITAL LETTER E WITH BREVE 0116 ; Uppercase # L& LATIN CAPITAL LETTER E WITH DOT ABOVE 0118 ; Uppercase # L& LATIN CAPITAL LETTER E WITH OGONEK 011A ; Uppercase # L& LATIN CAPITAL LETTER E WITH CARON 011C ; Uppercase # L& LATIN CAPITAL LETTER G WITH CIRCUMFLEX 011E ; Uppercase # L& LATIN CAPITAL LETTER G WITH BREVE 0120 ; Uppercase # L& LATIN CAPITAL LETTER G WITH DOT ABOVE 0122 ; Uppercase # L& LATIN CAPITAL LETTER G WITH CEDILLA 0124 ; Uppercase # L& LATIN CAPITAL LETTER H WITH CIRCUMFLEX 0126 ; Uppercase # L& LATIN CAPITAL LETTER H WITH STROKE 0128 ; Uppercase # L& LATIN CAPITAL LETTER I WITH TILDE 012A ; Uppercase # L& LATIN CAPITAL LETTER I WITH MACRON 012C ; Uppercase # L& LATIN CAPITAL LETTER I WITH BREVE 012E ; Uppercase # L& LATIN CAPITAL LETTER I WITH OGONEK 0130 ; Uppercase # L& LATIN CAPITAL LETTER I WITH DOT ABOVE 0132 ; Uppercase # L& LATIN CAPITAL LIGATURE IJ 0134 ; Uppercase # L& LATIN CAPITAL LETTER J WITH CIRCUMFLEX 0136 ; Uppercase # L& LATIN CAPITAL LETTER K WITH CEDILLA 0139 ; Uppercase # L& LATIN CAPITAL LETTER L WITH ACUTE 013B ; Uppercase # L& LATIN CAPITAL LETTER L WITH CEDILLA 013D ; Uppercase # L& LATIN CAPITAL LETTER L WITH CARON 013F ; Uppercase # L& LATIN CAPITAL LETTER L WITH MIDDLE DOT 0141 ; Uppercase # L& LATIN CAPITAL LETTER L WITH STROKE 0143 ; Uppercase # L& LATIN CAPITAL LETTER N WITH ACUTE 0145 ; Uppercase # L& LATIN CAPITAL LETTER N WITH CEDILLA 0147 ; Uppercase # L& LATIN CAPITAL LETTER N WITH CARON 014A ; Uppercase # L& LATIN CAPITAL LETTER ENG 014C ; Uppercase # L& LATIN CAPITAL LETTER O WITH MACRON 014E ; Uppercase # L& LATIN CAPITAL LETTER O WITH BREVE 0150 ; Uppercase # L& LATIN CAPITAL LETTER O WITH DOUBLE ACUTE 0152 ; Uppercase # L& LATIN CAPITAL LIGATURE OE 0154 ; Uppercase # L& LATIN CAPITAL LETTER R WITH ACUTE 0156 ; Uppercase # L& LATIN CAPITAL LETTER R WITH CEDILLA 0158 ; Uppercase # L& LATIN CAPITAL LETTER R WITH CARON 015A ; Uppercase # L& LATIN CAPITAL LETTER S WITH ACUTE 015C ; Uppercase # L& LATIN CAPITAL LETTER S WITH CIRCUMFLEX 015E ; Uppercase # L& LATIN CAPITAL LETTER S WITH CEDILLA 0160 ; Uppercase # L& LATIN CAPITAL LETTER S WITH CARON 0162 ; Uppercase # L& LATIN CAPITAL LETTER T WITH CEDILLA 0164 ; Uppercase # L& LATIN CAPITAL LETTER T WITH CARON 0166 ; Uppercase # L& LATIN CAPITAL LETTER T WITH STROKE 0168 ; Uppercase # L& LATIN CAPITAL LETTER U WITH TILDE 016A ; Uppercase # L& LATIN CAPITAL LETTER U WITH MACRON 016C ; Uppercase # L& LATIN CAPITAL LETTER U WITH BREVE 016E ; Uppercase # L& LATIN CAPITAL LETTER U WITH RING ABOVE 0170 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DOUBLE ACUTE 0172 ; Uppercase # L& LATIN CAPITAL LETTER U WITH OGONEK 0174 ; Uppercase # L& LATIN CAPITAL LETTER W WITH CIRCUMFLEX 0176 ; Uppercase # L& LATIN CAPITAL LETTER Y WITH CIRCUMFLEX 0178..0179 ; Uppercase # L& [2] LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN CAPITAL LETTER Z WITH ACUTE 017B ; Uppercase # L& LATIN CAPITAL LETTER Z WITH DOT ABOVE 017D ; Uppercase # L& LATIN CAPITAL LETTER Z WITH CARON 0181..0182 ; Uppercase # L& [2] LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPITAL LETTER B WITH TOPBAR 0184 ; Uppercase # L& LATIN CAPITAL LETTER TONE SIX 0186..0187 ; Uppercase # L& [2] LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL LETTER C WITH HOOK 0189..018B ; Uppercase # L& [3] LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITAL LETTER D WITH TOPBAR 018E..0191 ; Uppercase # L& [4] LATIN CAPITAL LETTER REVERSED E..LATIN CAPITAL LETTER F WITH HOOK 0193..0194 ; Uppercase # L& [2] LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPITAL LETTER GAMMA 0196..0198 ; Uppercase # L& [3] LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LETTER K WITH HOOK 019C..019D ; Uppercase # L& [2] LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL LETTER N WITH LEFT HOOK 019F..01A0 ; Uppercase # L& [2] LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LATIN CAPITAL LETTER O WITH HORN 01A2 ; Uppercase # L& LATIN CAPITAL LETTER OI 01A4 ; Uppercase # L& LATIN CAPITAL LETTER P WITH HOOK 01A6..01A7 ; Uppercase # L& [2] LATIN LETTER YR..LATIN CAPITAL LETTER TONE TWO 01A9 ; Uppercase # L& LATIN CAPITAL LETTER ESH 01AC ; Uppercase # L& LATIN CAPITAL LETTER T WITH HOOK 01AE..01AF ; Uppercase # L& [2] LATIN CAPITAL LETTER T WITH RETROFLEX HOOK..LATIN CAPITAL LETTER U WITH HORN 01B1..01B3 ; Uppercase # L& [3] LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL LETTER Y WITH HOOK 01B5 ; Uppercase # L& LATIN CAPITAL LETTER Z WITH STROKE 01B7..01B8 ; Uppercase # L& [2] LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETTER EZH REVERSED 01BC ; Uppercase # L& LATIN CAPITAL LETTER TONE FIVE 01C4 ; Uppercase # L& LATIN CAPITAL LETTER DZ WITH CARON 01C7 ; Uppercase # L& LATIN CAPITAL LETTER LJ 01CA ; Uppercase # L& LATIN CAPITAL LETTER NJ 01CD ; Uppercase # L& LATIN CAPITAL LETTER A WITH CARON 01CF ; Uppercase # L& LATIN CAPITAL LETTER I WITH CARON 01D1 ; Uppercase # L& LATIN CAPITAL LETTER O WITH CARON 01D3 ; Uppercase # L& LATIN CAPITAL LETTER U WITH CARON 01D5 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON 01D7 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE 01D9 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON 01DB ; Uppercase # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE 01DE ; Uppercase # L& LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON 01E0 ; Uppercase # L& LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON 01E2 ; Uppercase # L& LATIN CAPITAL LETTER AE WITH MACRON 01E4 ; Uppercase # L& LATIN CAPITAL LETTER G WITH STROKE 01E6 ; Uppercase # L& LATIN CAPITAL LETTER G WITH CARON 01E8 ; Uppercase # L& LATIN CAPITAL LETTER K WITH CARON 01EA ; Uppercase # L& LATIN CAPITAL LETTER O WITH OGONEK 01EC ; Uppercase # L& LATIN CAPITAL LETTER O WITH OGONEK AND MACRON 01EE ; Uppercase # L& LATIN CAPITAL LETTER EZH WITH CARON 01F1 ; Uppercase # L& LATIN CAPITAL LETTER DZ 01F4 ; Uppercase # L& LATIN CAPITAL LETTER G WITH ACUTE 01F6..01F8 ; Uppercase # L& [3] LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LETTER N WITH GRAVE 01FA ; Uppercase # L& LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE 01FC ; Uppercase # L& LATIN CAPITAL LETTER AE WITH ACUTE 01FE ; Uppercase # L& LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 0200 ; Uppercase # L& LATIN CAPITAL LETTER A WITH DOUBLE GRAVE 0202 ; Uppercase # L& LATIN CAPITAL LETTER A WITH INVERTED BREVE 0204 ; Uppercase # L& LATIN CAPITAL LETTER E WITH DOUBLE GRAVE 0206 ; Uppercase # L& LATIN CAPITAL LETTER E WITH INVERTED BREVE 0208 ; Uppercase # L& LATIN CAPITAL LETTER I WITH DOUBLE GRAVE 020A ; Uppercase # L& LATIN CAPITAL LETTER I WITH INVERTED BREVE 020C ; Uppercase # L& LATIN CAPITAL LETTER O WITH DOUBLE GRAVE 020E ; Uppercase # L& LATIN CAPITAL LETTER O WITH INVERTED BREVE 0210 ; Uppercase # L& LATIN CAPITAL LETTER R WITH DOUBLE GRAVE 0212 ; Uppercase # L& LATIN CAPITAL LETTER R WITH INVERTED BREVE 0214 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DOUBLE GRAVE 0216 ; Uppercase # L& LATIN CAPITAL LETTER U WITH INVERTED BREVE 0218 ; Uppercase # L& LATIN CAPITAL LETTER S WITH COMMA BELOW 021A ; Uppercase # L& LATIN CAPITAL LETTER T WITH COMMA BELOW 021C ; Uppercase # L& LATIN CAPITAL LETTER YOGH 021E ; Uppercase # L& LATIN CAPITAL LETTER H WITH CARON 0220 ; Uppercase # L& LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 0222 ; Uppercase # L& LATIN CAPITAL LETTER OU 0224 ; Uppercase # L& LATIN CAPITAL LETTER Z WITH HOOK 0226 ; Uppercase # L& LATIN CAPITAL LETTER A WITH DOT ABOVE 0228 ; Uppercase # L& LATIN CAPITAL LETTER E WITH CEDILLA 022A ; Uppercase # L& LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 022C ; Uppercase # L& LATIN CAPITAL LETTER O WITH TILDE AND MACRON 022E ; Uppercase # L& LATIN CAPITAL LETTER O WITH DOT ABOVE 0230 ; Uppercase # L& LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON 0232 ; Uppercase # L& LATIN CAPITAL LETTER Y WITH MACRON 023A..023B ; Uppercase # L& [2] LATIN CAPITAL LETTER A WITH STROKE..LATIN CAPITAL LETTER C WITH STROKE 023D..023E ; Uppercase # L& [2] LATIN CAPITAL LETTER L WITH BAR..LATIN CAPITAL LETTER T WITH DIAGONAL STROKE 0241 ; Uppercase # L& LATIN CAPITAL LETTER GLOTTAL STOP 0243..0246 ; Uppercase # L& [4] LATIN CAPITAL LETTER B WITH STROKE..LATIN CAPITAL LETTER E WITH STROKE 0248 ; Uppercase # L& LATIN CAPITAL LETTER J WITH STROKE 024A ; Uppercase # L& LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL 024C ; Uppercase # L& LATIN CAPITAL LETTER R WITH STROKE 024E ; Uppercase # L& LATIN CAPITAL LETTER Y WITH STROKE 0370 ; Uppercase # L& GREEK CAPITAL LETTER HETA 0372 ; Uppercase # L& GREEK CAPITAL LETTER ARCHAIC SAMPI 0376 ; Uppercase # L& GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA 037F ; Uppercase # L& GREEK CAPITAL LETTER YOT 0386 ; Uppercase # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Uppercase # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Uppercase # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..038F ; Uppercase # L& [2] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER OMEGA WITH TONOS 0391..03A1 ; Uppercase # L& [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO 03A3..03AB ; Uppercase # L& [9] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA 03CF ; Uppercase # L& GREEK CAPITAL KAI SYMBOL 03D2..03D4 ; Uppercase # L& [3] GREEK UPSILON WITH HOOK SYMBOL..GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL 03D8 ; Uppercase # L& GREEK LETTER ARCHAIC KOPPA 03DA ; Uppercase # L& GREEK LETTER STIGMA 03DC ; Uppercase # L& GREEK LETTER DIGAMMA 03DE ; Uppercase # L& GREEK LETTER KOPPA 03E0 ; Uppercase # L& GREEK LETTER SAMPI 03E2 ; Uppercase # L& COPTIC CAPITAL LETTER SHEI 03E4 ; Uppercase # L& COPTIC CAPITAL LETTER FEI 03E6 ; Uppercase # L& COPTIC CAPITAL LETTER KHEI 03E8 ; Uppercase # L& COPTIC CAPITAL LETTER HORI 03EA ; Uppercase # L& COPTIC CAPITAL LETTER GANGIA 03EC ; Uppercase # L& COPTIC CAPITAL LETTER SHIMA 03EE ; Uppercase # L& COPTIC CAPITAL LETTER DEI 03F4 ; Uppercase # L& GREEK CAPITAL THETA SYMBOL 03F7 ; Uppercase # L& GREEK CAPITAL LETTER SHO 03F9..03FA ; Uppercase # L& [2] GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAPITAL LETTER SAN 03FD..042F ; Uppercase # L& [51] GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..CYRILLIC CAPITAL LETTER YA 0460 ; Uppercase # L& CYRILLIC CAPITAL LETTER OMEGA 0462 ; Uppercase # L& CYRILLIC CAPITAL LETTER YAT 0464 ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTIFIED E 0466 ; Uppercase # L& CYRILLIC CAPITAL LETTER LITTLE YUS 0468 ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS 046A ; Uppercase # L& CYRILLIC CAPITAL LETTER BIG YUS 046C ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS 046E ; Uppercase # L& CYRILLIC CAPITAL LETTER KSI 0470 ; Uppercase # L& CYRILLIC CAPITAL LETTER PSI 0472 ; Uppercase # L& CYRILLIC CAPITAL LETTER FITA 0474 ; Uppercase # L& CYRILLIC CAPITAL LETTER IZHITSA 0476 ; Uppercase # L& CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0478 ; Uppercase # L& CYRILLIC CAPITAL LETTER UK 047A ; Uppercase # L& CYRILLIC CAPITAL LETTER ROUND OMEGA 047C ; Uppercase # L& CYRILLIC CAPITAL LETTER OMEGA WITH TITLO 047E ; Uppercase # L& CYRILLIC CAPITAL LETTER OT 0480 ; Uppercase # L& CYRILLIC CAPITAL LETTER KOPPA 048A ; Uppercase # L& CYRILLIC CAPITAL LETTER SHORT I WITH TAIL 048C ; Uppercase # L& CYRILLIC CAPITAL LETTER SEMISOFT SIGN 048E ; Uppercase # L& CYRILLIC CAPITAL LETTER ER WITH TICK 0490 ; Uppercase # L& CYRILLIC CAPITAL LETTER GHE WITH UPTURN 0492 ; Uppercase # L& CYRILLIC CAPITAL LETTER GHE WITH STROKE 0494 ; Uppercase # L& CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK 0496 ; Uppercase # L& CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 0498 ; Uppercase # L& CYRILLIC CAPITAL LETTER ZE WITH DESCENDER 049A ; Uppercase # L& CYRILLIC CAPITAL LETTER KA WITH DESCENDER 049C ; Uppercase # L& CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE 049E ; Uppercase # L& CYRILLIC CAPITAL LETTER KA WITH STROKE 04A0 ; Uppercase # L& CYRILLIC CAPITAL LETTER BASHKIR KA 04A2 ; Uppercase # L& CYRILLIC CAPITAL LETTER EN WITH DESCENDER 04A4 ; Uppercase # L& CYRILLIC CAPITAL LIGATURE EN GHE 04A6 ; Uppercase # L& CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK 04A8 ; Uppercase # L& CYRILLIC CAPITAL LETTER ABKHASIAN HA 04AA ; Uppercase # L& CYRILLIC CAPITAL LETTER ES WITH DESCENDER 04AC ; Uppercase # L& CYRILLIC CAPITAL LETTER TE WITH DESCENDER 04AE ; Uppercase # L& CYRILLIC CAPITAL LETTER STRAIGHT U 04B0 ; Uppercase # L& CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE 04B2 ; Uppercase # L& CYRILLIC CAPITAL LETTER HA WITH DESCENDER 04B4 ; Uppercase # L& CYRILLIC CAPITAL LIGATURE TE TSE 04B6 ; Uppercase # L& CYRILLIC CAPITAL LETTER CHE WITH DESCENDER 04B8 ; Uppercase # L& CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE 04BA ; Uppercase # L& CYRILLIC CAPITAL LETTER SHHA 04BC ; Uppercase # L& CYRILLIC CAPITAL LETTER ABKHASIAN CHE 04BE ; Uppercase # L& CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER 04C0..04C1 ; Uppercase # L& [2] CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL LETTER ZHE WITH BREVE 04C3 ; Uppercase # L& CYRILLIC CAPITAL LETTER KA WITH HOOK 04C5 ; Uppercase # L& CYRILLIC CAPITAL LETTER EL WITH TAIL 04C7 ; Uppercase # L& CYRILLIC CAPITAL LETTER EN WITH HOOK 04C9 ; Uppercase # L& CYRILLIC CAPITAL LETTER EN WITH TAIL 04CB ; Uppercase # L& CYRILLIC CAPITAL LETTER KHAKASSIAN CHE 04CD ; Uppercase # L& CYRILLIC CAPITAL LETTER EM WITH TAIL 04D0 ; Uppercase # L& CYRILLIC CAPITAL LETTER A WITH BREVE 04D2 ; Uppercase # L& CYRILLIC CAPITAL LETTER A WITH DIAERESIS 04D4 ; Uppercase # L& CYRILLIC CAPITAL LIGATURE A IE 04D6 ; Uppercase # L& CYRILLIC CAPITAL LETTER IE WITH BREVE 04D8 ; Uppercase # L& CYRILLIC CAPITAL LETTER SCHWA 04DA ; Uppercase # L& CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS 04DC ; Uppercase # L& CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS 04DE ; Uppercase # L& CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS 04E0 ; Uppercase # L& CYRILLIC CAPITAL LETTER ABKHASIAN DZE 04E2 ; Uppercase # L& CYRILLIC CAPITAL LETTER I WITH MACRON 04E4 ; Uppercase # L& CYRILLIC CAPITAL LETTER I WITH DIAERESIS 04E6 ; Uppercase # L& CYRILLIC CAPITAL LETTER O WITH DIAERESIS 04E8 ; Uppercase # L& CYRILLIC CAPITAL LETTER BARRED O 04EA ; Uppercase # L& CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS 04EC ; Uppercase # L& CYRILLIC CAPITAL LETTER E WITH DIAERESIS 04EE ; Uppercase # L& CYRILLIC CAPITAL LETTER U WITH MACRON 04F0 ; Uppercase # L& CYRILLIC CAPITAL LETTER U WITH DIAERESIS 04F2 ; Uppercase # L& CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 04F4 ; Uppercase # L& CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS 04F6 ; Uppercase # L& CYRILLIC CAPITAL LETTER GHE WITH DESCENDER 04F8 ; Uppercase # L& CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS 04FA ; Uppercase # L& CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK 04FC ; Uppercase # L& CYRILLIC CAPITAL LETTER HA WITH HOOK 04FE ; Uppercase # L& CYRILLIC CAPITAL LETTER HA WITH STROKE 0500 ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI DE 0502 ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI DJE 0504 ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI ZJE 0506 ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI DZJE 0508 ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI LJE 050A ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI NJE 050C ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI SJE 050E ; Uppercase # L& CYRILLIC CAPITAL LETTER KOMI TJE 0510 ; Uppercase # L& CYRILLIC CAPITAL LETTER REVERSED ZE 0512 ; Uppercase # L& CYRILLIC CAPITAL LETTER EL WITH HOOK 0514 ; Uppercase # L& CYRILLIC CAPITAL LETTER LHA 0516 ; Uppercase # L& CYRILLIC CAPITAL LETTER RHA 0518 ; Uppercase # L& CYRILLIC CAPITAL LETTER YAE 051A ; Uppercase # L& CYRILLIC CAPITAL LETTER QA 051C ; Uppercase # L& CYRILLIC CAPITAL LETTER WE 051E ; Uppercase # L& CYRILLIC CAPITAL LETTER ALEUT KA 0520 ; Uppercase # L& CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK 0522 ; Uppercase # L& CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK 0524 ; Uppercase # L& CYRILLIC CAPITAL LETTER PE WITH DESCENDER 0526 ; Uppercase # L& CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER 0528 ; Uppercase # L& CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK 052A ; Uppercase # L& CYRILLIC CAPITAL LETTER DZZHE 052C ; Uppercase # L& CYRILLIC CAPITAL LETTER DCHE 052E ; Uppercase # L& CYRILLIC CAPITAL LETTER EL WITH DESCENDER 0531..0556 ; Uppercase # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 10A0..10C5 ; Uppercase # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Uppercase # L& GEORGIAN CAPITAL LETTER YN 10CD ; Uppercase # L& GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Uppercase # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 1C89 ; Uppercase # L& CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Uppercase # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Uppercase # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Uppercase # L& LATIN CAPITAL LETTER A WITH RING BELOW 1E02 ; Uppercase # L& LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04 ; Uppercase # L& LATIN CAPITAL LETTER B WITH DOT BELOW 1E06 ; Uppercase # L& LATIN CAPITAL LETTER B WITH LINE BELOW 1E08 ; Uppercase # L& LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE 1E0A ; Uppercase # L& LATIN CAPITAL LETTER D WITH DOT ABOVE 1E0C ; Uppercase # L& LATIN CAPITAL LETTER D WITH DOT BELOW 1E0E ; Uppercase # L& LATIN CAPITAL LETTER D WITH LINE BELOW 1E10 ; Uppercase # L& LATIN CAPITAL LETTER D WITH CEDILLA 1E12 ; Uppercase # L& LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW 1E14 ; Uppercase # L& LATIN CAPITAL LETTER E WITH MACRON AND GRAVE 1E16 ; Uppercase # L& LATIN CAPITAL LETTER E WITH MACRON AND ACUTE 1E18 ; Uppercase # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW 1E1A ; Uppercase # L& LATIN CAPITAL LETTER E WITH TILDE BELOW 1E1C ; Uppercase # L& LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE 1E1E ; Uppercase # L& LATIN CAPITAL LETTER F WITH DOT ABOVE 1E20 ; Uppercase # L& LATIN CAPITAL LETTER G WITH MACRON 1E22 ; Uppercase # L& LATIN CAPITAL LETTER H WITH DOT ABOVE 1E24 ; Uppercase # L& LATIN CAPITAL LETTER H WITH DOT BELOW 1E26 ; Uppercase # L& LATIN CAPITAL LETTER H WITH DIAERESIS 1E28 ; Uppercase # L& LATIN CAPITAL LETTER H WITH CEDILLA 1E2A ; Uppercase # L& LATIN CAPITAL LETTER H WITH BREVE BELOW 1E2C ; Uppercase # L& LATIN CAPITAL LETTER I WITH TILDE BELOW 1E2E ; Uppercase # L& LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE 1E30 ; Uppercase # L& LATIN CAPITAL LETTER K WITH ACUTE 1E32 ; Uppercase # L& LATIN CAPITAL LETTER K WITH DOT BELOW 1E34 ; Uppercase # L& LATIN CAPITAL LETTER K WITH LINE BELOW 1E36 ; Uppercase # L& LATIN CAPITAL LETTER L WITH DOT BELOW 1E38 ; Uppercase # L& LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON 1E3A ; Uppercase # L& LATIN CAPITAL LETTER L WITH LINE BELOW 1E3C ; Uppercase # L& LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW 1E3E ; Uppercase # L& LATIN CAPITAL LETTER M WITH ACUTE 1E40 ; Uppercase # L& LATIN CAPITAL LETTER M WITH DOT ABOVE 1E42 ; Uppercase # L& LATIN CAPITAL LETTER M WITH DOT BELOW 1E44 ; Uppercase # L& LATIN CAPITAL LETTER N WITH DOT ABOVE 1E46 ; Uppercase # L& LATIN CAPITAL LETTER N WITH DOT BELOW 1E48 ; Uppercase # L& LATIN CAPITAL LETTER N WITH LINE BELOW 1E4A ; Uppercase # L& LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW 1E4C ; Uppercase # L& LATIN CAPITAL LETTER O WITH TILDE AND ACUTE 1E4E ; Uppercase # L& LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS 1E50 ; Uppercase # L& LATIN CAPITAL LETTER O WITH MACRON AND GRAVE 1E52 ; Uppercase # L& LATIN CAPITAL LETTER O WITH MACRON AND ACUTE 1E54 ; Uppercase # L& LATIN CAPITAL LETTER P WITH ACUTE 1E56 ; Uppercase # L& LATIN CAPITAL LETTER P WITH DOT ABOVE 1E58 ; Uppercase # L& LATIN CAPITAL LETTER R WITH DOT ABOVE 1E5A ; Uppercase # L& LATIN CAPITAL LETTER R WITH DOT BELOW 1E5C ; Uppercase # L& LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON 1E5E ; Uppercase # L& LATIN CAPITAL LETTER R WITH LINE BELOW 1E60 ; Uppercase # L& LATIN CAPITAL LETTER S WITH DOT ABOVE 1E62 ; Uppercase # L& LATIN CAPITAL LETTER S WITH DOT BELOW 1E64 ; Uppercase # L& LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE 1E66 ; Uppercase # L& LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE 1E68 ; Uppercase # L& LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6A ; Uppercase # L& LATIN CAPITAL LETTER T WITH DOT ABOVE 1E6C ; Uppercase # L& LATIN CAPITAL LETTER T WITH DOT BELOW 1E6E ; Uppercase # L& LATIN CAPITAL LETTER T WITH LINE BELOW 1E70 ; Uppercase # L& LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW 1E72 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DIAERESIS BELOW 1E74 ; Uppercase # L& LATIN CAPITAL LETTER U WITH TILDE BELOW 1E76 ; Uppercase # L& LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW 1E78 ; Uppercase # L& LATIN CAPITAL LETTER U WITH TILDE AND ACUTE 1E7A ; Uppercase # L& LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS 1E7C ; Uppercase # L& LATIN CAPITAL LETTER V WITH TILDE 1E7E ; Uppercase # L& LATIN CAPITAL LETTER V WITH DOT BELOW 1E80 ; Uppercase # L& LATIN CAPITAL LETTER W WITH GRAVE 1E82 ; Uppercase # L& LATIN CAPITAL LETTER W WITH ACUTE 1E84 ; Uppercase # L& LATIN CAPITAL LETTER W WITH DIAERESIS 1E86 ; Uppercase # L& LATIN CAPITAL LETTER W WITH DOT ABOVE 1E88 ; Uppercase # L& LATIN CAPITAL LETTER W WITH DOT BELOW 1E8A ; Uppercase # L& LATIN CAPITAL LETTER X WITH DOT ABOVE 1E8C ; Uppercase # L& LATIN CAPITAL LETTER X WITH DIAERESIS 1E8E ; Uppercase # L& LATIN CAPITAL LETTER Y WITH DOT ABOVE 1E90 ; Uppercase # L& LATIN CAPITAL LETTER Z WITH CIRCUMFLEX 1E92 ; Uppercase # L& LATIN CAPITAL LETTER Z WITH DOT BELOW 1E94 ; Uppercase # L& LATIN CAPITAL LETTER Z WITH LINE BELOW 1E9E ; Uppercase # L& LATIN CAPITAL LETTER SHARP S 1EA0 ; Uppercase # L& LATIN CAPITAL LETTER A WITH DOT BELOW 1EA2 ; Uppercase # L& LATIN CAPITAL LETTER A WITH HOOK ABOVE 1EA4 ; Uppercase # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA6 ; Uppercase # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA8 ; Uppercase # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAA ; Uppercase # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE 1EAC ; Uppercase # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAE ; Uppercase # L& LATIN CAPITAL LETTER A WITH BREVE AND ACUTE 1EB0 ; Uppercase # L& LATIN CAPITAL LETTER A WITH BREVE AND GRAVE 1EB2 ; Uppercase # L& LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE 1EB4 ; Uppercase # L& LATIN CAPITAL LETTER A WITH BREVE AND TILDE 1EB6 ; Uppercase # L& LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW 1EB8 ; Uppercase # L& LATIN CAPITAL LETTER E WITH DOT BELOW 1EBA ; Uppercase # L& LATIN CAPITAL LETTER E WITH HOOK ABOVE 1EBC ; Uppercase # L& LATIN CAPITAL LETTER E WITH TILDE 1EBE ; Uppercase # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC0 ; Uppercase # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC2 ; Uppercase # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC4 ; Uppercase # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE 1EC6 ; Uppercase # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC8 ; Uppercase # L& LATIN CAPITAL LETTER I WITH HOOK ABOVE 1ECA ; Uppercase # L& LATIN CAPITAL LETTER I WITH DOT BELOW 1ECC ; Uppercase # L& LATIN CAPITAL LETTER O WITH DOT BELOW 1ECE ; Uppercase # L& LATIN CAPITAL LETTER O WITH HOOK ABOVE 1ED0 ; Uppercase # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED2 ; Uppercase # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED4 ; Uppercase # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED6 ; Uppercase # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE 1ED8 ; Uppercase # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDA ; Uppercase # L& LATIN CAPITAL LETTER O WITH HORN AND ACUTE 1EDC ; Uppercase # L& LATIN CAPITAL LETTER O WITH HORN AND GRAVE 1EDE ; Uppercase # L& LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE 1EE0 ; Uppercase # L& LATIN CAPITAL LETTER O WITH HORN AND TILDE 1EE2 ; Uppercase # L& LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW 1EE4 ; Uppercase # L& LATIN CAPITAL LETTER U WITH DOT BELOW 1EE6 ; Uppercase # L& LATIN CAPITAL LETTER U WITH HOOK ABOVE 1EE8 ; Uppercase # L& LATIN CAPITAL LETTER U WITH HORN AND ACUTE 1EEA ; Uppercase # L& LATIN CAPITAL LETTER U WITH HORN AND GRAVE 1EEC ; Uppercase # L& LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE 1EEE ; Uppercase # L& LATIN CAPITAL LETTER U WITH HORN AND TILDE 1EF0 ; Uppercase # L& LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW 1EF2 ; Uppercase # L& LATIN CAPITAL LETTER Y WITH GRAVE 1EF4 ; Uppercase # L& LATIN CAPITAL LETTER Y WITH DOT BELOW 1EF6 ; Uppercase # L& LATIN CAPITAL LETTER Y WITH HOOK ABOVE 1EF8 ; Uppercase # L& LATIN CAPITAL LETTER Y WITH TILDE 1EFA ; Uppercase # L& LATIN CAPITAL LETTER MIDDLE-WELSH LL 1EFC ; Uppercase # L& LATIN CAPITAL LETTER MIDDLE-WELSH V 1EFE ; Uppercase # L& LATIN CAPITAL LETTER Y WITH LOOP 1F08..1F0F ; Uppercase # L& [8] GREEK CAPITAL LETTER ALPHA WITH PSILI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F18..1F1D ; Uppercase # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F28..1F2F ; Uppercase # L& [8] GREEK CAPITAL LETTER ETA WITH PSILI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI 1F38..1F3F ; Uppercase # L& [8] GREEK CAPITAL LETTER IOTA WITH PSILI..GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI 1F48..1F4D ; Uppercase # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F59 ; Uppercase # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Uppercase # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Uppercase # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F ; Uppercase # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F68..1F6F ; Uppercase # L& [8] GREEK CAPITAL LETTER OMEGA WITH PSILI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI 1FB8..1FBB ; Uppercase # L& [4] GREEK CAPITAL LETTER ALPHA WITH VRACHY..GREEK CAPITAL LETTER ALPHA WITH OXIA 1FC8..1FCB ; Uppercase # L& [4] GREEK CAPITAL LETTER EPSILON WITH VARIA..GREEK CAPITAL LETTER ETA WITH OXIA 1FD8..1FDB ; Uppercase # L& [4] GREEK CAPITAL LETTER IOTA WITH VRACHY..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE8..1FEC ; Uppercase # L& [5] GREEK CAPITAL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF8..1FFB ; Uppercase # L& [4] GREEK CAPITAL LETTER OMICRON WITH VARIA..GREEK CAPITAL LETTER OMEGA WITH OXIA 2102 ; Uppercase # L& DOUBLE-STRUCK CAPITAL C 2107 ; Uppercase # L& EULER CONSTANT 210B..210D ; Uppercase # L& [3] SCRIPT CAPITAL H..DOUBLE-STRUCK CAPITAL H 2110..2112 ; Uppercase # L& [3] SCRIPT CAPITAL I..SCRIPT CAPITAL L 2115 ; Uppercase # L& DOUBLE-STRUCK CAPITAL N 2119..211D ; Uppercase # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; Uppercase # L& DOUBLE-STRUCK CAPITAL Z 2126 ; Uppercase # L& OHM SIGN 2128 ; Uppercase # L& BLACK-LETTER CAPITAL Z 212A..212D ; Uppercase # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 2130..2133 ; Uppercase # L& [4] SCRIPT CAPITAL E..SCRIPT CAPITAL M 213E..213F ; Uppercase # L& [2] DOUBLE-STRUCK CAPITAL GAMMA..DOUBLE-STRUCK CAPITAL PI 2145 ; Uppercase # L& DOUBLE-STRUCK ITALIC CAPITAL D 2160..216F ; Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 2183 ; Uppercase # L& ROMAN NUMERAL REVERSED ONE HUNDRED 24B6..24CF ; Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z 2C00..2C2F ; Uppercase # L& [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI 2C60 ; Uppercase # L& LATIN CAPITAL LETTER L WITH DOUBLE BAR 2C62..2C64 ; Uppercase # L& [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL 2C67 ; Uppercase # L& LATIN CAPITAL LETTER H WITH DESCENDER 2C69 ; Uppercase # L& LATIN CAPITAL LETTER K WITH DESCENDER 2C6B ; Uppercase # L& LATIN CAPITAL LETTER Z WITH DESCENDER 2C6D..2C70 ; Uppercase # L& [4] LATIN CAPITAL LETTER ALPHA..LATIN CAPITAL LETTER TURNED ALPHA 2C72 ; Uppercase # L& LATIN CAPITAL LETTER W WITH HOOK 2C75 ; Uppercase # L& LATIN CAPITAL LETTER HALF H 2C7E..2C80 ; Uppercase # L& [3] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC CAPITAL LETTER ALFA 2C82 ; Uppercase # L& COPTIC CAPITAL LETTER VIDA 2C84 ; Uppercase # L& COPTIC CAPITAL LETTER GAMMA 2C86 ; Uppercase # L& COPTIC CAPITAL LETTER DALDA 2C88 ; Uppercase # L& COPTIC CAPITAL LETTER EIE 2C8A ; Uppercase # L& COPTIC CAPITAL LETTER SOU 2C8C ; Uppercase # L& COPTIC CAPITAL LETTER ZATA 2C8E ; Uppercase # L& COPTIC CAPITAL LETTER HATE 2C90 ; Uppercase # L& COPTIC CAPITAL LETTER THETHE 2C92 ; Uppercase # L& COPTIC CAPITAL LETTER IAUDA 2C94 ; Uppercase # L& COPTIC CAPITAL LETTER KAPA 2C96 ; Uppercase # L& COPTIC CAPITAL LETTER LAULA 2C98 ; Uppercase # L& COPTIC CAPITAL LETTER MI 2C9A ; Uppercase # L& COPTIC CAPITAL LETTER NI 2C9C ; Uppercase # L& COPTIC CAPITAL LETTER KSI 2C9E ; Uppercase # L& COPTIC CAPITAL LETTER O 2CA0 ; Uppercase # L& COPTIC CAPITAL LETTER PI 2CA2 ; Uppercase # L& COPTIC CAPITAL LETTER RO 2CA4 ; Uppercase # L& COPTIC CAPITAL LETTER SIMA 2CA6 ; Uppercase # L& COPTIC CAPITAL LETTER TAU 2CA8 ; Uppercase # L& COPTIC CAPITAL LETTER UA 2CAA ; Uppercase # L& COPTIC CAPITAL LETTER FI 2CAC ; Uppercase # L& COPTIC CAPITAL LETTER KHI 2CAE ; Uppercase # L& COPTIC CAPITAL LETTER PSI 2CB0 ; Uppercase # L& COPTIC CAPITAL LETTER OOU 2CB2 ; Uppercase # L& COPTIC CAPITAL LETTER DIALECT-P ALEF 2CB4 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC AIN 2CB6 ; Uppercase # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE 2CB8 ; Uppercase # L& COPTIC CAPITAL LETTER DIALECT-P KAPA 2CBA ; Uppercase # L& COPTIC CAPITAL LETTER DIALECT-P NI 2CBC ; Uppercase # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI 2CBE ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC OOU 2CC0 ; Uppercase # L& COPTIC CAPITAL LETTER SAMPI 2CC2 ; Uppercase # L& COPTIC CAPITAL LETTER CROSSED SHEI 2CC4 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC SHEI 2CC6 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC ESH 2CC8 ; Uppercase # L& COPTIC CAPITAL LETTER AKHMIMIC KHEI 2CCA ; Uppercase # L& COPTIC CAPITAL LETTER DIALECT-P HORI 2CCC ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC HORI 2CCE ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC HA 2CD0 ; Uppercase # L& COPTIC CAPITAL LETTER L-SHAPED HA 2CD2 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC HEI 2CD4 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC HAT 2CD6 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC GANGIA 2CD8 ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC DJA 2CDA ; Uppercase # L& COPTIC CAPITAL LETTER OLD COPTIC SHIMA 2CDC ; Uppercase # L& COPTIC CAPITAL LETTER OLD NUBIAN SHIMA 2CDE ; Uppercase # L& COPTIC CAPITAL LETTER OLD NUBIAN NGI 2CE0 ; Uppercase # L& COPTIC CAPITAL LETTER OLD NUBIAN NYI 2CE2 ; Uppercase # L& COPTIC CAPITAL LETTER OLD NUBIAN WAU 2CEB ; Uppercase # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI 2CED ; Uppercase # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA 2CF2 ; Uppercase # L& COPTIC CAPITAL LETTER BOHAIRIC KHEI A640 ; Uppercase # L& CYRILLIC CAPITAL LETTER ZEMLYA A642 ; Uppercase # L& CYRILLIC CAPITAL LETTER DZELO A644 ; Uppercase # L& CYRILLIC CAPITAL LETTER REVERSED DZE A646 ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTA A648 ; Uppercase # L& CYRILLIC CAPITAL LETTER DJERV A64A ; Uppercase # L& CYRILLIC CAPITAL LETTER MONOGRAPH UK A64C ; Uppercase # L& CYRILLIC CAPITAL LETTER BROAD OMEGA A64E ; Uppercase # L& CYRILLIC CAPITAL LETTER NEUTRAL YER A650 ; Uppercase # L& CYRILLIC CAPITAL LETTER YERU WITH BACK YER A652 ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTIFIED YAT A654 ; Uppercase # L& CYRILLIC CAPITAL LETTER REVERSED YU A656 ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTIFIED A A658 ; Uppercase # L& CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS A65A ; Uppercase # L& CYRILLIC CAPITAL LETTER BLENDED YUS A65C ; Uppercase # L& CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS A65E ; Uppercase # L& CYRILLIC CAPITAL LETTER YN A660 ; Uppercase # L& CYRILLIC CAPITAL LETTER REVERSED TSE A662 ; Uppercase # L& CYRILLIC CAPITAL LETTER SOFT DE A664 ; Uppercase # L& CYRILLIC CAPITAL LETTER SOFT EL A666 ; Uppercase # L& CYRILLIC CAPITAL LETTER SOFT EM A668 ; Uppercase # L& CYRILLIC CAPITAL LETTER MONOCULAR O A66A ; Uppercase # L& CYRILLIC CAPITAL LETTER BINOCULAR O A66C ; Uppercase # L& CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O A680 ; Uppercase # L& CYRILLIC CAPITAL LETTER DWE A682 ; Uppercase # L& CYRILLIC CAPITAL LETTER DZWE A684 ; Uppercase # L& CYRILLIC CAPITAL LETTER ZHWE A686 ; Uppercase # L& CYRILLIC CAPITAL LETTER CCHE A688 ; Uppercase # L& CYRILLIC CAPITAL LETTER DZZE A68A ; Uppercase # L& CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK A68C ; Uppercase # L& CYRILLIC CAPITAL LETTER TWE A68E ; Uppercase # L& CYRILLIC CAPITAL LETTER TSWE A690 ; Uppercase # L& CYRILLIC CAPITAL LETTER TSSE A692 ; Uppercase # L& CYRILLIC CAPITAL LETTER TCHE A694 ; Uppercase # L& CYRILLIC CAPITAL LETTER HWE A696 ; Uppercase # L& CYRILLIC CAPITAL LETTER SHWE A698 ; Uppercase # L& CYRILLIC CAPITAL LETTER DOUBLE O A69A ; Uppercase # L& CYRILLIC CAPITAL LETTER CROSSED O A722 ; Uppercase # L& LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF A724 ; Uppercase # L& LATIN CAPITAL LETTER EGYPTOLOGICAL AIN A726 ; Uppercase # L& LATIN CAPITAL LETTER HENG A728 ; Uppercase # L& LATIN CAPITAL LETTER TZ A72A ; Uppercase # L& LATIN CAPITAL LETTER TRESILLO A72C ; Uppercase # L& LATIN CAPITAL LETTER CUATRILLO A72E ; Uppercase # L& LATIN CAPITAL LETTER CUATRILLO WITH COMMA A732 ; Uppercase # L& LATIN CAPITAL LETTER AA A734 ; Uppercase # L& LATIN CAPITAL LETTER AO A736 ; Uppercase # L& LATIN CAPITAL LETTER AU A738 ; Uppercase # L& LATIN CAPITAL LETTER AV A73A ; Uppercase # L& LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR A73C ; Uppercase # L& LATIN CAPITAL LETTER AY A73E ; Uppercase # L& LATIN CAPITAL LETTER REVERSED C WITH DOT A740 ; Uppercase # L& LATIN CAPITAL LETTER K WITH STROKE A742 ; Uppercase # L& LATIN CAPITAL LETTER K WITH DIAGONAL STROKE A744 ; Uppercase # L& LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE A746 ; Uppercase # L& LATIN CAPITAL LETTER BROKEN L A748 ; Uppercase # L& LATIN CAPITAL LETTER L WITH HIGH STROKE A74A ; Uppercase # L& LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY A74C ; Uppercase # L& LATIN CAPITAL LETTER O WITH LOOP A74E ; Uppercase # L& LATIN CAPITAL LETTER OO A750 ; Uppercase # L& LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER A752 ; Uppercase # L& LATIN CAPITAL LETTER P WITH FLOURISH A754 ; Uppercase # L& LATIN CAPITAL LETTER P WITH SQUIRREL TAIL A756 ; Uppercase # L& LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER A758 ; Uppercase # L& LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE A75A ; Uppercase # L& LATIN CAPITAL LETTER R ROTUNDA A75C ; Uppercase # L& LATIN CAPITAL LETTER RUM ROTUNDA A75E ; Uppercase # L& LATIN CAPITAL LETTER V WITH DIAGONAL STROKE A760 ; Uppercase # L& LATIN CAPITAL LETTER VY A762 ; Uppercase # L& LATIN CAPITAL LETTER VISIGOTHIC Z A764 ; Uppercase # L& LATIN CAPITAL LETTER THORN WITH STROKE A766 ; Uppercase # L& LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER A768 ; Uppercase # L& LATIN CAPITAL LETTER VEND A76A ; Uppercase # L& LATIN CAPITAL LETTER ET A76C ; Uppercase # L& LATIN CAPITAL LETTER IS A76E ; Uppercase # L& LATIN CAPITAL LETTER CON A779 ; Uppercase # L& LATIN CAPITAL LETTER INSULAR D A77B ; Uppercase # L& LATIN CAPITAL LETTER INSULAR F A77D..A77E ; Uppercase # L& [2] LATIN CAPITAL LETTER INSULAR G..LATIN CAPITAL LETTER TURNED INSULAR G A780 ; Uppercase # L& LATIN CAPITAL LETTER TURNED L A782 ; Uppercase # L& LATIN CAPITAL LETTER INSULAR R A784 ; Uppercase # L& LATIN CAPITAL LETTER INSULAR S A786 ; Uppercase # L& LATIN CAPITAL LETTER INSULAR T A78B ; Uppercase # L& LATIN CAPITAL LETTER SALTILLO A78D ; Uppercase # L& LATIN CAPITAL LETTER TURNED H A790 ; Uppercase # L& LATIN CAPITAL LETTER N WITH DESCENDER A792 ; Uppercase # L& LATIN CAPITAL LETTER C WITH BAR A796 ; Uppercase # L& LATIN CAPITAL LETTER B WITH FLOURISH A798 ; Uppercase # L& LATIN CAPITAL LETTER F WITH STROKE A79A ; Uppercase # L& LATIN CAPITAL LETTER VOLAPUK AE A79C ; Uppercase # L& LATIN CAPITAL LETTER VOLAPUK OE A79E ; Uppercase # L& LATIN CAPITAL LETTER VOLAPUK UE A7A0 ; Uppercase # L& LATIN CAPITAL LETTER G WITH OBLIQUE STROKE A7A2 ; Uppercase # L& LATIN CAPITAL LETTER K WITH OBLIQUE STROKE A7A4 ; Uppercase # L& LATIN CAPITAL LETTER N WITH OBLIQUE STROKE A7A6 ; Uppercase # L& LATIN CAPITAL LETTER R WITH OBLIQUE STROKE A7A8 ; Uppercase # L& LATIN CAPITAL LETTER S WITH OBLIQUE STROKE A7AA..A7AE ; Uppercase # L& [5] LATIN CAPITAL LETTER H WITH HOOK..LATIN CAPITAL LETTER SMALL CAPITAL I A7B0..A7B4 ; Uppercase # L& [5] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER BETA A7B6 ; Uppercase # L& LATIN CAPITAL LETTER OMEGA A7B8 ; Uppercase # L& LATIN CAPITAL LETTER U WITH STROKE A7BA ; Uppercase # L& LATIN CAPITAL LETTER GLOTTAL A A7BC ; Uppercase # L& LATIN CAPITAL LETTER GLOTTAL I A7BE ; Uppercase # L& LATIN CAPITAL LETTER GLOTTAL U A7C0 ; Uppercase # L& LATIN CAPITAL LETTER OLD POLISH O A7C2 ; Uppercase # L& LATIN CAPITAL LETTER ANGLICANA W A7C4..A7C7 ; Uppercase # L& [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9 ; Uppercase # L& LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB..A7CC ; Uppercase # L& [2] LATIN CAPITAL LETTER RAMS HORN..LATIN CAPITAL LETTER S WITH DIAGONAL STROKE A7CE ; Uppercase # L& LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0 ; Uppercase # L& LATIN CAPITAL LETTER CLOSED INSULAR G A7D2 ; Uppercase # L& LATIN CAPITAL LETTER DOUBLE THORN A7D4 ; Uppercase # L& LATIN CAPITAL LETTER DOUBLE WYNN A7D6 ; Uppercase # L& LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8 ; Uppercase # L& LATIN CAPITAL LETTER SIGMOID S A7DA ; Uppercase # L& LATIN CAPITAL LETTER LAMBDA A7DC ; Uppercase # L& LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F5 ; Uppercase # L& LATIN CAPITAL LETTER REVERSED HALF H FF21..FF3A ; Uppercase # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z 10400..10427 ; Uppercase # L& [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW 104B0..104D3 ; Uppercase # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 10570..1057A ; Uppercase # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Uppercase # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Uppercase # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Uppercase # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10C80..10CB2 ; Uppercase # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10D50..10D65 ; Uppercase # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 118A0..118BF ; Uppercase # L& [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO 16E40..16E5F ; Uppercase # L& [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y 16EA0..16EB8 ; Uppercase # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 1D400..1D419 ; Uppercase # L& [26] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL BOLD CAPITAL Z 1D434..1D44D ; Uppercase # L& [26] MATHEMATICAL ITALIC CAPITAL A..MATHEMATICAL ITALIC CAPITAL Z 1D468..1D481 ; Uppercase # L& [26] MATHEMATICAL BOLD ITALIC CAPITAL A..MATHEMATICAL BOLD ITALIC CAPITAL Z 1D49C ; Uppercase # L& MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Uppercase # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Uppercase # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Uppercase # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Uppercase # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B5 ; Uppercase # L& [8] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT CAPITAL Z 1D4D0..1D4E9 ; Uppercase # L& [26] MATHEMATICAL BOLD SCRIPT CAPITAL A..MATHEMATICAL BOLD SCRIPT CAPITAL Z 1D504..1D505 ; Uppercase # L& [2] MATHEMATICAL FRAKTUR CAPITAL A..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Uppercase # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Uppercase # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Uppercase # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D538..1D539 ; Uppercase # L& [2] MATHEMATICAL DOUBLE-STRUCK CAPITAL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Uppercase # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Uppercase # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Uppercase # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Uppercase # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D56C..1D585 ; Uppercase # L& [26] MATHEMATICAL BOLD FRAKTUR CAPITAL A..MATHEMATICAL BOLD FRAKTUR CAPITAL Z 1D5A0..1D5B9 ; Uppercase # L& [26] MATHEMATICAL SANS-SERIF CAPITAL A..MATHEMATICAL SANS-SERIF CAPITAL Z 1D5D4..1D5ED ; Uppercase # L& [26] MATHEMATICAL SANS-SERIF BOLD CAPITAL A..MATHEMATICAL SANS-SERIF BOLD CAPITAL Z 1D608..1D621 ; Uppercase # L& [26] MATHEMATICAL SANS-SERIF ITALIC CAPITAL A..MATHEMATICAL SANS-SERIF ITALIC CAPITAL Z 1D63C..1D655 ; Uppercase # L& [26] MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL A..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL Z 1D670..1D689 ; Uppercase # L& [26] MATHEMATICAL MONOSPACE CAPITAL A..MATHEMATICAL MONOSPACE CAPITAL Z 1D6A8..1D6C0 ; Uppercase # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6E2..1D6FA ; Uppercase # L& [25] MATHEMATICAL ITALIC CAPITAL ALPHA..MATHEMATICAL ITALIC CAPITAL OMEGA 1D71C..1D734 ; Uppercase # L& [25] MATHEMATICAL BOLD ITALIC CAPITAL ALPHA..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D756..1D76E ; Uppercase # L& [25] MATHEMATICAL SANS-SERIF BOLD CAPITAL ALPHA..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D790..1D7A8 ; Uppercase # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7CA ; Uppercase # L& MATHEMATICAL BOLD CAPITAL DIGAMMA 1E900..1E921 ; Uppercase # L& [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA 1F130..1F149 ; Uppercase # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z 1F150..1F169 ; Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z # Total code points: 2006 # ================================================ # Derived Property: Cased (Cased) # As defined by Unicode Standard Definition D135 # C has the Lowercase or Uppercase property or has a General_Category value of Titlecase_Letter. 0041..005A ; Cased # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 0061..007A ; Cased # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; Cased # Lo FEMININE ORDINAL INDICATOR 00B5 ; Cased # L& MICRO SIGN 00BA ; Cased # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; Cased # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; Cased # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; Cased # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BC..01BF ; Cased # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C4..0293 ; Cased # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0296..02AF ; Cased # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02B8 ; Cased # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 02C0..02C1 ; Cased # Lm [2] MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP 02E0..02E4 ; Cased # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 0345 ; Cased # Mn COMBINING GREEK YPOGEGRAMMENI 0370..0373 ; Cased # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0376..0377 ; Cased # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; Cased # Lm GREEK YPOGEGRAMMENI 037B..037D ; Cased # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; Cased # L& GREEK CAPITAL LETTER YOT 0386 ; Cased # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Cased # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Cased # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; Cased # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; Cased # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; Cased # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 048A..052F ; Cased # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; Cased # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0560..0588 ; Cased # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 10A0..10C5 ; Cased # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Cased # L& GEORGIAN CAPITAL LETTER YN 10CD ; Cased # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; Cased # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; Cased # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; Cased # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13A0..13F5 ; Cased # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; Cased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C8A ; Cased # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Cased # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Cased # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1D00..1D2B ; Cased # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; Cased # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; Cased # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; Cased # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; Cased # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; Cased # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1E00..1F15 ; Cased # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; Cased # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; Cased # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; Cased # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Cased # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; Cased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Cased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Cased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; Cased # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; Cased # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; Cased # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; Cased # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Cased # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; Cased # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; Cased # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; Cased # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; Cased # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; Cased # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; Cased # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2071 ; Cased # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; Cased # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; Cased # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2102 ; Cased # L& DOUBLE-STRUCK CAPITAL C 2107 ; Cased # L& EULER CONSTANT 210A..2113 ; Cased # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; Cased # L& DOUBLE-STRUCK CAPITAL N 2119..211D ; Cased # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; Cased # L& DOUBLE-STRUCK CAPITAL Z 2126 ; Cased # L& OHM SIGN 2128 ; Cased # L& BLACK-LETTER CAPITAL Z 212A..212D ; Cased # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212F..2134 ; Cased # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2139 ; Cased # L& INFORMATION SOURCE 213C..213F ; Cased # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; Cased # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; Cased # L& TURNED SMALL F 2160..217F ; Cased # Nl [32] ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 2183..2184 ; Cased # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 24B6..24E9 ; Cased # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C00..2C7B ; Cased # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; Cased # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; Cased # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; Cased # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; Cased # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Cased # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Cased # L& GEORGIAN SMALL LETTER YN 2D2D ; Cased # L& GEORGIAN SMALL LETTER AEN A640..A66D ; Cased # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A680..A69B ; Cased # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; Cased # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A722..A76F ; Cased # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; Cased # Lm MODIFIER LETTER US A771..A787 ; Cased # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A78B..A78E ; Cased # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A790..A7DC ; Cased # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; Cased # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; Cased # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F8..A7F9 ; Cased # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; Cased # L& LATIN LETTER SMALL CAPITAL TURNED M AB30..AB5A ; Cased # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; Cased # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; Cased # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; Cased # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; Cased # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Cased # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Cased # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF21..FF3A ; Cased # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; Cased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z 10400..1044F ; Cased # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 104B0..104D3 ; Cased # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; Cased # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10570..1057A ; Cased # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Cased # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Cased # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Cased # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; Cased # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Cased # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Cased # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Cased # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 10780 ; Cased # Lm MODIFIER LETTER SMALL CAPITAL AA 10783..10785 ; Cased # Lm [3] MODIFIER LETTER SMALL AE..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Cased # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Cased # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10C80..10CB2 ; Cased # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; Cased # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D50..10D65 ; Cased # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D70..10D85 ; Cased # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 118A0..118DF ; Cased # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 16E40..16E7F ; Cased # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; Cased # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; Cased # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 1D400..1D454 ; Cased # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; Cased # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Cased # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Cased # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Cased # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Cased # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; Cased # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Cased # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Cased # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; Cased # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Cased # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Cased # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Cased # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; Cased # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Cased # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Cased # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Cased # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Cased # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; Cased # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; Cased # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; Cased # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; Cased # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; Cased # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; Cased # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; Cased # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; Cased # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; Cased # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; Cased # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; Cased # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; Cased # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1DF00..1DF09 ; Cased # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0B..1DF1E ; Cased # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Cased # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E030..1E06D ; Cased # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E900..1E943 ; Cased # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1F130..1F149 ; Cased # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z 1F150..1F169 ; Cased # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Cased # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z # Total code points: 4632 # ================================================ # Derived Property: Case_Ignorable (CI) # As defined by Unicode Standard Definition D136 # C is defined to be case-ignorable if # Word_Break(C) = MidLetter or MidNumLet or Single_Quote, or # General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). 0027 ; Case_Ignorable # Po APOSTROPHE 002E ; Case_Ignorable # Po FULL STOP 003A ; Case_Ignorable # Po COLON 005E ; Case_Ignorable # Sk CIRCUMFLEX ACCENT 0060 ; Case_Ignorable # Sk GRAVE ACCENT 00A8 ; Case_Ignorable # Sk DIAERESIS 00AD ; Case_Ignorable # Cf SOFT HYPHEN 00AF ; Case_Ignorable # Sk MACRON 00B4 ; Case_Ignorable # Sk ACUTE ACCENT 00B7 ; Case_Ignorable # Po MIDDLE DOT 00B8 ; Case_Ignorable # Sk CEDILLA 02B0..02C1 ; Case_Ignorable # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C2..02C5 ; Case_Ignorable # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD 02C6..02D1 ; Case_Ignorable # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02D2..02DF ; Case_Ignorable # Sk [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT 02E0..02E4 ; Case_Ignorable # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02E5..02EB ; Case_Ignorable # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK 02EC ; Case_Ignorable # Lm MODIFIER LETTER VOICING 02ED ; Case_Ignorable # Sk MODIFIER LETTER UNASPIRATED 02EE ; Case_Ignorable # Lm MODIFIER LETTER DOUBLE APOSTROPHE 02EF..02FF ; Case_Ignorable # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW 0300..036F ; Case_Ignorable # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0374 ; Case_Ignorable # Lm GREEK NUMERAL SIGN 0375 ; Case_Ignorable # Sk GREEK LOWER NUMERAL SIGN 037A ; Case_Ignorable # Lm GREEK YPOGEGRAMMENI 0384..0385 ; Case_Ignorable # Sk [2] GREEK TONOS..GREEK DIALYTIKA TONOS 0387 ; Case_Ignorable # Po GREEK ANO TELEIA 0483..0487 ; Case_Ignorable # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0488..0489 ; Case_Ignorable # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 0559 ; Case_Ignorable # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 055F ; Case_Ignorable # Po ARMENIAN ABBREVIATION MARK 0591..05BD ; Case_Ignorable # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; Case_Ignorable # Mn HEBREW POINT RAFE 05C1..05C2 ; Case_Ignorable # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Case_Ignorable # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Case_Ignorable # Mn HEBREW POINT QAMATS QATAN 05F4 ; Case_Ignorable # Po HEBREW PUNCTUATION GERSHAYIM 0600..0605 ; Case_Ignorable # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE 0610..061A ; Case_Ignorable # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 061C ; Case_Ignorable # Cf ARABIC LETTER MARK 0640 ; Case_Ignorable # Lm ARABIC TATWEEL 064B..065F ; Case_Ignorable # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0670 ; Case_Ignorable # Mn ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; Case_Ignorable # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DD ; Case_Ignorable # Cf ARABIC END OF AYAH 06DF..06E4 ; Case_Ignorable # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E5..06E6 ; Case_Ignorable # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06E7..06E8 ; Case_Ignorable # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; Case_Ignorable # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 070F ; Case_Ignorable # Cf SYRIAC ABBREVIATION MARK 0711 ; Case_Ignorable # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0730..074A ; Case_Ignorable # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; Case_Ignorable # Mn [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; Case_Ignorable # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07F4..07F5 ; Case_Ignorable # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; Case_Ignorable # Lm NKO LAJANYALAN 07FD ; Case_Ignorable # Mn NKO DANTAYALAN 0816..0819 ; Case_Ignorable # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081A ; Case_Ignorable # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 081B..0823 ; Case_Ignorable # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0824 ; Case_Ignorable # Lm SAMARITAN MODIFIER LETTER SHORT A 0825..0827 ; Case_Ignorable # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0828 ; Case_Ignorable # Lm SAMARITAN MODIFIER LETTER I 0829..082D ; Case_Ignorable # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Case_Ignorable # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0888 ; Case_Ignorable # Sk ARABIC RAISED ROUND DOT 0890..0891 ; Case_Ignorable # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 0897..089F ; Case_Ignorable # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08C9 ; Case_Ignorable # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; Case_Ignorable # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E2 ; Case_Ignorable # Cf ARABIC DISPUTED END OF AYAH 08E3..0902 ; Case_Ignorable # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; Case_Ignorable # Mn DEVANAGARI VOWEL SIGN OE 093C ; Case_Ignorable # Mn DEVANAGARI SIGN NUKTA 0941..0948 ; Case_Ignorable # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 094D ; Case_Ignorable # Mn DEVANAGARI SIGN VIRAMA 0951..0957 ; Case_Ignorable # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; Case_Ignorable # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0971 ; Case_Ignorable # Lm DEVANAGARI SIGN HIGH SPACING DOT 0981 ; Case_Ignorable # Mn BENGALI SIGN CANDRABINDU 09BC ; Case_Ignorable # Mn BENGALI SIGN NUKTA 09C1..09C4 ; Case_Ignorable # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09CD ; Case_Ignorable # Mn BENGALI SIGN VIRAMA 09E2..09E3 ; Case_Ignorable # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09FE ; Case_Ignorable # Mn BENGALI SANDHI MARK 0A01..0A02 ; Case_Ignorable # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A3C ; Case_Ignorable # Mn GURMUKHI SIGN NUKTA 0A41..0A42 ; Case_Ignorable # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; Case_Ignorable # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; Case_Ignorable # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; Case_Ignorable # Mn GURMUKHI SIGN UDAAT 0A70..0A71 ; Case_Ignorable # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; Case_Ignorable # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; Case_Ignorable # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0ABC ; Case_Ignorable # Mn GUJARATI SIGN NUKTA 0AC1..0AC5 ; Case_Ignorable # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; Case_Ignorable # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0ACD ; Case_Ignorable # Mn GUJARATI SIGN VIRAMA 0AE2..0AE3 ; Case_Ignorable # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFF ; Case_Ignorable # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; Case_Ignorable # Mn ORIYA SIGN CANDRABINDU 0B3C ; Case_Ignorable # Mn ORIYA SIGN NUKTA 0B3F ; Case_Ignorable # Mn ORIYA VOWEL SIGN I 0B41..0B44 ; Case_Ignorable # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B4D ; Case_Ignorable # Mn ORIYA SIGN VIRAMA 0B55..0B56 ; Case_Ignorable # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B62..0B63 ; Case_Ignorable # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; Case_Ignorable # Mn TAMIL SIGN ANUSVARA 0BC0 ; Case_Ignorable # Mn TAMIL VOWEL SIGN II 0BCD ; Case_Ignorable # Mn TAMIL SIGN VIRAMA 0C00 ; Case_Ignorable # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C04 ; Case_Ignorable # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3C ; Case_Ignorable # Mn TELUGU SIGN NUKTA 0C3E..0C40 ; Case_Ignorable # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C46..0C48 ; Case_Ignorable # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; Case_Ignorable # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; Case_Ignorable # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Case_Ignorable # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; Case_Ignorable # Mn KANNADA SIGN CANDRABINDU 0CBC ; Case_Ignorable # Mn KANNADA SIGN NUKTA 0CBF ; Case_Ignorable # Mn KANNADA VOWEL SIGN I 0CC6 ; Case_Ignorable # Mn KANNADA VOWEL SIGN E 0CCC..0CCD ; Case_Ignorable # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CE2..0CE3 ; Case_Ignorable # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0D00..0D01 ; Case_Ignorable # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D3B..0D3C ; Case_Ignorable # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D41..0D44 ; Case_Ignorable # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D4D ; Case_Ignorable # Mn MALAYALAM SIGN VIRAMA 0D62..0D63 ; Case_Ignorable # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; Case_Ignorable # Mn SINHALA SIGN CANDRABINDU 0DCA ; Case_Ignorable # Mn SINHALA SIGN AL-LAKUNA 0DD2..0DD4 ; Case_Ignorable # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Case_Ignorable # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0E31 ; Case_Ignorable # Mn THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; Case_Ignorable # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E46 ; Case_Ignorable # Lm THAI CHARACTER MAIYAMOK 0E47..0E4E ; Case_Ignorable # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0EB1 ; Case_Ignorable # Mn LAO VOWEL SIGN MAI KAN 0EB4..0EBC ; Case_Ignorable # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EC6 ; Case_Ignorable # Lm LAO KO LA 0EC8..0ECE ; Case_Ignorable # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0F18..0F19 ; Case_Ignorable # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; Case_Ignorable # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; Case_Ignorable # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; Case_Ignorable # Mn TIBETAN MARK TSA -PHRU 0F71..0F7E ; Case_Ignorable # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F80..0F84 ; Case_Ignorable # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; Case_Ignorable # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F8D..0F97 ; Case_Ignorable # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; Case_Ignorable # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; Case_Ignorable # Mn TIBETAN SYMBOL PADMA GDAN 102D..1030 ; Case_Ignorable # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1032..1037 ; Case_Ignorable # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1039..103A ; Case_Ignorable # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103D..103E ; Case_Ignorable # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1058..1059 ; Case_Ignorable # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; Case_Ignorable # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1071..1074 ; Case_Ignorable # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; Case_Ignorable # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1085..1086 ; Case_Ignorable # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 108D ; Case_Ignorable # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 109D ; Case_Ignorable # Mn MYANMAR VOWEL SIGN AITON AI 10FC ; Case_Ignorable # Lm MODIFIER LETTER GEORGIAN NAR 135D..135F ; Case_Ignorable # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; Case_Ignorable # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1732..1733 ; Case_Ignorable # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1752..1753 ; Case_Ignorable # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; Case_Ignorable # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; Case_Ignorable # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B7..17BD ; Case_Ignorable # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17C6 ; Case_Ignorable # Mn KHMER SIGN NIKAHIT 17C9..17D3 ; Case_Ignorable # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17D7 ; Case_Ignorable # Lm KHMER SIGN LEK TOO 17DD ; Case_Ignorable # Mn KHMER SIGN ATTHACAN 180B..180D ; Case_Ignorable # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180E ; Case_Ignorable # Cf MONGOLIAN VOWEL SEPARATOR 180F ; Case_Ignorable # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1843 ; Case_Ignorable # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1885..1886 ; Case_Ignorable # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; Case_Ignorable # Mn MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; Case_Ignorable # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1927..1928 ; Case_Ignorable # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1932 ; Case_Ignorable # Mn LIMBU SMALL LETTER ANUSVARA 1939..193B ; Case_Ignorable # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A17..1A18 ; Case_Ignorable # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A1B ; Case_Ignorable # Mn BUGINESE VOWEL SIGN AE 1A56 ; Case_Ignorable # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A58..1A5E ; Case_Ignorable # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; Case_Ignorable # Mn TAI THAM SIGN SAKOT 1A62 ; Case_Ignorable # Mn TAI THAM VOWEL SIGN MAI SAT 1A65..1A6C ; Case_Ignorable # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A73..1A7C ; Case_Ignorable # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Case_Ignorable # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AA7 ; Case_Ignorable # Lm TAI THAM SIGN MAI YAMOK 1AB0..1ABD ; Case_Ignorable # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; Case_Ignorable # Me COMBINING PARENTHESES OVERLAY 1ABF..1ADD ; Case_Ignorable # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; Case_Ignorable # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; Case_Ignorable # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B34 ; Case_Ignorable # Mn BALINESE SIGN REREKAN 1B36..1B3A ; Case_Ignorable # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3C ; Case_Ignorable # Mn BALINESE VOWEL SIGN LA LENGA 1B42 ; Case_Ignorable # Mn BALINESE VOWEL SIGN PEPET 1B6B..1B73 ; Case_Ignorable # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; Case_Ignorable # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; Case_Ignorable # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; Case_Ignorable # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAB..1BAD ; Case_Ignorable # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; Case_Ignorable # Mn BATAK SIGN TOMPI 1BE8..1BE9 ; Case_Ignorable # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; Case_Ignorable # Mn BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; Case_Ignorable # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1C2C..1C33 ; Case_Ignorable # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; Case_Ignorable # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1C78..1C7D ; Case_Ignorable # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1CD0..1CD2 ; Case_Ignorable # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Case_Ignorable # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Case_Ignorable # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Case_Ignorable # Mn VEDIC SIGN TIRYAK 1CF4 ; Case_Ignorable # Mn VEDIC TONE CANDRA ABOVE 1CF8..1CF9 ; Case_Ignorable # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1D2C..1D6A ; Case_Ignorable # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D78 ; Case_Ignorable # Lm MODIFIER LETTER CYRILLIC EN 1D9B..1DBF ; Case_Ignorable # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1DC0..1DFF ; Case_Ignorable # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 1FBD ; Case_Ignorable # Sk GREEK KORONIS 1FBF..1FC1 ; Case_Ignorable # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI 1FCD..1FCF ; Case_Ignorable # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI 1FDD..1FDF ; Case_Ignorable # Sk [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI 1FED..1FEF ; Case_Ignorable # Sk [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA 1FFD..1FFE ; Case_Ignorable # Sk [2] GREEK OXIA..GREEK DASIA 200B..200F ; Case_Ignorable # Cf [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK 2018 ; Case_Ignorable # Pi LEFT SINGLE QUOTATION MARK 2019 ; Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK 2024 ; Case_Ignorable # Po ONE DOT LEADER 2027 ; Case_Ignorable # Po HYPHENATION POINT 202A..202E ; Case_Ignorable # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE 2060..2064 ; Case_Ignorable # Cf [5] WORD JOINER..INVISIBLE PLUS 2066..206F ; Case_Ignorable # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES 2071 ; Case_Ignorable # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; Case_Ignorable # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; Case_Ignorable # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 20D0..20DC ; Case_Ignorable # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20DD..20E0 ; Case_Ignorable # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH 20E1 ; Case_Ignorable # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E2..20E4 ; Case_Ignorable # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE 20E5..20F0 ; Case_Ignorable # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2C7C..2C7D ; Case_Ignorable # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2CEF..2CF1 ; Case_Ignorable # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2D6F ; Case_Ignorable # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D7F ; Case_Ignorable # Mn TIFINAGH CONSONANT JOINER 2DE0..2DFF ; Case_Ignorable # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 2E2F ; Case_Ignorable # Lm VERTICAL TILDE 3005 ; Case_Ignorable # Lm IDEOGRAPHIC ITERATION MARK 302A..302D ; Case_Ignorable # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 3031..3035 ; Case_Ignorable # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 303B ; Case_Ignorable # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 3099..309A ; Case_Ignorable # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309B..309C ; Case_Ignorable # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309D..309E ; Case_Ignorable # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 30FC..30FE ; Case_Ignorable # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK A015 ; Case_Ignorable # Lm YI SYLLABLE WU A4F8..A4FD ; Case_Ignorable # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A60C ; Case_Ignorable # Lm VAI SYLLABLE LENGTHENER A66F ; Case_Ignorable # Mn COMBINING CYRILLIC VZMET A670..A672 ; Case_Ignorable # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN A674..A67D ; Case_Ignorable # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A67F ; Case_Ignorable # Lm CYRILLIC PAYEROK A69C..A69D ; Case_Ignorable # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A69E..A69F ; Case_Ignorable # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6F0..A6F1 ; Case_Ignorable # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A700..A716 ; Case_Ignorable # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR A717..A71F ; Case_Ignorable # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A720..A721 ; Case_Ignorable # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE A770 ; Case_Ignorable # Lm MODIFIER LETTER US A788 ; Case_Ignorable # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A789..A78A ; Case_Ignorable # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN A7F1..A7F4 ; Case_Ignorable # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F8..A7F9 ; Case_Ignorable # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A802 ; Case_Ignorable # Mn SYLOTI NAGRI SIGN DVISVARA A806 ; Case_Ignorable # Mn SYLOTI NAGRI SIGN HASANTA A80B ; Case_Ignorable # Mn SYLOTI NAGRI SIGN ANUSVARA A825..A826 ; Case_Ignorable # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A82C ; Case_Ignorable # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4..A8C5 ; Case_Ignorable # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8E0..A8F1 ; Case_Ignorable # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8FF ; Case_Ignorable # Mn DEVANAGARI VOWEL SIGN AY A926..A92D ; Case_Ignorable # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; Case_Ignorable # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A980..A982 ; Case_Ignorable # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; Case_Ignorable # Mn JAVANESE SIGN CECAK TELU A9B6..A9B9 ; Case_Ignorable # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; Case_Ignorable # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9CF ; Case_Ignorable # Lm JAVANESE PANGRANGKEP A9E5 ; Case_Ignorable # Mn MYANMAR SIGN SHAN SAW A9E6 ; Case_Ignorable # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION AA29..AA2E ; Case_Ignorable # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; Case_Ignorable # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA35..AA36 ; Case_Ignorable # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; Case_Ignorable # Mn CHAM CONSONANT SIGN FINAL NG AA4C ; Case_Ignorable # Mn CHAM CONSONANT SIGN FINAL M AA70 ; Case_Ignorable # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA7C ; Case_Ignorable # Mn MYANMAR SIGN TAI LAING TONE-2 AAB0 ; Case_Ignorable # Mn TAI VIET MAI KANG AAB2..AAB4 ; Case_Ignorable # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; Case_Ignorable # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE..AABF ; Case_Ignorable # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC1 ; Case_Ignorable # Mn TAI VIET TONE MAI THO AADD ; Case_Ignorable # Lm TAI VIET SYMBOL SAM AAEC..AAED ; Case_Ignorable # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAF3..AAF4 ; Case_Ignorable # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AAF6 ; Case_Ignorable # Mn MEETEI MAYEK VIRAMA AB5B ; Case_Ignorable # Sk MODIFIER BREVE WITH INVERTED BREVE AB5C..AB5F ; Case_Ignorable # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB69 ; Case_Ignorable # Lm MODIFIER LETTER SMALL TURNED W AB6A..AB6B ; Case_Ignorable # Sk [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK ABE5 ; Case_Ignorable # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE8 ; Case_Ignorable # Mn MEETEI MAYEK VOWEL SIGN UNAP ABED ; Case_Ignorable # Mn MEETEI MAYEK APUN IYEK FB1E ; Case_Ignorable # Mn HEBREW POINT JUDEO-SPANISH VARIKA FBB2..FBC2 ; Case_Ignorable # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE FE00..FE0F ; Case_Ignorable # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE13 ; Case_Ignorable # Po PRESENTATION FORM FOR VERTICAL COLON FE20..FE2F ; Case_Ignorable # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FE52 ; Case_Ignorable # Po SMALL FULL STOP FE55 ; Case_Ignorable # Po SMALL COLON FEFF ; Case_Ignorable # Cf ZERO WIDTH NO-BREAK SPACE FF07 ; Case_Ignorable # Po FULLWIDTH APOSTROPHE FF0E ; Case_Ignorable # Po FULLWIDTH FULL STOP FF1A ; Case_Ignorable # Po FULLWIDTH COLON FF3E ; Case_Ignorable # Sk FULLWIDTH CIRCUMFLEX ACCENT FF40 ; Case_Ignorable # Sk FULLWIDTH GRAVE ACCENT FF70 ; Case_Ignorable # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF9E..FF9F ; Case_Ignorable # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFE3 ; Case_Ignorable # Sk FULLWIDTH MACRON FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR 101FD ; Case_Ignorable # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 102E0 ; Case_Ignorable # Mn COPTIC EPACT THOUSANDS MARK 10376..1037A ; Case_Ignorable # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10780..10785 ; Case_Ignorable # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Case_Ignorable # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Case_Ignorable # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10A01..10A03 ; Case_Ignorable # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Case_Ignorable # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Case_Ignorable # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A38..10A3A ; Case_Ignorable # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; Case_Ignorable # Mn KHAROSHTHI VIRAMA 10AE5..10AE6 ; Case_Ignorable # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Case_Ignorable # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D4E ; Case_Ignorable # Lm GARAY VOWEL LENGTH MARK 10D69..10D6D ; Case_Ignorable # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10D6F ; Case_Ignorable # Lm GARAY REDUPLICATION MARK 10EAB..10EAC ; Case_Ignorable # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EC5 ; Case_Ignorable # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EFA..10EFF ; Case_Ignorable # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Case_Ignorable # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Case_Ignorable # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Case_Ignorable # Mn BRAHMI SIGN ANUSVARA 11038..11046 ; Case_Ignorable # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11070 ; Case_Ignorable # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11073..11074 ; Case_Ignorable # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 1107F..11081 ; Case_Ignorable # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 110B3..110B6 ; Case_Ignorable # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B9..110BA ; Case_Ignorable # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110BD ; Case_Ignorable # Cf KAITHI NUMBER SIGN 110C2 ; Case_Ignorable # Mn KAITHI VOWEL SIGN VOCALIC R 110CD ; Case_Ignorable # Cf KAITHI NUMBER SIGN ABOVE 11100..11102 ; Case_Ignorable # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; Case_Ignorable # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112D..11134 ; Case_Ignorable # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11173 ; Case_Ignorable # Mn MAHAJANI SIGN NUKTA 11180..11181 ; Case_Ignorable # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; Case_Ignorable # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111C9..111CC ; Case_Ignorable # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; Case_Ignorable # Mn SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; Case_Ignorable # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; Case_Ignorable # Mn KHOJKI SIGN ANUSVARA 11236..11237 ; Case_Ignorable # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; Case_Ignorable # Mn KHOJKI SIGN SUKUN 11241 ; Case_Ignorable # Mn KHOJKI VOWEL SIGN VOCALIC R 112DF ; Case_Ignorable # Mn KHUDAWADI SIGN ANUSVARA 112E3..112EA ; Case_Ignorable # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 11300..11301 ; Case_Ignorable # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 1133B..1133C ; Case_Ignorable # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 11340 ; Case_Ignorable # Mn GRANTHA VOWEL SIGN II 11366..1136C ; Case_Ignorable # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; Case_Ignorable # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113BB..113C0 ; Case_Ignorable # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113CE ; Case_Ignorable # Mn TULU-TIGALARI SIGN VIRAMA 113D0 ; Case_Ignorable # Mn TULU-TIGALARI CONJOINER 113D2 ; Case_Ignorable # Mn TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; Case_Ignorable # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11438..1143F ; Case_Ignorable # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11442..11444 ; Case_Ignorable # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11446 ; Case_Ignorable # Mn NEWA SIGN NUKTA 1145E ; Case_Ignorable # Mn NEWA SANDHI MARK 114B3..114B8 ; Case_Ignorable # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114BA ; Case_Ignorable # Mn TIRHUTA VOWEL SIGN SHORT E 114BF..114C0 ; Case_Ignorable # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C2..114C3 ; Case_Ignorable # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115B2..115B5 ; Case_Ignorable # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115BC..115BD ; Case_Ignorable # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BF..115C0 ; Case_Ignorable # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115DC..115DD ; Case_Ignorable # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11633..1163A ; Case_Ignorable # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163D ; Case_Ignorable # Mn MODI SIGN ANUSVARA 1163F..11640 ; Case_Ignorable # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 116AB ; Case_Ignorable # Mn TAKRI SIGN ANUSVARA 116AD ; Case_Ignorable # Mn TAKRI VOWEL SIGN AA 116B0..116B5 ; Case_Ignorable # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B7 ; Case_Ignorable # Mn TAKRI SIGN NUKTA 1171D ; Case_Ignorable # Mn AHOM CONSONANT SIGN MEDIAL LA 1171F ; Case_Ignorable # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11722..11725 ; Case_Ignorable # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11727..1172B ; Case_Ignorable # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 1182F..11837 ; Case_Ignorable # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11839..1183A ; Case_Ignorable # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 1193B..1193C ; Case_Ignorable # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193E ; Case_Ignorable # Mn DIVES AKURU VIRAMA 11943 ; Case_Ignorable # Mn DIVES AKURU SIGN NUKTA 119D4..119D7 ; Case_Ignorable # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; Case_Ignorable # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119E0 ; Case_Ignorable # Mn NANDINAGARI SIGN VIRAMA 11A01..11A0A ; Case_Ignorable # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A33..11A38 ; Case_Ignorable # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A3B..11A3E ; Case_Ignorable # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; Case_Ignorable # Mn ZANABAZAR SQUARE SUBJOINER 11A51..11A56 ; Case_Ignorable # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A59..11A5B ; Case_Ignorable # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; Case_Ignorable # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A98..11A99 ; Case_Ignorable # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11B60 ; Case_Ignorable # Mn SHARADA VOWEL SIGN OE 11B62..11B64 ; Case_Ignorable # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B66 ; Case_Ignorable # Mn SHARADA VOWEL SIGN CANDRA E 11C30..11C36 ; Case_Ignorable # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; Case_Ignorable # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3F ; Case_Ignorable # Mn BHAIKSUKI SIGN VIRAMA 11C92..11CA7 ; Case_Ignorable # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CAA..11CB0 ; Case_Ignorable # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB2..11CB3 ; Case_Ignorable # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB5..11CB6 ; Case_Ignorable # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; Case_Ignorable # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; Case_Ignorable # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; Case_Ignorable # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; Case_Ignorable # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D47 ; Case_Ignorable # Mn MASARAM GONDI RA-KARA 11D90..11D91 ; Case_Ignorable # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D95 ; Case_Ignorable # Mn GUNJALA GONDI SIGN ANUSVARA 11D97 ; Case_Ignorable # Mn GUNJALA GONDI VIRAMA 11DD9 ; Case_Ignorable # Lm TOLONG SIKI SIGN SELA 11EF3..11EF4 ; Case_Ignorable # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11F00..11F01 ; Case_Ignorable # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; Case_Ignorable # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; Case_Ignorable # Mn KAWI VOWEL SIGN EU 11F42 ; Case_Ignorable # Mn KAWI CONJOINER 11F5A ; Case_Ignorable # Mn KAWI SIGN NUKTA 13430..1343F ; Case_Ignorable # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE 13440 ; Case_Ignorable # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13447..13455 ; Case_Ignorable # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1611E..16129 ; Case_Ignorable # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612D..1612F ; Case_Ignorable # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; Case_Ignorable # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; Case_Ignorable # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16B40..16B43 ; Case_Ignorable # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16D40..16D42 ; Case_Ignorable # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D6B..16D6C ; Case_Ignorable # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16F4F ; Case_Ignorable # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; Case_Ignorable # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; Case_Ignorable # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; Case_Ignorable # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Case_Ignorable # Lm OLD CHINESE ITERATION MARK 16FE4 ; Case_Ignorable # Mn KHITAN SMALL SCRIPT FILLER 16FF2..16FF3 ; Case_Ignorable # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1AFF0..1AFF3 ; Case_Ignorable # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Case_Ignorable # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Case_Ignorable # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1BC9D..1BC9E ; Case_Ignorable # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1BCA0..1BCA3 ; Case_Ignorable # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1CF00..1CF2D ; Case_Ignorable # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; Case_Ignorable # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D167..1D169 ; Case_Ignorable # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D173..1D17A ; Case_Ignorable # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE 1D17B..1D182 ; Case_Ignorable # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Case_Ignorable # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Case_Ignorable # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; Case_Ignorable # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1DA00..1DA36 ; Case_Ignorable # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; Case_Ignorable # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; Case_Ignorable # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; Case_Ignorable # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; Case_Ignorable # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; Case_Ignorable # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1E000..1E006 ; Case_Ignorable # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; Case_Ignorable # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; Case_Ignorable # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; Case_Ignorable # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; Case_Ignorable # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E030..1E06D ; Case_Ignorable # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E08F ; Case_Ignorable # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E130..1E136 ; Case_Ignorable # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E137..1E13D ; Case_Ignorable # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E2AE ; Case_Ignorable # Mn TOTO SIGN RISING TONE 1E2EC..1E2EF ; Case_Ignorable # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E4EB ; Case_Ignorable # Lm NAG MUNDARI SIGN OJOD 1E4EC..1E4EF ; Case_Ignorable # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E5EE..1E5EF ; Case_Ignorable # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E6E3 ; Case_Ignorable # Mn TAI YO SIGN UE 1E6E6 ; Case_Ignorable # Mn TAI YO SIGN AU 1E6EE..1E6EF ; Case_Ignorable # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; Case_Ignorable # Mn TAI YO SIGN OM 1E6FF ; Case_Ignorable # Lm TAI YO XAM LAI 1E8D0..1E8D6 ; Case_Ignorable # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E94A ; Case_Ignorable # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA 1E94B ; Case_Ignorable # Lm ADLAM NASALIZATION MARK 1F3FB..1F3FF ; Case_Ignorable # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 E0001 ; Case_Ignorable # Cf LANGUAGE TAG E0020..E007F ; Case_Ignorable # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 2794 # ================================================ # Derived Property: Changes_When_Lowercased (CWL) # Characters whose normalized forms are not stable under a toLowercase mapping. # For more information, see the definition of "isLowercase(X)" # in the "Conformance" / "Default Case Algorithms" section of the core specification. # Changes_When_Lowercased(X) is true when toLowercase(toNFD(X)) != toNFD(X) 0041..005A ; Changes_When_Lowercased # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 00C0..00D6 ; Changes_When_Lowercased # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00DE ; Changes_When_Lowercased # L& [7] LATIN CAPITAL LETTER O WITH STROKE..LATIN CAPITAL LETTER THORN 0100 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH MACRON 0102 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH BREVE 0104 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH OGONEK 0106 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER C WITH ACUTE 0108 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER C WITH CIRCUMFLEX 010A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER C WITH DOT ABOVE 010C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER C WITH CARON 010E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH CARON 0110 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH STROKE 0112 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH MACRON 0114 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH BREVE 0116 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH DOT ABOVE 0118 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH OGONEK 011A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CARON 011C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH CIRCUMFLEX 011E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH BREVE 0120 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH DOT ABOVE 0122 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH CEDILLA 0124 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH CIRCUMFLEX 0126 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH STROKE 0128 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH TILDE 012A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH MACRON 012C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH BREVE 012E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH OGONEK 0130 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH DOT ABOVE 0132 ; Changes_When_Lowercased # L& LATIN CAPITAL LIGATURE IJ 0134 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER J WITH CIRCUMFLEX 0136 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH CEDILLA 0139 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH ACUTE 013B ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH CEDILLA 013D ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH CARON 013F ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH MIDDLE DOT 0141 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH STROKE 0143 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH ACUTE 0145 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH CEDILLA 0147 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH CARON 014A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER ENG 014C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH MACRON 014E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH BREVE 0150 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH DOUBLE ACUTE 0152 ; Changes_When_Lowercased # L& LATIN CAPITAL LIGATURE OE 0154 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH ACUTE 0156 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH CEDILLA 0158 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH CARON 015A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH ACUTE 015C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH CIRCUMFLEX 015E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH CEDILLA 0160 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH CARON 0162 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH CEDILLA 0164 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH CARON 0166 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH STROKE 0168 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH TILDE 016A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH MACRON 016C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH BREVE 016E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH RING ABOVE 0170 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DOUBLE ACUTE 0172 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH OGONEK 0174 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH CIRCUMFLEX 0176 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH CIRCUMFLEX 0178..0179 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN CAPITAL LETTER Z WITH ACUTE 017B ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH DOT ABOVE 017D ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH CARON 0181..0182 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPITAL LETTER B WITH TOPBAR 0184 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER TONE SIX 0186..0187 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL LETTER C WITH HOOK 0189..018B ; Changes_When_Lowercased # L& [3] LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITAL LETTER D WITH TOPBAR 018E..0191 ; Changes_When_Lowercased # L& [4] LATIN CAPITAL LETTER REVERSED E..LATIN CAPITAL LETTER F WITH HOOK 0193..0194 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPITAL LETTER GAMMA 0196..0198 ; Changes_When_Lowercased # L& [3] LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LETTER K WITH HOOK 019C..019D ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL LETTER N WITH LEFT HOOK 019F..01A0 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LATIN CAPITAL LETTER O WITH HORN 01A2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER OI 01A4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER P WITH HOOK 01A6..01A7 ; Changes_When_Lowercased # L& [2] LATIN LETTER YR..LATIN CAPITAL LETTER TONE TWO 01A9 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER ESH 01AC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH HOOK 01AE..01AF ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER T WITH RETROFLEX HOOK..LATIN CAPITAL LETTER U WITH HORN 01B1..01B3 ; Changes_When_Lowercased # L& [3] LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL LETTER Y WITH HOOK 01B5 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH STROKE 01B7..01B8 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETTER EZH REVERSED 01BC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER TONE FIVE 01C4..01C5 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER DZ WITH CARON..LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 01C7..01C8 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER LJ..LATIN CAPITAL LETTER L WITH SMALL LETTER J 01CA..01CB ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER NJ..LATIN CAPITAL LETTER N WITH SMALL LETTER J 01CD ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH CARON 01CF ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH CARON 01D1 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH CARON 01D3 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH CARON 01D5 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON 01D7 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE 01D9 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON 01DB ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE 01DE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON 01E0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON 01E2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AE WITH MACRON 01E4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH STROKE 01E6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH CARON 01E8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH CARON 01EA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH OGONEK 01EC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH OGONEK AND MACRON 01EE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER EZH WITH CARON 01F1..01F2 ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER DZ..LATIN CAPITAL LETTER D WITH SMALL LETTER Z 01F4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH ACUTE 01F6..01F8 ; Changes_When_Lowercased # L& [3] LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LETTER N WITH GRAVE 01FA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE 01FC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AE WITH ACUTE 01FE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 0200 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH DOUBLE GRAVE 0202 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH INVERTED BREVE 0204 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH DOUBLE GRAVE 0206 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH INVERTED BREVE 0208 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH DOUBLE GRAVE 020A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH INVERTED BREVE 020C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH DOUBLE GRAVE 020E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH INVERTED BREVE 0210 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH DOUBLE GRAVE 0212 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH INVERTED BREVE 0214 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DOUBLE GRAVE 0216 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH INVERTED BREVE 0218 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH COMMA BELOW 021A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH COMMA BELOW 021C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER YOGH 021E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH CARON 0220 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 0222 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER OU 0224 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH HOOK 0226 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH DOT ABOVE 0228 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CEDILLA 022A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 022C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH TILDE AND MACRON 022E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH DOT ABOVE 0230 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON 0232 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH MACRON 023A..023B ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER A WITH STROKE..LATIN CAPITAL LETTER C WITH STROKE 023D..023E ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER L WITH BAR..LATIN CAPITAL LETTER T WITH DIAGONAL STROKE 0241 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER GLOTTAL STOP 0243..0246 ; Changes_When_Lowercased # L& [4] LATIN CAPITAL LETTER B WITH STROKE..LATIN CAPITAL LETTER E WITH STROKE 0248 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER J WITH STROKE 024A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL 024C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH STROKE 024E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH STROKE 0370 ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER HETA 0372 ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER ARCHAIC SAMPI 0376 ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA 037F ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER YOT 0386 ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Changes_When_Lowercased # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..038F ; Changes_When_Lowercased # L& [2] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER OMEGA WITH TONOS 0391..03A1 ; Changes_When_Lowercased # L& [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO 03A3..03AB ; Changes_When_Lowercased # L& [9] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA 03CF ; Changes_When_Lowercased # L& GREEK CAPITAL KAI SYMBOL 03D8 ; Changes_When_Lowercased # L& GREEK LETTER ARCHAIC KOPPA 03DA ; Changes_When_Lowercased # L& GREEK LETTER STIGMA 03DC ; Changes_When_Lowercased # L& GREEK LETTER DIGAMMA 03DE ; Changes_When_Lowercased # L& GREEK LETTER KOPPA 03E0 ; Changes_When_Lowercased # L& GREEK LETTER SAMPI 03E2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER SHEI 03E4 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER FEI 03E6 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER KHEI 03E8 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER HORI 03EA ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER GANGIA 03EC ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER SHIMA 03EE ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER DEI 03F4 ; Changes_When_Lowercased # L& GREEK CAPITAL THETA SYMBOL 03F7 ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER SHO 03F9..03FA ; Changes_When_Lowercased # L& [2] GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAPITAL LETTER SAN 03FD..042F ; Changes_When_Lowercased # L& [51] GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..CYRILLIC CAPITAL LETTER YA 0460 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER OMEGA 0462 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER YAT 0464 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTIFIED E 0466 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER LITTLE YUS 0468 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS 046A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BIG YUS 046C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS 046E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KSI 0470 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER PSI 0472 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER FITA 0474 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IZHITSA 0476 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0478 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER UK 047A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ROUND OMEGA 047C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER OMEGA WITH TITLO 047E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER OT 0480 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOPPA 048A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SHORT I WITH TAIL 048C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SEMISOFT SIGN 048E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ER WITH TICK 0490 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER GHE WITH UPTURN 0492 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER GHE WITH STROKE 0494 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK 0496 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 0498 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ZE WITH DESCENDER 049A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KA WITH DESCENDER 049C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE 049E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KA WITH STROKE 04A0 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BASHKIR KA 04A2 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EN WITH DESCENDER 04A4 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LIGATURE EN GHE 04A6 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK 04A8 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ABKHASIAN HA 04AA ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ES WITH DESCENDER 04AC ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TE WITH DESCENDER 04AE ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER STRAIGHT U 04B0 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE 04B2 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER HA WITH DESCENDER 04B4 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LIGATURE TE TSE 04B6 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER CHE WITH DESCENDER 04B8 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE 04BA ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SHHA 04BC ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ABKHASIAN CHE 04BE ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER 04C0..04C1 ; Changes_When_Lowercased # L& [2] CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL LETTER ZHE WITH BREVE 04C3 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KA WITH HOOK 04C5 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EL WITH TAIL 04C7 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EN WITH HOOK 04C9 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EN WITH TAIL 04CB ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KHAKASSIAN CHE 04CD ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EM WITH TAIL 04D0 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER A WITH BREVE 04D2 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER A WITH DIAERESIS 04D4 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LIGATURE A IE 04D6 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IE WITH BREVE 04D8 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SCHWA 04DA ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS 04DC ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS 04DE ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS 04E0 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ABKHASIAN DZE 04E2 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER I WITH MACRON 04E4 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER I WITH DIAERESIS 04E6 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER O WITH DIAERESIS 04E8 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BARRED O 04EA ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS 04EC ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER E WITH DIAERESIS 04EE ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER U WITH MACRON 04F0 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER U WITH DIAERESIS 04F2 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 04F4 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS 04F6 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER GHE WITH DESCENDER 04F8 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS 04FA ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK 04FC ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER HA WITH HOOK 04FE ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER HA WITH STROKE 0500 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI DE 0502 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI DJE 0504 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI ZJE 0506 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI DZJE 0508 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI LJE 050A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI NJE 050C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI SJE 050E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER KOMI TJE 0510 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER REVERSED ZE 0512 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EL WITH HOOK 0514 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER LHA 0516 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER RHA 0518 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER YAE 051A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER QA 051C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER WE 051E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ALEUT KA 0520 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK 0522 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK 0524 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER PE WITH DESCENDER 0526 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER 0528 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK 052A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DZZHE 052C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DCHE 052E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER EL WITH DESCENDER 0531..0556 ; Changes_When_Lowercased # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 10A0..10C5 ; Changes_When_Lowercased # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Changes_When_Lowercased # L& GEORGIAN CAPITAL LETTER YN 10CD ; Changes_When_Lowercased # L& GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Changes_When_Lowercased # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 1C89 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Changes_When_Lowercased # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Changes_When_Lowercased # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH RING BELOW 1E02 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER B WITH DOT BELOW 1E06 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER B WITH LINE BELOW 1E08 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE 1E0A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH DOT ABOVE 1E0C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH DOT BELOW 1E0E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH LINE BELOW 1E10 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH CEDILLA 1E12 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW 1E14 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH MACRON AND GRAVE 1E16 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH MACRON AND ACUTE 1E18 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW 1E1A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH TILDE BELOW 1E1C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE 1E1E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER F WITH DOT ABOVE 1E20 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH MACRON 1E22 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH DOT ABOVE 1E24 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH DOT BELOW 1E26 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH DIAERESIS 1E28 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH CEDILLA 1E2A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH BREVE BELOW 1E2C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH TILDE BELOW 1E2E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE 1E30 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH ACUTE 1E32 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH DOT BELOW 1E34 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH LINE BELOW 1E36 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH DOT BELOW 1E38 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON 1E3A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH LINE BELOW 1E3C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW 1E3E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER M WITH ACUTE 1E40 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER M WITH DOT ABOVE 1E42 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER M WITH DOT BELOW 1E44 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH DOT ABOVE 1E46 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH DOT BELOW 1E48 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH LINE BELOW 1E4A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW 1E4C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH TILDE AND ACUTE 1E4E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS 1E50 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH MACRON AND GRAVE 1E52 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH MACRON AND ACUTE 1E54 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER P WITH ACUTE 1E56 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER P WITH DOT ABOVE 1E58 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH DOT ABOVE 1E5A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH DOT BELOW 1E5C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON 1E5E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH LINE BELOW 1E60 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH DOT ABOVE 1E62 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH DOT BELOW 1E64 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE 1E66 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE 1E68 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH DOT ABOVE 1E6C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH DOT BELOW 1E6E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH LINE BELOW 1E70 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW 1E72 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DIAERESIS BELOW 1E74 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH TILDE BELOW 1E76 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW 1E78 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH TILDE AND ACUTE 1E7A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS 1E7C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER V WITH TILDE 1E7E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER V WITH DOT BELOW 1E80 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH GRAVE 1E82 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH ACUTE 1E84 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH DIAERESIS 1E86 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH DOT ABOVE 1E88 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH DOT BELOW 1E8A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER X WITH DOT ABOVE 1E8C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER X WITH DIAERESIS 1E8E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH DOT ABOVE 1E90 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH CIRCUMFLEX 1E92 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH DOT BELOW 1E94 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH LINE BELOW 1E9E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER SHARP S 1EA0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH DOT BELOW 1EA2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH HOOK ABOVE 1EA4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE 1EAC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH BREVE AND ACUTE 1EB0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH BREVE AND GRAVE 1EB2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE 1EB4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH BREVE AND TILDE 1EB6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW 1EB8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH DOT BELOW 1EBA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH HOOK ABOVE 1EBC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH TILDE 1EBE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE 1EC6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH HOOK ABOVE 1ECA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER I WITH DOT BELOW 1ECC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH DOT BELOW 1ECE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH HOOK ABOVE 1ED0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE 1ED8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH HORN AND ACUTE 1EDC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH HORN AND GRAVE 1EDE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE 1EE0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH HORN AND TILDE 1EE2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW 1EE4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH DOT BELOW 1EE6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH HOOK ABOVE 1EE8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH HORN AND ACUTE 1EEA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH HORN AND GRAVE 1EEC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE 1EEE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH HORN AND TILDE 1EF0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW 1EF2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH GRAVE 1EF4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH DOT BELOW 1EF6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH HOOK ABOVE 1EF8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH TILDE 1EFA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER MIDDLE-WELSH LL 1EFC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER MIDDLE-WELSH V 1EFE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Y WITH LOOP 1F08..1F0F ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER ALPHA WITH PSILI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F18..1F1D ; Changes_When_Lowercased # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F28..1F2F ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER ETA WITH PSILI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI 1F38..1F3F ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER IOTA WITH PSILI..GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI 1F48..1F4D ; Changes_When_Lowercased # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F59 ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F ; Changes_When_Lowercased # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F68..1F6F ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER OMEGA WITH PSILI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F88..1F8F ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1F98..1F9F ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FA8..1FAF ; Changes_When_Lowercased # L& [8] GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FB8..1FBC ; Changes_When_Lowercased # L& [5] GREEK CAPITAL LETTER ALPHA WITH VRACHY..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FC8..1FCC ; Changes_When_Lowercased # L& [5] GREEK CAPITAL LETTER EPSILON WITH VARIA..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD8..1FDB ; Changes_When_Lowercased # L& [4] GREEK CAPITAL LETTER IOTA WITH VRACHY..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE8..1FEC ; Changes_When_Lowercased # L& [5] GREEK CAPITAL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF8..1FFC ; Changes_When_Lowercased # L& [5] GREEK CAPITAL LETTER OMICRON WITH VARIA..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2126 ; Changes_When_Lowercased # L& OHM SIGN 212A..212B ; Changes_When_Lowercased # L& [2] KELVIN SIGN..ANGSTROM SIGN 2132 ; Changes_When_Lowercased # L& TURNED CAPITAL F 2160..216F ; Changes_When_Lowercased # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 2183 ; Changes_When_Lowercased # L& ROMAN NUMERAL REVERSED ONE HUNDRED 24B6..24CF ; Changes_When_Lowercased # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z 2C00..2C2F ; Changes_When_Lowercased # L& [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI 2C60 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH DOUBLE BAR 2C62..2C64 ; Changes_When_Lowercased # L& [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL 2C67 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER H WITH DESCENDER 2C69 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH DESCENDER 2C6B ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Z WITH DESCENDER 2C6D..2C70 ; Changes_When_Lowercased # L& [4] LATIN CAPITAL LETTER ALPHA..LATIN CAPITAL LETTER TURNED ALPHA 2C72 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER W WITH HOOK 2C75 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER HALF H 2C7E..2C80 ; Changes_When_Lowercased # L& [3] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC CAPITAL LETTER ALFA 2C82 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER VIDA 2C84 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER GAMMA 2C86 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER DALDA 2C88 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER EIE 2C8A ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER SOU 2C8C ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER ZATA 2C8E ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER HATE 2C90 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER THETHE 2C92 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER IAUDA 2C94 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER KAPA 2C96 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER LAULA 2C98 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER MI 2C9A ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER NI 2C9C ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER KSI 2C9E ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER O 2CA0 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER PI 2CA2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER RO 2CA4 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER SIMA 2CA6 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER TAU 2CA8 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER UA 2CAA ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER FI 2CAC ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER KHI 2CAE ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER PSI 2CB0 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OOU 2CB2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER DIALECT-P ALEF 2CB4 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC AIN 2CB6 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE 2CB8 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER DIALECT-P KAPA 2CBA ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER DIALECT-P NI 2CBC ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI 2CBE ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC OOU 2CC0 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER SAMPI 2CC2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER CROSSED SHEI 2CC4 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC SHEI 2CC6 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC ESH 2CC8 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER AKHMIMIC KHEI 2CCA ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER DIALECT-P HORI 2CCC ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC HORI 2CCE ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC HA 2CD0 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER L-SHAPED HA 2CD2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC HEI 2CD4 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC HAT 2CD6 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC GANGIA 2CD8 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC DJA 2CDA ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD COPTIC SHIMA 2CDC ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD NUBIAN SHIMA 2CDE ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD NUBIAN NGI 2CE0 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD NUBIAN NYI 2CE2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER OLD NUBIAN WAU 2CEB ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI 2CED ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA 2CF2 ; Changes_When_Lowercased # L& COPTIC CAPITAL LETTER BOHAIRIC KHEI A640 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ZEMLYA A642 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DZELO A644 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER REVERSED DZE A646 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTA A648 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DJERV A64A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER MONOGRAPH UK A64C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BROAD OMEGA A64E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER NEUTRAL YER A650 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER YERU WITH BACK YER A652 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTIFIED YAT A654 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER REVERSED YU A656 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTIFIED A A658 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS A65A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BLENDED YUS A65C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS A65E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER YN A660 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER REVERSED TSE A662 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SOFT DE A664 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SOFT EL A666 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SOFT EM A668 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER MONOCULAR O A66A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER BINOCULAR O A66C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O A680 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DWE A682 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DZWE A684 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER ZHWE A686 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER CCHE A688 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DZZE A68A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK A68C ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TWE A68E ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TSWE A690 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TSSE A692 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TCHE A694 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER HWE A696 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER SHWE A698 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER DOUBLE O A69A ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER CROSSED O A722 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF A724 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER EGYPTOLOGICAL AIN A726 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER HENG A728 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER TZ A72A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER TRESILLO A72C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER CUATRILLO A72E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER CUATRILLO WITH COMMA A732 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AA A734 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AO A736 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AU A738 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AV A73A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR A73C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER AY A73E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER REVERSED C WITH DOT A740 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH STROKE A742 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH DIAGONAL STROKE A744 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE A746 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER BROKEN L A748 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER L WITH HIGH STROKE A74A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY A74C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER O WITH LOOP A74E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER OO A750 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER A752 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER P WITH FLOURISH A754 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER P WITH SQUIRREL TAIL A756 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER A758 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE A75A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R ROTUNDA A75C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER RUM ROTUNDA A75E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER V WITH DIAGONAL STROKE A760 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER VY A762 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER VISIGOTHIC Z A764 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER THORN WITH STROKE A766 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER A768 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER VEND A76A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER ET A76C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER IS A76E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER CON A779 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER INSULAR D A77B ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER INSULAR F A77D..A77E ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER INSULAR G..LATIN CAPITAL LETTER TURNED INSULAR G A780 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER TURNED L A782 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER INSULAR R A784 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER INSULAR S A786 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER INSULAR T A78B ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER SALTILLO A78D ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER TURNED H A790 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH DESCENDER A792 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER C WITH BAR A796 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER B WITH FLOURISH A798 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER F WITH STROKE A79A ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER VOLAPUK AE A79C ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER VOLAPUK OE A79E ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER VOLAPUK UE A7A0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER G WITH OBLIQUE STROKE A7A2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER K WITH OBLIQUE STROKE A7A4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER N WITH OBLIQUE STROKE A7A6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER R WITH OBLIQUE STROKE A7A8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH OBLIQUE STROKE A7AA..A7AE ; Changes_When_Lowercased # L& [5] LATIN CAPITAL LETTER H WITH HOOK..LATIN CAPITAL LETTER SMALL CAPITAL I A7B0..A7B4 ; Changes_When_Lowercased # L& [5] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER BETA A7B6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER OMEGA A7B8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER U WITH STROKE A7BA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER GLOTTAL A A7BC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER GLOTTAL I A7BE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER GLOTTAL U A7C0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER OLD POLISH O A7C2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER ANGLICANA W A7C4..A7C7 ; Changes_When_Lowercased # L& [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB..A7CC ; Changes_When_Lowercased # L& [2] LATIN CAPITAL LETTER RAMS HORN..LATIN CAPITAL LETTER S WITH DIAGONAL STROKE A7CE ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER CLOSED INSULAR G A7D2 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER DOUBLE THORN A7D4 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER DOUBLE WYNN A7D6 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER SIGMOID S A7DA ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER LAMBDA A7DC ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F5 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER REVERSED HALF H FF21..FF3A ; Changes_When_Lowercased # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z 10400..10427 ; Changes_When_Lowercased # L& [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW 104B0..104D3 ; Changes_When_Lowercased # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 10570..1057A ; Changes_When_Lowercased # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Changes_When_Lowercased # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Changes_When_Lowercased # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Changes_When_Lowercased # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10C80..10CB2 ; Changes_When_Lowercased # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10D50..10D65 ; Changes_When_Lowercased # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 118A0..118BF ; Changes_When_Lowercased # L& [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO 16E40..16E5F ; Changes_When_Lowercased # L& [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y 16EA0..16EB8 ; Changes_When_Lowercased # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 1E900..1E921 ; Changes_When_Lowercased # L& [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA # Total code points: 1488 # ================================================ # Derived Property: Changes_When_Uppercased (CWU) # Characters whose normalized forms are not stable under a toUppercase mapping. # For more information, see the definition of "isUppercase(X)" # in the "Conformance" / "Default Case Algorithms" section of the core specification. # Changes_When_Uppercased(X) is true when toUppercase(toNFD(X)) != toNFD(X) 0061..007A ; Changes_When_Uppercased # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00B5 ; Changes_When_Uppercased # L& MICRO SIGN 00DF..00F6 ; Changes_When_Uppercased # L& [24] LATIN SMALL LETTER SHARP S..LATIN SMALL LETTER O WITH DIAERESIS 00F8..00FF ; Changes_When_Uppercased # L& [8] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER Y WITH DIAERESIS 0101 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH MACRON 0103 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH BREVE 0105 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH OGONEK 0107 ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH ACUTE 0109 ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH CIRCUMFLEX 010B ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH DOT ABOVE 010D ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH CARON 010F ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH CARON 0111 ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH STROKE 0113 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH MACRON 0115 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH BREVE 0117 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH DOT ABOVE 0119 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH OGONEK 011B ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CARON 011D ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH CIRCUMFLEX 011F ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH BREVE 0121 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH DOT ABOVE 0123 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH CEDILLA 0125 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH CIRCUMFLEX 0127 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH STROKE 0129 ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH TILDE 012B ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH MACRON 012D ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH BREVE 012F ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH OGONEK 0131 ; Changes_When_Uppercased # L& LATIN SMALL LETTER DOTLESS I 0133 ; Changes_When_Uppercased # L& LATIN SMALL LIGATURE IJ 0135 ; Changes_When_Uppercased # L& LATIN SMALL LETTER J WITH CIRCUMFLEX 0137 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH CEDILLA 013A ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH ACUTE 013C ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH CEDILLA 013E ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH CARON 0140 ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH MIDDLE DOT 0142 ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH STROKE 0144 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH ACUTE 0146 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH CEDILLA 0148..0149 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 014B ; Changes_When_Uppercased # L& LATIN SMALL LETTER ENG 014D ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH MACRON 014F ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH BREVE 0151 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH DOUBLE ACUTE 0153 ; Changes_When_Uppercased # L& LATIN SMALL LIGATURE OE 0155 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH ACUTE 0157 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH CEDILLA 0159 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH CARON 015B ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH ACUTE 015D ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH CIRCUMFLEX 015F ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH CEDILLA 0161 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH CARON 0163 ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH CEDILLA 0165 ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH CARON 0167 ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH STROKE 0169 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH TILDE 016B ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH MACRON 016D ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH BREVE 016F ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH RING ABOVE 0171 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DOUBLE ACUTE 0173 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH OGONEK 0175 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH CIRCUMFLEX 0177 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH CIRCUMFLEX 017A ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH ACUTE 017C ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH DOT ABOVE 017E..0180 ; Changes_When_Uppercased # L& [3] LATIN SMALL LETTER Z WITH CARON..LATIN SMALL LETTER B WITH STROKE 0183 ; Changes_When_Uppercased # L& LATIN SMALL LETTER B WITH TOPBAR 0185 ; Changes_When_Uppercased # L& LATIN SMALL LETTER TONE SIX 0188 ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH HOOK 018C ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH TOPBAR 0192 ; Changes_When_Uppercased # L& LATIN SMALL LETTER F WITH HOOK 0195 ; Changes_When_Uppercased # L& LATIN SMALL LETTER HV 0199..019B ; Changes_When_Uppercased # L& [3] LATIN SMALL LETTER K WITH HOOK..LATIN SMALL LETTER LAMBDA WITH STROKE 019E ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH LONG RIGHT LEG 01A1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HORN 01A3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER OI 01A5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH HOOK 01A8 ; Changes_When_Uppercased # L& LATIN SMALL LETTER TONE TWO 01AD ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH HOOK 01B0 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HORN 01B4 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH HOOK 01B6 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH STROKE 01B9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER EZH REVERSED 01BD ; Changes_When_Uppercased # L& LATIN SMALL LETTER TONE FIVE 01BF ; Changes_When_Uppercased # L& LATIN LETTER WYNN 01C5..01C6 ; Changes_When_Uppercased # L& [2] LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON..LATIN SMALL LETTER DZ WITH CARON 01C8..01C9 ; Changes_When_Uppercased # L& [2] LATIN CAPITAL LETTER L WITH SMALL LETTER J..LATIN SMALL LETTER LJ 01CB..01CC ; Changes_When_Uppercased # L& [2] LATIN CAPITAL LETTER N WITH SMALL LETTER J..LATIN SMALL LETTER NJ 01CE ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH CARON 01D0 ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH CARON 01D2 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH CARON 01D4 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH CARON 01D6 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DIAERESIS AND MACRON 01D8 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE 01DA ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DIAERESIS AND CARON 01DC..01DD ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE..LATIN SMALL LETTER TURNED E 01DF ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH DIAERESIS AND MACRON 01E1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON 01E3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER AE WITH MACRON 01E5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH STROKE 01E7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH CARON 01E9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH CARON 01EB ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH OGONEK 01ED ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH OGONEK AND MACRON 01EF..01F0 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER EZH WITH CARON..LATIN SMALL LETTER J WITH CARON 01F2..01F3 ; Changes_When_Uppercased # L& [2] LATIN CAPITAL LETTER D WITH SMALL LETTER Z..LATIN SMALL LETTER DZ 01F5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH ACUTE 01F9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH GRAVE 01FB ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE 01FD ; Changes_When_Uppercased # L& LATIN SMALL LETTER AE WITH ACUTE 01FF ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH STROKE AND ACUTE 0201 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH DOUBLE GRAVE 0203 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH INVERTED BREVE 0205 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH DOUBLE GRAVE 0207 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH INVERTED BREVE 0209 ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH DOUBLE GRAVE 020B ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH INVERTED BREVE 020D ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH DOUBLE GRAVE 020F ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH INVERTED BREVE 0211 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH DOUBLE GRAVE 0213 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH INVERTED BREVE 0215 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DOUBLE GRAVE 0217 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH INVERTED BREVE 0219 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH COMMA BELOW 021B ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH COMMA BELOW 021D ; Changes_When_Uppercased # L& LATIN SMALL LETTER YOGH 021F ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH CARON 0223 ; Changes_When_Uppercased # L& LATIN SMALL LETTER OU 0225 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH HOOK 0227 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH DOT ABOVE 0229 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CEDILLA 022B ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH DIAERESIS AND MACRON 022D ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH TILDE AND MACRON 022F ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH DOT ABOVE 0231 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON 0233 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH MACRON 023C ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH STROKE 023F..0240 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER S WITH SWASH TAIL..LATIN SMALL LETTER Z WITH SWASH TAIL 0242 ; Changes_When_Uppercased # L& LATIN SMALL LETTER GLOTTAL STOP 0247 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH STROKE 0249 ; Changes_When_Uppercased # L& LATIN SMALL LETTER J WITH STROKE 024B ; Changes_When_Uppercased # L& LATIN SMALL LETTER Q WITH HOOK TAIL 024D ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH STROKE 024F..0254 ; Changes_When_Uppercased # L& [6] LATIN SMALL LETTER Y WITH STROKE..LATIN SMALL LETTER OPEN O 0256..0257 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER D WITH TAIL..LATIN SMALL LETTER D WITH HOOK 0259 ; Changes_When_Uppercased # L& LATIN SMALL LETTER SCHWA 025B..025C ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER OPEN E..LATIN SMALL LETTER REVERSED OPEN E 0260..0261 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER G WITH HOOK..LATIN SMALL LETTER SCRIPT G 0263..0266 ; Changes_When_Uppercased # L& [4] LATIN SMALL LETTER GAMMA..LATIN SMALL LETTER H WITH HOOK 0268..026C ; Changes_When_Uppercased # L& [5] LATIN SMALL LETTER I WITH STROKE..LATIN SMALL LETTER L WITH BELT 026F ; Changes_When_Uppercased # L& LATIN SMALL LETTER TURNED M 0271..0272 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER M WITH HOOK..LATIN SMALL LETTER N WITH LEFT HOOK 0275 ; Changes_When_Uppercased # L& LATIN SMALL LETTER BARRED O 027D ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH TAIL 0280 ; Changes_When_Uppercased # L& LATIN LETTER SMALL CAPITAL R 0282..0283 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER S WITH HOOK..LATIN SMALL LETTER ESH 0287..028C ; Changes_When_Uppercased # L& [6] LATIN SMALL LETTER TURNED T..LATIN SMALL LETTER TURNED V 0292 ; Changes_When_Uppercased # L& LATIN SMALL LETTER EZH 029D..029E ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER J WITH CROSSED-TAIL..LATIN SMALL LETTER TURNED K 0345 ; Changes_When_Uppercased # Mn COMBINING GREEK YPOGEGRAMMENI 0371 ; Changes_When_Uppercased # L& GREEK SMALL LETTER HETA 0373 ; Changes_When_Uppercased # L& GREEK SMALL LETTER ARCHAIC SAMPI 0377 ; Changes_When_Uppercased # L& GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037B..037D ; Changes_When_Uppercased # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 0390 ; Changes_When_Uppercased # L& GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 03AC..03CE ; Changes_When_Uppercased # L& [35] GREEK SMALL LETTER ALPHA WITH TONOS..GREEK SMALL LETTER OMEGA WITH TONOS 03D0..03D1 ; Changes_When_Uppercased # L& [2] GREEK BETA SYMBOL..GREEK THETA SYMBOL 03D5..03D7 ; Changes_When_Uppercased # L& [3] GREEK PHI SYMBOL..GREEK KAI SYMBOL 03D9 ; Changes_When_Uppercased # L& GREEK SMALL LETTER ARCHAIC KOPPA 03DB ; Changes_When_Uppercased # L& GREEK SMALL LETTER STIGMA 03DD ; Changes_When_Uppercased # L& GREEK SMALL LETTER DIGAMMA 03DF ; Changes_When_Uppercased # L& GREEK SMALL LETTER KOPPA 03E1 ; Changes_When_Uppercased # L& GREEK SMALL LETTER SAMPI 03E3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER SHEI 03E5 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER FEI 03E7 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER KHEI 03E9 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER HORI 03EB ; Changes_When_Uppercased # L& COPTIC SMALL LETTER GANGIA 03ED ; Changes_When_Uppercased # L& COPTIC SMALL LETTER SHIMA 03EF..03F3 ; Changes_When_Uppercased # L& [5] COPTIC SMALL LETTER DEI..GREEK LETTER YOT 03F5 ; Changes_When_Uppercased # L& GREEK LUNATE EPSILON SYMBOL 03F8 ; Changes_When_Uppercased # L& GREEK SMALL LETTER SHO 03FB ; Changes_When_Uppercased # L& GREEK SMALL LETTER SAN 0430..045F ; Changes_When_Uppercased # L& [48] CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETTER DZHE 0461 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER OMEGA 0463 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER YAT 0465 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTIFIED E 0467 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER LITTLE YUS 0469 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS 046B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BIG YUS 046D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTIFIED BIG YUS 046F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KSI 0471 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER PSI 0473 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER FITA 0475 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IZHITSA 0477 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0479 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER UK 047B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ROUND OMEGA 047D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER OMEGA WITH TITLO 047F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER OT 0481 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOPPA 048B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SHORT I WITH TAIL 048D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SEMISOFT SIGN 048F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ER WITH TICK 0491 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER GHE WITH UPTURN 0493 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER GHE WITH STROKE 0495 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK 0497 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZHE WITH DESCENDER 0499 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZE WITH DESCENDER 049B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KA WITH DESCENDER 049D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE 049F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KA WITH STROKE 04A1 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BASHKIR KA 04A3 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EN WITH DESCENDER 04A5 ; Changes_When_Uppercased # L& CYRILLIC SMALL LIGATURE EN GHE 04A7 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK 04A9 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ABKHASIAN HA 04AB ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ES WITH DESCENDER 04AD ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TE WITH DESCENDER 04AF ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER STRAIGHT U 04B1 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE 04B3 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER HA WITH DESCENDER 04B5 ; Changes_When_Uppercased # L& CYRILLIC SMALL LIGATURE TE TSE 04B7 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER CHE WITH DESCENDER 04B9 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE 04BB ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SHHA 04BD ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ABKHASIAN CHE 04BF ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER 04C2 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZHE WITH BREVE 04C4 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KA WITH HOOK 04C6 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EL WITH TAIL 04C8 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EN WITH HOOK 04CA ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EN WITH TAIL 04CC ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KHAKASSIAN CHE 04CE..04CF ; Changes_When_Uppercased # L& [2] CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC SMALL LETTER PALOCHKA 04D1 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER A WITH BREVE 04D3 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER A WITH DIAERESIS 04D5 ; Changes_When_Uppercased # L& CYRILLIC SMALL LIGATURE A IE 04D7 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IE WITH BREVE 04D9 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SCHWA 04DB ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS 04DD ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZHE WITH DIAERESIS 04DF ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZE WITH DIAERESIS 04E1 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ABKHASIAN DZE 04E3 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER I WITH MACRON 04E5 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER I WITH DIAERESIS 04E7 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER O WITH DIAERESIS 04E9 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BARRED O 04EB ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS 04ED ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER E WITH DIAERESIS 04EF ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER U WITH MACRON 04F1 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER U WITH DIAERESIS 04F3 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE 04F5 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER CHE WITH DIAERESIS 04F7 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER GHE WITH DESCENDER 04F9 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER YERU WITH DIAERESIS 04FB ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK 04FD ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER HA WITH HOOK 04FF ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER HA WITH STROKE 0501 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI DE 0503 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI DJE 0505 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI ZJE 0507 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI DZJE 0509 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI LJE 050B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI NJE 050D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI SJE 050F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER KOMI TJE 0511 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER REVERSED ZE 0513 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EL WITH HOOK 0515 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER LHA 0517 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER RHA 0519 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER YAE 051B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER QA 051D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER WE 051F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ALEUT KA 0521 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK 0523 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK 0525 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER PE WITH DESCENDER 0527 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SHHA WITH DESCENDER 0529 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EN WITH LEFT HOOK 052B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DZZHE 052D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DCHE 052F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER EL WITH DESCENDER 0561..0587 ; Changes_When_Uppercased # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN 10D0..10FA ; Changes_When_Uppercased # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FD..10FF ; Changes_When_Uppercased # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13F8..13FD ; Changes_When_Uppercased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Changes_When_Uppercased # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK 1C8A ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TJE 1D79 ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR G 1D7D ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH STROKE 1D8E ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH PALATAL HOOK 1E01 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH RING BELOW 1E03 ; Changes_When_Uppercased # L& LATIN SMALL LETTER B WITH DOT ABOVE 1E05 ; Changes_When_Uppercased # L& LATIN SMALL LETTER B WITH DOT BELOW 1E07 ; Changes_When_Uppercased # L& LATIN SMALL LETTER B WITH LINE BELOW 1E09 ; Changes_When_Uppercased # L& LATIN SMALL LETTER C WITH CEDILLA AND ACUTE 1E0B ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH DOT ABOVE 1E0D ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH DOT BELOW 1E0F ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH LINE BELOW 1E11 ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH CEDILLA 1E13 ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW 1E15 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH MACRON AND GRAVE 1E17 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH MACRON AND ACUTE 1E19 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW 1E1B ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH TILDE BELOW 1E1D ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CEDILLA AND BREVE 1E1F ; Changes_When_Uppercased # L& LATIN SMALL LETTER F WITH DOT ABOVE 1E21 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH MACRON 1E23 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH DOT ABOVE 1E25 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH DOT BELOW 1E27 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH DIAERESIS 1E29 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH CEDILLA 1E2B ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH BREVE BELOW 1E2D ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH TILDE BELOW 1E2F ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE 1E31 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH ACUTE 1E33 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH DOT BELOW 1E35 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH LINE BELOW 1E37 ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH DOT BELOW 1E39 ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH DOT BELOW AND MACRON 1E3B ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH LINE BELOW 1E3D ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW 1E3F ; Changes_When_Uppercased # L& LATIN SMALL LETTER M WITH ACUTE 1E41 ; Changes_When_Uppercased # L& LATIN SMALL LETTER M WITH DOT ABOVE 1E43 ; Changes_When_Uppercased # L& LATIN SMALL LETTER M WITH DOT BELOW 1E45 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH DOT ABOVE 1E47 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH DOT BELOW 1E49 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH LINE BELOW 1E4B ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW 1E4D ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH TILDE AND ACUTE 1E4F ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH TILDE AND DIAERESIS 1E51 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH MACRON AND GRAVE 1E53 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH MACRON AND ACUTE 1E55 ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH ACUTE 1E57 ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH DOT ABOVE 1E59 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH DOT ABOVE 1E5B ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH DOT BELOW 1E5D ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH DOT BELOW AND MACRON 1E5F ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH LINE BELOW 1E61 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH DOT ABOVE 1E63 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH DOT BELOW 1E65 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE 1E67 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH CARON AND DOT ABOVE 1E69 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6B ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH DOT ABOVE 1E6D ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH DOT BELOW 1E6F ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH LINE BELOW 1E71 ; Changes_When_Uppercased # L& LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW 1E73 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DIAERESIS BELOW 1E75 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH TILDE BELOW 1E77 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW 1E79 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH TILDE AND ACUTE 1E7B ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH MACRON AND DIAERESIS 1E7D ; Changes_When_Uppercased # L& LATIN SMALL LETTER V WITH TILDE 1E7F ; Changes_When_Uppercased # L& LATIN SMALL LETTER V WITH DOT BELOW 1E81 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH GRAVE 1E83 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH ACUTE 1E85 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH DIAERESIS 1E87 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH DOT ABOVE 1E89 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH DOT BELOW 1E8B ; Changes_When_Uppercased # L& LATIN SMALL LETTER X WITH DOT ABOVE 1E8D ; Changes_When_Uppercased # L& LATIN SMALL LETTER X WITH DIAERESIS 1E8F ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH DOT ABOVE 1E91 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH CIRCUMFLEX 1E93 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH DOT BELOW 1E95..1E9B ; Changes_When_Uppercased # L& [7] LATIN SMALL LETTER Z WITH LINE BELOW..LATIN SMALL LETTER LONG S WITH DOT ABOVE 1EA1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH DOT BELOW 1EA3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH HOOK ABOVE 1EA5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAB ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE 1EAD ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAF ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH BREVE AND ACUTE 1EB1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH BREVE AND GRAVE 1EB3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE 1EB5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH BREVE AND TILDE 1EB7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER A WITH BREVE AND DOT BELOW 1EB9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH DOT BELOW 1EBB ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH HOOK ABOVE 1EBD ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH TILDE 1EBF ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE 1EC7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH HOOK ABOVE 1ECB ; Changes_When_Uppercased # L& LATIN SMALL LETTER I WITH DOT BELOW 1ECD ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH DOT BELOW 1ECF ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HOOK ABOVE 1ED1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE 1ED9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDB ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HORN AND ACUTE 1EDD ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HORN AND GRAVE 1EDF ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE 1EE1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HORN AND TILDE 1EE3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH HORN AND DOT BELOW 1EE5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH DOT BELOW 1EE7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HOOK ABOVE 1EE9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HORN AND ACUTE 1EEB ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HORN AND GRAVE 1EED ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE 1EEF ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HORN AND TILDE 1EF1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH HORN AND DOT BELOW 1EF3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH GRAVE 1EF5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH DOT BELOW 1EF7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH HOOK ABOVE 1EF9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Y WITH TILDE 1EFB ; Changes_When_Uppercased # L& LATIN SMALL LETTER MIDDLE-WELSH LL 1EFD ; Changes_When_Uppercased # L& LATIN SMALL LETTER MIDDLE-WELSH V 1EFF..1F07 ; Changes_When_Uppercased # L& [9] LATIN SMALL LETTER Y WITH LOOP..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F10..1F15 ; Changes_When_Uppercased # L& [6] GREEK SMALL LETTER EPSILON WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F27 ; Changes_When_Uppercased # L& [8] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI 1F30..1F37 ; Changes_When_Uppercased # L& [8] GREEK SMALL LETTER IOTA WITH PSILI..GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI 1F40..1F45 ; Changes_When_Uppercased # L& [6] GREEK SMALL LETTER OMICRON WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Changes_When_Uppercased # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F60..1F67 ; Changes_When_Uppercased # L& [8] GREEK SMALL LETTER OMEGA WITH PSILI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F70..1F7D ; Changes_When_Uppercased # L& [14] GREEK SMALL LETTER ALPHA WITH VARIA..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; Changes_When_Uppercased # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FB7 ; Changes_When_Uppercased # L& [2] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 1FBC ; Changes_When_Uppercased # L& GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; Changes_When_Uppercased # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Changes_When_Uppercased # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FC7 ; Changes_When_Uppercased # L& [2] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 1FCC ; Changes_When_Uppercased # L& GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; Changes_When_Uppercased # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FD7 ; Changes_When_Uppercased # L& [2] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FE0..1FE7 ; Changes_When_Uppercased # L& [8] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI 1FF2..1FF4 ; Changes_When_Uppercased # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FF7 ; Changes_When_Uppercased # L& [2] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 1FFC ; Changes_When_Uppercased # L& GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 214E ; Changes_When_Uppercased # L& TURNED SMALL F 2170..217F ; Changes_When_Uppercased # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 2184 ; Changes_When_Uppercased # L& LATIN SMALL LETTER REVERSED C 24D0..24E9 ; Changes_When_Uppercased # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C30..2C5F ; Changes_When_Uppercased # L& [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI 2C61 ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH DOUBLE BAR 2C65..2C66 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE 2C68 ; Changes_When_Uppercased # L& LATIN SMALL LETTER H WITH DESCENDER 2C6A ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH DESCENDER 2C6C ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH DESCENDER 2C73 ; Changes_When_Uppercased # L& LATIN SMALL LETTER W WITH HOOK 2C76 ; Changes_When_Uppercased # L& LATIN SMALL LETTER HALF H 2C81 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER ALFA 2C83 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER VIDA 2C85 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER GAMMA 2C87 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER DALDA 2C89 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER EIE 2C8B ; Changes_When_Uppercased # L& COPTIC SMALL LETTER SOU 2C8D ; Changes_When_Uppercased # L& COPTIC SMALL LETTER ZATA 2C8F ; Changes_When_Uppercased # L& COPTIC SMALL LETTER HATE 2C91 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER THETHE 2C93 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER IAUDA 2C95 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER KAPA 2C97 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER LAULA 2C99 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER MI 2C9B ; Changes_When_Uppercased # L& COPTIC SMALL LETTER NI 2C9D ; Changes_When_Uppercased # L& COPTIC SMALL LETTER KSI 2C9F ; Changes_When_Uppercased # L& COPTIC SMALL LETTER O 2CA1 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER PI 2CA3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER RO 2CA5 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER SIMA 2CA7 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER TAU 2CA9 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER UA 2CAB ; Changes_When_Uppercased # L& COPTIC SMALL LETTER FI 2CAD ; Changes_When_Uppercased # L& COPTIC SMALL LETTER KHI 2CAF ; Changes_When_Uppercased # L& COPTIC SMALL LETTER PSI 2CB1 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OOU 2CB3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER DIALECT-P ALEF 2CB5 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC AIN 2CB7 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC EIE 2CB9 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER DIALECT-P KAPA 2CBB ; Changes_When_Uppercased # L& COPTIC SMALL LETTER DIALECT-P NI 2CBD ; Changes_When_Uppercased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC NI 2CBF ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC OOU 2CC1 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER SAMPI 2CC3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER CROSSED SHEI 2CC5 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC SHEI 2CC7 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC ESH 2CC9 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER AKHMIMIC KHEI 2CCB ; Changes_When_Uppercased # L& COPTIC SMALL LETTER DIALECT-P HORI 2CCD ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC HORI 2CCF ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC HA 2CD1 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER L-SHAPED HA 2CD3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC HEI 2CD5 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC HAT 2CD7 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC GANGIA 2CD9 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC DJA 2CDB ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD COPTIC SHIMA 2CDD ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD NUBIAN SHIMA 2CDF ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD NUBIAN NGI 2CE1 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD NUBIAN NYI 2CE3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER OLD NUBIAN WAU 2CEC ; Changes_When_Uppercased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI 2CEE ; Changes_When_Uppercased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF3 ; Changes_When_Uppercased # L& COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Changes_When_Uppercased # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Changes_When_Uppercased # L& GEORGIAN SMALL LETTER YN 2D2D ; Changes_When_Uppercased # L& GEORGIAN SMALL LETTER AEN A641 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZEMLYA A643 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DZELO A645 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER REVERSED DZE A647 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTA A649 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DJERV A64B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER MONOGRAPH UK A64D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BROAD OMEGA A64F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER NEUTRAL YER A651 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER YERU WITH BACK YER A653 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTIFIED YAT A655 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER REVERSED YU A657 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTIFIED A A659 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER CLOSED LITTLE YUS A65B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BLENDED YUS A65D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE YUS A65F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER YN A661 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER REVERSED TSE A663 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SOFT DE A665 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SOFT EL A667 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SOFT EM A669 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER MONOCULAR O A66B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER BINOCULAR O A66D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A681 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DWE A683 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DZWE A685 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER ZHWE A687 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER CCHE A689 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DZZE A68B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK A68D ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TWE A68F ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TSWE A691 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TSSE A693 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TCHE A695 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER HWE A697 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER SHWE A699 ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER DOUBLE O A69B ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER CROSSED O A723 ; Changes_When_Uppercased # L& LATIN SMALL LETTER EGYPTOLOGICAL ALEF A725 ; Changes_When_Uppercased # L& LATIN SMALL LETTER EGYPTOLOGICAL AIN A727 ; Changes_When_Uppercased # L& LATIN SMALL LETTER HENG A729 ; Changes_When_Uppercased # L& LATIN SMALL LETTER TZ A72B ; Changes_When_Uppercased # L& LATIN SMALL LETTER TRESILLO A72D ; Changes_When_Uppercased # L& LATIN SMALL LETTER CUATRILLO A72F ; Changes_When_Uppercased # L& LATIN SMALL LETTER CUATRILLO WITH COMMA A733 ; Changes_When_Uppercased # L& LATIN SMALL LETTER AA A735 ; Changes_When_Uppercased # L& LATIN SMALL LETTER AO A737 ; Changes_When_Uppercased # L& LATIN SMALL LETTER AU A739 ; Changes_When_Uppercased # L& LATIN SMALL LETTER AV A73B ; Changes_When_Uppercased # L& LATIN SMALL LETTER AV WITH HORIZONTAL BAR A73D ; Changes_When_Uppercased # L& LATIN SMALL LETTER AY A73F ; Changes_When_Uppercased # L& LATIN SMALL LETTER REVERSED C WITH DOT A741 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH STROKE A743 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH DIAGONAL STROKE A745 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE A747 ; Changes_When_Uppercased # L& LATIN SMALL LETTER BROKEN L A749 ; Changes_When_Uppercased # L& LATIN SMALL LETTER L WITH HIGH STROKE A74B ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH LONG STROKE OVERLAY A74D ; Changes_When_Uppercased # L& LATIN SMALL LETTER O WITH LOOP A74F ; Changes_When_Uppercased # L& LATIN SMALL LETTER OO A751 ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER A753 ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH FLOURISH A755 ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH SQUIRREL TAIL A757 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER A759 ; Changes_When_Uppercased # L& LATIN SMALL LETTER Q WITH DIAGONAL STROKE A75B ; Changes_When_Uppercased # L& LATIN SMALL LETTER R ROTUNDA A75D ; Changes_When_Uppercased # L& LATIN SMALL LETTER RUM ROTUNDA A75F ; Changes_When_Uppercased # L& LATIN SMALL LETTER V WITH DIAGONAL STROKE A761 ; Changes_When_Uppercased # L& LATIN SMALL LETTER VY A763 ; Changes_When_Uppercased # L& LATIN SMALL LETTER VISIGOTHIC Z A765 ; Changes_When_Uppercased # L& LATIN SMALL LETTER THORN WITH STROKE A767 ; Changes_When_Uppercased # L& LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER A769 ; Changes_When_Uppercased # L& LATIN SMALL LETTER VEND A76B ; Changes_When_Uppercased # L& LATIN SMALL LETTER ET A76D ; Changes_When_Uppercased # L& LATIN SMALL LETTER IS A76F ; Changes_When_Uppercased # L& LATIN SMALL LETTER CON A77A ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR D A77C ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR F A77F ; Changes_When_Uppercased # L& LATIN SMALL LETTER TURNED INSULAR G A781 ; Changes_When_Uppercased # L& LATIN SMALL LETTER TURNED L A783 ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR R A785 ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR S A787 ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR T A78C ; Changes_When_Uppercased # L& LATIN SMALL LETTER SALTILLO A791 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH DESCENDER A793..A794 ; Changes_When_Uppercased # L& [2] LATIN SMALL LETTER C WITH BAR..LATIN SMALL LETTER C WITH PALATAL HOOK A797 ; Changes_When_Uppercased # L& LATIN SMALL LETTER B WITH FLOURISH A799 ; Changes_When_Uppercased # L& LATIN SMALL LETTER F WITH STROKE A79B ; Changes_When_Uppercased # L& LATIN SMALL LETTER VOLAPUK AE A79D ; Changes_When_Uppercased # L& LATIN SMALL LETTER VOLAPUK OE A79F ; Changes_When_Uppercased # L& LATIN SMALL LETTER VOLAPUK UE A7A1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER G WITH OBLIQUE STROKE A7A3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER K WITH OBLIQUE STROKE A7A5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER N WITH OBLIQUE STROKE A7A7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER R WITH OBLIQUE STROKE A7A9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH OBLIQUE STROKE A7B5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER BETA A7B7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER OMEGA A7B9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER U WITH STROKE A7BB ; Changes_When_Uppercased # L& LATIN SMALL LETTER GLOTTAL A A7BD ; Changes_When_Uppercased # L& LATIN SMALL LETTER GLOTTAL I A7BF ; Changes_When_Uppercased # L& LATIN SMALL LETTER GLOTTAL U A7C1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER OLD POLISH O A7C3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER ANGLICANA W A7C8 ; Changes_When_Uppercased # L& LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY A7CA ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY A7CD ; Changes_When_Uppercased # L& LATIN SMALL LETTER S WITH DIAGONAL STROKE A7CF ; Changes_When_Uppercased # L& LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE A7D1 ; Changes_When_Uppercased # L& LATIN SMALL LETTER CLOSED INSULAR G A7D3 ; Changes_When_Uppercased # L& LATIN SMALL LETTER DOUBLE THORN A7D5 ; Changes_When_Uppercased # L& LATIN SMALL LETTER DOUBLE WYNN A7D7 ; Changes_When_Uppercased # L& LATIN SMALL LETTER MIDDLE SCOTS S A7D9 ; Changes_When_Uppercased # L& LATIN SMALL LETTER SIGMOID S A7DB ; Changes_When_Uppercased # L& LATIN SMALL LETTER LAMBDA A7F6 ; Changes_When_Uppercased # L& LATIN SMALL LETTER REVERSED HALF H AB53 ; Changes_When_Uppercased # L& LATIN SMALL LETTER CHI AB70..ABBF ; Changes_When_Uppercased # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Changes_When_Uppercased # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Changes_When_Uppercased # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF41..FF5A ; Changes_When_Uppercased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z 10428..1044F ; Changes_When_Uppercased # L& [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW 104D8..104FB ; Changes_When_Uppercased # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10597..105A1 ; Changes_When_Uppercased # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Changes_When_Uppercased # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Changes_When_Uppercased # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Changes_When_Uppercased # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 10CC0..10CF2 ; Changes_When_Uppercased # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D70..10D85 ; Changes_When_Uppercased # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 118C0..118DF ; Changes_When_Uppercased # L& [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 16E60..16E7F ; Changes_When_Uppercased # L& [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EBB..16ED3 ; Changes_When_Uppercased # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 1E922..1E943 ; Changes_When_Uppercased # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA # Total code points: 1580 # ================================================ # Derived Property: Changes_When_Titlecased (CWT) # Characters whose normalized forms are not stable under a toTitlecase mapping. # For more information, see the definition of "isTitlecase(X)" # in the "Conformance" / "Default Case Algorithms" section of the core specification. # Changes_When_Titlecased(X) is true when toTitlecase(toNFD(X)) != toNFD(X) 0061..007A ; Changes_When_Titlecased # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00B5 ; Changes_When_Titlecased # L& MICRO SIGN 00DF..00F6 ; Changes_When_Titlecased # L& [24] LATIN SMALL LETTER SHARP S..LATIN SMALL LETTER O WITH DIAERESIS 00F8..00FF ; Changes_When_Titlecased # L& [8] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER Y WITH DIAERESIS 0101 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH MACRON 0103 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH BREVE 0105 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH OGONEK 0107 ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH ACUTE 0109 ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH CIRCUMFLEX 010B ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH DOT ABOVE 010D ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH CARON 010F ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH CARON 0111 ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH STROKE 0113 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH MACRON 0115 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH BREVE 0117 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH DOT ABOVE 0119 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH OGONEK 011B ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CARON 011D ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH CIRCUMFLEX 011F ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH BREVE 0121 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH DOT ABOVE 0123 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH CEDILLA 0125 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH CIRCUMFLEX 0127 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH STROKE 0129 ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH TILDE 012B ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH MACRON 012D ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH BREVE 012F ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH OGONEK 0131 ; Changes_When_Titlecased # L& LATIN SMALL LETTER DOTLESS I 0133 ; Changes_When_Titlecased # L& LATIN SMALL LIGATURE IJ 0135 ; Changes_When_Titlecased # L& LATIN SMALL LETTER J WITH CIRCUMFLEX 0137 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH CEDILLA 013A ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH ACUTE 013C ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH CEDILLA 013E ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH CARON 0140 ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH MIDDLE DOT 0142 ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH STROKE 0144 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH ACUTE 0146 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH CEDILLA 0148..0149 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 014B ; Changes_When_Titlecased # L& LATIN SMALL LETTER ENG 014D ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH MACRON 014F ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH BREVE 0151 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH DOUBLE ACUTE 0153 ; Changes_When_Titlecased # L& LATIN SMALL LIGATURE OE 0155 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH ACUTE 0157 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH CEDILLA 0159 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH CARON 015B ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH ACUTE 015D ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH CIRCUMFLEX 015F ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH CEDILLA 0161 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH CARON 0163 ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH CEDILLA 0165 ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH CARON 0167 ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH STROKE 0169 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH TILDE 016B ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH MACRON 016D ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH BREVE 016F ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH RING ABOVE 0171 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DOUBLE ACUTE 0173 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH OGONEK 0175 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH CIRCUMFLEX 0177 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH CIRCUMFLEX 017A ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH ACUTE 017C ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH DOT ABOVE 017E..0180 ; Changes_When_Titlecased # L& [3] LATIN SMALL LETTER Z WITH CARON..LATIN SMALL LETTER B WITH STROKE 0183 ; Changes_When_Titlecased # L& LATIN SMALL LETTER B WITH TOPBAR 0185 ; Changes_When_Titlecased # L& LATIN SMALL LETTER TONE SIX 0188 ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH HOOK 018C ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH TOPBAR 0192 ; Changes_When_Titlecased # L& LATIN SMALL LETTER F WITH HOOK 0195 ; Changes_When_Titlecased # L& LATIN SMALL LETTER HV 0199..019B ; Changes_When_Titlecased # L& [3] LATIN SMALL LETTER K WITH HOOK..LATIN SMALL LETTER LAMBDA WITH STROKE 019E ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH LONG RIGHT LEG 01A1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HORN 01A3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER OI 01A5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH HOOK 01A8 ; Changes_When_Titlecased # L& LATIN SMALL LETTER TONE TWO 01AD ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH HOOK 01B0 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HORN 01B4 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH HOOK 01B6 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH STROKE 01B9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER EZH REVERSED 01BD ; Changes_When_Titlecased # L& LATIN SMALL LETTER TONE FIVE 01BF ; Changes_When_Titlecased # L& LATIN LETTER WYNN 01C4 ; Changes_When_Titlecased # L& LATIN CAPITAL LETTER DZ WITH CARON 01C6..01C7 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER DZ WITH CARON..LATIN CAPITAL LETTER LJ 01C9..01CA ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER LJ..LATIN CAPITAL LETTER NJ 01CC ; Changes_When_Titlecased # L& LATIN SMALL LETTER NJ 01CE ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH CARON 01D0 ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH CARON 01D2 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH CARON 01D4 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH CARON 01D6 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DIAERESIS AND MACRON 01D8 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE 01DA ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DIAERESIS AND CARON 01DC..01DD ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE..LATIN SMALL LETTER TURNED E 01DF ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH DIAERESIS AND MACRON 01E1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON 01E3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER AE WITH MACRON 01E5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH STROKE 01E7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH CARON 01E9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH CARON 01EB ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH OGONEK 01ED ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH OGONEK AND MACRON 01EF..01F1 ; Changes_When_Titlecased # L& [3] LATIN SMALL LETTER EZH WITH CARON..LATIN CAPITAL LETTER DZ 01F3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER DZ 01F5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH ACUTE 01F9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH GRAVE 01FB ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE 01FD ; Changes_When_Titlecased # L& LATIN SMALL LETTER AE WITH ACUTE 01FF ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH STROKE AND ACUTE 0201 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH DOUBLE GRAVE 0203 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH INVERTED BREVE 0205 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH DOUBLE GRAVE 0207 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH INVERTED BREVE 0209 ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH DOUBLE GRAVE 020B ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH INVERTED BREVE 020D ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH DOUBLE GRAVE 020F ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH INVERTED BREVE 0211 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH DOUBLE GRAVE 0213 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH INVERTED BREVE 0215 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DOUBLE GRAVE 0217 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH INVERTED BREVE 0219 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH COMMA BELOW 021B ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH COMMA BELOW 021D ; Changes_When_Titlecased # L& LATIN SMALL LETTER YOGH 021F ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH CARON 0223 ; Changes_When_Titlecased # L& LATIN SMALL LETTER OU 0225 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH HOOK 0227 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH DOT ABOVE 0229 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CEDILLA 022B ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH DIAERESIS AND MACRON 022D ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH TILDE AND MACRON 022F ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH DOT ABOVE 0231 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON 0233 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH MACRON 023C ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH STROKE 023F..0240 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER S WITH SWASH TAIL..LATIN SMALL LETTER Z WITH SWASH TAIL 0242 ; Changes_When_Titlecased # L& LATIN SMALL LETTER GLOTTAL STOP 0247 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH STROKE 0249 ; Changes_When_Titlecased # L& LATIN SMALL LETTER J WITH STROKE 024B ; Changes_When_Titlecased # L& LATIN SMALL LETTER Q WITH HOOK TAIL 024D ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH STROKE 024F..0254 ; Changes_When_Titlecased # L& [6] LATIN SMALL LETTER Y WITH STROKE..LATIN SMALL LETTER OPEN O 0256..0257 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER D WITH TAIL..LATIN SMALL LETTER D WITH HOOK 0259 ; Changes_When_Titlecased # L& LATIN SMALL LETTER SCHWA 025B..025C ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER OPEN E..LATIN SMALL LETTER REVERSED OPEN E 0260..0261 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER G WITH HOOK..LATIN SMALL LETTER SCRIPT G 0263..0266 ; Changes_When_Titlecased # L& [4] LATIN SMALL LETTER GAMMA..LATIN SMALL LETTER H WITH HOOK 0268..026C ; Changes_When_Titlecased # L& [5] LATIN SMALL LETTER I WITH STROKE..LATIN SMALL LETTER L WITH BELT 026F ; Changes_When_Titlecased # L& LATIN SMALL LETTER TURNED M 0271..0272 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER M WITH HOOK..LATIN SMALL LETTER N WITH LEFT HOOK 0275 ; Changes_When_Titlecased # L& LATIN SMALL LETTER BARRED O 027D ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH TAIL 0280 ; Changes_When_Titlecased # L& LATIN LETTER SMALL CAPITAL R 0282..0283 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER S WITH HOOK..LATIN SMALL LETTER ESH 0287..028C ; Changes_When_Titlecased # L& [6] LATIN SMALL LETTER TURNED T..LATIN SMALL LETTER TURNED V 0292 ; Changes_When_Titlecased # L& LATIN SMALL LETTER EZH 029D..029E ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER J WITH CROSSED-TAIL..LATIN SMALL LETTER TURNED K 0345 ; Changes_When_Titlecased # Mn COMBINING GREEK YPOGEGRAMMENI 0371 ; Changes_When_Titlecased # L& GREEK SMALL LETTER HETA 0373 ; Changes_When_Titlecased # L& GREEK SMALL LETTER ARCHAIC SAMPI 0377 ; Changes_When_Titlecased # L& GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037B..037D ; Changes_When_Titlecased # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 0390 ; Changes_When_Titlecased # L& GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 03AC..03CE ; Changes_When_Titlecased # L& [35] GREEK SMALL LETTER ALPHA WITH TONOS..GREEK SMALL LETTER OMEGA WITH TONOS 03D0..03D1 ; Changes_When_Titlecased # L& [2] GREEK BETA SYMBOL..GREEK THETA SYMBOL 03D5..03D7 ; Changes_When_Titlecased # L& [3] GREEK PHI SYMBOL..GREEK KAI SYMBOL 03D9 ; Changes_When_Titlecased # L& GREEK SMALL LETTER ARCHAIC KOPPA 03DB ; Changes_When_Titlecased # L& GREEK SMALL LETTER STIGMA 03DD ; Changes_When_Titlecased # L& GREEK SMALL LETTER DIGAMMA 03DF ; Changes_When_Titlecased # L& GREEK SMALL LETTER KOPPA 03E1 ; Changes_When_Titlecased # L& GREEK SMALL LETTER SAMPI 03E3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER SHEI 03E5 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER FEI 03E7 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER KHEI 03E9 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER HORI 03EB ; Changes_When_Titlecased # L& COPTIC SMALL LETTER GANGIA 03ED ; Changes_When_Titlecased # L& COPTIC SMALL LETTER SHIMA 03EF..03F3 ; Changes_When_Titlecased # L& [5] COPTIC SMALL LETTER DEI..GREEK LETTER YOT 03F5 ; Changes_When_Titlecased # L& GREEK LUNATE EPSILON SYMBOL 03F8 ; Changes_When_Titlecased # L& GREEK SMALL LETTER SHO 03FB ; Changes_When_Titlecased # L& GREEK SMALL LETTER SAN 0430..045F ; Changes_When_Titlecased # L& [48] CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETTER DZHE 0461 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER OMEGA 0463 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER YAT 0465 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTIFIED E 0467 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER LITTLE YUS 0469 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS 046B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BIG YUS 046D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTIFIED BIG YUS 046F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KSI 0471 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER PSI 0473 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER FITA 0475 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IZHITSA 0477 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0479 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER UK 047B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ROUND OMEGA 047D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER OMEGA WITH TITLO 047F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER OT 0481 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOPPA 048B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SHORT I WITH TAIL 048D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SEMISOFT SIGN 048F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ER WITH TICK 0491 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER GHE WITH UPTURN 0493 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER GHE WITH STROKE 0495 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK 0497 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZHE WITH DESCENDER 0499 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZE WITH DESCENDER 049B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KA WITH DESCENDER 049D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE 049F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KA WITH STROKE 04A1 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BASHKIR KA 04A3 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EN WITH DESCENDER 04A5 ; Changes_When_Titlecased # L& CYRILLIC SMALL LIGATURE EN GHE 04A7 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK 04A9 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ABKHASIAN HA 04AB ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ES WITH DESCENDER 04AD ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TE WITH DESCENDER 04AF ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER STRAIGHT U 04B1 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE 04B3 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER HA WITH DESCENDER 04B5 ; Changes_When_Titlecased # L& CYRILLIC SMALL LIGATURE TE TSE 04B7 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER CHE WITH DESCENDER 04B9 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE 04BB ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SHHA 04BD ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ABKHASIAN CHE 04BF ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER 04C2 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZHE WITH BREVE 04C4 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KA WITH HOOK 04C6 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EL WITH TAIL 04C8 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EN WITH HOOK 04CA ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EN WITH TAIL 04CC ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KHAKASSIAN CHE 04CE..04CF ; Changes_When_Titlecased # L& [2] CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC SMALL LETTER PALOCHKA 04D1 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER A WITH BREVE 04D3 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER A WITH DIAERESIS 04D5 ; Changes_When_Titlecased # L& CYRILLIC SMALL LIGATURE A IE 04D7 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IE WITH BREVE 04D9 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SCHWA 04DB ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS 04DD ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZHE WITH DIAERESIS 04DF ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZE WITH DIAERESIS 04E1 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ABKHASIAN DZE 04E3 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER I WITH MACRON 04E5 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER I WITH DIAERESIS 04E7 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER O WITH DIAERESIS 04E9 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BARRED O 04EB ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS 04ED ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER E WITH DIAERESIS 04EF ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER U WITH MACRON 04F1 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER U WITH DIAERESIS 04F3 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE 04F5 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER CHE WITH DIAERESIS 04F7 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER GHE WITH DESCENDER 04F9 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER YERU WITH DIAERESIS 04FB ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK 04FD ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER HA WITH HOOK 04FF ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER HA WITH STROKE 0501 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI DE 0503 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI DJE 0505 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI ZJE 0507 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI DZJE 0509 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI LJE 050B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI NJE 050D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI SJE 050F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER KOMI TJE 0511 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER REVERSED ZE 0513 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EL WITH HOOK 0515 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER LHA 0517 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER RHA 0519 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER YAE 051B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER QA 051D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER WE 051F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ALEUT KA 0521 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK 0523 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK 0525 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER PE WITH DESCENDER 0527 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SHHA WITH DESCENDER 0529 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EN WITH LEFT HOOK 052B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DZZHE 052D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DCHE 052F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER EL WITH DESCENDER 0561..0587 ; Changes_When_Titlecased # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN 13F8..13FD ; Changes_When_Titlecased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Changes_When_Titlecased # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK 1C8A ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TJE 1D79 ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR G 1D7D ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH STROKE 1D8E ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH PALATAL HOOK 1E01 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH RING BELOW 1E03 ; Changes_When_Titlecased # L& LATIN SMALL LETTER B WITH DOT ABOVE 1E05 ; Changes_When_Titlecased # L& LATIN SMALL LETTER B WITH DOT BELOW 1E07 ; Changes_When_Titlecased # L& LATIN SMALL LETTER B WITH LINE BELOW 1E09 ; Changes_When_Titlecased # L& LATIN SMALL LETTER C WITH CEDILLA AND ACUTE 1E0B ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH DOT ABOVE 1E0D ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH DOT BELOW 1E0F ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH LINE BELOW 1E11 ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH CEDILLA 1E13 ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW 1E15 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH MACRON AND GRAVE 1E17 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH MACRON AND ACUTE 1E19 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW 1E1B ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH TILDE BELOW 1E1D ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CEDILLA AND BREVE 1E1F ; Changes_When_Titlecased # L& LATIN SMALL LETTER F WITH DOT ABOVE 1E21 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH MACRON 1E23 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH DOT ABOVE 1E25 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH DOT BELOW 1E27 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH DIAERESIS 1E29 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH CEDILLA 1E2B ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH BREVE BELOW 1E2D ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH TILDE BELOW 1E2F ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE 1E31 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH ACUTE 1E33 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH DOT BELOW 1E35 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH LINE BELOW 1E37 ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH DOT BELOW 1E39 ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH DOT BELOW AND MACRON 1E3B ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH LINE BELOW 1E3D ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW 1E3F ; Changes_When_Titlecased # L& LATIN SMALL LETTER M WITH ACUTE 1E41 ; Changes_When_Titlecased # L& LATIN SMALL LETTER M WITH DOT ABOVE 1E43 ; Changes_When_Titlecased # L& LATIN SMALL LETTER M WITH DOT BELOW 1E45 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH DOT ABOVE 1E47 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH DOT BELOW 1E49 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH LINE BELOW 1E4B ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW 1E4D ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH TILDE AND ACUTE 1E4F ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH TILDE AND DIAERESIS 1E51 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH MACRON AND GRAVE 1E53 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH MACRON AND ACUTE 1E55 ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH ACUTE 1E57 ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH DOT ABOVE 1E59 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH DOT ABOVE 1E5B ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH DOT BELOW 1E5D ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH DOT BELOW AND MACRON 1E5F ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH LINE BELOW 1E61 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH DOT ABOVE 1E63 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH DOT BELOW 1E65 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE 1E67 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH CARON AND DOT ABOVE 1E69 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6B ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH DOT ABOVE 1E6D ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH DOT BELOW 1E6F ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH LINE BELOW 1E71 ; Changes_When_Titlecased # L& LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW 1E73 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DIAERESIS BELOW 1E75 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH TILDE BELOW 1E77 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW 1E79 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH TILDE AND ACUTE 1E7B ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH MACRON AND DIAERESIS 1E7D ; Changes_When_Titlecased # L& LATIN SMALL LETTER V WITH TILDE 1E7F ; Changes_When_Titlecased # L& LATIN SMALL LETTER V WITH DOT BELOW 1E81 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH GRAVE 1E83 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH ACUTE 1E85 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH DIAERESIS 1E87 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH DOT ABOVE 1E89 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH DOT BELOW 1E8B ; Changes_When_Titlecased # L& LATIN SMALL LETTER X WITH DOT ABOVE 1E8D ; Changes_When_Titlecased # L& LATIN SMALL LETTER X WITH DIAERESIS 1E8F ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH DOT ABOVE 1E91 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH CIRCUMFLEX 1E93 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH DOT BELOW 1E95..1E9B ; Changes_When_Titlecased # L& [7] LATIN SMALL LETTER Z WITH LINE BELOW..LATIN SMALL LETTER LONG S WITH DOT ABOVE 1EA1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH DOT BELOW 1EA3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH HOOK ABOVE 1EA5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAB ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE 1EAD ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAF ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH BREVE AND ACUTE 1EB1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH BREVE AND GRAVE 1EB3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE 1EB5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH BREVE AND TILDE 1EB7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER A WITH BREVE AND DOT BELOW 1EB9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH DOT BELOW 1EBB ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH HOOK ABOVE 1EBD ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH TILDE 1EBF ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE 1EC7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH HOOK ABOVE 1ECB ; Changes_When_Titlecased # L& LATIN SMALL LETTER I WITH DOT BELOW 1ECD ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH DOT BELOW 1ECF ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HOOK ABOVE 1ED1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE 1ED9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDB ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HORN AND ACUTE 1EDD ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HORN AND GRAVE 1EDF ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE 1EE1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HORN AND TILDE 1EE3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH HORN AND DOT BELOW 1EE5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH DOT BELOW 1EE7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HOOK ABOVE 1EE9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HORN AND ACUTE 1EEB ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HORN AND GRAVE 1EED ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE 1EEF ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HORN AND TILDE 1EF1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH HORN AND DOT BELOW 1EF3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH GRAVE 1EF5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH DOT BELOW 1EF7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH HOOK ABOVE 1EF9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Y WITH TILDE 1EFB ; Changes_When_Titlecased # L& LATIN SMALL LETTER MIDDLE-WELSH LL 1EFD ; Changes_When_Titlecased # L& LATIN SMALL LETTER MIDDLE-WELSH V 1EFF..1F07 ; Changes_When_Titlecased # L& [9] LATIN SMALL LETTER Y WITH LOOP..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F10..1F15 ; Changes_When_Titlecased # L& [6] GREEK SMALL LETTER EPSILON WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F27 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI 1F30..1F37 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER IOTA WITH PSILI..GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI 1F40..1F45 ; Changes_When_Titlecased # L& [6] GREEK SMALL LETTER OMICRON WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F60..1F67 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER OMEGA WITH PSILI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F70..1F7D ; Changes_When_Titlecased # L& [14] GREEK SMALL LETTER ALPHA WITH VARIA..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1F87 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1F90..1F97 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FA0..1FA7 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FB0..1FB4 ; Changes_When_Titlecased # L& [5] GREEK SMALL LETTER ALPHA WITH VRACHY..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FB7 ; Changes_When_Titlecased # L& [2] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 1FBE ; Changes_When_Titlecased # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Changes_When_Titlecased # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FC7 ; Changes_When_Titlecased # L& [2] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 1FD0..1FD3 ; Changes_When_Titlecased # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FD7 ; Changes_When_Titlecased # L& [2] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FE0..1FE7 ; Changes_When_Titlecased # L& [8] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI 1FF2..1FF4 ; Changes_When_Titlecased # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FF7 ; Changes_When_Titlecased # L& [2] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 214E ; Changes_When_Titlecased # L& TURNED SMALL F 2170..217F ; Changes_When_Titlecased # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 2184 ; Changes_When_Titlecased # L& LATIN SMALL LETTER REVERSED C 24D0..24E9 ; Changes_When_Titlecased # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C30..2C5F ; Changes_When_Titlecased # L& [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI 2C61 ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH DOUBLE BAR 2C65..2C66 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE 2C68 ; Changes_When_Titlecased # L& LATIN SMALL LETTER H WITH DESCENDER 2C6A ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH DESCENDER 2C6C ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH DESCENDER 2C73 ; Changes_When_Titlecased # L& LATIN SMALL LETTER W WITH HOOK 2C76 ; Changes_When_Titlecased # L& LATIN SMALL LETTER HALF H 2C81 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER ALFA 2C83 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER VIDA 2C85 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER GAMMA 2C87 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER DALDA 2C89 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER EIE 2C8B ; Changes_When_Titlecased # L& COPTIC SMALL LETTER SOU 2C8D ; Changes_When_Titlecased # L& COPTIC SMALL LETTER ZATA 2C8F ; Changes_When_Titlecased # L& COPTIC SMALL LETTER HATE 2C91 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER THETHE 2C93 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER IAUDA 2C95 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER KAPA 2C97 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER LAULA 2C99 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER MI 2C9B ; Changes_When_Titlecased # L& COPTIC SMALL LETTER NI 2C9D ; Changes_When_Titlecased # L& COPTIC SMALL LETTER KSI 2C9F ; Changes_When_Titlecased # L& COPTIC SMALL LETTER O 2CA1 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER PI 2CA3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER RO 2CA5 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER SIMA 2CA7 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER TAU 2CA9 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER UA 2CAB ; Changes_When_Titlecased # L& COPTIC SMALL LETTER FI 2CAD ; Changes_When_Titlecased # L& COPTIC SMALL LETTER KHI 2CAF ; Changes_When_Titlecased # L& COPTIC SMALL LETTER PSI 2CB1 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OOU 2CB3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER DIALECT-P ALEF 2CB5 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC AIN 2CB7 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC EIE 2CB9 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER DIALECT-P KAPA 2CBB ; Changes_When_Titlecased # L& COPTIC SMALL LETTER DIALECT-P NI 2CBD ; Changes_When_Titlecased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC NI 2CBF ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC OOU 2CC1 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER SAMPI 2CC3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER CROSSED SHEI 2CC5 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC SHEI 2CC7 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC ESH 2CC9 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER AKHMIMIC KHEI 2CCB ; Changes_When_Titlecased # L& COPTIC SMALL LETTER DIALECT-P HORI 2CCD ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC HORI 2CCF ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC HA 2CD1 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER L-SHAPED HA 2CD3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC HEI 2CD5 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC HAT 2CD7 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC GANGIA 2CD9 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC DJA 2CDB ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD COPTIC SHIMA 2CDD ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD NUBIAN SHIMA 2CDF ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD NUBIAN NGI 2CE1 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD NUBIAN NYI 2CE3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER OLD NUBIAN WAU 2CEC ; Changes_When_Titlecased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI 2CEE ; Changes_When_Titlecased # L& COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF3 ; Changes_When_Titlecased # L& COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Changes_When_Titlecased # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Changes_When_Titlecased # L& GEORGIAN SMALL LETTER YN 2D2D ; Changes_When_Titlecased # L& GEORGIAN SMALL LETTER AEN A641 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZEMLYA A643 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DZELO A645 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER REVERSED DZE A647 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTA A649 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DJERV A64B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER MONOGRAPH UK A64D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BROAD OMEGA A64F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER NEUTRAL YER A651 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER YERU WITH BACK YER A653 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTIFIED YAT A655 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER REVERSED YU A657 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTIFIED A A659 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER CLOSED LITTLE YUS A65B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BLENDED YUS A65D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE YUS A65F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER YN A661 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER REVERSED TSE A663 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SOFT DE A665 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SOFT EL A667 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SOFT EM A669 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER MONOCULAR O A66B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER BINOCULAR O A66D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A681 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DWE A683 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DZWE A685 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER ZHWE A687 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER CCHE A689 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DZZE A68B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK A68D ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TWE A68F ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TSWE A691 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TSSE A693 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TCHE A695 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER HWE A697 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER SHWE A699 ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER DOUBLE O A69B ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER CROSSED O A723 ; Changes_When_Titlecased # L& LATIN SMALL LETTER EGYPTOLOGICAL ALEF A725 ; Changes_When_Titlecased # L& LATIN SMALL LETTER EGYPTOLOGICAL AIN A727 ; Changes_When_Titlecased # L& LATIN SMALL LETTER HENG A729 ; Changes_When_Titlecased # L& LATIN SMALL LETTER TZ A72B ; Changes_When_Titlecased # L& LATIN SMALL LETTER TRESILLO A72D ; Changes_When_Titlecased # L& LATIN SMALL LETTER CUATRILLO A72F ; Changes_When_Titlecased # L& LATIN SMALL LETTER CUATRILLO WITH COMMA A733 ; Changes_When_Titlecased # L& LATIN SMALL LETTER AA A735 ; Changes_When_Titlecased # L& LATIN SMALL LETTER AO A737 ; Changes_When_Titlecased # L& LATIN SMALL LETTER AU A739 ; Changes_When_Titlecased # L& LATIN SMALL LETTER AV A73B ; Changes_When_Titlecased # L& LATIN SMALL LETTER AV WITH HORIZONTAL BAR A73D ; Changes_When_Titlecased # L& LATIN SMALL LETTER AY A73F ; Changes_When_Titlecased # L& LATIN SMALL LETTER REVERSED C WITH DOT A741 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH STROKE A743 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH DIAGONAL STROKE A745 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE A747 ; Changes_When_Titlecased # L& LATIN SMALL LETTER BROKEN L A749 ; Changes_When_Titlecased # L& LATIN SMALL LETTER L WITH HIGH STROKE A74B ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH LONG STROKE OVERLAY A74D ; Changes_When_Titlecased # L& LATIN SMALL LETTER O WITH LOOP A74F ; Changes_When_Titlecased # L& LATIN SMALL LETTER OO A751 ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER A753 ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH FLOURISH A755 ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH SQUIRREL TAIL A757 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER A759 ; Changes_When_Titlecased # L& LATIN SMALL LETTER Q WITH DIAGONAL STROKE A75B ; Changes_When_Titlecased # L& LATIN SMALL LETTER R ROTUNDA A75D ; Changes_When_Titlecased # L& LATIN SMALL LETTER RUM ROTUNDA A75F ; Changes_When_Titlecased # L& LATIN SMALL LETTER V WITH DIAGONAL STROKE A761 ; Changes_When_Titlecased # L& LATIN SMALL LETTER VY A763 ; Changes_When_Titlecased # L& LATIN SMALL LETTER VISIGOTHIC Z A765 ; Changes_When_Titlecased # L& LATIN SMALL LETTER THORN WITH STROKE A767 ; Changes_When_Titlecased # L& LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER A769 ; Changes_When_Titlecased # L& LATIN SMALL LETTER VEND A76B ; Changes_When_Titlecased # L& LATIN SMALL LETTER ET A76D ; Changes_When_Titlecased # L& LATIN SMALL LETTER IS A76F ; Changes_When_Titlecased # L& LATIN SMALL LETTER CON A77A ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR D A77C ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR F A77F ; Changes_When_Titlecased # L& LATIN SMALL LETTER TURNED INSULAR G A781 ; Changes_When_Titlecased # L& LATIN SMALL LETTER TURNED L A783 ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR R A785 ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR S A787 ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR T A78C ; Changes_When_Titlecased # L& LATIN SMALL LETTER SALTILLO A791 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH DESCENDER A793..A794 ; Changes_When_Titlecased # L& [2] LATIN SMALL LETTER C WITH BAR..LATIN SMALL LETTER C WITH PALATAL HOOK A797 ; Changes_When_Titlecased # L& LATIN SMALL LETTER B WITH FLOURISH A799 ; Changes_When_Titlecased # L& LATIN SMALL LETTER F WITH STROKE A79B ; Changes_When_Titlecased # L& LATIN SMALL LETTER VOLAPUK AE A79D ; Changes_When_Titlecased # L& LATIN SMALL LETTER VOLAPUK OE A79F ; Changes_When_Titlecased # L& LATIN SMALL LETTER VOLAPUK UE A7A1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER G WITH OBLIQUE STROKE A7A3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER K WITH OBLIQUE STROKE A7A5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER N WITH OBLIQUE STROKE A7A7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER R WITH OBLIQUE STROKE A7A9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH OBLIQUE STROKE A7B5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER BETA A7B7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER OMEGA A7B9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER U WITH STROKE A7BB ; Changes_When_Titlecased # L& LATIN SMALL LETTER GLOTTAL A A7BD ; Changes_When_Titlecased # L& LATIN SMALL LETTER GLOTTAL I A7BF ; Changes_When_Titlecased # L& LATIN SMALL LETTER GLOTTAL U A7C1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER OLD POLISH O A7C3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER ANGLICANA W A7C8 ; Changes_When_Titlecased # L& LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY A7CA ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY A7CD ; Changes_When_Titlecased # L& LATIN SMALL LETTER S WITH DIAGONAL STROKE A7CF ; Changes_When_Titlecased # L& LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE A7D1 ; Changes_When_Titlecased # L& LATIN SMALL LETTER CLOSED INSULAR G A7D3 ; Changes_When_Titlecased # L& LATIN SMALL LETTER DOUBLE THORN A7D5 ; Changes_When_Titlecased # L& LATIN SMALL LETTER DOUBLE WYNN A7D7 ; Changes_When_Titlecased # L& LATIN SMALL LETTER MIDDLE SCOTS S A7D9 ; Changes_When_Titlecased # L& LATIN SMALL LETTER SIGMOID S A7DB ; Changes_When_Titlecased # L& LATIN SMALL LETTER LAMBDA A7F6 ; Changes_When_Titlecased # L& LATIN SMALL LETTER REVERSED HALF H AB53 ; Changes_When_Titlecased # L& LATIN SMALL LETTER CHI AB70..ABBF ; Changes_When_Titlecased # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Changes_When_Titlecased # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Changes_When_Titlecased # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF41..FF5A ; Changes_When_Titlecased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z 10428..1044F ; Changes_When_Titlecased # L& [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW 104D8..104FB ; Changes_When_Titlecased # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10597..105A1 ; Changes_When_Titlecased # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Changes_When_Titlecased # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Changes_When_Titlecased # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Changes_When_Titlecased # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 10CC0..10CF2 ; Changes_When_Titlecased # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D70..10D85 ; Changes_When_Titlecased # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 118C0..118DF ; Changes_When_Titlecased # L& [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 16E60..16E7F ; Changes_When_Titlecased # L& [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EBB..16ED3 ; Changes_When_Titlecased # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 1E922..1E943 ; Changes_When_Titlecased # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA # Total code points: 1507 # ================================================ # Derived Property: Changes_When_Casefolded (CWCF) # Characters whose normalized forms are not stable under case folding. # For more information, see the definition of "isCasefolded(X)" # in the "Conformance" / "Default Case Algorithms" section of the core specification. # Changes_When_Casefolded(X) is true when toCasefold(toNFD(X)) != toNFD(X) 0041..005A ; Changes_When_Casefolded # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 00B5 ; Changes_When_Casefolded # L& MICRO SIGN 00C0..00D6 ; Changes_When_Casefolded # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00DF ; Changes_When_Casefolded # L& [8] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER SHARP S 0100 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH MACRON 0102 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH BREVE 0104 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH OGONEK 0106 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER C WITH ACUTE 0108 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER C WITH CIRCUMFLEX 010A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER C WITH DOT ABOVE 010C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER C WITH CARON 010E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH CARON 0110 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH STROKE 0112 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH MACRON 0114 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH BREVE 0116 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH DOT ABOVE 0118 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH OGONEK 011A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CARON 011C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH CIRCUMFLEX 011E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH BREVE 0120 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH DOT ABOVE 0122 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH CEDILLA 0124 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH CIRCUMFLEX 0126 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH STROKE 0128 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH TILDE 012A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH MACRON 012C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH BREVE 012E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH OGONEK 0130 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH DOT ABOVE 0132 ; Changes_When_Casefolded # L& LATIN CAPITAL LIGATURE IJ 0134 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER J WITH CIRCUMFLEX 0136 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH CEDILLA 0139 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH ACUTE 013B ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH CEDILLA 013D ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH CARON 013F ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH MIDDLE DOT 0141 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH STROKE 0143 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH ACUTE 0145 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH CEDILLA 0147 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH CARON 0149..014A ; Changes_When_Casefolded # L& [2] LATIN SMALL LETTER N PRECEDED BY APOSTROPHE..LATIN CAPITAL LETTER ENG 014C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH MACRON 014E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH BREVE 0150 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH DOUBLE ACUTE 0152 ; Changes_When_Casefolded # L& LATIN CAPITAL LIGATURE OE 0154 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH ACUTE 0156 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH CEDILLA 0158 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH CARON 015A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH ACUTE 015C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH CIRCUMFLEX 015E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH CEDILLA 0160 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH CARON 0162 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH CEDILLA 0164 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH CARON 0166 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH STROKE 0168 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH TILDE 016A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH MACRON 016C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH BREVE 016E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH RING ABOVE 0170 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DOUBLE ACUTE 0172 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH OGONEK 0174 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH CIRCUMFLEX 0176 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH CIRCUMFLEX 0178..0179 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN CAPITAL LETTER Z WITH ACUTE 017B ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH DOT ABOVE 017D ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH CARON 017F ; Changes_When_Casefolded # L& LATIN SMALL LETTER LONG S 0181..0182 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPITAL LETTER B WITH TOPBAR 0184 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER TONE SIX 0186..0187 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL LETTER C WITH HOOK 0189..018B ; Changes_When_Casefolded # L& [3] LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITAL LETTER D WITH TOPBAR 018E..0191 ; Changes_When_Casefolded # L& [4] LATIN CAPITAL LETTER REVERSED E..LATIN CAPITAL LETTER F WITH HOOK 0193..0194 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPITAL LETTER GAMMA 0196..0198 ; Changes_When_Casefolded # L& [3] LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LETTER K WITH HOOK 019C..019D ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL LETTER N WITH LEFT HOOK 019F..01A0 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LATIN CAPITAL LETTER O WITH HORN 01A2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER OI 01A4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER P WITH HOOK 01A6..01A7 ; Changes_When_Casefolded # L& [2] LATIN LETTER YR..LATIN CAPITAL LETTER TONE TWO 01A9 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER ESH 01AC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH HOOK 01AE..01AF ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER T WITH RETROFLEX HOOK..LATIN CAPITAL LETTER U WITH HORN 01B1..01B3 ; Changes_When_Casefolded # L& [3] LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL LETTER Y WITH HOOK 01B5 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH STROKE 01B7..01B8 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETTER EZH REVERSED 01BC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER TONE FIVE 01C4..01C5 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER DZ WITH CARON..LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 01C7..01C8 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER LJ..LATIN CAPITAL LETTER L WITH SMALL LETTER J 01CA..01CB ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER NJ..LATIN CAPITAL LETTER N WITH SMALL LETTER J 01CD ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH CARON 01CF ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH CARON 01D1 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH CARON 01D3 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH CARON 01D5 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON 01D7 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE 01D9 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON 01DB ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE 01DE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON 01E0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON 01E2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AE WITH MACRON 01E4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH STROKE 01E6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH CARON 01E8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH CARON 01EA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH OGONEK 01EC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH OGONEK AND MACRON 01EE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER EZH WITH CARON 01F1..01F2 ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER DZ..LATIN CAPITAL LETTER D WITH SMALL LETTER Z 01F4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH ACUTE 01F6..01F8 ; Changes_When_Casefolded # L& [3] LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LETTER N WITH GRAVE 01FA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE 01FC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AE WITH ACUTE 01FE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 0200 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH DOUBLE GRAVE 0202 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH INVERTED BREVE 0204 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH DOUBLE GRAVE 0206 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH INVERTED BREVE 0208 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH DOUBLE GRAVE 020A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH INVERTED BREVE 020C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH DOUBLE GRAVE 020E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH INVERTED BREVE 0210 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH DOUBLE GRAVE 0212 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH INVERTED BREVE 0214 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DOUBLE GRAVE 0216 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH INVERTED BREVE 0218 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH COMMA BELOW 021A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH COMMA BELOW 021C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER YOGH 021E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH CARON 0220 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 0222 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER OU 0224 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH HOOK 0226 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH DOT ABOVE 0228 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CEDILLA 022A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 022C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH TILDE AND MACRON 022E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH DOT ABOVE 0230 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON 0232 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH MACRON 023A..023B ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER A WITH STROKE..LATIN CAPITAL LETTER C WITH STROKE 023D..023E ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER L WITH BAR..LATIN CAPITAL LETTER T WITH DIAGONAL STROKE 0241 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER GLOTTAL STOP 0243..0246 ; Changes_When_Casefolded # L& [4] LATIN CAPITAL LETTER B WITH STROKE..LATIN CAPITAL LETTER E WITH STROKE 0248 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER J WITH STROKE 024A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL 024C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH STROKE 024E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH STROKE 0345 ; Changes_When_Casefolded # Mn COMBINING GREEK YPOGEGRAMMENI 0370 ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER HETA 0372 ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER ARCHAIC SAMPI 0376 ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA 037F ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER YOT 0386 ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Changes_When_Casefolded # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..038F ; Changes_When_Casefolded # L& [2] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER OMEGA WITH TONOS 0391..03A1 ; Changes_When_Casefolded # L& [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO 03A3..03AB ; Changes_When_Casefolded # L& [9] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA 03C2 ; Changes_When_Casefolded # L& GREEK SMALL LETTER FINAL SIGMA 03CF..03D1 ; Changes_When_Casefolded # L& [3] GREEK CAPITAL KAI SYMBOL..GREEK THETA SYMBOL 03D5..03D6 ; Changes_When_Casefolded # L& [2] GREEK PHI SYMBOL..GREEK PI SYMBOL 03D8 ; Changes_When_Casefolded # L& GREEK LETTER ARCHAIC KOPPA 03DA ; Changes_When_Casefolded # L& GREEK LETTER STIGMA 03DC ; Changes_When_Casefolded # L& GREEK LETTER DIGAMMA 03DE ; Changes_When_Casefolded # L& GREEK LETTER KOPPA 03E0 ; Changes_When_Casefolded # L& GREEK LETTER SAMPI 03E2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER SHEI 03E4 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER FEI 03E6 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER KHEI 03E8 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER HORI 03EA ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER GANGIA 03EC ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER SHIMA 03EE ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER DEI 03F0..03F1 ; Changes_When_Casefolded # L& [2] GREEK KAPPA SYMBOL..GREEK RHO SYMBOL 03F4..03F5 ; Changes_When_Casefolded # L& [2] GREEK CAPITAL THETA SYMBOL..GREEK LUNATE EPSILON SYMBOL 03F7 ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER SHO 03F9..03FA ; Changes_When_Casefolded # L& [2] GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAPITAL LETTER SAN 03FD..042F ; Changes_When_Casefolded # L& [51] GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..CYRILLIC CAPITAL LETTER YA 0460 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER OMEGA 0462 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER YAT 0464 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTIFIED E 0466 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER LITTLE YUS 0468 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS 046A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BIG YUS 046C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS 046E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KSI 0470 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER PSI 0472 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER FITA 0474 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IZHITSA 0476 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0478 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER UK 047A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ROUND OMEGA 047C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER OMEGA WITH TITLO 047E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER OT 0480 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOPPA 048A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SHORT I WITH TAIL 048C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SEMISOFT SIGN 048E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ER WITH TICK 0490 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER GHE WITH UPTURN 0492 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER GHE WITH STROKE 0494 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK 0496 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 0498 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ZE WITH DESCENDER 049A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KA WITH DESCENDER 049C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE 049E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KA WITH STROKE 04A0 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BASHKIR KA 04A2 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EN WITH DESCENDER 04A4 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LIGATURE EN GHE 04A6 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK 04A8 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ABKHASIAN HA 04AA ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ES WITH DESCENDER 04AC ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER TE WITH DESCENDER 04AE ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER STRAIGHT U 04B0 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE 04B2 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER HA WITH DESCENDER 04B4 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LIGATURE TE TSE 04B6 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER CHE WITH DESCENDER 04B8 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE 04BA ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SHHA 04BC ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ABKHASIAN CHE 04BE ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER 04C0..04C1 ; Changes_When_Casefolded # L& [2] CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL LETTER ZHE WITH BREVE 04C3 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KA WITH HOOK 04C5 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EL WITH TAIL 04C7 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EN WITH HOOK 04C9 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EN WITH TAIL 04CB ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KHAKASSIAN CHE 04CD ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EM WITH TAIL 04D0 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER A WITH BREVE 04D2 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER A WITH DIAERESIS 04D4 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LIGATURE A IE 04D6 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IE WITH BREVE 04D8 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SCHWA 04DA ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS 04DC ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS 04DE ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS 04E0 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ABKHASIAN DZE 04E2 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER I WITH MACRON 04E4 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER I WITH DIAERESIS 04E6 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER O WITH DIAERESIS 04E8 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BARRED O 04EA ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS 04EC ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER E WITH DIAERESIS 04EE ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER U WITH MACRON 04F0 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER U WITH DIAERESIS 04F2 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 04F4 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS 04F6 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER GHE WITH DESCENDER 04F8 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS 04FA ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK 04FC ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER HA WITH HOOK 04FE ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER HA WITH STROKE 0500 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI DE 0502 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI DJE 0504 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI ZJE 0506 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI DZJE 0508 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI LJE 050A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI NJE 050C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI SJE 050E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER KOMI TJE 0510 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER REVERSED ZE 0512 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EL WITH HOOK 0514 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER LHA 0516 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER RHA 0518 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER YAE 051A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER QA 051C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER WE 051E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ALEUT KA 0520 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK 0522 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK 0524 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER PE WITH DESCENDER 0526 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER 0528 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK 052A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DZZHE 052C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DCHE 052E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER EL WITH DESCENDER 0531..0556 ; Changes_When_Casefolded # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0587 ; Changes_When_Casefolded # L& ARMENIAN SMALL LIGATURE ECH YIWN 10A0..10C5 ; Changes_When_Casefolded # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Changes_When_Casefolded # L& GEORGIAN CAPITAL LETTER YN 10CD ; Changes_When_Casefolded # L& GEORGIAN CAPITAL LETTER AEN 13F8..13FD ; Changes_When_Casefolded # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C89 ; Changes_When_Casefolded # L& [10] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Changes_When_Casefolded # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Changes_When_Casefolded # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH RING BELOW 1E02 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER B WITH DOT BELOW 1E06 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER B WITH LINE BELOW 1E08 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE 1E0A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH DOT ABOVE 1E0C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH DOT BELOW 1E0E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH LINE BELOW 1E10 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH CEDILLA 1E12 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW 1E14 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH MACRON AND GRAVE 1E16 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH MACRON AND ACUTE 1E18 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW 1E1A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH TILDE BELOW 1E1C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE 1E1E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER F WITH DOT ABOVE 1E20 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH MACRON 1E22 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH DOT ABOVE 1E24 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH DOT BELOW 1E26 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH DIAERESIS 1E28 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH CEDILLA 1E2A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH BREVE BELOW 1E2C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH TILDE BELOW 1E2E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE 1E30 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH ACUTE 1E32 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH DOT BELOW 1E34 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH LINE BELOW 1E36 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH DOT BELOW 1E38 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON 1E3A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH LINE BELOW 1E3C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW 1E3E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER M WITH ACUTE 1E40 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER M WITH DOT ABOVE 1E42 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER M WITH DOT BELOW 1E44 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH DOT ABOVE 1E46 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH DOT BELOW 1E48 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH LINE BELOW 1E4A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW 1E4C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH TILDE AND ACUTE 1E4E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS 1E50 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH MACRON AND GRAVE 1E52 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH MACRON AND ACUTE 1E54 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER P WITH ACUTE 1E56 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER P WITH DOT ABOVE 1E58 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH DOT ABOVE 1E5A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH DOT BELOW 1E5C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON 1E5E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH LINE BELOW 1E60 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH DOT ABOVE 1E62 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH DOT BELOW 1E64 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE 1E66 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE 1E68 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH DOT ABOVE 1E6C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH DOT BELOW 1E6E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH LINE BELOW 1E70 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW 1E72 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DIAERESIS BELOW 1E74 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH TILDE BELOW 1E76 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW 1E78 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH TILDE AND ACUTE 1E7A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS 1E7C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER V WITH TILDE 1E7E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER V WITH DOT BELOW 1E80 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH GRAVE 1E82 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH ACUTE 1E84 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH DIAERESIS 1E86 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH DOT ABOVE 1E88 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH DOT BELOW 1E8A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER X WITH DOT ABOVE 1E8C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER X WITH DIAERESIS 1E8E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH DOT ABOVE 1E90 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH CIRCUMFLEX 1E92 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH DOT BELOW 1E94 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH LINE BELOW 1E9A..1E9B ; Changes_When_Casefolded # L& [2] LATIN SMALL LETTER A WITH RIGHT HALF RING..LATIN SMALL LETTER LONG S WITH DOT ABOVE 1E9E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER SHARP S 1EA0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH DOT BELOW 1EA2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH HOOK ABOVE 1EA4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE 1EAC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH BREVE AND ACUTE 1EB0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH BREVE AND GRAVE 1EB2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE 1EB4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH BREVE AND TILDE 1EB6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW 1EB8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH DOT BELOW 1EBA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH HOOK ABOVE 1EBC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH TILDE 1EBE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE 1EC6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH HOOK ABOVE 1ECA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER I WITH DOT BELOW 1ECC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH DOT BELOW 1ECE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH HOOK ABOVE 1ED0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE 1ED8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH HORN AND ACUTE 1EDC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH HORN AND GRAVE 1EDE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE 1EE0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH HORN AND TILDE 1EE2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW 1EE4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH DOT BELOW 1EE6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH HOOK ABOVE 1EE8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH HORN AND ACUTE 1EEA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH HORN AND GRAVE 1EEC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE 1EEE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH HORN AND TILDE 1EF0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW 1EF2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH GRAVE 1EF4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH DOT BELOW 1EF6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH HOOK ABOVE 1EF8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH TILDE 1EFA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER MIDDLE-WELSH LL 1EFC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER MIDDLE-WELSH V 1EFE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Y WITH LOOP 1F08..1F0F ; Changes_When_Casefolded # L& [8] GREEK CAPITAL LETTER ALPHA WITH PSILI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F18..1F1D ; Changes_When_Casefolded # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F28..1F2F ; Changes_When_Casefolded # L& [8] GREEK CAPITAL LETTER ETA WITH PSILI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI 1F38..1F3F ; Changes_When_Casefolded # L& [8] GREEK CAPITAL LETTER IOTA WITH PSILI..GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI 1F48..1F4D ; Changes_When_Casefolded # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F59 ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F ; Changes_When_Casefolded # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F68..1F6F ; Changes_When_Casefolded # L& [8] GREEK CAPITAL LETTER OMEGA WITH PSILI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F80..1FAF ; Changes_When_Casefolded # L& [48] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FB2..1FB4 ; Changes_When_Casefolded # L& [3] GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB7..1FBC ; Changes_When_Casefolded # L& [6] GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FC2..1FC4 ; Changes_When_Casefolded # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC7..1FCC ; Changes_When_Casefolded # L& [6] GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD8..1FDB ; Changes_When_Casefolded # L& [4] GREEK CAPITAL LETTER IOTA WITH VRACHY..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE8..1FEC ; Changes_When_Casefolded # L& [5] GREEK CAPITAL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; Changes_When_Casefolded # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF7..1FFC ; Changes_When_Casefolded # L& [6] GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2126 ; Changes_When_Casefolded # L& OHM SIGN 212A..212B ; Changes_When_Casefolded # L& [2] KELVIN SIGN..ANGSTROM SIGN 2132 ; Changes_When_Casefolded # L& TURNED CAPITAL F 2160..216F ; Changes_When_Casefolded # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 2183 ; Changes_When_Casefolded # L& ROMAN NUMERAL REVERSED ONE HUNDRED 24B6..24CF ; Changes_When_Casefolded # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z 2C00..2C2F ; Changes_When_Casefolded # L& [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI 2C60 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH DOUBLE BAR 2C62..2C64 ; Changes_When_Casefolded # L& [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL 2C67 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER H WITH DESCENDER 2C69 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH DESCENDER 2C6B ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Z WITH DESCENDER 2C6D..2C70 ; Changes_When_Casefolded # L& [4] LATIN CAPITAL LETTER ALPHA..LATIN CAPITAL LETTER TURNED ALPHA 2C72 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER W WITH HOOK 2C75 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER HALF H 2C7E..2C80 ; Changes_When_Casefolded # L& [3] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC CAPITAL LETTER ALFA 2C82 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER VIDA 2C84 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER GAMMA 2C86 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER DALDA 2C88 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER EIE 2C8A ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER SOU 2C8C ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER ZATA 2C8E ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER HATE 2C90 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER THETHE 2C92 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER IAUDA 2C94 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER KAPA 2C96 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER LAULA 2C98 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER MI 2C9A ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER NI 2C9C ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER KSI 2C9E ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER O 2CA0 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER PI 2CA2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER RO 2CA4 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER SIMA 2CA6 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER TAU 2CA8 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER UA 2CAA ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER FI 2CAC ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER KHI 2CAE ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER PSI 2CB0 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OOU 2CB2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER DIALECT-P ALEF 2CB4 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC AIN 2CB6 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE 2CB8 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER DIALECT-P KAPA 2CBA ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER DIALECT-P NI 2CBC ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI 2CBE ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC OOU 2CC0 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER SAMPI 2CC2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER CROSSED SHEI 2CC4 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC SHEI 2CC6 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC ESH 2CC8 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER AKHMIMIC KHEI 2CCA ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER DIALECT-P HORI 2CCC ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC HORI 2CCE ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC HA 2CD0 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER L-SHAPED HA 2CD2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC HEI 2CD4 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC HAT 2CD6 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC GANGIA 2CD8 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC DJA 2CDA ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD COPTIC SHIMA 2CDC ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD NUBIAN SHIMA 2CDE ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD NUBIAN NGI 2CE0 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD NUBIAN NYI 2CE2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER OLD NUBIAN WAU 2CEB ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI 2CED ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA 2CF2 ; Changes_When_Casefolded # L& COPTIC CAPITAL LETTER BOHAIRIC KHEI A640 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ZEMLYA A642 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DZELO A644 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER REVERSED DZE A646 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTA A648 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DJERV A64A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER MONOGRAPH UK A64C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BROAD OMEGA A64E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER NEUTRAL YER A650 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER YERU WITH BACK YER A652 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTIFIED YAT A654 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER REVERSED YU A656 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTIFIED A A658 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS A65A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BLENDED YUS A65C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS A65E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER YN A660 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER REVERSED TSE A662 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SOFT DE A664 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SOFT EL A666 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SOFT EM A668 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER MONOCULAR O A66A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER BINOCULAR O A66C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O A680 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DWE A682 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DZWE A684 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER ZHWE A686 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER CCHE A688 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DZZE A68A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK A68C ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER TWE A68E ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER TSWE A690 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER TSSE A692 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER TCHE A694 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER HWE A696 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER SHWE A698 ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER DOUBLE O A69A ; Changes_When_Casefolded # L& CYRILLIC CAPITAL LETTER CROSSED O A722 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF A724 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER EGYPTOLOGICAL AIN A726 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER HENG A728 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER TZ A72A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER TRESILLO A72C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER CUATRILLO A72E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER CUATRILLO WITH COMMA A732 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AA A734 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AO A736 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AU A738 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AV A73A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR A73C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER AY A73E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER REVERSED C WITH DOT A740 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH STROKE A742 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH DIAGONAL STROKE A744 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE A746 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER BROKEN L A748 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER L WITH HIGH STROKE A74A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY A74C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER O WITH LOOP A74E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER OO A750 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER A752 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER P WITH FLOURISH A754 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER P WITH SQUIRREL TAIL A756 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER A758 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE A75A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R ROTUNDA A75C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER RUM ROTUNDA A75E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER V WITH DIAGONAL STROKE A760 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER VY A762 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER VISIGOTHIC Z A764 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER THORN WITH STROKE A766 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER A768 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER VEND A76A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER ET A76C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER IS A76E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER CON A779 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER INSULAR D A77B ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER INSULAR F A77D..A77E ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER INSULAR G..LATIN CAPITAL LETTER TURNED INSULAR G A780 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER TURNED L A782 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER INSULAR R A784 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER INSULAR S A786 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER INSULAR T A78B ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER SALTILLO A78D ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER TURNED H A790 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH DESCENDER A792 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER C WITH BAR A796 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER B WITH FLOURISH A798 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER F WITH STROKE A79A ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER VOLAPUK AE A79C ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER VOLAPUK OE A79E ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER VOLAPUK UE A7A0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER G WITH OBLIQUE STROKE A7A2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER K WITH OBLIQUE STROKE A7A4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER N WITH OBLIQUE STROKE A7A6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER R WITH OBLIQUE STROKE A7A8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH OBLIQUE STROKE A7AA..A7AE ; Changes_When_Casefolded # L& [5] LATIN CAPITAL LETTER H WITH HOOK..LATIN CAPITAL LETTER SMALL CAPITAL I A7B0..A7B4 ; Changes_When_Casefolded # L& [5] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER BETA A7B6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER OMEGA A7B8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER U WITH STROKE A7BA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER GLOTTAL A A7BC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER GLOTTAL I A7BE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER GLOTTAL U A7C0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER OLD POLISH O A7C2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER ANGLICANA W A7C4..A7C7 ; Changes_When_Casefolded # L& [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB..A7CC ; Changes_When_Casefolded # L& [2] LATIN CAPITAL LETTER RAMS HORN..LATIN CAPITAL LETTER S WITH DIAGONAL STROKE A7CE ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER CLOSED INSULAR G A7D2 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER DOUBLE THORN A7D4 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER DOUBLE WYNN A7D6 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER SIGMOID S A7DA ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER LAMBDA A7DC ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F5 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER REVERSED HALF H AB70..ABBF ; Changes_When_Casefolded # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Changes_When_Casefolded # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Changes_When_Casefolded # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF21..FF3A ; Changes_When_Casefolded # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z 10400..10427 ; Changes_When_Casefolded # L& [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW 104B0..104D3 ; Changes_When_Casefolded # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 10570..1057A ; Changes_When_Casefolded # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Changes_When_Casefolded # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Changes_When_Casefolded # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Changes_When_Casefolded # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10C80..10CB2 ; Changes_When_Casefolded # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10D50..10D65 ; Changes_When_Casefolded # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 118A0..118BF ; Changes_When_Casefolded # L& [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO 16E40..16E5F ; Changes_When_Casefolded # L& [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y 16EA0..16EB8 ; Changes_When_Casefolded # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 1E900..1E921 ; Changes_When_Casefolded # L& [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA # Total code points: 1561 # ================================================ # Derived Property: Changes_When_Casemapped (CWCM) # Characters whose normalized forms are not stable under case mapping. # For more information, see the definition of "isCased(X)" # in the "Conformance" / "Default Case Algorithms" section of the core specification. # Changes_When_Casemapped(X) is true when CWL(X), or CWT(X), or CWU(X) 0041..005A ; Changes_When_Casemapped # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 0061..007A ; Changes_When_Casemapped # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00B5 ; Changes_When_Casemapped # L& MICRO SIGN 00C0..00D6 ; Changes_When_Casemapped # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; Changes_When_Casemapped # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..0137 ; Changes_When_Casemapped # L& [64] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER K WITH CEDILLA 0139..018C ; Changes_When_Casemapped # L& [84] LATIN CAPITAL LETTER L WITH ACUTE..LATIN SMALL LETTER D WITH TOPBAR 018E..01A9 ; Changes_When_Casemapped # L& [28] LATIN CAPITAL LETTER REVERSED E..LATIN CAPITAL LETTER ESH 01AC..01B9 ; Changes_When_Casemapped # L& [14] LATIN CAPITAL LETTER T WITH HOOK..LATIN SMALL LETTER EZH REVERSED 01BC..01BD ; Changes_When_Casemapped # L& [2] LATIN CAPITAL LETTER TONE FIVE..LATIN SMALL LETTER TONE FIVE 01BF ; Changes_When_Casemapped # L& LATIN LETTER WYNN 01C4..0220 ; Changes_When_Casemapped # L& [93] LATIN CAPITAL LETTER DZ WITH CARON..LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 0222..0233 ; Changes_When_Casemapped # L& [18] LATIN CAPITAL LETTER OU..LATIN SMALL LETTER Y WITH MACRON 023A..0254 ; Changes_When_Casemapped # L& [27] LATIN CAPITAL LETTER A WITH STROKE..LATIN SMALL LETTER OPEN O 0256..0257 ; Changes_When_Casemapped # L& [2] LATIN SMALL LETTER D WITH TAIL..LATIN SMALL LETTER D WITH HOOK 0259 ; Changes_When_Casemapped # L& LATIN SMALL LETTER SCHWA 025B..025C ; Changes_When_Casemapped # L& [2] LATIN SMALL LETTER OPEN E..LATIN SMALL LETTER REVERSED OPEN E 0260..0261 ; Changes_When_Casemapped # L& [2] LATIN SMALL LETTER G WITH HOOK..LATIN SMALL LETTER SCRIPT G 0263..0266 ; Changes_When_Casemapped # L& [4] LATIN SMALL LETTER GAMMA..LATIN SMALL LETTER H WITH HOOK 0268..026C ; Changes_When_Casemapped # L& [5] LATIN SMALL LETTER I WITH STROKE..LATIN SMALL LETTER L WITH BELT 026F ; Changes_When_Casemapped # L& LATIN SMALL LETTER TURNED M 0271..0272 ; Changes_When_Casemapped # L& [2] LATIN SMALL LETTER M WITH HOOK..LATIN SMALL LETTER N WITH LEFT HOOK 0275 ; Changes_When_Casemapped # L& LATIN SMALL LETTER BARRED O 027D ; Changes_When_Casemapped # L& LATIN SMALL LETTER R WITH TAIL 0280 ; Changes_When_Casemapped # L& LATIN LETTER SMALL CAPITAL R 0282..0283 ; Changes_When_Casemapped # L& [2] LATIN SMALL LETTER S WITH HOOK..LATIN SMALL LETTER ESH 0287..028C ; Changes_When_Casemapped # L& [6] LATIN SMALL LETTER TURNED T..LATIN SMALL LETTER TURNED V 0292 ; Changes_When_Casemapped # L& LATIN SMALL LETTER EZH 029D..029E ; Changes_When_Casemapped # L& [2] LATIN SMALL LETTER J WITH CROSSED-TAIL..LATIN SMALL LETTER TURNED K 0345 ; Changes_When_Casemapped # Mn COMBINING GREEK YPOGEGRAMMENI 0370..0373 ; Changes_When_Casemapped # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0376..0377 ; Changes_When_Casemapped # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037B..037D ; Changes_When_Casemapped # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; Changes_When_Casemapped # L& GREEK CAPITAL LETTER YOT 0386 ; Changes_When_Casemapped # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Changes_When_Casemapped # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Changes_When_Casemapped # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; Changes_When_Casemapped # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03D1 ; Changes_When_Casemapped # L& [47] GREEK CAPITAL LETTER SIGMA..GREEK THETA SYMBOL 03D5..03F5 ; Changes_When_Casemapped # L& [33] GREEK PHI SYMBOL..GREEK LUNATE EPSILON SYMBOL 03F7..03FB ; Changes_When_Casemapped # L& [5] GREEK CAPITAL LETTER SHO..GREEK SMALL LETTER SAN 03FD..0481 ; Changes_When_Casemapped # L& [133] GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..CYRILLIC SMALL LETTER KOPPA 048A..052F ; Changes_When_Casemapped # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; Changes_When_Casemapped # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0561..0587 ; Changes_When_Casemapped # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN 10A0..10C5 ; Changes_When_Casemapped # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Changes_When_Casemapped # L& GEORGIAN CAPITAL LETTER YN 10CD ; Changes_When_Casemapped # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; Changes_When_Casemapped # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FD..10FF ; Changes_When_Casemapped # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13A0..13F5 ; Changes_When_Casemapped # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; Changes_When_Casemapped # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C8A ; Changes_When_Casemapped # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Changes_When_Casemapped # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Changes_When_Casemapped # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1D79 ; Changes_When_Casemapped # L& LATIN SMALL LETTER INSULAR G 1D7D ; Changes_When_Casemapped # L& LATIN SMALL LETTER P WITH STROKE 1D8E ; Changes_When_Casemapped # L& LATIN SMALL LETTER Z WITH PALATAL HOOK 1E00..1E9B ; Changes_When_Casemapped # L& [156] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER LONG S WITH DOT ABOVE 1E9E ; Changes_When_Casemapped # L& LATIN CAPITAL LETTER SHARP S 1EA0..1F15 ; Changes_When_Casemapped # L& [118] LATIN CAPITAL LETTER A WITH DOT BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; Changes_When_Casemapped # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; Changes_When_Casemapped # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; Changes_When_Casemapped # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Changes_When_Casemapped # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; Changes_When_Casemapped # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Changes_When_Casemapped # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Changes_When_Casemapped # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; Changes_When_Casemapped # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; Changes_When_Casemapped # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; Changes_When_Casemapped # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; Changes_When_Casemapped # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Changes_When_Casemapped # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; Changes_When_Casemapped # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; Changes_When_Casemapped # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; Changes_When_Casemapped # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; Changes_When_Casemapped # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; Changes_When_Casemapped # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; Changes_When_Casemapped # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2126 ; Changes_When_Casemapped # L& OHM SIGN 212A..212B ; Changes_When_Casemapped # L& [2] KELVIN SIGN..ANGSTROM SIGN 2132 ; Changes_When_Casemapped # L& TURNED CAPITAL F 214E ; Changes_When_Casemapped # L& TURNED SMALL F 2160..217F ; Changes_When_Casemapped # Nl [32] ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 2183..2184 ; Changes_When_Casemapped # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 24B6..24E9 ; Changes_When_Casemapped # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C00..2C70 ; Changes_When_Casemapped # L& [113] GLAGOLITIC CAPITAL LETTER AZU..LATIN CAPITAL LETTER TURNED ALPHA 2C72..2C73 ; Changes_When_Casemapped # L& [2] LATIN CAPITAL LETTER W WITH HOOK..LATIN SMALL LETTER W WITH HOOK 2C75..2C76 ; Changes_When_Casemapped # L& [2] LATIN CAPITAL LETTER HALF H..LATIN SMALL LETTER HALF H 2C7E..2CE3 ; Changes_When_Casemapped # L& [102] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SMALL LETTER OLD NUBIAN WAU 2CEB..2CEE ; Changes_When_Casemapped # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; Changes_When_Casemapped # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Changes_When_Casemapped # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Changes_When_Casemapped # L& GEORGIAN SMALL LETTER YN 2D2D ; Changes_When_Casemapped # L& GEORGIAN SMALL LETTER AEN A640..A66D ; Changes_When_Casemapped # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A680..A69B ; Changes_When_Casemapped # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A722..A72F ; Changes_When_Casemapped # L& [14] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CUATRILLO WITH COMMA A732..A76F ; Changes_When_Casemapped # L& [62] LATIN CAPITAL LETTER AA..LATIN SMALL LETTER CON A779..A787 ; Changes_When_Casemapped # L& [15] LATIN CAPITAL LETTER INSULAR D..LATIN SMALL LETTER INSULAR T A78B..A78D ; Changes_When_Casemapped # L& [3] LATIN CAPITAL LETTER SALTILLO..LATIN CAPITAL LETTER TURNED H A790..A794 ; Changes_When_Casemapped # L& [5] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER C WITH PALATAL HOOK A796..A7AE ; Changes_When_Casemapped # L& [25] LATIN CAPITAL LETTER B WITH FLOURISH..LATIN CAPITAL LETTER SMALL CAPITAL I A7B0..A7DC ; Changes_When_Casemapped # L& [45] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F5..A7F6 ; Changes_When_Casemapped # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H AB53 ; Changes_When_Casemapped # L& LATIN SMALL LETTER CHI AB70..ABBF ; Changes_When_Casemapped # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Changes_When_Casemapped # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Changes_When_Casemapped # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF21..FF3A ; Changes_When_Casemapped # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; Changes_When_Casemapped # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z 10400..1044F ; Changes_When_Casemapped # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 104B0..104D3 ; Changes_When_Casemapped # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; Changes_When_Casemapped # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10570..1057A ; Changes_When_Casemapped # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Changes_When_Casemapped # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Changes_When_Casemapped # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Changes_When_Casemapped # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; Changes_When_Casemapped # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Changes_When_Casemapped # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Changes_When_Casemapped # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Changes_When_Casemapped # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 10C80..10CB2 ; Changes_When_Casemapped # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; Changes_When_Casemapped # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D50..10D65 ; Changes_When_Casemapped # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D70..10D85 ; Changes_When_Casemapped # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 118A0..118DF ; Changes_When_Casemapped # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 16E40..16E7F ; Changes_When_Casemapped # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; Changes_When_Casemapped # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; Changes_When_Casemapped # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 1E900..1E943 ; Changes_When_Casemapped # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA # Total code points: 3037 # ================================================ # Derived Property: ID_Start # Characters that can start an identifier. # Generated from: # Lu + Ll + Lt + Lm + Lo + Nl # + Other_ID_Start # - Pattern_Syntax # - Pattern_White_Space # NOTE: See UAX #31 for more information 0041..005A ; ID_Start # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 0061..007A ; ID_Start # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; ID_Start # Lo FEMININE ORDINAL INDICATOR 00B5 ; ID_Start # L& MICRO SIGN 00BA ; ID_Start # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; ID_Start # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; ID_Start # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; ID_Start # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; ID_Start # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; ID_Start # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; ID_Start # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; ID_Start # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; ID_Start # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; ID_Start # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02C1 ; ID_Start # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C6..02D1 ; ID_Start # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; ID_Start # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EC ; ID_Start # Lm MODIFIER LETTER VOICING 02EE ; ID_Start # Lm MODIFIER LETTER DOUBLE APOSTROPHE 0370..0373 ; ID_Start # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0374 ; ID_Start # Lm GREEK NUMERAL SIGN 0376..0377 ; ID_Start # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; ID_Start # Lm GREEK YPOGEGRAMMENI 037B..037D ; ID_Start # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; ID_Start # L& GREEK CAPITAL LETTER YOT 0386 ; ID_Start # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; ID_Start # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; ID_Start # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; ID_Start # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; ID_Start # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; ID_Start # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 048A..052F ; ID_Start # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; ID_Start # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; ID_Start # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 0560..0588 ; ID_Start # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 05D0..05EA ; ID_Start # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; ID_Start # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 0620..063F ; ID_Start # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; ID_Start # Lm ARABIC TATWEEL 0641..064A ; ID_Start # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 066E..066F ; ID_Start # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0671..06D3 ; ID_Start # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D5 ; ID_Start # Lo ARABIC LETTER AE 06E5..06E6 ; ID_Start # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06EE..06EF ; ID_Start # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06FA..06FC ; ID_Start # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FF ; ID_Start # Lo ARABIC LETTER HEH WITH INVERTED V 0710 ; ID_Start # Lo SYRIAC LETTER ALAPH 0712..072F ; ID_Start # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 074D..07A5 ; ID_Start # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07B1 ; ID_Start # Lo THAANA LETTER NAA 07CA..07EA ; ID_Start # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07F4..07F5 ; ID_Start # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; ID_Start # Lm NKO LAJANYALAN 0800..0815 ; ID_Start # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 081A ; ID_Start # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 0824 ; ID_Start # Lm SAMARITAN MODIFIER LETTER SHORT A 0828 ; ID_Start # Lm SAMARITAN MODIFIER LETTER I 0840..0858 ; ID_Start # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 0860..086A ; ID_Start # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; ID_Start # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088F ; ID_Start # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 08A0..08C8 ; ID_Start # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; ID_Start # Lm ARABIC SMALL FARSI YEH 0904..0939 ; ID_Start # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093D ; ID_Start # Lo DEVANAGARI SIGN AVAGRAHA 0950 ; ID_Start # Lo DEVANAGARI OM 0958..0961 ; ID_Start # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0971 ; ID_Start # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; ID_Start # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0985..098C ; ID_Start # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; ID_Start # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; ID_Start # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; ID_Start # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; ID_Start # Lo BENGALI LETTER LA 09B6..09B9 ; ID_Start # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BD ; ID_Start # Lo BENGALI SIGN AVAGRAHA 09CE ; ID_Start # Lo BENGALI LETTER KHANDA TA 09DC..09DD ; ID_Start # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; ID_Start # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09F0..09F1 ; ID_Start # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09FC ; ID_Start # Lo BENGALI LETTER VEDIC ANUSVARA 0A05..0A0A ; ID_Start # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; ID_Start # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; ID_Start # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; ID_Start # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; ID_Start # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; ID_Start # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; ID_Start # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A59..0A5C ; ID_Start # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; ID_Start # Lo GURMUKHI LETTER FA 0A72..0A74 ; ID_Start # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A85..0A8D ; ID_Start # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; ID_Start # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; ID_Start # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; ID_Start # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; ID_Start # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; ID_Start # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABD ; ID_Start # Lo GUJARATI SIGN AVAGRAHA 0AD0 ; ID_Start # Lo GUJARATI OM 0AE0..0AE1 ; ID_Start # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AF9 ; ID_Start # Lo GUJARATI LETTER ZHA 0B05..0B0C ; ID_Start # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; ID_Start # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; ID_Start # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; ID_Start # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; ID_Start # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; ID_Start # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3D ; ID_Start # Lo ORIYA SIGN AVAGRAHA 0B5C..0B5D ; ID_Start # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; ID_Start # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B71 ; ID_Start # Lo ORIYA LETTER WA 0B83 ; ID_Start # Lo TAMIL SIGN VISARGA 0B85..0B8A ; ID_Start # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; ID_Start # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; ID_Start # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; ID_Start # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; ID_Start # Lo TAMIL LETTER JA 0B9E..0B9F ; ID_Start # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; ID_Start # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; ID_Start # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; ID_Start # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BD0 ; ID_Start # Lo TAMIL OM 0C05..0C0C ; ID_Start # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; ID_Start # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; ID_Start # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; ID_Start # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; ID_Start # Lo TELUGU SIGN AVAGRAHA 0C58..0C5A ; ID_Start # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; ID_Start # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; ID_Start # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C80 ; ID_Start # Lo KANNADA SIGN SPACING CANDRABINDU 0C85..0C8C ; ID_Start # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; ID_Start # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; ID_Start # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; ID_Start # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; ID_Start # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBD ; ID_Start # Lo KANNADA SIGN AVAGRAHA 0CDC..0CDE ; ID_Start # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; ID_Start # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CF1..0CF2 ; ID_Start # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0D04..0D0C ; ID_Start # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; ID_Start # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; ID_Start # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3D ; ID_Start # Lo MALAYALAM SIGN AVAGRAHA 0D4E ; ID_Start # Lo MALAYALAM LETTER DOT REPH 0D54..0D56 ; ID_Start # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D5F..0D61 ; ID_Start # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D7A..0D7F ; ID_Start # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D85..0D96 ; ID_Start # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; ID_Start # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; ID_Start # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; ID_Start # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; ID_Start # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0E01..0E30 ; ID_Start # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E32..0E33 ; ID_Start # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E40..0E45 ; ID_Start # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; ID_Start # Lm THAI CHARACTER MAIYAMOK 0E81..0E82 ; ID_Start # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; ID_Start # Lo LAO LETTER KHO TAM 0E86..0E8A ; ID_Start # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; ID_Start # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; ID_Start # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; ID_Start # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB2..0EB3 ; ID_Start # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EBD ; ID_Start # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; ID_Start # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; ID_Start # Lm LAO KO LA 0EDC..0EDF ; ID_Start # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; ID_Start # Lo TIBETAN SYLLABLE OM 0F40..0F47 ; ID_Start # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; ID_Start # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F88..0F8C ; ID_Start # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 1000..102A ; ID_Start # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 103F ; ID_Start # Lo MYANMAR LETTER GREAT SA 1050..1055 ; ID_Start # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 105A..105D ; ID_Start # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 1061 ; ID_Start # Lo MYANMAR LETTER SGAW KAREN SHA 1065..1066 ; ID_Start # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 106E..1070 ; ID_Start # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1075..1081 ; ID_Start # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 108E ; ID_Start # Lo MYANMAR LETTER RUMAI PALAUNG FA 10A0..10C5 ; ID_Start # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; ID_Start # L& GEORGIAN CAPITAL LETTER YN 10CD ; ID_Start # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; ID_Start # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; ID_Start # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; ID_Start # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; ID_Start # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; ID_Start # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; ID_Start # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; ID_Start # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; ID_Start # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; ID_Start # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; ID_Start # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; ID_Start # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; ID_Start # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; ID_Start # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; ID_Start # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; ID_Start # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; ID_Start # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; ID_Start # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; ID_Start # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 1380..138F ; ID_Start # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 13A0..13F5 ; ID_Start # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; ID_Start # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1401..166C ; ID_Start # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166F..167F ; ID_Start # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; ID_Start # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; ID_Start # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; ID_Start # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; ID_Start # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; ID_Start # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 171F..1731 ; ID_Start # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1740..1751 ; ID_Start # Lo [18] BUHID LETTER A..BUHID LETTER HA 1760..176C ; ID_Start # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; ID_Start # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1780..17B3 ; ID_Start # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17D7 ; ID_Start # Lm KHMER SIGN LEK TOO 17DC ; ID_Start # Lo KHMER SIGN AVAKRAHASANYA 1820..1842 ; ID_Start # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; ID_Start # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; ID_Start # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; ID_Start # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1885..1886 ; ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 1887..18A8 ; ID_Start # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18AA ; ID_Start # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; ID_Start # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; ID_Start # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1950..196D ; ID_Start # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; ID_Start # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; ID_Start # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; ID_Start # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 1A00..1A16 ; ID_Start # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A20..1A54 ; ID_Start # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1AA7 ; ID_Start # Lm TAI THAM SIGN MAI YAMOK 1B05..1B33 ; ID_Start # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B45..1B4C ; ID_Start # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B83..1BA0 ; ID_Start # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BAE..1BAF ; ID_Start # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BBA..1BE5 ; ID_Start # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1C00..1C23 ; ID_Start # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C4D..1C4F ; ID_Start # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; ID_Start # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; ID_Start # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C80..1C8A ; ID_Start # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; ID_Start # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; ID_Start # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; ID_Start # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CEE..1CF3 ; ID_Start # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF5..1CF6 ; ID_Start # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CFA ; ID_Start # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; ID_Start # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; ID_Start # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; ID_Start # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; ID_Start # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; ID_Start # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; ID_Start # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1E00..1F15 ; ID_Start # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; ID_Start # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; ID_Start # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; ID_Start # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; ID_Start # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; ID_Start # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; ID_Start # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; ID_Start # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; ID_Start # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; ID_Start # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; ID_Start # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; ID_Start # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; ID_Start # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; ID_Start # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; ID_Start # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; ID_Start # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; ID_Start # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; ID_Start # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; ID_Start # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2071 ; ID_Start # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; ID_Start # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; ID_Start # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2102 ; ID_Start # L& DOUBLE-STRUCK CAPITAL C 2107 ; ID_Start # L& EULER CONSTANT 210A..2113 ; ID_Start # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; ID_Start # L& DOUBLE-STRUCK CAPITAL N 2118 ; ID_Start # Sm SCRIPT CAPITAL P 2119..211D ; ID_Start # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; ID_Start # L& DOUBLE-STRUCK CAPITAL Z 2126 ; ID_Start # L& OHM SIGN 2128 ; ID_Start # L& BLACK-LETTER CAPITAL Z 212A..212D ; ID_Start # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212E ; ID_Start # So ESTIMATED SYMBOL 212F..2134 ; ID_Start # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; ID_Start # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; ID_Start # L& INFORMATION SOURCE 213C..213F ; ID_Start # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; ID_Start # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; ID_Start # L& TURNED SMALL F 2160..2182 ; ID_Start # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; ID_Start # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; ID_Start # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 2C00..2C7B ; ID_Start # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; ID_Start # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; ID_Start # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; ID_Start # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; ID_Start # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; ID_Start # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; ID_Start # L& GEORGIAN SMALL LETTER YN 2D2D ; ID_Start # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; ID_Start # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; ID_Start # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D80..2D96 ; ID_Start # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; ID_Start # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; ID_Start # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; ID_Start # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; ID_Start # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 3005 ; ID_Start # Lm IDEOGRAPHIC ITERATION MARK 3006 ; ID_Start # Lo IDEOGRAPHIC CLOSING MARK 3007 ; ID_Start # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; ID_Start # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 3031..3035 ; ID_Start # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3038..303A ; ID_Start # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; ID_Start # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; ID_Start # Lo MASU MARK 3041..3096 ; ID_Start # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 309B..309C ; ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309D..309E ; ID_Start # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; ID_Start # Lo HIRAGANA DIGRAPH YORI 30A1..30FA ; ID_Start # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FC..30FE ; ID_Start # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; ID_Start # Lo KATAKANA DIGRAPH KOTO 3105..312F ; ID_Start # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; ID_Start # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 31A0..31BF ; ID_Start # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; ID_Start # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3400..4DBF ; ID_Start # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; ID_Start # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; ID_Start # Lm YI SYLLABLE WU A016..A48C ; ID_Start # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; ID_Start # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; ID_Start # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A500..A60B ; ID_Start # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; ID_Start # Lm VAI SYLLABLE LENGTHENER A610..A61F ; ID_Start # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A62A..A62B ; ID_Start # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; ID_Start # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; ID_Start # Lo CYRILLIC LETTER MULTIOCULAR O A67F ; ID_Start # Lm CYRILLIC PAYEROK A680..A69B ; ID_Start # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; ID_Start # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6A0..A6E5 ; ID_Start # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; ID_Start # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A717..A71F ; ID_Start # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A722..A76F ; ID_Start # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; ID_Start # Lm MODIFIER LETTER US A771..A787 ; ID_Start # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A788 ; ID_Start # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A78B..A78E ; ID_Start # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; ID_Start # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; ID_Start # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; ID_Start # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; ID_Start # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; ID_Start # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; ID_Start # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; ID_Start # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; ID_Start # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A803..A805 ; ID_Start # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A807..A80A ; ID_Start # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80C..A822 ; ID_Start # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A840..A873 ; ID_Start # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A882..A8B3 ; ID_Start # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8F2..A8F7 ; ID_Start # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8FB ; ID_Start # Lo DEVANAGARI HEADSTROKE A8FD..A8FE ; ID_Start # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A90A..A925 ; ID_Start # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A930..A946 ; ID_Start # Lo [23] REJANG LETTER KA..REJANG LETTER A A960..A97C ; ID_Start # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A984..A9B2 ; ID_Start # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9CF ; ID_Start # Lm JAVANESE PANGRANGKEP A9E0..A9E4 ; ID_Start # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E6 ; ID_Start # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; ID_Start # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9FA..A9FE ; ID_Start # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; ID_Start # Lo [41] CHAM LETTER A..CHAM LETTER HA AA40..AA42 ; ID_Start # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA44..AA4B ; ID_Start # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA60..AA6F ; ID_Start # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; ID_Start # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; ID_Start # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA7A ; ID_Start # Lo MYANMAR LETTER AITON RA AA7E..AAAF ; ID_Start # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB1 ; ID_Start # Lo TAI VIET VOWEL AA AAB5..AAB6 ; ID_Start # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB9..AABD ; ID_Start # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AAC0 ; ID_Start # Lo TAI VIET TONE MAI NUENG AAC2 ; ID_Start # Lo TAI VIET TONE MAI SONG AADB..AADC ; ID_Start # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; ID_Start # Lm TAI VIET SYMBOL SAM AAE0..AAEA ; ID_Start # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAF2 ; ID_Start # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; ID_Start # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AB01..AB06 ; ID_Start # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; ID_Start # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; ID_Start # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; ID_Start # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; ID_Start # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; ID_Start # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; ID_Start # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; ID_Start # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; ID_Start # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; ID_Start # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM AC00..D7A3 ; ID_Start # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; ID_Start # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; ID_Start # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; ID_Start # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; ID_Start # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; ID_Start # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; ID_Start # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FB1D ; ID_Start # Lo HEBREW LETTER YOD WITH HIRIQ FB1F..FB28 ; ID_Start # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; ID_Start # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; ID_Start # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; ID_Start # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; ID_Start # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; ID_Start # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; ID_Start # Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBD3..FD3D ; ID_Start # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; ID_Start # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; ID_Start # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; ID_Start # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FE70..FE74 ; ID_Start # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; ID_Start # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF21..FF3A ; ID_Start # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; ID_Start # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF66..FF6F ; ID_Start # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; ID_Start # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; ID_Start # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FF9E..FF9F ; ID_Start # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFA0..FFBE ; ID_Start # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; ID_Start # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; ID_Start # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; ID_Start # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; ID_Start # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; ID_Start # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; ID_Start # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; ID_Start # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; ID_Start # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; ID_Start # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; ID_Start # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10140..10174 ; ID_Start # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10280..1029C ; ID_Start # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; ID_Start # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 10300..1031F ; ID_Start # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 1032D..10340 ; ID_Start # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; ID_Start # Nl GOTHIC LETTER NINETY 10342..10349 ; ID_Start # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; ID_Start # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; ID_Start # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10380..1039D ; ID_Start # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 103A0..103C3 ; ID_Start # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; ID_Start # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D1..103D5 ; ID_Start # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; ID_Start # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; ID_Start # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104B0..104D3 ; ID_Start # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; ID_Start # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; ID_Start # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; ID_Start # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 10570..1057A ; ID_Start # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; ID_Start # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; ID_Start # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; ID_Start # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; ID_Start # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; ID_Start # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; ID_Start # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; ID_Start # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; ID_Start # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; ID_Start # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; ID_Start # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; ID_Start # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; ID_Start # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; ID_Start # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; ID_Start # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10800..10805 ; ID_Start # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; ID_Start # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; ID_Start # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; ID_Start # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; ID_Start # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; ID_Start # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10860..10876 ; ID_Start # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10880..1089E ; ID_Start # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108E0..108F2 ; ID_Start # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; ID_Start # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 10900..10915 ; ID_Start # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10920..10939 ; ID_Start # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 10940..10959 ; ID_Start # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; ID_Start # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BE..109BF ; ID_Start # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 10A00 ; ID_Start # Lo KHAROSHTHI LETTER A 10A10..10A13 ; ID_Start # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; ID_Start # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; ID_Start # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A60..10A7C ; ID_Start # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A80..10A9C ; ID_Start # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10AC0..10AC7 ; ID_Start # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC9..10AE4 ; ID_Start # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10B00..10B35 ; ID_Start # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; ID_Start # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B60..10B72 ; ID_Start # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B80..10B91 ; ID_Start # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10C00..10C48 ; ID_Start # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; ID_Start # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; ID_Start # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D00..10D23 ; ID_Start # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D4A..10D4D ; ID_Start # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; ID_Start # Lm GARAY VOWEL LENGTH MARK 10D4F ; ID_Start # Lo GARAY SUKUN 10D50..10D65 ; ID_Start # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D6F ; ID_Start # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; ID_Start # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10E80..10EA9 ; ID_Start # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; ID_Start # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; ID_Start # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; ID_Start # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; ID_Start # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10F00..10F1C ; ID_Start # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; ID_Start # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; ID_Start # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F70..10F81 ; ID_Start # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10FB0..10FC4 ; ID_Start # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FE0..10FF6 ; ID_Start # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11003..11037 ; ID_Start # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11071..11072 ; ID_Start # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11075 ; ID_Start # Lo BRAHMI LETTER OLD TAMIL LLA 11083..110AF ; ID_Start # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110D0..110E8 ; ID_Start # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 11103..11126 ; ID_Start # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11144 ; ID_Start # Lo CHAKMA LETTER LHAA 11147 ; ID_Start # Lo CHAKMA LETTER VAA 11150..11172 ; ID_Start # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11176 ; ID_Start # Lo MAHAJANI LIGATURE SHRI 11183..111B2 ; ID_Start # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111C1..111C4 ; ID_Start # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111DA ; ID_Start # Lo SHARADA EKAM 111DC ; ID_Start # Lo SHARADA HEADSTROKE 11200..11211 ; ID_Start # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; ID_Start # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1123F..11240 ; ID_Start # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11280..11286 ; ID_Start # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; ID_Start # Lo MULTANI LETTER GHA 1128A..1128D ; ID_Start # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; ID_Start # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; ID_Start # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112B0..112DE ; ID_Start # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 11305..1130C ; ID_Start # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; ID_Start # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; ID_Start # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; ID_Start # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; ID_Start # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; ID_Start # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133D ; ID_Start # Lo GRANTHA SIGN AVAGRAHA 11350 ; ID_Start # Lo GRANTHA OM 1135D..11361 ; ID_Start # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11380..11389 ; ID_Start # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; ID_Start # Lo TULU-TIGALARI LETTER EE 1138E ; ID_Start # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; ID_Start # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; ID_Start # Lo TULU-TIGALARI SIGN AVAGRAHA 113D1 ; ID_Start # Lo TULU-TIGALARI REPHA 113D3 ; ID_Start # Lo TULU-TIGALARI SIGN PLUTA 11400..11434 ; ID_Start # Lo [53] NEWA LETTER A..NEWA LETTER HA 11447..1144A ; ID_Start # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 1145F..11461 ; ID_Start # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; ID_Start # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114C4..114C5 ; ID_Start # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C7 ; ID_Start # Lo TIRHUTA OM 11580..115AE ; ID_Start # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115D8..115DB ; ID_Start # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 11600..1162F ; ID_Start # Lo [48] MODI LETTER A..MODI LETTER LLA 11644 ; ID_Start # Lo MODI SIGN HUVA 11680..116AA ; ID_Start # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116B8 ; ID_Start # Lo TAKRI LETTER ARCHAIC KHA 11700..1171A ; ID_Start # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 11740..11746 ; ID_Start # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; ID_Start # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 118A0..118DF ; ID_Start # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118FF..11906 ; ID_Start # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; ID_Start # Lo DIVES AKURU LETTER O 1190C..11913 ; ID_Start # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; ID_Start # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; ID_Start # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 1193F ; ID_Start # Lo DIVES AKURU PREFIXED NASAL SIGN 11941 ; ID_Start # Lo DIVES AKURU INITIAL RA 119A0..119A7 ; ID_Start # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; ID_Start # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119E1 ; ID_Start # Lo NANDINAGARI SIGN AVAGRAHA 119E3 ; ID_Start # Lo NANDINAGARI HEADSTROKE 11A00 ; ID_Start # Lo ZANABAZAR SQUARE LETTER A 11A0B..11A32 ; ID_Start # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A3A ; ID_Start # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A50 ; ID_Start # Lo SOYOMBO LETTER A 11A5C..11A89 ; ID_Start # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; ID_Start # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; ID_Start # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11BC0..11BE0 ; ID_Start # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; ID_Start # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; ID_Start # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; ID_Start # Lo BHAIKSUKI SIGN AVAGRAHA 11C72..11C8F ; ID_Start # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11D00..11D06 ; ID_Start # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; ID_Start # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; ID_Start # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D46 ; ID_Start # Lo MASARAM GONDI REPHA 11D60..11D65 ; ID_Start # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; ID_Start # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; ID_Start # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D98 ; ID_Start # Lo GUNJALA GONDI OM 11DB0..11DD8 ; ID_Start # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; ID_Start # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; ID_Start # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11EE0..11EF2 ; ID_Start # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11F02 ; ID_Start # Lo KAWI SIGN REPHA 11F04..11F10 ; ID_Start # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; ID_Start # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11FB0 ; ID_Start # Lo LISU LETTER YHA 12000..12399 ; ID_Start # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; ID_Start # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12480..12543 ; ID_Start # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; ID_Start # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 13000..1342F ; ID_Start # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13441..13446 ; ID_Start # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13460..143FA ; ID_Start # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; ID_Start # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; ID_Start # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 16800..16A38 ; ID_Start # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; ID_Start # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A70..16ABE ; ID_Start # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AD0..16AED ; ID_Start # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16B00..16B2F ; ID_Start # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B40..16B43 ; ID_Start # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B63..16B77 ; ID_Start # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; ID_Start # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; ID_Start # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; ID_Start # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; ID_Start # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16E40..16E7F ; ID_Start # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; ID_Start # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; ID_Start # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; ID_Start # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; ID_Start # Lo MIAO LETTER NASALIZATION 16F93..16F9F ; ID_Start # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; ID_Start # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; ID_Start # Lm OLD CHINESE ITERATION MARK 16FF2..16FF3 ; ID_Start # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; ID_Start # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; ID_Start # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; ID_Start # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; ID_Start # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; ID_Start # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; ID_Start # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; ID_Start # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; ID_Start # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; ID_Start # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; ID_Start # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; ID_Start # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; ID_Start # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; ID_Start # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; ID_Start # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; ID_Start # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; ID_Start # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; ID_Start # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1D400..1D454 ; ID_Start # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; ID_Start # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; ID_Start # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; ID_Start # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; ID_Start # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; ID_Start # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; ID_Start # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; ID_Start # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; ID_Start # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; ID_Start # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; ID_Start # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; ID_Start # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; ID_Start # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; ID_Start # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; ID_Start # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; ID_Start # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; ID_Start # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; ID_Start # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; ID_Start # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; ID_Start # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; ID_Start # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; ID_Start # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; ID_Start # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; ID_Start # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; ID_Start # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; ID_Start # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; ID_Start # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; ID_Start # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; ID_Start # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; ID_Start # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1DF00..1DF09 ; ID_Start # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; ID_Start # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; ID_Start # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; ID_Start # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E030..1E06D ; ID_Start # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E100..1E12C ; ID_Start # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E137..1E13D ; ID_Start # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E14E ; ID_Start # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E290..1E2AD ; ID_Start # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; ID_Start # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E4D0..1E4EA ; ID_Start # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; ID_Start # Lm NAG MUNDARI SIGN OJOD 1E5D0..1E5ED ; ID_Start # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5F0 ; ID_Start # Lo OL ONAL SIGN HODDOND 1E6C0..1E6DE ; ID_Start # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; ID_Start # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E4..1E6E5 ; ID_Start # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E7..1E6ED ; ID_Start # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6F0..1E6F4 ; ID_Start # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6FE ; ID_Start # Lo TAI YO SYMBOL MUEANG 1E6FF ; ID_Start # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; ID_Start # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; ID_Start # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; ID_Start # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; ID_Start # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; ID_Start # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E900..1E943 ; ID_Start # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E94B ; ID_Start # Lm ADLAM NASALIZATION MARK 1EE00..1EE03 ; ID_Start # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; ID_Start # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; ID_Start # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; ID_Start # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; ID_Start # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; ID_Start # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; ID_Start # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; ID_Start # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; ID_Start # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; ID_Start # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; ID_Start # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; ID_Start # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; ID_Start # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; ID_Start # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; ID_Start # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; ID_Start # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; ID_Start # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; ID_Start # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; ID_Start # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; ID_Start # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; ID_Start # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; ID_Start # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; ID_Start # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; ID_Start # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; ID_Start # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; ID_Start # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; ID_Start # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; ID_Start # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; ID_Start # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; ID_Start # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; ID_Start # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; ID_Start # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; ID_Start # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 20000..2A6DF ; ID_Start # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; ID_Start # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; ID_Start # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; ID_Start # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; ID_Start # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; ID_Start # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; ID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; ID_Start # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 145916 # ================================================ # Derived Property: ID_Continue # Characters that can continue an identifier. # Generated from: # ID_Start # + Mn + Mc + Nd + Pc # + Other_ID_Continue # - Pattern_Syntax # - Pattern_White_Space # NOTE: See UAX #31 for more information 0030..0039 ; ID_Continue # Nd [10] DIGIT ZERO..DIGIT NINE 0041..005A ; ID_Continue # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 005F ; ID_Continue # Pc LOW LINE 0061..007A ; ID_Continue # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; ID_Continue # Lo FEMININE ORDINAL INDICATOR 00B5 ; ID_Continue # L& MICRO SIGN 00B7 ; ID_Continue # Po MIDDLE DOT 00BA ; ID_Continue # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; ID_Continue # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; ID_Continue # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; ID_Continue # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; ID_Continue # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; ID_Continue # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; ID_Continue # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; ID_Continue # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; ID_Continue # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; ID_Continue # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02C1 ; ID_Continue # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C6..02D1 ; ID_Continue # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; ID_Continue # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EC ; ID_Continue # Lm MODIFIER LETTER VOICING 02EE ; ID_Continue # Lm MODIFIER LETTER DOUBLE APOSTROPHE 0300..036F ; ID_Continue # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0370..0373 ; ID_Continue # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0374 ; ID_Continue # Lm GREEK NUMERAL SIGN 0376..0377 ; ID_Continue # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; ID_Continue # Lm GREEK YPOGEGRAMMENI 037B..037D ; ID_Continue # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; ID_Continue # L& GREEK CAPITAL LETTER YOT 0386 ; ID_Continue # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0387 ; ID_Continue # Po GREEK ANO TELEIA 0388..038A ; ID_Continue # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; ID_Continue # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; ID_Continue # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; ID_Continue # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; ID_Continue # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 0483..0487 ; ID_Continue # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 048A..052F ; ID_Continue # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; ID_Continue # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; ID_Continue # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 0560..0588 ; ID_Continue # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 0591..05BD ; ID_Continue # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; ID_Continue # Mn HEBREW POINT RAFE 05C1..05C2 ; ID_Continue # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; ID_Continue # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; ID_Continue # Mn HEBREW POINT QAMATS QATAN 05D0..05EA ; ID_Continue # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; ID_Continue # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 0610..061A ; ID_Continue # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 0620..063F ; ID_Continue # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; ID_Continue # Lm ARABIC TATWEEL 0641..064A ; ID_Continue # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 064B..065F ; ID_Continue # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0660..0669 ; ID_Continue # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE 066E..066F ; ID_Continue # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0670 ; ID_Continue # Mn ARABIC LETTER SUPERSCRIPT ALEF 0671..06D3 ; ID_Continue # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D5 ; ID_Continue # Lo ARABIC LETTER AE 06D6..06DC ; ID_Continue # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; ID_Continue # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E5..06E6 ; ID_Continue # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06E7..06E8 ; ID_Continue # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; ID_Continue # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 06EE..06EF ; ID_Continue # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06F0..06F9 ; ID_Continue # Nd [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE 06FA..06FC ; ID_Continue # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FF ; ID_Continue # Lo ARABIC LETTER HEH WITH INVERTED V 0710 ; ID_Continue # Lo SYRIAC LETTER ALAPH 0711 ; ID_Continue # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0712..072F ; ID_Continue # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 0730..074A ; ID_Continue # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 074D..07A5 ; ID_Continue # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07A6..07B0 ; ID_Continue # Mn [11] THAANA ABAFILI..THAANA SUKUN 07B1 ; ID_Continue # Lo THAANA LETTER NAA 07C0..07C9 ; ID_Continue # Nd [10] NKO DIGIT ZERO..NKO DIGIT NINE 07CA..07EA ; ID_Continue # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07EB..07F3 ; ID_Continue # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07F4..07F5 ; ID_Continue # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; ID_Continue # Lm NKO LAJANYALAN 07FD ; ID_Continue # Mn NKO DANTAYALAN 0800..0815 ; ID_Continue # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 0816..0819 ; ID_Continue # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081A ; ID_Continue # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 081B..0823 ; ID_Continue # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0824 ; ID_Continue # Lm SAMARITAN MODIFIER LETTER SHORT A 0825..0827 ; ID_Continue # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0828 ; ID_Continue # Lm SAMARITAN MODIFIER LETTER I 0829..082D ; ID_Continue # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0840..0858 ; ID_Continue # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 0859..085B ; ID_Continue # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0860..086A ; ID_Continue # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; ID_Continue # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088F ; ID_Continue # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 0897..089F ; ID_Continue # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; ID_Continue # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; ID_Continue # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; ID_Continue # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; ID_Continue # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 0903 ; ID_Continue # Mc DEVANAGARI SIGN VISARGA 0904..0939 ; ID_Continue # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093A ; ID_Continue # Mn DEVANAGARI VOWEL SIGN OE 093B ; ID_Continue # Mc DEVANAGARI VOWEL SIGN OOE 093C ; ID_Continue # Mn DEVANAGARI SIGN NUKTA 093D ; ID_Continue # Lo DEVANAGARI SIGN AVAGRAHA 093E..0940 ; ID_Continue # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0941..0948 ; ID_Continue # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 0949..094C ; ID_Continue # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094D ; ID_Continue # Mn DEVANAGARI SIGN VIRAMA 094E..094F ; ID_Continue # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0950 ; ID_Continue # Lo DEVANAGARI OM 0951..0957 ; ID_Continue # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0958..0961 ; ID_Continue # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0962..0963 ; ID_Continue # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0966..096F ; ID_Continue # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 0971 ; ID_Continue # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; ID_Continue # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0981 ; ID_Continue # Mn BENGALI SIGN CANDRABINDU 0982..0983 ; ID_Continue # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; ID_Continue # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; ID_Continue # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; ID_Continue # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; ID_Continue # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; ID_Continue # Lo BENGALI LETTER LA 09B6..09B9 ; ID_Continue # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BC ; ID_Continue # Mn BENGALI SIGN NUKTA 09BD ; ID_Continue # Lo BENGALI SIGN AVAGRAHA 09BE..09C0 ; ID_Continue # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II 09C1..09C4 ; ID_Continue # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09C7..09C8 ; ID_Continue # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; ID_Continue # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09CD ; ID_Continue # Mn BENGALI SIGN VIRAMA 09CE ; ID_Continue # Lo BENGALI LETTER KHANDA TA 09D7 ; ID_Continue # Mc BENGALI AU LENGTH MARK 09DC..09DD ; ID_Continue # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; ID_Continue # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09E2..09E3 ; ID_Continue # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09E6..09EF ; ID_Continue # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE 09F0..09F1 ; ID_Continue # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09FC ; ID_Continue # Lo BENGALI LETTER VEDIC ANUSVARA 09FE ; ID_Continue # Mn BENGALI SANDHI MARK 0A01..0A02 ; ID_Continue # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A03 ; ID_Continue # Mc GURMUKHI SIGN VISARGA 0A05..0A0A ; ID_Continue # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; ID_Continue # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; ID_Continue # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; ID_Continue # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; ID_Continue # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; ID_Continue # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; ID_Continue # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A3C ; ID_Continue # Mn GURMUKHI SIGN NUKTA 0A3E..0A40 ; ID_Continue # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A41..0A42 ; ID_Continue # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; ID_Continue # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; ID_Continue # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; ID_Continue # Mn GURMUKHI SIGN UDAAT 0A59..0A5C ; ID_Continue # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; ID_Continue # Lo GURMUKHI LETTER FA 0A66..0A6F ; ID_Continue # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE 0A70..0A71 ; ID_Continue # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A72..0A74 ; ID_Continue # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A75 ; ID_Continue # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; ID_Continue # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0A83 ; ID_Continue # Mc GUJARATI SIGN VISARGA 0A85..0A8D ; ID_Continue # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; ID_Continue # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; ID_Continue # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; ID_Continue # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; ID_Continue # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; ID_Continue # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABC ; ID_Continue # Mn GUJARATI SIGN NUKTA 0ABD ; ID_Continue # Lo GUJARATI SIGN AVAGRAHA 0ABE..0AC0 ; ID_Continue # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC1..0AC5 ; ID_Continue # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; ID_Continue # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0AC9 ; ID_Continue # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; ID_Continue # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0ACD ; ID_Continue # Mn GUJARATI SIGN VIRAMA 0AD0 ; ID_Continue # Lo GUJARATI OM 0AE0..0AE1 ; ID_Continue # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AE2..0AE3 ; ID_Continue # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AE6..0AEF ; ID_Continue # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE 0AF9 ; ID_Continue # Lo GUJARATI LETTER ZHA 0AFA..0AFF ; ID_Continue # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; ID_Continue # Mn ORIYA SIGN CANDRABINDU 0B02..0B03 ; ID_Continue # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B05..0B0C ; ID_Continue # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; ID_Continue # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; ID_Continue # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; ID_Continue # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; ID_Continue # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; ID_Continue # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3C ; ID_Continue # Mn ORIYA SIGN NUKTA 0B3D ; ID_Continue # Lo ORIYA SIGN AVAGRAHA 0B3E ; ID_Continue # Mc ORIYA VOWEL SIGN AA 0B3F ; ID_Continue # Mn ORIYA VOWEL SIGN I 0B40 ; ID_Continue # Mc ORIYA VOWEL SIGN II 0B41..0B44 ; ID_Continue # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B47..0B48 ; ID_Continue # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; ID_Continue # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B4D ; ID_Continue # Mn ORIYA SIGN VIRAMA 0B55..0B56 ; ID_Continue # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B57 ; ID_Continue # Mc ORIYA AU LENGTH MARK 0B5C..0B5D ; ID_Continue # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; ID_Continue # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B62..0B63 ; ID_Continue # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B66..0B6F ; ID_Continue # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE 0B71 ; ID_Continue # Lo ORIYA LETTER WA 0B82 ; ID_Continue # Mn TAMIL SIGN ANUSVARA 0B83 ; ID_Continue # Lo TAMIL SIGN VISARGA 0B85..0B8A ; ID_Continue # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; ID_Continue # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; ID_Continue # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; ID_Continue # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; ID_Continue # Lo TAMIL LETTER JA 0B9E..0B9F ; ID_Continue # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; ID_Continue # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; ID_Continue # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; ID_Continue # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BBE..0BBF ; ID_Continue # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I 0BC0 ; ID_Continue # Mn TAMIL VOWEL SIGN II 0BC1..0BC2 ; ID_Continue # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; ID_Continue # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; ID_Continue # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BCD ; ID_Continue # Mn TAMIL SIGN VIRAMA 0BD0 ; ID_Continue # Lo TAMIL OM 0BD7 ; ID_Continue # Mc TAMIL AU LENGTH MARK 0BE6..0BEF ; ID_Continue # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE 0C00 ; ID_Continue # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; ID_Continue # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C04 ; ID_Continue # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C05..0C0C ; ID_Continue # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; ID_Continue # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; ID_Continue # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; ID_Continue # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3C ; ID_Continue # Mn TELUGU SIGN NUKTA 0C3D ; ID_Continue # Lo TELUGU SIGN AVAGRAHA 0C3E..0C40 ; ID_Continue # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; ID_Continue # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C46..0C48 ; ID_Continue # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; ID_Continue # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; ID_Continue # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C58..0C5A ; ID_Continue # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; ID_Continue # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; ID_Continue # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C62..0C63 ; ID_Continue # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C66..0C6F ; ID_Continue # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE 0C80 ; ID_Continue # Lo KANNADA SIGN SPACING CANDRABINDU 0C81 ; ID_Continue # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; ID_Continue # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C85..0C8C ; ID_Continue # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; ID_Continue # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; ID_Continue # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; ID_Continue # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; ID_Continue # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBC ; ID_Continue # Mn KANNADA SIGN NUKTA 0CBD ; ID_Continue # Lo KANNADA SIGN AVAGRAHA 0CBE ; ID_Continue # Mc KANNADA VOWEL SIGN AA 0CBF ; ID_Continue # Mn KANNADA VOWEL SIGN I 0CC0..0CC4 ; ID_Continue # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR 0CC6 ; ID_Continue # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; ID_Continue # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; ID_Continue # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; ID_Continue # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; ID_Continue # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CDC..0CDE ; ID_Continue # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; ID_Continue # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE2..0CE3 ; ID_Continue # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0CE6..0CEF ; ID_Continue # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; ID_Continue # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; ID_Continue # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D00..0D01 ; ID_Continue # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; ID_Continue # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D04..0D0C ; ID_Continue # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; ID_Continue # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; ID_Continue # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3B..0D3C ; ID_Continue # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D3D ; ID_Continue # Lo MALAYALAM SIGN AVAGRAHA 0D3E..0D40 ; ID_Continue # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D41..0D44 ; ID_Continue # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D46..0D48 ; ID_Continue # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; ID_Continue # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D4D ; ID_Continue # Mn MALAYALAM SIGN VIRAMA 0D4E ; ID_Continue # Lo MALAYALAM LETTER DOT REPH 0D54..0D56 ; ID_Continue # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D57 ; ID_Continue # Mc MALAYALAM AU LENGTH MARK 0D5F..0D61 ; ID_Continue # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D62..0D63 ; ID_Continue # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D66..0D6F ; ID_Continue # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE 0D7A..0D7F ; ID_Continue # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D81 ; ID_Continue # Mn SINHALA SIGN CANDRABINDU 0D82..0D83 ; ID_Continue # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0D85..0D96 ; ID_Continue # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; ID_Continue # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; ID_Continue # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; ID_Continue # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; ID_Continue # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0DCA ; ID_Continue # Mn SINHALA SIGN AL-LAKUNA 0DCF..0DD1 ; ID_Continue # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD2..0DD4 ; ID_Continue # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; ID_Continue # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DD8..0DDF ; ID_Continue # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA 0DE6..0DEF ; ID_Continue # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0DF2..0DF3 ; ID_Continue # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0E01..0E30 ; ID_Continue # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E31 ; ID_Continue # Mn THAI CHARACTER MAI HAN-AKAT 0E32..0E33 ; ID_Continue # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E34..0E3A ; ID_Continue # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E40..0E45 ; ID_Continue # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; ID_Continue # Lm THAI CHARACTER MAIYAMOK 0E47..0E4E ; ID_Continue # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0E50..0E59 ; ID_Continue # Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE 0E81..0E82 ; ID_Continue # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; ID_Continue # Lo LAO LETTER KHO TAM 0E86..0E8A ; ID_Continue # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; ID_Continue # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; ID_Continue # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; ID_Continue # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB1 ; ID_Continue # Mn LAO VOWEL SIGN MAI KAN 0EB2..0EB3 ; ID_Continue # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EB4..0EBC ; ID_Continue # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EBD ; ID_Continue # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; ID_Continue # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; ID_Continue # Lm LAO KO LA 0EC8..0ECE ; ID_Continue # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0ED0..0ED9 ; ID_Continue # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE 0EDC..0EDF ; ID_Continue # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; ID_Continue # Lo TIBETAN SYLLABLE OM 0F18..0F19 ; ID_Continue # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F20..0F29 ; ID_Continue # Nd [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE 0F35 ; ID_Continue # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; ID_Continue # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; ID_Continue # Mn TIBETAN MARK TSA -PHRU 0F3E..0F3F ; ID_Continue # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F40..0F47 ; ID_Continue # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; ID_Continue # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F71..0F7E ; ID_Continue # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F7F ; ID_Continue # Mc TIBETAN SIGN RNAM BCAD 0F80..0F84 ; ID_Continue # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; ID_Continue # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F88..0F8C ; ID_Continue # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 0F8D..0F97 ; ID_Continue # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; ID_Continue # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; ID_Continue # Mn TIBETAN SYMBOL PADMA GDAN 1000..102A ; ID_Continue # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 102B..102C ; ID_Continue # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 102D..1030 ; ID_Continue # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1031 ; ID_Continue # Mc MYANMAR VOWEL SIGN E 1032..1037 ; ID_Continue # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1038 ; ID_Continue # Mc MYANMAR SIGN VISARGA 1039..103A ; ID_Continue # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103B..103C ; ID_Continue # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 103D..103E ; ID_Continue # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 103F ; ID_Continue # Lo MYANMAR LETTER GREAT SA 1040..1049 ; ID_Continue # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE 1050..1055 ; ID_Continue # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 1056..1057 ; ID_Continue # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1058..1059 ; ID_Continue # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105A..105D ; ID_Continue # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 105E..1060 ; ID_Continue # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1061 ; ID_Continue # Lo MYANMAR LETTER SGAW KAREN SHA 1062..1064 ; ID_Continue # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1065..1066 ; ID_Continue # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 1067..106D ; ID_Continue # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 106E..1070 ; ID_Continue # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1071..1074 ; ID_Continue # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1075..1081 ; ID_Continue # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 1082 ; ID_Continue # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1083..1084 ; ID_Continue # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1085..1086 ; ID_Continue # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 1087..108C ; ID_Continue # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108D ; ID_Continue # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 108E ; ID_Continue # Lo MYANMAR LETTER RUMAI PALAUNG FA 108F ; ID_Continue # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 1090..1099 ; ID_Continue # Nd [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE 109A..109C ; ID_Continue # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109D ; ID_Continue # Mn MYANMAR VOWEL SIGN AITON AI 10A0..10C5 ; ID_Continue # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; ID_Continue # L& GEORGIAN CAPITAL LETTER YN 10CD ; ID_Continue # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; ID_Continue # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; ID_Continue # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; ID_Continue # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; ID_Continue # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; ID_Continue # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; ID_Continue # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; ID_Continue # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; ID_Continue # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; ID_Continue # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; ID_Continue # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; ID_Continue # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 135D..135F ; ID_Continue # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1369..1371 ; ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE 1380..138F ; ID_Continue # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 13A0..13F5 ; ID_Continue # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; ID_Continue # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1401..166C ; ID_Continue # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166F..167F ; ID_Continue # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; ID_Continue # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; ID_Continue # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; ID_Continue # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; ID_Continue # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; ID_Continue # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 1712..1714 ; ID_Continue # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1715 ; ID_Continue # Mc TAGALOG SIGN PAMUDPOD 171F..1731 ; ID_Continue # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1732..1733 ; ID_Continue # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1734 ; ID_Continue # Mc HANUNOO SIGN PAMUDPOD 1740..1751 ; ID_Continue # Lo [18] BUHID LETTER A..BUHID LETTER HA 1752..1753 ; ID_Continue # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1760..176C ; ID_Continue # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; ID_Continue # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1772..1773 ; ID_Continue # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 1780..17B3 ; ID_Continue # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17B4..17B5 ; ID_Continue # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B6 ; ID_Continue # Mc KHMER VOWEL SIGN AA 17B7..17BD ; ID_Continue # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17BE..17C5 ; ID_Continue # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C6 ; ID_Continue # Mn KHMER SIGN NIKAHIT 17C7..17C8 ; ID_Continue # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 17C9..17D3 ; ID_Continue # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17D7 ; ID_Continue # Lm KHMER SIGN LEK TOO 17DC ; ID_Continue # Lo KHMER SIGN AVAKRAHASANYA 17DD ; ID_Continue # Mn KHMER SIGN ATTHACAN 17E0..17E9 ; ID_Continue # Nd [10] KHMER DIGIT ZERO..KHMER DIGIT NINE 180B..180D ; ID_Continue # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; ID_Continue # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1810..1819 ; ID_Continue # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1820..1842 ; ID_Continue # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; ID_Continue # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; ID_Continue # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; ID_Continue # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1885..1886 ; ID_Continue # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 1887..18A8 ; ID_Continue # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18A9 ; ID_Continue # Mn MONGOLIAN LETTER ALI GALI DAGALGA 18AA ; ID_Continue # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; ID_Continue # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; ID_Continue # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1920..1922 ; ID_Continue # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1923..1926 ; ID_Continue # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1927..1928 ; ID_Continue # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1929..192B ; ID_Continue # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; ID_Continue # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1932 ; ID_Continue # Mn LIMBU SMALL LETTER ANUSVARA 1933..1938 ; ID_Continue # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1939..193B ; ID_Continue # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1946..194F ; ID_Continue # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE 1950..196D ; ID_Continue # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; ID_Continue # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; ID_Continue # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; ID_Continue # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 19D0..19D9 ; ID_Continue # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE 19DA ; ID_Continue # No NEW TAI LUE THAM DIGIT ONE 1A00..1A16 ; ID_Continue # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A17..1A18 ; ID_Continue # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A19..1A1A ; ID_Continue # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A1B ; ID_Continue # Mn BUGINESE VOWEL SIGN AE 1A20..1A54 ; ID_Continue # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1A55 ; ID_Continue # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A56 ; ID_Continue # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A57 ; ID_Continue # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A58..1A5E ; ID_Continue # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; ID_Continue # Mn TAI THAM SIGN SAKOT 1A61 ; ID_Continue # Mc TAI THAM VOWEL SIGN A 1A62 ; ID_Continue # Mn TAI THAM VOWEL SIGN MAI SAT 1A63..1A64 ; ID_Continue # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A65..1A6C ; ID_Continue # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A6D..1A72 ; ID_Continue # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1A73..1A7C ; ID_Continue # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; ID_Continue # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1A80..1A89 ; ID_Continue # Nd [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE 1A90..1A99 ; ID_Continue # Nd [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE 1AA7 ; ID_Continue # Lm TAI THAM SIGN MAI YAMOK 1AB0..1ABD ; ID_Continue # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABF..1ADD ; ID_Continue # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; ID_Continue # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; ID_Continue # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B04 ; ID_Continue # Mc BALINESE SIGN BISAH 1B05..1B33 ; ID_Continue # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B34 ; ID_Continue # Mn BALINESE SIGN REREKAN 1B35 ; ID_Continue # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; ID_Continue # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; ID_Continue # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; ID_Continue # Mn BALINESE VOWEL SIGN LA LENGA 1B3D..1B41 ; ID_Continue # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B42 ; ID_Continue # Mn BALINESE VOWEL SIGN PEPET 1B43..1B44 ; ID_Continue # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1B45..1B4C ; ID_Continue # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B50..1B59 ; ID_Continue # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE 1B6B..1B73 ; ID_Continue # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; ID_Continue # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1B82 ; ID_Continue # Mc SUNDANESE SIGN PANGWISAD 1B83..1BA0 ; ID_Continue # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BA1 ; ID_Continue # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA2..1BA5 ; ID_Continue # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA6..1BA7 ; ID_Continue # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; ID_Continue # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; ID_Continue # Mc SUNDANESE SIGN PAMAAEH 1BAB..1BAD ; ID_Continue # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BAE..1BAF ; ID_Continue # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BB0..1BB9 ; ID_Continue # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1BBA..1BE5 ; ID_Continue # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1BE6 ; ID_Continue # Mn BATAK SIGN TOMPI 1BE7 ; ID_Continue # Mc BATAK VOWEL SIGN E 1BE8..1BE9 ; ID_Continue # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BEA..1BEC ; ID_Continue # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BED ; ID_Continue # Mn BATAK VOWEL SIGN KARO O 1BEE ; ID_Continue # Mc BATAK VOWEL SIGN U 1BEF..1BF1 ; ID_Continue # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1BF2..1BF3 ; ID_Continue # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C00..1C23 ; ID_Continue # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C24..1C2B ; ID_Continue # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C2C..1C33 ; ID_Continue # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; ID_Continue # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C36..1C37 ; ID_Continue # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1C40..1C49 ; ID_Continue # Nd [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE 1C4D..1C4F ; ID_Continue # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C50..1C59 ; ID_Continue # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE 1C5A..1C77 ; ID_Continue # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; ID_Continue # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C80..1C8A ; ID_Continue # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; ID_Continue # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; ID_Continue # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CD0..1CD2 ; ID_Continue # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; ID_Continue # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE1 ; ID_Continue # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CE2..1CE8 ; ID_Continue # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CE9..1CEC ; ID_Continue # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CED ; ID_Continue # Mn VEDIC SIGN TIRYAK 1CEE..1CF3 ; ID_Continue # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF4 ; ID_Continue # Mn VEDIC TONE CANDRA ABOVE 1CF5..1CF6 ; ID_Continue # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CF7 ; ID_Continue # Mc VEDIC SIGN ATIKRAMA 1CF8..1CF9 ; ID_Continue # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1CFA ; ID_Continue # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; ID_Continue # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; ID_Continue # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; ID_Continue # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; ID_Continue # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; ID_Continue # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; ID_Continue # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1DC0..1DFF ; ID_Continue # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 1E00..1F15 ; ID_Continue # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; ID_Continue # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; ID_Continue # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; ID_Continue # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; ID_Continue # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; ID_Continue # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; ID_Continue # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; ID_Continue # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; ID_Continue # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; ID_Continue # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; ID_Continue # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; ID_Continue # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; ID_Continue # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; ID_Continue # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; ID_Continue # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; ID_Continue # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; ID_Continue # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; ID_Continue # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; ID_Continue # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 200C..200D ; ID_Continue # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 203F..2040 ; ID_Continue # Pc [2] UNDERTIE..CHARACTER TIE 2054 ; ID_Continue # Pc INVERTED UNDERTIE 2071 ; ID_Continue # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; ID_Continue # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; ID_Continue # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 20D0..20DC ; ID_Continue # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20E1 ; ID_Continue # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E5..20F0 ; ID_Continue # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2102 ; ID_Continue # L& DOUBLE-STRUCK CAPITAL C 2107 ; ID_Continue # L& EULER CONSTANT 210A..2113 ; ID_Continue # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; ID_Continue # L& DOUBLE-STRUCK CAPITAL N 2118 ; ID_Continue # Sm SCRIPT CAPITAL P 2119..211D ; ID_Continue # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; ID_Continue # L& DOUBLE-STRUCK CAPITAL Z 2126 ; ID_Continue # L& OHM SIGN 2128 ; ID_Continue # L& BLACK-LETTER CAPITAL Z 212A..212D ; ID_Continue # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212E ; ID_Continue # So ESTIMATED SYMBOL 212F..2134 ; ID_Continue # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; ID_Continue # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; ID_Continue # L& INFORMATION SOURCE 213C..213F ; ID_Continue # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; ID_Continue # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; ID_Continue # L& TURNED SMALL F 2160..2182 ; ID_Continue # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; ID_Continue # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; ID_Continue # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 2C00..2C7B ; ID_Continue # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; ID_Continue # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; ID_Continue # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; ID_Continue # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CEF..2CF1 ; ID_Continue # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2CF2..2CF3 ; ID_Continue # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; ID_Continue # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; ID_Continue # L& GEORGIAN SMALL LETTER YN 2D2D ; ID_Continue # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; ID_Continue # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; ID_Continue # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D7F ; ID_Continue # Mn TIFINAGH CONSONANT JOINER 2D80..2D96 ; ID_Continue # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 2DE0..2DFF ; ID_Continue # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 3005 ; ID_Continue # Lm IDEOGRAPHIC ITERATION MARK 3006 ; ID_Continue # Lo IDEOGRAPHIC CLOSING MARK 3007 ; ID_Continue # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; ID_Continue # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 302A..302D ; ID_Continue # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 302E..302F ; ID_Continue # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3031..3035 ; ID_Continue # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3038..303A ; ID_Continue # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; ID_Continue # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; ID_Continue # Lo MASU MARK 3041..3096 ; ID_Continue # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 3099..309A ; ID_Continue # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309B..309C ; ID_Continue # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309D..309E ; ID_Continue # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; ID_Continue # Lo HIRAGANA DIGRAPH YORI 30A1..30FA ; ID_Continue # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FB ; ID_Continue # Po KATAKANA MIDDLE DOT 30FC..30FE ; ID_Continue # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; ID_Continue # Lo KATAKANA DIGRAPH KOTO 3105..312F ; ID_Continue # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; ID_Continue # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 31A0..31BF ; ID_Continue # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; ID_Continue # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3400..4DBF ; ID_Continue # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; ID_Continue # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; ID_Continue # Lm YI SYLLABLE WU A016..A48C ; ID_Continue # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; ID_Continue # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; ID_Continue # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A500..A60B ; ID_Continue # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; ID_Continue # Lm VAI SYLLABLE LENGTHENER A610..A61F ; ID_Continue # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A620..A629 ; ID_Continue # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE A62A..A62B ; ID_Continue # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; ID_Continue # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; ID_Continue # Lo CYRILLIC LETTER MULTIOCULAR O A66F ; ID_Continue # Mn COMBINING CYRILLIC VZMET A674..A67D ; ID_Continue # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A67F ; ID_Continue # Lm CYRILLIC PAYEROK A680..A69B ; ID_Continue # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; ID_Continue # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A69E..A69F ; ID_Continue # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6A0..A6E5 ; ID_Continue # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; ID_Continue # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A6F0..A6F1 ; ID_Continue # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A717..A71F ; ID_Continue # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A722..A76F ; ID_Continue # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; ID_Continue # Lm MODIFIER LETTER US A771..A787 ; ID_Continue # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A788 ; ID_Continue # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A78B..A78E ; ID_Continue # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; ID_Continue # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; ID_Continue # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; ID_Continue # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; ID_Continue # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; ID_Continue # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; ID_Continue # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; ID_Continue # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; ID_Continue # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A802 ; ID_Continue # Mn SYLOTI NAGRI SIGN DVISVARA A803..A805 ; ID_Continue # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A806 ; ID_Continue # Mn SYLOTI NAGRI SIGN HASANTA A807..A80A ; ID_Continue # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80B ; ID_Continue # Mn SYLOTI NAGRI SIGN ANUSVARA A80C..A822 ; ID_Continue # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A823..A824 ; ID_Continue # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A825..A826 ; ID_Continue # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A827 ; ID_Continue # Mc SYLOTI NAGRI VOWEL SIGN OO A82C ; ID_Continue # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A840..A873 ; ID_Continue # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A880..A881 ; ID_Continue # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A882..A8B3 ; ID_Continue # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8B4..A8C3 ; ID_Continue # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A8C4..A8C5 ; ID_Continue # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8D0..A8D9 ; ID_Continue # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE A8E0..A8F1 ; ID_Continue # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8F2..A8F7 ; ID_Continue # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8FB ; ID_Continue # Lo DEVANAGARI HEADSTROKE A8FD..A8FE ; ID_Continue # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A8FF ; ID_Continue # Mn DEVANAGARI VOWEL SIGN AY A900..A909 ; ID_Continue # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A90A..A925 ; ID_Continue # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A926..A92D ; ID_Continue # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A930..A946 ; ID_Continue # Lo [23] REJANG LETTER KA..REJANG LETTER A A947..A951 ; ID_Continue # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A952..A953 ; ID_Continue # Mc [2] REJANG CONSONANT SIGN H..REJANG VIRAMA A960..A97C ; ID_Continue # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A980..A982 ; ID_Continue # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A983 ; ID_Continue # Mc JAVANESE SIGN WIGNYAN A984..A9B2 ; ID_Continue # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9B3 ; ID_Continue # Mn JAVANESE SIGN CECAK TELU A9B4..A9B5 ; ID_Continue # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9B6..A9B9 ; ID_Continue # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BA..A9BB ; ID_Continue # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BC..A9BD ; ID_Continue # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9BE..A9C0 ; ID_Continue # Mc [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON A9CF ; ID_Continue # Lm JAVANESE PANGRANGKEP A9D0..A9D9 ; ID_Continue # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9E0..A9E4 ; ID_Continue # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E5 ; ID_Continue # Mn MYANMAR SIGN SHAN SAW A9E6 ; ID_Continue # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; ID_Continue # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9F0..A9F9 ; ID_Continue # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE A9FA..A9FE ; ID_Continue # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; ID_Continue # Lo [41] CHAM LETTER A..CHAM LETTER HA AA29..AA2E ; ID_Continue # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA2F..AA30 ; ID_Continue # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA31..AA32 ; ID_Continue # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA33..AA34 ; ID_Continue # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA35..AA36 ; ID_Continue # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA40..AA42 ; ID_Continue # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA43 ; ID_Continue # Mn CHAM CONSONANT SIGN FINAL NG AA44..AA4B ; ID_Continue # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA4C ; ID_Continue # Mn CHAM CONSONANT SIGN FINAL M AA4D ; ID_Continue # Mc CHAM CONSONANT SIGN FINAL H AA50..AA59 ; ID_Continue # Nd [10] CHAM DIGIT ZERO..CHAM DIGIT NINE AA60..AA6F ; ID_Continue # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; ID_Continue # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; ID_Continue # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA7A ; ID_Continue # Lo MYANMAR LETTER AITON RA AA7B ; ID_Continue # Mc MYANMAR SIGN PAO KAREN TONE AA7C ; ID_Continue # Mn MYANMAR SIGN TAI LAING TONE-2 AA7D ; ID_Continue # Mc MYANMAR SIGN TAI LAING TONE-5 AA7E..AAAF ; ID_Continue # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB0 ; ID_Continue # Mn TAI VIET MAI KANG AAB1 ; ID_Continue # Lo TAI VIET VOWEL AA AAB2..AAB4 ; ID_Continue # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB5..AAB6 ; ID_Continue # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB7..AAB8 ; ID_Continue # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AAB9..AABD ; ID_Continue # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AABE..AABF ; ID_Continue # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC0 ; ID_Continue # Lo TAI VIET TONE MAI NUENG AAC1 ; ID_Continue # Mn TAI VIET TONE MAI THO AAC2 ; ID_Continue # Lo TAI VIET TONE MAI SONG AADB..AADC ; ID_Continue # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; ID_Continue # Lm TAI VIET SYMBOL SAM AAE0..AAEA ; ID_Continue # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAEB ; ID_Continue # Mc MEETEI MAYEK VOWEL SIGN II AAEC..AAED ; ID_Continue # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAEE..AAEF ; ID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF2 ; ID_Continue # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; ID_Continue # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AAF5 ; ID_Continue # Mc MEETEI MAYEK VOWEL SIGN VISARGA AAF6 ; ID_Continue # Mn MEETEI MAYEK VIRAMA AB01..AB06 ; ID_Continue # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; ID_Continue # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; ID_Continue # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; ID_Continue # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; ID_Continue # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; ID_Continue # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; ID_Continue # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; ID_Continue # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; ID_Continue # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM ABE3..ABE4 ; ID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE5 ; ID_Continue # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE6..ABE7 ; ID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE8 ; ID_Continue # Mn MEETEI MAYEK VOWEL SIGN UNAP ABE9..ABEA ; ID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG ABEC ; ID_Continue # Mc MEETEI MAYEK LUM IYEK ABED ; ID_Continue # Mn MEETEI MAYEK APUN IYEK ABF0..ABF9 ; ID_Continue # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE AC00..D7A3 ; ID_Continue # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; ID_Continue # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; ID_Continue # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; ID_Continue # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; ID_Continue # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; ID_Continue # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; ID_Continue # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FB1D ; ID_Continue # Lo HEBREW LETTER YOD WITH HIRIQ FB1E ; ID_Continue # Mn HEBREW POINT JUDEO-SPANISH VARIKA FB1F..FB28 ; ID_Continue # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; ID_Continue # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; ID_Continue # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; ID_Continue # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; ID_Continue # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; ID_Continue # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; ID_Continue # Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBD3..FD3D ; ID_Continue # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; ID_Continue # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; ID_Continue # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; ID_Continue # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FE00..FE0F ; ID_Continue # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; ID_Continue # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FE33..FE34 ; ID_Continue # Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE FE4D..FE4F ; ID_Continue # Pc [3] DASHED LOW LINE..WAVY LOW LINE FE70..FE74 ; ID_Continue # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; ID_Continue # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF10..FF19 ; ID_Continue # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE FF21..FF3A ; ID_Continue # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF3F ; ID_Continue # Pc FULLWIDTH LOW LINE FF41..FF5A ; ID_Continue # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF65 ; ID_Continue # Po HALFWIDTH KATAKANA MIDDLE DOT FF66..FF6F ; ID_Continue # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; ID_Continue # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; ID_Continue # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FF9E..FF9F ; ID_Continue # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFA0..FFBE ; ID_Continue # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; ID_Continue # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; ID_Continue # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; ID_Continue # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; ID_Continue # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; ID_Continue # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; ID_Continue # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; ID_Continue # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; ID_Continue # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; ID_Continue # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; ID_Continue # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10140..10174 ; ID_Continue # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 101FD ; ID_Continue # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 10280..1029C ; ID_Continue # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; ID_Continue # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 102E0 ; ID_Continue # Mn COPTIC EPACT THOUSANDS MARK 10300..1031F ; ID_Continue # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 1032D..10340 ; ID_Continue # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; ID_Continue # Nl GOTHIC LETTER NINETY 10342..10349 ; ID_Continue # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; ID_Continue # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; ID_Continue # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10376..1037A ; ID_Continue # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10380..1039D ; ID_Continue # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 103A0..103C3 ; ID_Continue # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; ID_Continue # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D1..103D5 ; ID_Continue # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; ID_Continue # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; ID_Continue # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104A0..104A9 ; ID_Continue # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE 104B0..104D3 ; ID_Continue # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; ID_Continue # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; ID_Continue # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; ID_Continue # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 10570..1057A ; ID_Continue # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; ID_Continue # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; ID_Continue # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; ID_Continue # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; ID_Continue # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; ID_Continue # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; ID_Continue # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; ID_Continue # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; ID_Continue # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; ID_Continue # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; ID_Continue # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; ID_Continue # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; ID_Continue # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; ID_Continue # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; ID_Continue # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10800..10805 ; ID_Continue # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; ID_Continue # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; ID_Continue # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; ID_Continue # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; ID_Continue # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; ID_Continue # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10860..10876 ; ID_Continue # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10880..1089E ; ID_Continue # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108E0..108F2 ; ID_Continue # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; ID_Continue # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 10900..10915 ; ID_Continue # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10920..10939 ; ID_Continue # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 10940..10959 ; ID_Continue # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; ID_Continue # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BE..109BF ; ID_Continue # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 10A00 ; ID_Continue # Lo KHAROSHTHI LETTER A 10A01..10A03 ; ID_Continue # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; ID_Continue # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; ID_Continue # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A10..10A13 ; ID_Continue # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; ID_Continue # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; ID_Continue # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A38..10A3A ; ID_Continue # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; ID_Continue # Mn KHAROSHTHI VIRAMA 10A60..10A7C ; ID_Continue # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A80..10A9C ; ID_Continue # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10AC0..10AC7 ; ID_Continue # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC9..10AE4 ; ID_Continue # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10AE5..10AE6 ; ID_Continue # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10B00..10B35 ; ID_Continue # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; ID_Continue # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B60..10B72 ; ID_Continue # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B80..10B91 ; ID_Continue # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10C00..10C48 ; ID_Continue # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; ID_Continue # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; ID_Continue # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D00..10D23 ; ID_Continue # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D24..10D27 ; ID_Continue # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D30..10D39 ; ID_Continue # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE 10D40..10D49 ; ID_Continue # Nd [10] GARAY DIGIT ZERO..GARAY DIGIT NINE 10D4A..10D4D ; ID_Continue # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; ID_Continue # Lm GARAY VOWEL LENGTH MARK 10D4F ; ID_Continue # Lo GARAY SUKUN 10D50..10D65 ; ID_Continue # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D69..10D6D ; ID_Continue # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10D6F ; ID_Continue # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; ID_Continue # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10E80..10EA9 ; ID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAB..10EAC ; ID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EB0..10EB1 ; ID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; ID_Continue # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; ID_Continue # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; ID_Continue # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10EFA..10EFF ; ID_Continue # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; ID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; ID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; ID_Continue # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F46..10F50 ; ID_Continue # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F70..10F81 ; ID_Continue # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10F82..10F85 ; ID_Continue # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 10FB0..10FC4 ; ID_Continue # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FE0..10FF6 ; ID_Continue # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11000 ; ID_Continue # Mc BRAHMI SIGN CANDRABINDU 11001 ; ID_Continue # Mn BRAHMI SIGN ANUSVARA 11002 ; ID_Continue # Mc BRAHMI SIGN VISARGA 11003..11037 ; ID_Continue # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11038..11046 ; ID_Continue # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11066..1106F ; ID_Continue # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE 11070 ; ID_Continue # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11071..11072 ; ID_Continue # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11073..11074 ; ID_Continue # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 11075 ; ID_Continue # Lo BRAHMI LETTER OLD TAMIL LLA 1107F..11081 ; ID_Continue # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 11082 ; ID_Continue # Mc KAITHI SIGN VISARGA 11083..110AF ; ID_Continue # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110B0..110B2 ; ID_Continue # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B3..110B6 ; ID_Continue # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B7..110B8 ; ID_Continue # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 110B9..110BA ; ID_Continue # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; ID_Continue # Mn KAITHI VOWEL SIGN VOCALIC R 110D0..110E8 ; ID_Continue # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 110F0..110F9 ; ID_Continue # Nd [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE 11100..11102 ; ID_Continue # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11103..11126 ; ID_Continue # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11127..1112B ; ID_Continue # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112C ; ID_Continue # Mc CHAKMA VOWEL SIGN E 1112D..11134 ; ID_Continue # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11136..1113F ; ID_Continue # Nd [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE 11144 ; ID_Continue # Lo CHAKMA LETTER LHAA 11145..11146 ; ID_Continue # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11147 ; ID_Continue # Lo CHAKMA LETTER VAA 11150..11172 ; ID_Continue # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11173 ; ID_Continue # Mn MAHAJANI SIGN NUKTA 11176 ; ID_Continue # Lo MAHAJANI LIGATURE SHRI 11180..11181 ; ID_Continue # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 11182 ; ID_Continue # Mc SHARADA SIGN VISARGA 11183..111B2 ; ID_Continue # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111B3..111B5 ; ID_Continue # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111B6..111BE ; ID_Continue # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111BF..111C0 ; ID_Continue # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA 111C1..111C4 ; ID_Continue # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111C9..111CC ; ID_Continue # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CE ; ID_Continue # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 111CF ; ID_Continue # Mn SHARADA SIGN INVERTED CANDRABINDU 111D0..111D9 ; ID_Continue # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE 111DA ; ID_Continue # Lo SHARADA EKAM 111DC ; ID_Continue # Lo SHARADA HEADSTROKE 11200..11211 ; ID_Continue # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; ID_Continue # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1122C..1122E ; ID_Continue # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 1122F..11231 ; ID_Continue # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11232..11233 ; ID_Continue # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11234 ; ID_Continue # Mn KHOJKI SIGN ANUSVARA 11235 ; ID_Continue # Mc KHOJKI SIGN VIRAMA 11236..11237 ; ID_Continue # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; ID_Continue # Mn KHOJKI SIGN SUKUN 1123F..11240 ; ID_Continue # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11241 ; ID_Continue # Mn KHOJKI VOWEL SIGN VOCALIC R 11280..11286 ; ID_Continue # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; ID_Continue # Lo MULTANI LETTER GHA 1128A..1128D ; ID_Continue # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; ID_Continue # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; ID_Continue # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112B0..112DE ; ID_Continue # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 112DF ; ID_Continue # Mn KHUDAWADI SIGN ANUSVARA 112E0..112E2 ; ID_Continue # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 112E3..112EA ; ID_Continue # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 112F0..112F9 ; ID_Continue # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE 11300..11301 ; ID_Continue # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 11302..11303 ; ID_Continue # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 11305..1130C ; ID_Continue # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; ID_Continue # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; ID_Continue # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; ID_Continue # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; ID_Continue # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; ID_Continue # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133B..1133C ; ID_Continue # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1133D ; ID_Continue # Lo GRANTHA SIGN AVAGRAHA 1133E..1133F ; ID_Continue # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I 11340 ; ID_Continue # Mn GRANTHA VOWEL SIGN II 11341..11344 ; ID_Continue # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; ID_Continue # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134D ; ID_Continue # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA 11350 ; ID_Continue # Lo GRANTHA OM 11357 ; ID_Continue # Mc GRANTHA AU LENGTH MARK 1135D..11361 ; ID_Continue # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11362..11363 ; ID_Continue # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 11366..1136C ; ID_Continue # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; ID_Continue # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 11380..11389 ; ID_Continue # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; ID_Continue # Lo TULU-TIGALARI LETTER EE 1138E ; ID_Continue # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; ID_Continue # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; ID_Continue # Lo TULU-TIGALARI SIGN AVAGRAHA 113B8..113BA ; ID_Continue # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II 113BB..113C0 ; ID_Continue # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; ID_Continue # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; ID_Continue # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113CA ; ID_Continue # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; ID_Continue # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 113CE ; ID_Continue # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; ID_Continue # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; ID_Continue # Mn TULU-TIGALARI CONJOINER 113D1 ; ID_Continue # Lo TULU-TIGALARI REPHA 113D2 ; ID_Continue # Mn TULU-TIGALARI GEMINATION MARK 113D3 ; ID_Continue # Lo TULU-TIGALARI SIGN PLUTA 113E1..113E2 ; ID_Continue # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11400..11434 ; ID_Continue # Lo [53] NEWA LETTER A..NEWA LETTER HA 11435..11437 ; ID_Continue # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11438..1143F ; ID_Continue # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11440..11441 ; ID_Continue # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11442..11444 ; ID_Continue # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11445 ; ID_Continue # Mc NEWA SIGN VISARGA 11446 ; ID_Continue # Mn NEWA SIGN NUKTA 11447..1144A ; ID_Continue # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 11450..11459 ; ID_Continue # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE 1145E ; ID_Continue # Mn NEWA SANDHI MARK 1145F..11461 ; ID_Continue # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; ID_Continue # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114B0..114B2 ; ID_Continue # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II 114B3..114B8 ; ID_Continue # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114B9 ; ID_Continue # Mc TIRHUTA VOWEL SIGN E 114BA ; ID_Continue # Mn TIRHUTA VOWEL SIGN SHORT E 114BB..114BE ; ID_Continue # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU 114BF..114C0 ; ID_Continue # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C1 ; ID_Continue # Mc TIRHUTA SIGN VISARGA 114C2..114C3 ; ID_Continue # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 114C4..114C5 ; ID_Continue # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C7 ; ID_Continue # Lo TIRHUTA OM 114D0..114D9 ; ID_Continue # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE 11580..115AE ; ID_Continue # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115AF..115B1 ; ID_Continue # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II 115B2..115B5 ; ID_Continue # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115B8..115BB ; ID_Continue # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BC..115BD ; ID_Continue # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BE ; ID_Continue # Mc SIDDHAM SIGN VISARGA 115BF..115C0 ; ID_Continue # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115D8..115DB ; ID_Continue # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 115DC..115DD ; ID_Continue # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11600..1162F ; ID_Continue # Lo [48] MODI LETTER A..MODI LETTER LLA 11630..11632 ; ID_Continue # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 11633..1163A ; ID_Continue # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163B..1163C ; ID_Continue # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163D ; ID_Continue # Mn MODI SIGN ANUSVARA 1163E ; ID_Continue # Mc MODI SIGN VISARGA 1163F..11640 ; ID_Continue # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 11644 ; ID_Continue # Lo MODI SIGN HUVA 11650..11659 ; ID_Continue # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE 11680..116AA ; ID_Continue # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116AB ; ID_Continue # Mn TAKRI SIGN ANUSVARA 116AC ; ID_Continue # Mc TAKRI SIGN VISARGA 116AD ; ID_Continue # Mn TAKRI VOWEL SIGN AA 116AE..116AF ; ID_Continue # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B0..116B5 ; ID_Continue # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B6 ; ID_Continue # Mc TAKRI SIGN VIRAMA 116B7 ; ID_Continue # Mn TAKRI SIGN NUKTA 116B8 ; ID_Continue # Lo TAKRI LETTER ARCHAIC KHA 116C0..116C9 ; ID_Continue # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE 116D0..116E3 ; ID_Continue # Nd [20] MYANMAR PAO DIGIT ZERO..MYANMAR EASTERN PWO KAREN DIGIT NINE 11700..1171A ; ID_Continue # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 1171D ; ID_Continue # Mn AHOM CONSONANT SIGN MEDIAL LA 1171E ; ID_Continue # Mc AHOM CONSONANT SIGN MEDIAL RA 1171F ; ID_Continue # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11720..11721 ; ID_Continue # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11722..11725 ; ID_Continue # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11726 ; ID_Continue # Mc AHOM VOWEL SIGN E 11727..1172B ; ID_Continue # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 11730..11739 ; ID_Continue # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 11740..11746 ; ID_Continue # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; ID_Continue # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 1182C..1182E ; ID_Continue # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 1182F..11837 ; ID_Continue # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11838 ; ID_Continue # Mc DOGRA SIGN VISARGA 11839..1183A ; ID_Continue # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 118A0..118DF ; ID_Continue # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118E0..118E9 ; ID_Continue # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 118FF..11906 ; ID_Continue # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; ID_Continue # Lo DIVES AKURU LETTER O 1190C..11913 ; ID_Continue # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; ID_Continue # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; ID_Continue # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 11930..11935 ; ID_Continue # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E 11937..11938 ; ID_Continue # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193B..1193C ; ID_Continue # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193D ; ID_Continue # Mc DIVES AKURU SIGN HALANTA 1193E ; ID_Continue # Mn DIVES AKURU VIRAMA 1193F ; ID_Continue # Lo DIVES AKURU PREFIXED NASAL SIGN 11940 ; ID_Continue # Mc DIVES AKURU MEDIAL YA 11941 ; ID_Continue # Lo DIVES AKURU INITIAL RA 11942 ; ID_Continue # Mc DIVES AKURU MEDIAL RA 11943 ; ID_Continue # Mn DIVES AKURU SIGN NUKTA 11950..11959 ; ID_Continue # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE 119A0..119A7 ; ID_Continue # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; ID_Continue # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119D1..119D3 ; ID_Continue # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119D4..119D7 ; ID_Continue # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; ID_Continue # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119DC..119DF ; ID_Continue # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E0 ; ID_Continue # Mn NANDINAGARI SIGN VIRAMA 119E1 ; ID_Continue # Lo NANDINAGARI SIGN AVAGRAHA 119E3 ; ID_Continue # Lo NANDINAGARI HEADSTROKE 119E4 ; ID_Continue # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A00 ; ID_Continue # Lo ZANABAZAR SQUARE LETTER A 11A01..11A0A ; ID_Continue # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A0B..11A32 ; ID_Continue # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A33..11A38 ; ID_Continue # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A39 ; ID_Continue # Mc ZANABAZAR SQUARE SIGN VISARGA 11A3A ; ID_Continue # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A3B..11A3E ; ID_Continue # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; ID_Continue # Mn ZANABAZAR SQUARE SUBJOINER 11A50 ; ID_Continue # Lo SOYOMBO LETTER A 11A51..11A56 ; ID_Continue # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A57..11A58 ; ID_Continue # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A59..11A5B ; ID_Continue # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A5C..11A89 ; ID_Continue # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A8A..11A96 ; ID_Continue # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A97 ; ID_Continue # Mc SOYOMBO SIGN VISARGA 11A98..11A99 ; ID_Continue # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11A9D ; ID_Continue # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; ID_Continue # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B60 ; ID_Continue # Mn SHARADA VOWEL SIGN OE 11B61 ; ID_Continue # Mc SHARADA VOWEL SIGN OOE 11B62..11B64 ; ID_Continue # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B65 ; ID_Continue # Mc SHARADA VOWEL SIGN SHORT O 11B66 ; ID_Continue # Mn SHARADA VOWEL SIGN CANDRA E 11B67 ; ID_Continue # Mc SHARADA VOWEL SIGN CANDRA O 11BC0..11BE0 ; ID_Continue # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11BF0..11BF9 ; ID_Continue # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; ID_Continue # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; ID_Continue # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; ID_Continue # Mc BHAIKSUKI VOWEL SIGN AA 11C30..11C36 ; ID_Continue # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; ID_Continue # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3E ; ID_Continue # Mc BHAIKSUKI SIGN VISARGA 11C3F ; ID_Continue # Mn BHAIKSUKI SIGN VIRAMA 11C40 ; ID_Continue # Lo BHAIKSUKI SIGN AVAGRAHA 11C50..11C59 ; ID_Continue # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11C72..11C8F ; ID_Continue # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11C92..11CA7 ; ID_Continue # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CA9 ; ID_Continue # Mc MARCHEN SUBJOINED LETTER YA 11CAA..11CB0 ; ID_Continue # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB1 ; ID_Continue # Mc MARCHEN VOWEL SIGN I 11CB2..11CB3 ; ID_Continue # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB4 ; ID_Continue # Mc MARCHEN VOWEL SIGN O 11CB5..11CB6 ; ID_Continue # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D00..11D06 ; ID_Continue # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; ID_Continue # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; ID_Continue # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D31..11D36 ; ID_Continue # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; ID_Continue # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; ID_Continue # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; ID_Continue # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D46 ; ID_Continue # Lo MASARAM GONDI REPHA 11D47 ; ID_Continue # Mn MASARAM GONDI RA-KARA 11D50..11D59 ; ID_Continue # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11D60..11D65 ; ID_Continue # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; ID_Continue # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; ID_Continue # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D8A..11D8E ; ID_Continue # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D90..11D91 ; ID_Continue # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D93..11D94 ; ID_Continue # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D95 ; ID_Continue # Mn GUNJALA GONDI SIGN ANUSVARA 11D96 ; ID_Continue # Mc GUNJALA GONDI SIGN VISARGA 11D97 ; ID_Continue # Mn GUNJALA GONDI VIRAMA 11D98 ; ID_Continue # Lo GUNJALA GONDI OM 11DA0..11DA9 ; ID_Continue # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE 11DB0..11DD8 ; ID_Continue # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; ID_Continue # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; ID_Continue # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11DE0..11DE9 ; ID_Continue # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE 11EE0..11EF2 ; ID_Continue # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11EF3..11EF4 ; ID_Continue # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11EF5..11EF6 ; ID_Continue # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11F00..11F01 ; ID_Continue # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F02 ; ID_Continue # Lo KAWI SIGN REPHA 11F03 ; ID_Continue # Mc KAWI SIGN VISARGA 11F04..11F10 ; ID_Continue # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; ID_Continue # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11F34..11F35 ; ID_Continue # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F36..11F3A ; ID_Continue # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F3E..11F3F ; ID_Continue # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F40 ; ID_Continue # Mn KAWI VOWEL SIGN EU 11F41 ; ID_Continue # Mc KAWI SIGN KILLER 11F42 ; ID_Continue # Mn KAWI CONJOINER 11F50..11F59 ; ID_Continue # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE 11F5A ; ID_Continue # Mn KAWI SIGN NUKTA 11FB0 ; ID_Continue # Lo LISU LETTER YHA 12000..12399 ; ID_Continue # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; ID_Continue # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12480..12543 ; ID_Continue # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; ID_Continue # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 13000..1342F ; ID_Continue # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13440 ; ID_Continue # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13441..13446 ; ID_Continue # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13447..13455 ; ID_Continue # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 13460..143FA ; ID_Continue # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; ID_Continue # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; ID_Continue # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 1611E..16129 ; ID_Continue # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612A..1612C ; ID_Continue # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 1612D..1612F ; ID_Continue # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16130..16139 ; ID_Continue # Nd [10] GURUNG KHEMA DIGIT ZERO..GURUNG KHEMA DIGIT NINE 16800..16A38 ; ID_Continue # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; ID_Continue # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A60..16A69 ; ID_Continue # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE 16A70..16ABE ; ID_Continue # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AC0..16AC9 ; ID_Continue # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE 16AD0..16AED ; ID_Continue # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16AF0..16AF4 ; ID_Continue # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B00..16B2F ; ID_Continue # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B30..16B36 ; ID_Continue # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16B40..16B43 ; ID_Continue # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B50..16B59 ; ID_Continue # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE 16B63..16B77 ; ID_Continue # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; ID_Continue # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; ID_Continue # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; ID_Continue # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; ID_Continue # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16D70..16D79 ; ID_Continue # Nd [10] KIRAT RAI DIGIT ZERO..KIRAT RAI DIGIT NINE 16E40..16E7F ; ID_Continue # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; ID_Continue # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; ID_Continue # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; ID_Continue # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F4F ; ID_Continue # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F50 ; ID_Continue # Lo MIAO LETTER NASALIZATION 16F51..16F87 ; ID_Continue # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; ID_Continue # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; ID_Continue # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; ID_Continue # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; ID_Continue # Lm OLD CHINESE ITERATION MARK 16FE4 ; ID_Continue # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; ID_Continue # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 16FF2..16FF3 ; ID_Continue # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; ID_Continue # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; ID_Continue # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; ID_Continue # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; ID_Continue # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; ID_Continue # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; ID_Continue # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; ID_Continue # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; ID_Continue # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; ID_Continue # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; ID_Continue # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; ID_Continue # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; ID_Continue # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; ID_Continue # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; ID_Continue # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; ID_Continue # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; ID_Continue # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; ID_Continue # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1BC9D..1BC9E ; ID_Continue # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CCF0..1CCF9 ; ID_Continue # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE 1CF00..1CF2D ; ID_Continue # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; ID_Continue # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D165..1D166 ; ID_Continue # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D167..1D169 ; ID_Continue # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D16D..1D172 ; ID_Continue # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; ID_Continue # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; ID_Continue # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; ID_Continue # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; ID_Continue # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1D400..1D454 ; ID_Continue # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; ID_Continue # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; ID_Continue # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; ID_Continue # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; ID_Continue # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; ID_Continue # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; ID_Continue # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; ID_Continue # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; ID_Continue # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; ID_Continue # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; ID_Continue # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; ID_Continue # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; ID_Continue # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; ID_Continue # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; ID_Continue # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; ID_Continue # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; ID_Continue # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; ID_Continue # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; ID_Continue # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; ID_Continue # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; ID_Continue # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; ID_Continue # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; ID_Continue # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; ID_Continue # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; ID_Continue # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; ID_Continue # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; ID_Continue # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; ID_Continue # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; ID_Continue # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; ID_Continue # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1D7CE..1D7FF ; ID_Continue # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1DA00..1DA36 ; ID_Continue # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; ID_Continue # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; ID_Continue # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; ID_Continue # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; ID_Continue # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; ID_Continue # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1DF00..1DF09 ; ID_Continue # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; ID_Continue # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; ID_Continue # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; ID_Continue # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E000..1E006 ; ID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; ID_Continue # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; ID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; ID_Continue # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; ID_Continue # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E030..1E06D ; ID_Continue # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E08F ; ID_Continue # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E100..1E12C ; ID_Continue # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E130..1E136 ; ID_Continue # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E137..1E13D ; ID_Continue # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E140..1E149 ; ID_Continue # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE 1E14E ; ID_Continue # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E290..1E2AD ; ID_Continue # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2AE ; ID_Continue # Mn TOTO SIGN RISING TONE 1E2C0..1E2EB ; ID_Continue # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E2EC..1E2EF ; ID_Continue # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E2F0..1E2F9 ; ID_Continue # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE 1E4D0..1E4EA ; ID_Continue # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; ID_Continue # Lm NAG MUNDARI SIGN OJOD 1E4EC..1E4EF ; ID_Continue # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E4F0..1E4F9 ; ID_Continue # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE 1E5D0..1E5ED ; ID_Continue # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5EE..1E5EF ; ID_Continue # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E5F0 ; ID_Continue # Lo OL ONAL SIGN HODDOND 1E5F1..1E5FA ; ID_Continue # Nd [10] OL ONAL DIGIT ZERO..OL ONAL DIGIT NINE 1E6C0..1E6DE ; ID_Continue # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; ID_Continue # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E3 ; ID_Continue # Mn TAI YO SIGN UE 1E6E4..1E6E5 ; ID_Continue # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E6 ; ID_Continue # Mn TAI YO SIGN AU 1E6E7..1E6ED ; ID_Continue # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6EE..1E6EF ; ID_Continue # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F0..1E6F4 ; ID_Continue # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6F5 ; ID_Continue # Mn TAI YO SIGN OM 1E6FE ; ID_Continue # Lo TAI YO SYMBOL MUEANG 1E6FF ; ID_Continue # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; ID_Continue # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; ID_Continue # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; ID_Continue # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; ID_Continue # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; ID_Continue # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E8D0..1E8D6 ; ID_Continue # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E900..1E943 ; ID_Continue # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E944..1E94A ; ID_Continue # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA 1E94B ; ID_Continue # Lm ADLAM NASALIZATION MARK 1E950..1E959 ; ID_Continue # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1EE00..1EE03 ; ID_Continue # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; ID_Continue # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; ID_Continue # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; ID_Continue # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; ID_Continue # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; ID_Continue # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; ID_Continue # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; ID_Continue # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; ID_Continue # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; ID_Continue # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; ID_Continue # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; ID_Continue # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; ID_Continue # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; ID_Continue # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; ID_Continue # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; ID_Continue # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; ID_Continue # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; ID_Continue # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; ID_Continue # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; ID_Continue # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; ID_Continue # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; ID_Continue # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; ID_Continue # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; ID_Continue # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1FBF0..1FBF9 ; ID_Continue # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE 20000..2A6DF ; ID_Continue # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; ID_Continue # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; ID_Continue # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; ID_Continue # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; ID_Continue # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; ID_Continue # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; ID_Continue # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; ID_Continue # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 E0100..E01EF ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 149240 # ================================================ # Derived Property: XID_Start # ID_Start modified for closure under NFKx # Modified as described in UAX #15 # NOTE: Does NOT remove the non-NFKx characters. # Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string)) # NOTE: See UAX #31 for more information 0041..005A ; XID_Start # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 0061..007A ; XID_Start # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; XID_Start # Lo FEMININE ORDINAL INDICATOR 00B5 ; XID_Start # L& MICRO SIGN 00BA ; XID_Start # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; XID_Start # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; XID_Start # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; XID_Start # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; XID_Start # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; XID_Start # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; XID_Start # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; XID_Start # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; XID_Start # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; XID_Start # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02C1 ; XID_Start # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C6..02D1 ; XID_Start # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; XID_Start # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EC ; XID_Start # Lm MODIFIER LETTER VOICING 02EE ; XID_Start # Lm MODIFIER LETTER DOUBLE APOSTROPHE 0370..0373 ; XID_Start # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0374 ; XID_Start # Lm GREEK NUMERAL SIGN 0376..0377 ; XID_Start # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037B..037D ; XID_Start # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; XID_Start # L& GREEK CAPITAL LETTER YOT 0386 ; XID_Start # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; XID_Start # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; XID_Start # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; XID_Start # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; XID_Start # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; XID_Start # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 048A..052F ; XID_Start # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; XID_Start # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; XID_Start # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 0560..0588 ; XID_Start # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 05D0..05EA ; XID_Start # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; XID_Start # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 0620..063F ; XID_Start # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; XID_Start # Lm ARABIC TATWEEL 0641..064A ; XID_Start # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 066E..066F ; XID_Start # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0671..06D3 ; XID_Start # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D5 ; XID_Start # Lo ARABIC LETTER AE 06E5..06E6 ; XID_Start # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06EE..06EF ; XID_Start # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06FA..06FC ; XID_Start # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FF ; XID_Start # Lo ARABIC LETTER HEH WITH INVERTED V 0710 ; XID_Start # Lo SYRIAC LETTER ALAPH 0712..072F ; XID_Start # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 074D..07A5 ; XID_Start # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07B1 ; XID_Start # Lo THAANA LETTER NAA 07CA..07EA ; XID_Start # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07F4..07F5 ; XID_Start # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; XID_Start # Lm NKO LAJANYALAN 0800..0815 ; XID_Start # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 081A ; XID_Start # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 0824 ; XID_Start # Lm SAMARITAN MODIFIER LETTER SHORT A 0828 ; XID_Start # Lm SAMARITAN MODIFIER LETTER I 0840..0858 ; XID_Start # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 0860..086A ; XID_Start # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; XID_Start # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088F ; XID_Start # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 08A0..08C8 ; XID_Start # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; XID_Start # Lm ARABIC SMALL FARSI YEH 0904..0939 ; XID_Start # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093D ; XID_Start # Lo DEVANAGARI SIGN AVAGRAHA 0950 ; XID_Start # Lo DEVANAGARI OM 0958..0961 ; XID_Start # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0971 ; XID_Start # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; XID_Start # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0985..098C ; XID_Start # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; XID_Start # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; XID_Start # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; XID_Start # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; XID_Start # Lo BENGALI LETTER LA 09B6..09B9 ; XID_Start # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BD ; XID_Start # Lo BENGALI SIGN AVAGRAHA 09CE ; XID_Start # Lo BENGALI LETTER KHANDA TA 09DC..09DD ; XID_Start # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; XID_Start # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09F0..09F1 ; XID_Start # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09FC ; XID_Start # Lo BENGALI LETTER VEDIC ANUSVARA 0A05..0A0A ; XID_Start # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; XID_Start # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; XID_Start # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; XID_Start # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; XID_Start # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; XID_Start # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; XID_Start # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A59..0A5C ; XID_Start # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; XID_Start # Lo GURMUKHI LETTER FA 0A72..0A74 ; XID_Start # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A85..0A8D ; XID_Start # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; XID_Start # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; XID_Start # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; XID_Start # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; XID_Start # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; XID_Start # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABD ; XID_Start # Lo GUJARATI SIGN AVAGRAHA 0AD0 ; XID_Start # Lo GUJARATI OM 0AE0..0AE1 ; XID_Start # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AF9 ; XID_Start # Lo GUJARATI LETTER ZHA 0B05..0B0C ; XID_Start # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; XID_Start # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; XID_Start # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; XID_Start # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; XID_Start # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; XID_Start # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3D ; XID_Start # Lo ORIYA SIGN AVAGRAHA 0B5C..0B5D ; XID_Start # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; XID_Start # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B71 ; XID_Start # Lo ORIYA LETTER WA 0B83 ; XID_Start # Lo TAMIL SIGN VISARGA 0B85..0B8A ; XID_Start # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; XID_Start # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; XID_Start # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; XID_Start # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; XID_Start # Lo TAMIL LETTER JA 0B9E..0B9F ; XID_Start # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; XID_Start # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; XID_Start # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; XID_Start # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BD0 ; XID_Start # Lo TAMIL OM 0C05..0C0C ; XID_Start # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; XID_Start # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; XID_Start # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; XID_Start # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; XID_Start # Lo TELUGU SIGN AVAGRAHA 0C58..0C5A ; XID_Start # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; XID_Start # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; XID_Start # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C80 ; XID_Start # Lo KANNADA SIGN SPACING CANDRABINDU 0C85..0C8C ; XID_Start # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; XID_Start # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; XID_Start # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; XID_Start # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; XID_Start # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBD ; XID_Start # Lo KANNADA SIGN AVAGRAHA 0CDC..0CDE ; XID_Start # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; XID_Start # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CF1..0CF2 ; XID_Start # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0D04..0D0C ; XID_Start # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; XID_Start # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; XID_Start # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3D ; XID_Start # Lo MALAYALAM SIGN AVAGRAHA 0D4E ; XID_Start # Lo MALAYALAM LETTER DOT REPH 0D54..0D56 ; XID_Start # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D5F..0D61 ; XID_Start # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D7A..0D7F ; XID_Start # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D85..0D96 ; XID_Start # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; XID_Start # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; XID_Start # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; XID_Start # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; XID_Start # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0E01..0E30 ; XID_Start # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E32 ; XID_Start # Lo THAI CHARACTER SARA AA 0E40..0E45 ; XID_Start # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; XID_Start # Lm THAI CHARACTER MAIYAMOK 0E81..0E82 ; XID_Start # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; XID_Start # Lo LAO LETTER KHO TAM 0E86..0E8A ; XID_Start # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; XID_Start # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; XID_Start # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; XID_Start # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB2 ; XID_Start # Lo LAO VOWEL SIGN AA 0EBD ; XID_Start # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; XID_Start # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; XID_Start # Lm LAO KO LA 0EDC..0EDF ; XID_Start # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; XID_Start # Lo TIBETAN SYLLABLE OM 0F40..0F47 ; XID_Start # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; XID_Start # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F88..0F8C ; XID_Start # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 1000..102A ; XID_Start # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 103F ; XID_Start # Lo MYANMAR LETTER GREAT SA 1050..1055 ; XID_Start # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 105A..105D ; XID_Start # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 1061 ; XID_Start # Lo MYANMAR LETTER SGAW KAREN SHA 1065..1066 ; XID_Start # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 106E..1070 ; XID_Start # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1075..1081 ; XID_Start # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 108E ; XID_Start # Lo MYANMAR LETTER RUMAI PALAUNG FA 10A0..10C5 ; XID_Start # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; XID_Start # L& GEORGIAN CAPITAL LETTER YN 10CD ; XID_Start # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; XID_Start # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; XID_Start # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; XID_Start # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; XID_Start # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; XID_Start # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; XID_Start # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; XID_Start # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; XID_Start # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; XID_Start # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; XID_Start # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; XID_Start # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; XID_Start # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; XID_Start # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; XID_Start # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; XID_Start # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; XID_Start # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; XID_Start # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; XID_Start # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 1380..138F ; XID_Start # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 13A0..13F5 ; XID_Start # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; XID_Start # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1401..166C ; XID_Start # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166F..167F ; XID_Start # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; XID_Start # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; XID_Start # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; XID_Start # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; XID_Start # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; XID_Start # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 171F..1731 ; XID_Start # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1740..1751 ; XID_Start # Lo [18] BUHID LETTER A..BUHID LETTER HA 1760..176C ; XID_Start # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; XID_Start # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1780..17B3 ; XID_Start # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17D7 ; XID_Start # Lm KHMER SIGN LEK TOO 17DC ; XID_Start # Lo KHMER SIGN AVAKRAHASANYA 1820..1842 ; XID_Start # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; XID_Start # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; XID_Start # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; XID_Start # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1885..1886 ; XID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 1887..18A8 ; XID_Start # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18AA ; XID_Start # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; XID_Start # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; XID_Start # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1950..196D ; XID_Start # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; XID_Start # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; XID_Start # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; XID_Start # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 1A00..1A16 ; XID_Start # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A20..1A54 ; XID_Start # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1AA7 ; XID_Start # Lm TAI THAM SIGN MAI YAMOK 1B05..1B33 ; XID_Start # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B45..1B4C ; XID_Start # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B83..1BA0 ; XID_Start # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BAE..1BAF ; XID_Start # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BBA..1BE5 ; XID_Start # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1C00..1C23 ; XID_Start # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C4D..1C4F ; XID_Start # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; XID_Start # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; XID_Start # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C80..1C8A ; XID_Start # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; XID_Start # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; XID_Start # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; XID_Start # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CEE..1CF3 ; XID_Start # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF5..1CF6 ; XID_Start # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CFA ; XID_Start # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; XID_Start # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; XID_Start # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; XID_Start # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; XID_Start # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; XID_Start # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; XID_Start # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1E00..1F15 ; XID_Start # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; XID_Start # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; XID_Start # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; XID_Start # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; XID_Start # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; XID_Start # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; XID_Start # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; XID_Start # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; XID_Start # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; XID_Start # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; XID_Start # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; XID_Start # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; XID_Start # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; XID_Start # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; XID_Start # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; XID_Start # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; XID_Start # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; XID_Start # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; XID_Start # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 2071 ; XID_Start # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; XID_Start # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; XID_Start # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2102 ; XID_Start # L& DOUBLE-STRUCK CAPITAL C 2107 ; XID_Start # L& EULER CONSTANT 210A..2113 ; XID_Start # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; XID_Start # L& DOUBLE-STRUCK CAPITAL N 2118 ; XID_Start # Sm SCRIPT CAPITAL P 2119..211D ; XID_Start # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; XID_Start # L& DOUBLE-STRUCK CAPITAL Z 2126 ; XID_Start # L& OHM SIGN 2128 ; XID_Start # L& BLACK-LETTER CAPITAL Z 212A..212D ; XID_Start # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212E ; XID_Start # So ESTIMATED SYMBOL 212F..2134 ; XID_Start # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; XID_Start # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; XID_Start # L& INFORMATION SOURCE 213C..213F ; XID_Start # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; XID_Start # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; XID_Start # L& TURNED SMALL F 2160..2182 ; XID_Start # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; XID_Start # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; XID_Start # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 2C00..2C7B ; XID_Start # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; XID_Start # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; XID_Start # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; XID_Start # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; XID_Start # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; XID_Start # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; XID_Start # L& GEORGIAN SMALL LETTER YN 2D2D ; XID_Start # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; XID_Start # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; XID_Start # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D80..2D96 ; XID_Start # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; XID_Start # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; XID_Start # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; XID_Start # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; XID_Start # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 3005 ; XID_Start # Lm IDEOGRAPHIC ITERATION MARK 3006 ; XID_Start # Lo IDEOGRAPHIC CLOSING MARK 3007 ; XID_Start # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; XID_Start # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 3031..3035 ; XID_Start # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3038..303A ; XID_Start # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; XID_Start # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; XID_Start # Lo MASU MARK 3041..3096 ; XID_Start # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 309D..309E ; XID_Start # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; XID_Start # Lo HIRAGANA DIGRAPH YORI 30A1..30FA ; XID_Start # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FC..30FE ; XID_Start # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; XID_Start # Lo KATAKANA DIGRAPH KOTO 3105..312F ; XID_Start # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; XID_Start # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 31A0..31BF ; XID_Start # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; XID_Start # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3400..4DBF ; XID_Start # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; XID_Start # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; XID_Start # Lm YI SYLLABLE WU A016..A48C ; XID_Start # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; XID_Start # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; XID_Start # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A500..A60B ; XID_Start # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; XID_Start # Lm VAI SYLLABLE LENGTHENER A610..A61F ; XID_Start # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A62A..A62B ; XID_Start # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; XID_Start # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; XID_Start # Lo CYRILLIC LETTER MULTIOCULAR O A67F ; XID_Start # Lm CYRILLIC PAYEROK A680..A69B ; XID_Start # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; XID_Start # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6A0..A6E5 ; XID_Start # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; XID_Start # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A717..A71F ; XID_Start # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A722..A76F ; XID_Start # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; XID_Start # Lm MODIFIER LETTER US A771..A787 ; XID_Start # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A788 ; XID_Start # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A78B..A78E ; XID_Start # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; XID_Start # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; XID_Start # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; XID_Start # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; XID_Start # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; XID_Start # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; XID_Start # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; XID_Start # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; XID_Start # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A803..A805 ; XID_Start # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A807..A80A ; XID_Start # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80C..A822 ; XID_Start # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A840..A873 ; XID_Start # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A882..A8B3 ; XID_Start # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8F2..A8F7 ; XID_Start # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8FB ; XID_Start # Lo DEVANAGARI HEADSTROKE A8FD..A8FE ; XID_Start # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A90A..A925 ; XID_Start # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A930..A946 ; XID_Start # Lo [23] REJANG LETTER KA..REJANG LETTER A A960..A97C ; XID_Start # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A984..A9B2 ; XID_Start # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9CF ; XID_Start # Lm JAVANESE PANGRANGKEP A9E0..A9E4 ; XID_Start # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E6 ; XID_Start # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; XID_Start # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9FA..A9FE ; XID_Start # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; XID_Start # Lo [41] CHAM LETTER A..CHAM LETTER HA AA40..AA42 ; XID_Start # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA44..AA4B ; XID_Start # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA60..AA6F ; XID_Start # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; XID_Start # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; XID_Start # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA7A ; XID_Start # Lo MYANMAR LETTER AITON RA AA7E..AAAF ; XID_Start # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB1 ; XID_Start # Lo TAI VIET VOWEL AA AAB5..AAB6 ; XID_Start # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB9..AABD ; XID_Start # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AAC0 ; XID_Start # Lo TAI VIET TONE MAI NUENG AAC2 ; XID_Start # Lo TAI VIET TONE MAI SONG AADB..AADC ; XID_Start # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; XID_Start # Lm TAI VIET SYMBOL SAM AAE0..AAEA ; XID_Start # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAF2 ; XID_Start # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; XID_Start # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AB01..AB06 ; XID_Start # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; XID_Start # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; XID_Start # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; XID_Start # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; XID_Start # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; XID_Start # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; XID_Start # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; XID_Start # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; XID_Start # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; XID_Start # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM AC00..D7A3 ; XID_Start # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; XID_Start # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; XID_Start # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; XID_Start # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; XID_Start # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; XID_Start # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; XID_Start # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FB1D ; XID_Start # Lo HEBREW LETTER YOD WITH HIRIQ FB1F..FB28 ; XID_Start # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; XID_Start # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; XID_Start # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; XID_Start # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; XID_Start # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; XID_Start # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; XID_Start # Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBD3..FC5D ; XID_Start # Lo [139] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF MAKSURA WITH SUPERSCRIPT ALEF ISOLATED FORM FC64..FD3D ; XID_Start # Lo [218] ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH REH FINAL FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; XID_Start # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; XID_Start # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDF9 ; XID_Start # Lo [10] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE SALLA ISOLATED FORM FE71 ; XID_Start # Lo ARABIC TATWEEL WITH FATHATAN ABOVE FE73 ; XID_Start # Lo ARABIC TAIL FRAGMENT FE77 ; XID_Start # Lo ARABIC FATHA MEDIAL FORM FE79 ; XID_Start # Lo ARABIC DAMMA MEDIAL FORM FE7B ; XID_Start # Lo ARABIC KASRA MEDIAL FORM FE7D ; XID_Start # Lo ARABIC SHADDA MEDIAL FORM FE7F..FEFC ; XID_Start # Lo [126] ARABIC SUKUN MEDIAL FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF21..FF3A ; XID_Start # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; XID_Start # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF66..FF6F ; XID_Start # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; XID_Start # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; XID_Start # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FFA0..FFBE ; XID_Start # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; XID_Start # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; XID_Start # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; XID_Start # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; XID_Start # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; XID_Start # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; XID_Start # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; XID_Start # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; XID_Start # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; XID_Start # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; XID_Start # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10140..10174 ; XID_Start # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10280..1029C ; XID_Start # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; XID_Start # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 10300..1031F ; XID_Start # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 1032D..10340 ; XID_Start # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; XID_Start # Nl GOTHIC LETTER NINETY 10342..10349 ; XID_Start # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; XID_Start # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; XID_Start # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10380..1039D ; XID_Start # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 103A0..103C3 ; XID_Start # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; XID_Start # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D1..103D5 ; XID_Start # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; XID_Start # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; XID_Start # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104B0..104D3 ; XID_Start # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; XID_Start # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; XID_Start # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; XID_Start # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 10570..1057A ; XID_Start # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; XID_Start # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; XID_Start # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; XID_Start # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; XID_Start # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; XID_Start # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; XID_Start # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; XID_Start # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; XID_Start # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; XID_Start # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; XID_Start # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; XID_Start # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; XID_Start # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; XID_Start # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; XID_Start # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10800..10805 ; XID_Start # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; XID_Start # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; XID_Start # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; XID_Start # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; XID_Start # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; XID_Start # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10860..10876 ; XID_Start # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10880..1089E ; XID_Start # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108E0..108F2 ; XID_Start # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; XID_Start # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 10900..10915 ; XID_Start # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10920..10939 ; XID_Start # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 10940..10959 ; XID_Start # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; XID_Start # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BE..109BF ; XID_Start # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 10A00 ; XID_Start # Lo KHAROSHTHI LETTER A 10A10..10A13 ; XID_Start # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; XID_Start # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; XID_Start # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A60..10A7C ; XID_Start # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A80..10A9C ; XID_Start # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10AC0..10AC7 ; XID_Start # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC9..10AE4 ; XID_Start # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10B00..10B35 ; XID_Start # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; XID_Start # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B60..10B72 ; XID_Start # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B80..10B91 ; XID_Start # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10C00..10C48 ; XID_Start # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; XID_Start # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; XID_Start # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D00..10D23 ; XID_Start # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D4A..10D4D ; XID_Start # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; XID_Start # Lm GARAY VOWEL LENGTH MARK 10D4F ; XID_Start # Lo GARAY SUKUN 10D50..10D65 ; XID_Start # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D6F ; XID_Start # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; XID_Start # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10E80..10EA9 ; XID_Start # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; XID_Start # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; XID_Start # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; XID_Start # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; XID_Start # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10F00..10F1C ; XID_Start # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; XID_Start # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; XID_Start # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F70..10F81 ; XID_Start # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10FB0..10FC4 ; XID_Start # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FE0..10FF6 ; XID_Start # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11003..11037 ; XID_Start # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11071..11072 ; XID_Start # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11075 ; XID_Start # Lo BRAHMI LETTER OLD TAMIL LLA 11083..110AF ; XID_Start # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110D0..110E8 ; XID_Start # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 11103..11126 ; XID_Start # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11144 ; XID_Start # Lo CHAKMA LETTER LHAA 11147 ; XID_Start # Lo CHAKMA LETTER VAA 11150..11172 ; XID_Start # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11176 ; XID_Start # Lo MAHAJANI LIGATURE SHRI 11183..111B2 ; XID_Start # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111C1..111C4 ; XID_Start # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111DA ; XID_Start # Lo SHARADA EKAM 111DC ; XID_Start # Lo SHARADA HEADSTROKE 11200..11211 ; XID_Start # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; XID_Start # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1123F..11240 ; XID_Start # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11280..11286 ; XID_Start # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; XID_Start # Lo MULTANI LETTER GHA 1128A..1128D ; XID_Start # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; XID_Start # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; XID_Start # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112B0..112DE ; XID_Start # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 11305..1130C ; XID_Start # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; XID_Start # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; XID_Start # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; XID_Start # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; XID_Start # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; XID_Start # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133D ; XID_Start # Lo GRANTHA SIGN AVAGRAHA 11350 ; XID_Start # Lo GRANTHA OM 1135D..11361 ; XID_Start # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11380..11389 ; XID_Start # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; XID_Start # Lo TULU-TIGALARI LETTER EE 1138E ; XID_Start # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; XID_Start # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; XID_Start # Lo TULU-TIGALARI SIGN AVAGRAHA 113D1 ; XID_Start # Lo TULU-TIGALARI REPHA 113D3 ; XID_Start # Lo TULU-TIGALARI SIGN PLUTA 11400..11434 ; XID_Start # Lo [53] NEWA LETTER A..NEWA LETTER HA 11447..1144A ; XID_Start # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 1145F..11461 ; XID_Start # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; XID_Start # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114C4..114C5 ; XID_Start # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C7 ; XID_Start # Lo TIRHUTA OM 11580..115AE ; XID_Start # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115D8..115DB ; XID_Start # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 11600..1162F ; XID_Start # Lo [48] MODI LETTER A..MODI LETTER LLA 11644 ; XID_Start # Lo MODI SIGN HUVA 11680..116AA ; XID_Start # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116B8 ; XID_Start # Lo TAKRI LETTER ARCHAIC KHA 11700..1171A ; XID_Start # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 11740..11746 ; XID_Start # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; XID_Start # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 118A0..118DF ; XID_Start # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118FF..11906 ; XID_Start # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; XID_Start # Lo DIVES AKURU LETTER O 1190C..11913 ; XID_Start # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; XID_Start # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; XID_Start # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 1193F ; XID_Start # Lo DIVES AKURU PREFIXED NASAL SIGN 11941 ; XID_Start # Lo DIVES AKURU INITIAL RA 119A0..119A7 ; XID_Start # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; XID_Start # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119E1 ; XID_Start # Lo NANDINAGARI SIGN AVAGRAHA 119E3 ; XID_Start # Lo NANDINAGARI HEADSTROKE 11A00 ; XID_Start # Lo ZANABAZAR SQUARE LETTER A 11A0B..11A32 ; XID_Start # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A3A ; XID_Start # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A50 ; XID_Start # Lo SOYOMBO LETTER A 11A5C..11A89 ; XID_Start # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; XID_Start # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; XID_Start # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11BC0..11BE0 ; XID_Start # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; XID_Start # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; XID_Start # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; XID_Start # Lo BHAIKSUKI SIGN AVAGRAHA 11C72..11C8F ; XID_Start # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11D00..11D06 ; XID_Start # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; XID_Start # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; XID_Start # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D46 ; XID_Start # Lo MASARAM GONDI REPHA 11D60..11D65 ; XID_Start # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; XID_Start # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; XID_Start # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D98 ; XID_Start # Lo GUNJALA GONDI OM 11DB0..11DD8 ; XID_Start # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; XID_Start # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; XID_Start # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11EE0..11EF2 ; XID_Start # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11F02 ; XID_Start # Lo KAWI SIGN REPHA 11F04..11F10 ; XID_Start # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; XID_Start # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11FB0 ; XID_Start # Lo LISU LETTER YHA 12000..12399 ; XID_Start # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; XID_Start # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12480..12543 ; XID_Start # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; XID_Start # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 13000..1342F ; XID_Start # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13441..13446 ; XID_Start # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13460..143FA ; XID_Start # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; XID_Start # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; XID_Start # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 16800..16A38 ; XID_Start # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; XID_Start # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A70..16ABE ; XID_Start # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AD0..16AED ; XID_Start # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16B00..16B2F ; XID_Start # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B40..16B43 ; XID_Start # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B63..16B77 ; XID_Start # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; XID_Start # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; XID_Start # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; XID_Start # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; XID_Start # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16E40..16E7F ; XID_Start # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; XID_Start # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; XID_Start # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; XID_Start # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; XID_Start # Lo MIAO LETTER NASALIZATION 16F93..16F9F ; XID_Start # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; XID_Start # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; XID_Start # Lm OLD CHINESE ITERATION MARK 16FF2..16FF3 ; XID_Start # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; XID_Start # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; XID_Start # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; XID_Start # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; XID_Start # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; XID_Start # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; XID_Start # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; XID_Start # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; XID_Start # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; XID_Start # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; XID_Start # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; XID_Start # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; XID_Start # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; XID_Start # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; XID_Start # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; XID_Start # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; XID_Start # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; XID_Start # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1D400..1D454 ; XID_Start # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; XID_Start # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; XID_Start # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; XID_Start # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; XID_Start # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; XID_Start # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; XID_Start # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; XID_Start # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; XID_Start # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; XID_Start # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; XID_Start # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; XID_Start # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; XID_Start # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; XID_Start # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; XID_Start # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; XID_Start # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; XID_Start # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; XID_Start # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; XID_Start # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; XID_Start # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; XID_Start # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; XID_Start # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; XID_Start # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; XID_Start # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; XID_Start # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; XID_Start # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; XID_Start # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; XID_Start # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; XID_Start # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; XID_Start # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1DF00..1DF09 ; XID_Start # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; XID_Start # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; XID_Start # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; XID_Start # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E030..1E06D ; XID_Start # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E100..1E12C ; XID_Start # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E137..1E13D ; XID_Start # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E14E ; XID_Start # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E290..1E2AD ; XID_Start # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; XID_Start # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E4D0..1E4EA ; XID_Start # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; XID_Start # Lm NAG MUNDARI SIGN OJOD 1E5D0..1E5ED ; XID_Start # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5F0 ; XID_Start # Lo OL ONAL SIGN HODDOND 1E6C0..1E6DE ; XID_Start # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; XID_Start # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E4..1E6E5 ; XID_Start # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E7..1E6ED ; XID_Start # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6F0..1E6F4 ; XID_Start # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6FE ; XID_Start # Lo TAI YO SYMBOL MUEANG 1E6FF ; XID_Start # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; XID_Start # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; XID_Start # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; XID_Start # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; XID_Start # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; XID_Start # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E900..1E943 ; XID_Start # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E94B ; XID_Start # Lm ADLAM NASALIZATION MARK 1EE00..1EE03 ; XID_Start # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; XID_Start # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; XID_Start # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; XID_Start # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; XID_Start # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; XID_Start # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; XID_Start # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; XID_Start # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; XID_Start # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; XID_Start # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; XID_Start # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; XID_Start # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; XID_Start # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; XID_Start # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; XID_Start # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; XID_Start # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; XID_Start # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; XID_Start # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; XID_Start # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; XID_Start # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; XID_Start # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; XID_Start # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; XID_Start # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; XID_Start # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; XID_Start # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; XID_Start # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; XID_Start # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; XID_Start # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; XID_Start # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; XID_Start # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; XID_Start # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; XID_Start # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; XID_Start # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 20000..2A6DF ; XID_Start # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; XID_Start # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; XID_Start # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; XID_Start # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; XID_Start # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; XID_Start # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; XID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; XID_Start # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 145893 # ================================================ # Derived Property: XID_Continue # Mod_ID_Continue modified for closure under NFKx # Modified as described in UAX #15 # NOTE: Does NOT remove the non-NFKx characters. # Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string)) # NOTE: See UAX #31 for more information 0030..0039 ; XID_Continue # Nd [10] DIGIT ZERO..DIGIT NINE 0041..005A ; XID_Continue # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 005F ; XID_Continue # Pc LOW LINE 0061..007A ; XID_Continue # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00AA ; XID_Continue # Lo FEMININE ORDINAL INDICATOR 00B5 ; XID_Continue # L& MICRO SIGN 00B7 ; XID_Continue # Po MIDDLE DOT 00BA ; XID_Continue # Lo MASCULINE ORDINAL INDICATOR 00C0..00D6 ; XID_Continue # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00F6 ; XID_Continue # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F8..01BA ; XID_Continue # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; XID_Continue # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; XID_Continue # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; XID_Continue # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; XID_Continue # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; XID_Continue # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; XID_Continue # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02C1 ; XID_Continue # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C6..02D1 ; XID_Continue # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; XID_Continue # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EC ; XID_Continue # Lm MODIFIER LETTER VOICING 02EE ; XID_Continue # Lm MODIFIER LETTER DOUBLE APOSTROPHE 0300..036F ; XID_Continue # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0370..0373 ; XID_Continue # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0374 ; XID_Continue # Lm GREEK NUMERAL SIGN 0376..0377 ; XID_Continue # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037B..037D ; XID_Continue # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037F ; XID_Continue # L& GREEK CAPITAL LETTER YOT 0386 ; XID_Continue # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0387 ; XID_Continue # Po GREEK ANO TELEIA 0388..038A ; XID_Continue # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; XID_Continue # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; XID_Continue # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; XID_Continue # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F7..0481 ; XID_Continue # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 0483..0487 ; XID_Continue # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 048A..052F ; XID_Continue # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; XID_Continue # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; XID_Continue # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 0560..0588 ; XID_Continue # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 0591..05BD ; XID_Continue # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; XID_Continue # Mn HEBREW POINT RAFE 05C1..05C2 ; XID_Continue # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; XID_Continue # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; XID_Continue # Mn HEBREW POINT QAMATS QATAN 05D0..05EA ; XID_Continue # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; XID_Continue # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 0610..061A ; XID_Continue # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 0620..063F ; XID_Continue # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; XID_Continue # Lm ARABIC TATWEEL 0641..064A ; XID_Continue # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 064B..065F ; XID_Continue # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0660..0669 ; XID_Continue # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE 066E..066F ; XID_Continue # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0670 ; XID_Continue # Mn ARABIC LETTER SUPERSCRIPT ALEF 0671..06D3 ; XID_Continue # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D5 ; XID_Continue # Lo ARABIC LETTER AE 06D6..06DC ; XID_Continue # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; XID_Continue # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E5..06E6 ; XID_Continue # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06E7..06E8 ; XID_Continue # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; XID_Continue # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 06EE..06EF ; XID_Continue # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06F0..06F9 ; XID_Continue # Nd [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE 06FA..06FC ; XID_Continue # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FF ; XID_Continue # Lo ARABIC LETTER HEH WITH INVERTED V 0710 ; XID_Continue # Lo SYRIAC LETTER ALAPH 0711 ; XID_Continue # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0712..072F ; XID_Continue # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 0730..074A ; XID_Continue # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 074D..07A5 ; XID_Continue # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07A6..07B0 ; XID_Continue # Mn [11] THAANA ABAFILI..THAANA SUKUN 07B1 ; XID_Continue # Lo THAANA LETTER NAA 07C0..07C9 ; XID_Continue # Nd [10] NKO DIGIT ZERO..NKO DIGIT NINE 07CA..07EA ; XID_Continue # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07EB..07F3 ; XID_Continue # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07F4..07F5 ; XID_Continue # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; XID_Continue # Lm NKO LAJANYALAN 07FD ; XID_Continue # Mn NKO DANTAYALAN 0800..0815 ; XID_Continue # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 0816..0819 ; XID_Continue # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081A ; XID_Continue # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 081B..0823 ; XID_Continue # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0824 ; XID_Continue # Lm SAMARITAN MODIFIER LETTER SHORT A 0825..0827 ; XID_Continue # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0828 ; XID_Continue # Lm SAMARITAN MODIFIER LETTER I 0829..082D ; XID_Continue # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0840..0858 ; XID_Continue # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 0859..085B ; XID_Continue # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0860..086A ; XID_Continue # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; XID_Continue # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088F ; XID_Continue # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 0897..089F ; XID_Continue # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; XID_Continue # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; XID_Continue # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; XID_Continue # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; XID_Continue # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 0903 ; XID_Continue # Mc DEVANAGARI SIGN VISARGA 0904..0939 ; XID_Continue # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093A ; XID_Continue # Mn DEVANAGARI VOWEL SIGN OE 093B ; XID_Continue # Mc DEVANAGARI VOWEL SIGN OOE 093C ; XID_Continue # Mn DEVANAGARI SIGN NUKTA 093D ; XID_Continue # Lo DEVANAGARI SIGN AVAGRAHA 093E..0940 ; XID_Continue # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0941..0948 ; XID_Continue # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 0949..094C ; XID_Continue # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094D ; XID_Continue # Mn DEVANAGARI SIGN VIRAMA 094E..094F ; XID_Continue # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0950 ; XID_Continue # Lo DEVANAGARI OM 0951..0957 ; XID_Continue # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0958..0961 ; XID_Continue # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0962..0963 ; XID_Continue # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0966..096F ; XID_Continue # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 0971 ; XID_Continue # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; XID_Continue # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0981 ; XID_Continue # Mn BENGALI SIGN CANDRABINDU 0982..0983 ; XID_Continue # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; XID_Continue # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; XID_Continue # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; XID_Continue # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; XID_Continue # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; XID_Continue # Lo BENGALI LETTER LA 09B6..09B9 ; XID_Continue # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BC ; XID_Continue # Mn BENGALI SIGN NUKTA 09BD ; XID_Continue # Lo BENGALI SIGN AVAGRAHA 09BE..09C0 ; XID_Continue # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II 09C1..09C4 ; XID_Continue # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09C7..09C8 ; XID_Continue # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; XID_Continue # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09CD ; XID_Continue # Mn BENGALI SIGN VIRAMA 09CE ; XID_Continue # Lo BENGALI LETTER KHANDA TA 09D7 ; XID_Continue # Mc BENGALI AU LENGTH MARK 09DC..09DD ; XID_Continue # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; XID_Continue # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09E2..09E3 ; XID_Continue # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09E6..09EF ; XID_Continue # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE 09F0..09F1 ; XID_Continue # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09FC ; XID_Continue # Lo BENGALI LETTER VEDIC ANUSVARA 09FE ; XID_Continue # Mn BENGALI SANDHI MARK 0A01..0A02 ; XID_Continue # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A03 ; XID_Continue # Mc GURMUKHI SIGN VISARGA 0A05..0A0A ; XID_Continue # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; XID_Continue # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; XID_Continue # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; XID_Continue # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; XID_Continue # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; XID_Continue # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; XID_Continue # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A3C ; XID_Continue # Mn GURMUKHI SIGN NUKTA 0A3E..0A40 ; XID_Continue # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A41..0A42 ; XID_Continue # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; XID_Continue # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; XID_Continue # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; XID_Continue # Mn GURMUKHI SIGN UDAAT 0A59..0A5C ; XID_Continue # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; XID_Continue # Lo GURMUKHI LETTER FA 0A66..0A6F ; XID_Continue # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE 0A70..0A71 ; XID_Continue # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A72..0A74 ; XID_Continue # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A75 ; XID_Continue # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; XID_Continue # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0A83 ; XID_Continue # Mc GUJARATI SIGN VISARGA 0A85..0A8D ; XID_Continue # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; XID_Continue # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; XID_Continue # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; XID_Continue # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; XID_Continue # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; XID_Continue # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABC ; XID_Continue # Mn GUJARATI SIGN NUKTA 0ABD ; XID_Continue # Lo GUJARATI SIGN AVAGRAHA 0ABE..0AC0 ; XID_Continue # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC1..0AC5 ; XID_Continue # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; XID_Continue # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0AC9 ; XID_Continue # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; XID_Continue # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0ACD ; XID_Continue # Mn GUJARATI SIGN VIRAMA 0AD0 ; XID_Continue # Lo GUJARATI OM 0AE0..0AE1 ; XID_Continue # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AE2..0AE3 ; XID_Continue # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AE6..0AEF ; XID_Continue # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE 0AF9 ; XID_Continue # Lo GUJARATI LETTER ZHA 0AFA..0AFF ; XID_Continue # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; XID_Continue # Mn ORIYA SIGN CANDRABINDU 0B02..0B03 ; XID_Continue # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B05..0B0C ; XID_Continue # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; XID_Continue # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; XID_Continue # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; XID_Continue # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; XID_Continue # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; XID_Continue # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3C ; XID_Continue # Mn ORIYA SIGN NUKTA 0B3D ; XID_Continue # Lo ORIYA SIGN AVAGRAHA 0B3E ; XID_Continue # Mc ORIYA VOWEL SIGN AA 0B3F ; XID_Continue # Mn ORIYA VOWEL SIGN I 0B40 ; XID_Continue # Mc ORIYA VOWEL SIGN II 0B41..0B44 ; XID_Continue # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B47..0B48 ; XID_Continue # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; XID_Continue # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B4D ; XID_Continue # Mn ORIYA SIGN VIRAMA 0B55..0B56 ; XID_Continue # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B57 ; XID_Continue # Mc ORIYA AU LENGTH MARK 0B5C..0B5D ; XID_Continue # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; XID_Continue # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B62..0B63 ; XID_Continue # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B66..0B6F ; XID_Continue # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE 0B71 ; XID_Continue # Lo ORIYA LETTER WA 0B82 ; XID_Continue # Mn TAMIL SIGN ANUSVARA 0B83 ; XID_Continue # Lo TAMIL SIGN VISARGA 0B85..0B8A ; XID_Continue # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; XID_Continue # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; XID_Continue # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; XID_Continue # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; XID_Continue # Lo TAMIL LETTER JA 0B9E..0B9F ; XID_Continue # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; XID_Continue # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; XID_Continue # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; XID_Continue # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BBE..0BBF ; XID_Continue # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I 0BC0 ; XID_Continue # Mn TAMIL VOWEL SIGN II 0BC1..0BC2 ; XID_Continue # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; XID_Continue # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; XID_Continue # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BCD ; XID_Continue # Mn TAMIL SIGN VIRAMA 0BD0 ; XID_Continue # Lo TAMIL OM 0BD7 ; XID_Continue # Mc TAMIL AU LENGTH MARK 0BE6..0BEF ; XID_Continue # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE 0C00 ; XID_Continue # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; XID_Continue # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C04 ; XID_Continue # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C05..0C0C ; XID_Continue # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; XID_Continue # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; XID_Continue # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; XID_Continue # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3C ; XID_Continue # Mn TELUGU SIGN NUKTA 0C3D ; XID_Continue # Lo TELUGU SIGN AVAGRAHA 0C3E..0C40 ; XID_Continue # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; XID_Continue # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C46..0C48 ; XID_Continue # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; XID_Continue # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; XID_Continue # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C58..0C5A ; XID_Continue # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; XID_Continue # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; XID_Continue # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C62..0C63 ; XID_Continue # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C66..0C6F ; XID_Continue # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE 0C80 ; XID_Continue # Lo KANNADA SIGN SPACING CANDRABINDU 0C81 ; XID_Continue # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; XID_Continue # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C85..0C8C ; XID_Continue # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; XID_Continue # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; XID_Continue # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; XID_Continue # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; XID_Continue # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBC ; XID_Continue # Mn KANNADA SIGN NUKTA 0CBD ; XID_Continue # Lo KANNADA SIGN AVAGRAHA 0CBE ; XID_Continue # Mc KANNADA VOWEL SIGN AA 0CBF ; XID_Continue # Mn KANNADA VOWEL SIGN I 0CC0..0CC4 ; XID_Continue # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR 0CC6 ; XID_Continue # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; XID_Continue # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; XID_Continue # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; XID_Continue # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; XID_Continue # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CDC..0CDE ; XID_Continue # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; XID_Continue # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE2..0CE3 ; XID_Continue # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0CE6..0CEF ; XID_Continue # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; XID_Continue # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; XID_Continue # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D00..0D01 ; XID_Continue # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; XID_Continue # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D04..0D0C ; XID_Continue # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; XID_Continue # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; XID_Continue # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3B..0D3C ; XID_Continue # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D3D ; XID_Continue # Lo MALAYALAM SIGN AVAGRAHA 0D3E..0D40 ; XID_Continue # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D41..0D44 ; XID_Continue # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D46..0D48 ; XID_Continue # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; XID_Continue # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D4D ; XID_Continue # Mn MALAYALAM SIGN VIRAMA 0D4E ; XID_Continue # Lo MALAYALAM LETTER DOT REPH 0D54..0D56 ; XID_Continue # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D57 ; XID_Continue # Mc MALAYALAM AU LENGTH MARK 0D5F..0D61 ; XID_Continue # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D62..0D63 ; XID_Continue # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D66..0D6F ; XID_Continue # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE 0D7A..0D7F ; XID_Continue # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D81 ; XID_Continue # Mn SINHALA SIGN CANDRABINDU 0D82..0D83 ; XID_Continue # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0D85..0D96 ; XID_Continue # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; XID_Continue # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; XID_Continue # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; XID_Continue # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; XID_Continue # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0DCA ; XID_Continue # Mn SINHALA SIGN AL-LAKUNA 0DCF..0DD1 ; XID_Continue # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD2..0DD4 ; XID_Continue # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; XID_Continue # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DD8..0DDF ; XID_Continue # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA 0DE6..0DEF ; XID_Continue # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0DF2..0DF3 ; XID_Continue # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0E01..0E30 ; XID_Continue # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E31 ; XID_Continue # Mn THAI CHARACTER MAI HAN-AKAT 0E32..0E33 ; XID_Continue # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E34..0E3A ; XID_Continue # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E40..0E45 ; XID_Continue # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; XID_Continue # Lm THAI CHARACTER MAIYAMOK 0E47..0E4E ; XID_Continue # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0E50..0E59 ; XID_Continue # Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE 0E81..0E82 ; XID_Continue # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; XID_Continue # Lo LAO LETTER KHO TAM 0E86..0E8A ; XID_Continue # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; XID_Continue # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; XID_Continue # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; XID_Continue # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB1 ; XID_Continue # Mn LAO VOWEL SIGN MAI KAN 0EB2..0EB3 ; XID_Continue # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EB4..0EBC ; XID_Continue # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EBD ; XID_Continue # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; XID_Continue # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; XID_Continue # Lm LAO KO LA 0EC8..0ECE ; XID_Continue # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0ED0..0ED9 ; XID_Continue # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE 0EDC..0EDF ; XID_Continue # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; XID_Continue # Lo TIBETAN SYLLABLE OM 0F18..0F19 ; XID_Continue # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F20..0F29 ; XID_Continue # Nd [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE 0F35 ; XID_Continue # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; XID_Continue # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; XID_Continue # Mn TIBETAN MARK TSA -PHRU 0F3E..0F3F ; XID_Continue # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F40..0F47 ; XID_Continue # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; XID_Continue # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F71..0F7E ; XID_Continue # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F7F ; XID_Continue # Mc TIBETAN SIGN RNAM BCAD 0F80..0F84 ; XID_Continue # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; XID_Continue # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F88..0F8C ; XID_Continue # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 0F8D..0F97 ; XID_Continue # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; XID_Continue # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; XID_Continue # Mn TIBETAN SYMBOL PADMA GDAN 1000..102A ; XID_Continue # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 102B..102C ; XID_Continue # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 102D..1030 ; XID_Continue # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1031 ; XID_Continue # Mc MYANMAR VOWEL SIGN E 1032..1037 ; XID_Continue # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1038 ; XID_Continue # Mc MYANMAR SIGN VISARGA 1039..103A ; XID_Continue # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103B..103C ; XID_Continue # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 103D..103E ; XID_Continue # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 103F ; XID_Continue # Lo MYANMAR LETTER GREAT SA 1040..1049 ; XID_Continue # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE 1050..1055 ; XID_Continue # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 1056..1057 ; XID_Continue # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1058..1059 ; XID_Continue # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105A..105D ; XID_Continue # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 105E..1060 ; XID_Continue # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1061 ; XID_Continue # Lo MYANMAR LETTER SGAW KAREN SHA 1062..1064 ; XID_Continue # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1065..1066 ; XID_Continue # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 1067..106D ; XID_Continue # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 106E..1070 ; XID_Continue # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1071..1074 ; XID_Continue # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1075..1081 ; XID_Continue # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 1082 ; XID_Continue # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1083..1084 ; XID_Continue # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1085..1086 ; XID_Continue # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 1087..108C ; XID_Continue # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108D ; XID_Continue # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 108E ; XID_Continue # Lo MYANMAR LETTER RUMAI PALAUNG FA 108F ; XID_Continue # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 1090..1099 ; XID_Continue # Nd [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE 109A..109C ; XID_Continue # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109D ; XID_Continue # Mn MYANMAR VOWEL SIGN AITON AI 10A0..10C5 ; XID_Continue # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; XID_Continue # L& GEORGIAN CAPITAL LETTER YN 10CD ; XID_Continue # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; XID_Continue # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FC ; XID_Continue # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; XID_Continue # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; XID_Continue # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; XID_Continue # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; XID_Continue # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; XID_Continue # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; XID_Continue # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; XID_Continue # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; XID_Continue # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; XID_Continue # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 135D..135F ; XID_Continue # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1369..1371 ; XID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE 1380..138F ; XID_Continue # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 13A0..13F5 ; XID_Continue # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; XID_Continue # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1401..166C ; XID_Continue # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166F..167F ; XID_Continue # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; XID_Continue # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; XID_Continue # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; XID_Continue # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; XID_Continue # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; XID_Continue # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 1712..1714 ; XID_Continue # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1715 ; XID_Continue # Mc TAGALOG SIGN PAMUDPOD 171F..1731 ; XID_Continue # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1732..1733 ; XID_Continue # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1734 ; XID_Continue # Mc HANUNOO SIGN PAMUDPOD 1740..1751 ; XID_Continue # Lo [18] BUHID LETTER A..BUHID LETTER HA 1752..1753 ; XID_Continue # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1760..176C ; XID_Continue # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; XID_Continue # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1772..1773 ; XID_Continue # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 1780..17B3 ; XID_Continue # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17B4..17B5 ; XID_Continue # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B6 ; XID_Continue # Mc KHMER VOWEL SIGN AA 17B7..17BD ; XID_Continue # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17BE..17C5 ; XID_Continue # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C6 ; XID_Continue # Mn KHMER SIGN NIKAHIT 17C7..17C8 ; XID_Continue # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 17C9..17D3 ; XID_Continue # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17D7 ; XID_Continue # Lm KHMER SIGN LEK TOO 17DC ; XID_Continue # Lo KHMER SIGN AVAKRAHASANYA 17DD ; XID_Continue # Mn KHMER SIGN ATTHACAN 17E0..17E9 ; XID_Continue # Nd [10] KHMER DIGIT ZERO..KHMER DIGIT NINE 180B..180D ; XID_Continue # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; XID_Continue # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1810..1819 ; XID_Continue # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1820..1842 ; XID_Continue # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; XID_Continue # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; XID_Continue # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; XID_Continue # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1885..1886 ; XID_Continue # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 1887..18A8 ; XID_Continue # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18A9 ; XID_Continue # Mn MONGOLIAN LETTER ALI GALI DAGALGA 18AA ; XID_Continue # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; XID_Continue # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; XID_Continue # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1920..1922 ; XID_Continue # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1923..1926 ; XID_Continue # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1927..1928 ; XID_Continue # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1929..192B ; XID_Continue # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; XID_Continue # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1932 ; XID_Continue # Mn LIMBU SMALL LETTER ANUSVARA 1933..1938 ; XID_Continue # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1939..193B ; XID_Continue # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1946..194F ; XID_Continue # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE 1950..196D ; XID_Continue # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; XID_Continue # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; XID_Continue # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; XID_Continue # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 19D0..19D9 ; XID_Continue # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE 19DA ; XID_Continue # No NEW TAI LUE THAM DIGIT ONE 1A00..1A16 ; XID_Continue # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A17..1A18 ; XID_Continue # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A19..1A1A ; XID_Continue # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A1B ; XID_Continue # Mn BUGINESE VOWEL SIGN AE 1A20..1A54 ; XID_Continue # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1A55 ; XID_Continue # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A56 ; XID_Continue # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A57 ; XID_Continue # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A58..1A5E ; XID_Continue # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; XID_Continue # Mn TAI THAM SIGN SAKOT 1A61 ; XID_Continue # Mc TAI THAM VOWEL SIGN A 1A62 ; XID_Continue # Mn TAI THAM VOWEL SIGN MAI SAT 1A63..1A64 ; XID_Continue # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A65..1A6C ; XID_Continue # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A6D..1A72 ; XID_Continue # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1A73..1A7C ; XID_Continue # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; XID_Continue # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1A80..1A89 ; XID_Continue # Nd [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE 1A90..1A99 ; XID_Continue # Nd [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE 1AA7 ; XID_Continue # Lm TAI THAM SIGN MAI YAMOK 1AB0..1ABD ; XID_Continue # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABF..1ADD ; XID_Continue # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; XID_Continue # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; XID_Continue # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B04 ; XID_Continue # Mc BALINESE SIGN BISAH 1B05..1B33 ; XID_Continue # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B34 ; XID_Continue # Mn BALINESE SIGN REREKAN 1B35 ; XID_Continue # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; XID_Continue # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; XID_Continue # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; XID_Continue # Mn BALINESE VOWEL SIGN LA LENGA 1B3D..1B41 ; XID_Continue # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B42 ; XID_Continue # Mn BALINESE VOWEL SIGN PEPET 1B43..1B44 ; XID_Continue # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1B45..1B4C ; XID_Continue # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B50..1B59 ; XID_Continue # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE 1B6B..1B73 ; XID_Continue # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; XID_Continue # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1B82 ; XID_Continue # Mc SUNDANESE SIGN PANGWISAD 1B83..1BA0 ; XID_Continue # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BA1 ; XID_Continue # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA2..1BA5 ; XID_Continue # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA6..1BA7 ; XID_Continue # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; XID_Continue # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; XID_Continue # Mc SUNDANESE SIGN PAMAAEH 1BAB..1BAD ; XID_Continue # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BAE..1BAF ; XID_Continue # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BB0..1BB9 ; XID_Continue # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1BBA..1BE5 ; XID_Continue # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1BE6 ; XID_Continue # Mn BATAK SIGN TOMPI 1BE7 ; XID_Continue # Mc BATAK VOWEL SIGN E 1BE8..1BE9 ; XID_Continue # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BEA..1BEC ; XID_Continue # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BED ; XID_Continue # Mn BATAK VOWEL SIGN KARO O 1BEE ; XID_Continue # Mc BATAK VOWEL SIGN U 1BEF..1BF1 ; XID_Continue # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1BF2..1BF3 ; XID_Continue # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C00..1C23 ; XID_Continue # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C24..1C2B ; XID_Continue # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C2C..1C33 ; XID_Continue # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; XID_Continue # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C36..1C37 ; XID_Continue # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1C40..1C49 ; XID_Continue # Nd [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE 1C4D..1C4F ; XID_Continue # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C50..1C59 ; XID_Continue # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE 1C5A..1C77 ; XID_Continue # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; XID_Continue # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C80..1C8A ; XID_Continue # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; XID_Continue # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; XID_Continue # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CD0..1CD2 ; XID_Continue # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; XID_Continue # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE1 ; XID_Continue # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CE2..1CE8 ; XID_Continue # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CE9..1CEC ; XID_Continue # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CED ; XID_Continue # Mn VEDIC SIGN TIRYAK 1CEE..1CF3 ; XID_Continue # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF4 ; XID_Continue # Mn VEDIC TONE CANDRA ABOVE 1CF5..1CF6 ; XID_Continue # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CF7 ; XID_Continue # Mc VEDIC SIGN ATIKRAMA 1CF8..1CF9 ; XID_Continue # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1CFA ; XID_Continue # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; XID_Continue # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; XID_Continue # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; XID_Continue # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; XID_Continue # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; XID_Continue # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; XID_Continue # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1DC0..1DFF ; XID_Continue # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 1E00..1F15 ; XID_Continue # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; XID_Continue # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; XID_Continue # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; XID_Continue # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; XID_Continue # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; XID_Continue # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; XID_Continue # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; XID_Continue # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; XID_Continue # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; XID_Continue # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; XID_Continue # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBE ; XID_Continue # L& GREEK PROSGEGRAMMENI 1FC2..1FC4 ; XID_Continue # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; XID_Continue # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD0..1FD3 ; XID_Continue # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; XID_Continue # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE0..1FEC ; XID_Continue # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF2..1FF4 ; XID_Continue # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; XID_Continue # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 200C..200D ; XID_Continue # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 203F..2040 ; XID_Continue # Pc [2] UNDERTIE..CHARACTER TIE 2054 ; XID_Continue # Pc INVERTED UNDERTIE 2071 ; XID_Continue # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; XID_Continue # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; XID_Continue # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 20D0..20DC ; XID_Continue # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20E1 ; XID_Continue # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E5..20F0 ; XID_Continue # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2102 ; XID_Continue # L& DOUBLE-STRUCK CAPITAL C 2107 ; XID_Continue # L& EULER CONSTANT 210A..2113 ; XID_Continue # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; XID_Continue # L& DOUBLE-STRUCK CAPITAL N 2118 ; XID_Continue # Sm SCRIPT CAPITAL P 2119..211D ; XID_Continue # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; XID_Continue # L& DOUBLE-STRUCK CAPITAL Z 2126 ; XID_Continue # L& OHM SIGN 2128 ; XID_Continue # L& BLACK-LETTER CAPITAL Z 212A..212D ; XID_Continue # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212E ; XID_Continue # So ESTIMATED SYMBOL 212F..2134 ; XID_Continue # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; XID_Continue # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; XID_Continue # L& INFORMATION SOURCE 213C..213F ; XID_Continue # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; XID_Continue # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; XID_Continue # L& TURNED SMALL F 2160..2182 ; XID_Continue # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; XID_Continue # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; XID_Continue # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 2C00..2C7B ; XID_Continue # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; XID_Continue # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; XID_Continue # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CEB..2CEE ; XID_Continue # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CEF..2CF1 ; XID_Continue # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2CF2..2CF3 ; XID_Continue # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; XID_Continue # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; XID_Continue # L& GEORGIAN SMALL LETTER YN 2D2D ; XID_Continue # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; XID_Continue # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; XID_Continue # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D7F ; XID_Continue # Mn TIFINAGH CONSONANT JOINER 2D80..2D96 ; XID_Continue # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 2DE0..2DFF ; XID_Continue # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 3005 ; XID_Continue # Lm IDEOGRAPHIC ITERATION MARK 3006 ; XID_Continue # Lo IDEOGRAPHIC CLOSING MARK 3007 ; XID_Continue # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; XID_Continue # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 302A..302D ; XID_Continue # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 302E..302F ; XID_Continue # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3031..3035 ; XID_Continue # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3038..303A ; XID_Continue # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; XID_Continue # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; XID_Continue # Lo MASU MARK 3041..3096 ; XID_Continue # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 3099..309A ; XID_Continue # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309D..309E ; XID_Continue # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; XID_Continue # Lo HIRAGANA DIGRAPH YORI 30A1..30FA ; XID_Continue # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FB ; XID_Continue # Po KATAKANA MIDDLE DOT 30FC..30FE ; XID_Continue # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; XID_Continue # Lo KATAKANA DIGRAPH KOTO 3105..312F ; XID_Continue # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; XID_Continue # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 31A0..31BF ; XID_Continue # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; XID_Continue # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3400..4DBF ; XID_Continue # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; XID_Continue # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; XID_Continue # Lm YI SYLLABLE WU A016..A48C ; XID_Continue # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; XID_Continue # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; XID_Continue # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A500..A60B ; XID_Continue # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; XID_Continue # Lm VAI SYLLABLE LENGTHENER A610..A61F ; XID_Continue # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A620..A629 ; XID_Continue # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE A62A..A62B ; XID_Continue # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; XID_Continue # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; XID_Continue # Lo CYRILLIC LETTER MULTIOCULAR O A66F ; XID_Continue # Mn COMBINING CYRILLIC VZMET A674..A67D ; XID_Continue # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A67F ; XID_Continue # Lm CYRILLIC PAYEROK A680..A69B ; XID_Continue # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; XID_Continue # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A69E..A69F ; XID_Continue # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6A0..A6E5 ; XID_Continue # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; XID_Continue # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A6F0..A6F1 ; XID_Continue # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A717..A71F ; XID_Continue # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A722..A76F ; XID_Continue # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; XID_Continue # Lm MODIFIER LETTER US A771..A787 ; XID_Continue # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A788 ; XID_Continue # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A78B..A78E ; XID_Continue # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; XID_Continue # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; XID_Continue # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; XID_Continue # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; XID_Continue # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; XID_Continue # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; XID_Continue # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; XID_Continue # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; XID_Continue # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A802 ; XID_Continue # Mn SYLOTI NAGRI SIGN DVISVARA A803..A805 ; XID_Continue # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A806 ; XID_Continue # Mn SYLOTI NAGRI SIGN HASANTA A807..A80A ; XID_Continue # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80B ; XID_Continue # Mn SYLOTI NAGRI SIGN ANUSVARA A80C..A822 ; XID_Continue # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A823..A824 ; XID_Continue # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A825..A826 ; XID_Continue # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A827 ; XID_Continue # Mc SYLOTI NAGRI VOWEL SIGN OO A82C ; XID_Continue # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A840..A873 ; XID_Continue # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A880..A881 ; XID_Continue # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A882..A8B3 ; XID_Continue # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8B4..A8C3 ; XID_Continue # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A8C4..A8C5 ; XID_Continue # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8D0..A8D9 ; XID_Continue # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE A8E0..A8F1 ; XID_Continue # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8F2..A8F7 ; XID_Continue # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8FB ; XID_Continue # Lo DEVANAGARI HEADSTROKE A8FD..A8FE ; XID_Continue # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A8FF ; XID_Continue # Mn DEVANAGARI VOWEL SIGN AY A900..A909 ; XID_Continue # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A90A..A925 ; XID_Continue # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A926..A92D ; XID_Continue # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A930..A946 ; XID_Continue # Lo [23] REJANG LETTER KA..REJANG LETTER A A947..A951 ; XID_Continue # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A952..A953 ; XID_Continue # Mc [2] REJANG CONSONANT SIGN H..REJANG VIRAMA A960..A97C ; XID_Continue # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A980..A982 ; XID_Continue # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A983 ; XID_Continue # Mc JAVANESE SIGN WIGNYAN A984..A9B2 ; XID_Continue # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9B3 ; XID_Continue # Mn JAVANESE SIGN CECAK TELU A9B4..A9B5 ; XID_Continue # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9B6..A9B9 ; XID_Continue # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BA..A9BB ; XID_Continue # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BC..A9BD ; XID_Continue # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9BE..A9C0 ; XID_Continue # Mc [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON A9CF ; XID_Continue # Lm JAVANESE PANGRANGKEP A9D0..A9D9 ; XID_Continue # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9E0..A9E4 ; XID_Continue # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E5 ; XID_Continue # Mn MYANMAR SIGN SHAN SAW A9E6 ; XID_Continue # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; XID_Continue # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9F0..A9F9 ; XID_Continue # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE A9FA..A9FE ; XID_Continue # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; XID_Continue # Lo [41] CHAM LETTER A..CHAM LETTER HA AA29..AA2E ; XID_Continue # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA2F..AA30 ; XID_Continue # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA31..AA32 ; XID_Continue # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA33..AA34 ; XID_Continue # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA35..AA36 ; XID_Continue # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA40..AA42 ; XID_Continue # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA43 ; XID_Continue # Mn CHAM CONSONANT SIGN FINAL NG AA44..AA4B ; XID_Continue # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA4C ; XID_Continue # Mn CHAM CONSONANT SIGN FINAL M AA4D ; XID_Continue # Mc CHAM CONSONANT SIGN FINAL H AA50..AA59 ; XID_Continue # Nd [10] CHAM DIGIT ZERO..CHAM DIGIT NINE AA60..AA6F ; XID_Continue # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; XID_Continue # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; XID_Continue # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA7A ; XID_Continue # Lo MYANMAR LETTER AITON RA AA7B ; XID_Continue # Mc MYANMAR SIGN PAO KAREN TONE AA7C ; XID_Continue # Mn MYANMAR SIGN TAI LAING TONE-2 AA7D ; XID_Continue # Mc MYANMAR SIGN TAI LAING TONE-5 AA7E..AAAF ; XID_Continue # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB0 ; XID_Continue # Mn TAI VIET MAI KANG AAB1 ; XID_Continue # Lo TAI VIET VOWEL AA AAB2..AAB4 ; XID_Continue # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB5..AAB6 ; XID_Continue # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB7..AAB8 ; XID_Continue # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AAB9..AABD ; XID_Continue # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AABE..AABF ; XID_Continue # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC0 ; XID_Continue # Lo TAI VIET TONE MAI NUENG AAC1 ; XID_Continue # Mn TAI VIET TONE MAI THO AAC2 ; XID_Continue # Lo TAI VIET TONE MAI SONG AADB..AADC ; XID_Continue # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; XID_Continue # Lm TAI VIET SYMBOL SAM AAE0..AAEA ; XID_Continue # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAEB ; XID_Continue # Mc MEETEI MAYEK VOWEL SIGN II AAEC..AAED ; XID_Continue # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAEE..AAEF ; XID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF2 ; XID_Continue # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; XID_Continue # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AAF5 ; XID_Continue # Mc MEETEI MAYEK VOWEL SIGN VISARGA AAF6 ; XID_Continue # Mn MEETEI MAYEK VIRAMA AB01..AB06 ; XID_Continue # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; XID_Continue # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; XID_Continue # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; XID_Continue # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5C..AB5F ; XID_Continue # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; XID_Continue # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; XID_Continue # Lm MODIFIER LETTER SMALL TURNED W AB70..ABBF ; XID_Continue # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; XID_Continue # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM ABE3..ABE4 ; XID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE5 ; XID_Continue # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE6..ABE7 ; XID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE8 ; XID_Continue # Mn MEETEI MAYEK VOWEL SIGN UNAP ABE9..ABEA ; XID_Continue # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG ABEC ; XID_Continue # Mc MEETEI MAYEK LUM IYEK ABED ; XID_Continue # Mn MEETEI MAYEK APUN IYEK ABF0..ABF9 ; XID_Continue # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE AC00..D7A3 ; XID_Continue # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; XID_Continue # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; XID_Continue # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; XID_Continue # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; XID_Continue # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; XID_Continue # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; XID_Continue # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FB1D ; XID_Continue # Lo HEBREW LETTER YOD WITH HIRIQ FB1E ; XID_Continue # Mn HEBREW POINT JUDEO-SPANISH VARIKA FB1F..FB28 ; XID_Continue # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; XID_Continue # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; XID_Continue # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; XID_Continue # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; XID_Continue # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; XID_Continue # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; XID_Continue # Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBD3..FC5D ; XID_Continue # Lo [139] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF MAKSURA WITH SUPERSCRIPT ALEF ISOLATED FORM FC64..FD3D ; XID_Continue # Lo [218] ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH REH FINAL FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; XID_Continue # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; XID_Continue # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDF9 ; XID_Continue # Lo [10] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE SALLA ISOLATED FORM FE00..FE0F ; XID_Continue # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; XID_Continue # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FE33..FE34 ; XID_Continue # Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE FE4D..FE4F ; XID_Continue # Pc [3] DASHED LOW LINE..WAVY LOW LINE FE71 ; XID_Continue # Lo ARABIC TATWEEL WITH FATHATAN ABOVE FE73 ; XID_Continue # Lo ARABIC TAIL FRAGMENT FE77 ; XID_Continue # Lo ARABIC FATHA MEDIAL FORM FE79 ; XID_Continue # Lo ARABIC DAMMA MEDIAL FORM FE7B ; XID_Continue # Lo ARABIC KASRA MEDIAL FORM FE7D ; XID_Continue # Lo ARABIC SHADDA MEDIAL FORM FE7F..FEFC ; XID_Continue # Lo [126] ARABIC SUKUN MEDIAL FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF10..FF19 ; XID_Continue # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE FF21..FF3A ; XID_Continue # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF3F ; XID_Continue # Pc FULLWIDTH LOW LINE FF41..FF5A ; XID_Continue # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF65 ; XID_Continue # Po HALFWIDTH KATAKANA MIDDLE DOT FF66..FF6F ; XID_Continue # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; XID_Continue # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; XID_Continue # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FF9E..FF9F ; XID_Continue # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFA0..FFBE ; XID_Continue # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; XID_Continue # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; XID_Continue # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; XID_Continue # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; XID_Continue # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; XID_Continue # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; XID_Continue # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; XID_Continue # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; XID_Continue # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; XID_Continue # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; XID_Continue # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10140..10174 ; XID_Continue # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 101FD ; XID_Continue # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 10280..1029C ; XID_Continue # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; XID_Continue # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 102E0 ; XID_Continue # Mn COPTIC EPACT THOUSANDS MARK 10300..1031F ; XID_Continue # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 1032D..10340 ; XID_Continue # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; XID_Continue # Nl GOTHIC LETTER NINETY 10342..10349 ; XID_Continue # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; XID_Continue # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; XID_Continue # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10376..1037A ; XID_Continue # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10380..1039D ; XID_Continue # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 103A0..103C3 ; XID_Continue # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; XID_Continue # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D1..103D5 ; XID_Continue # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; XID_Continue # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; XID_Continue # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104A0..104A9 ; XID_Continue # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE 104B0..104D3 ; XID_Continue # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; XID_Continue # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; XID_Continue # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; XID_Continue # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 10570..1057A ; XID_Continue # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; XID_Continue # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; XID_Continue # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; XID_Continue # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; XID_Continue # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; XID_Continue # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; XID_Continue # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; XID_Continue # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; XID_Continue # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; XID_Continue # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; XID_Continue # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; XID_Continue # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; XID_Continue # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; XID_Continue # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; XID_Continue # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10800..10805 ; XID_Continue # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; XID_Continue # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; XID_Continue # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; XID_Continue # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; XID_Continue # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; XID_Continue # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10860..10876 ; XID_Continue # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10880..1089E ; XID_Continue # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108E0..108F2 ; XID_Continue # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; XID_Continue # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 10900..10915 ; XID_Continue # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10920..10939 ; XID_Continue # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 10940..10959 ; XID_Continue # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; XID_Continue # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BE..109BF ; XID_Continue # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 10A00 ; XID_Continue # Lo KHAROSHTHI LETTER A 10A01..10A03 ; XID_Continue # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; XID_Continue # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; XID_Continue # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A10..10A13 ; XID_Continue # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; XID_Continue # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; XID_Continue # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A38..10A3A ; XID_Continue # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; XID_Continue # Mn KHAROSHTHI VIRAMA 10A60..10A7C ; XID_Continue # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A80..10A9C ; XID_Continue # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10AC0..10AC7 ; XID_Continue # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC9..10AE4 ; XID_Continue # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10AE5..10AE6 ; XID_Continue # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10B00..10B35 ; XID_Continue # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; XID_Continue # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B60..10B72 ; XID_Continue # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B80..10B91 ; XID_Continue # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10C00..10C48 ; XID_Continue # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; XID_Continue # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; XID_Continue # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D00..10D23 ; XID_Continue # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D24..10D27 ; XID_Continue # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D30..10D39 ; XID_Continue # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE 10D40..10D49 ; XID_Continue # Nd [10] GARAY DIGIT ZERO..GARAY DIGIT NINE 10D4A..10D4D ; XID_Continue # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; XID_Continue # Lm GARAY VOWEL LENGTH MARK 10D4F ; XID_Continue # Lo GARAY SUKUN 10D50..10D65 ; XID_Continue # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D69..10D6D ; XID_Continue # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10D6F ; XID_Continue # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; XID_Continue # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10E80..10EA9 ; XID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAB..10EAC ; XID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EB0..10EB1 ; XID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; XID_Continue # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; XID_Continue # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; XID_Continue # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10EFA..10EFF ; XID_Continue # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; XID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; XID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; XID_Continue # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F46..10F50 ; XID_Continue # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F70..10F81 ; XID_Continue # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10F82..10F85 ; XID_Continue # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 10FB0..10FC4 ; XID_Continue # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FE0..10FF6 ; XID_Continue # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11000 ; XID_Continue # Mc BRAHMI SIGN CANDRABINDU 11001 ; XID_Continue # Mn BRAHMI SIGN ANUSVARA 11002 ; XID_Continue # Mc BRAHMI SIGN VISARGA 11003..11037 ; XID_Continue # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11038..11046 ; XID_Continue # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11066..1106F ; XID_Continue # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE 11070 ; XID_Continue # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11071..11072 ; XID_Continue # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11073..11074 ; XID_Continue # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 11075 ; XID_Continue # Lo BRAHMI LETTER OLD TAMIL LLA 1107F..11081 ; XID_Continue # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 11082 ; XID_Continue # Mc KAITHI SIGN VISARGA 11083..110AF ; XID_Continue # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110B0..110B2 ; XID_Continue # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B3..110B6 ; XID_Continue # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B7..110B8 ; XID_Continue # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 110B9..110BA ; XID_Continue # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; XID_Continue # Mn KAITHI VOWEL SIGN VOCALIC R 110D0..110E8 ; XID_Continue # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 110F0..110F9 ; XID_Continue # Nd [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE 11100..11102 ; XID_Continue # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11103..11126 ; XID_Continue # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11127..1112B ; XID_Continue # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112C ; XID_Continue # Mc CHAKMA VOWEL SIGN E 1112D..11134 ; XID_Continue # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11136..1113F ; XID_Continue # Nd [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE 11144 ; XID_Continue # Lo CHAKMA LETTER LHAA 11145..11146 ; XID_Continue # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11147 ; XID_Continue # Lo CHAKMA LETTER VAA 11150..11172 ; XID_Continue # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11173 ; XID_Continue # Mn MAHAJANI SIGN NUKTA 11176 ; XID_Continue # Lo MAHAJANI LIGATURE SHRI 11180..11181 ; XID_Continue # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 11182 ; XID_Continue # Mc SHARADA SIGN VISARGA 11183..111B2 ; XID_Continue # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111B3..111B5 ; XID_Continue # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111B6..111BE ; XID_Continue # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111BF..111C0 ; XID_Continue # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA 111C1..111C4 ; XID_Continue # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111C9..111CC ; XID_Continue # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CE ; XID_Continue # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 111CF ; XID_Continue # Mn SHARADA SIGN INVERTED CANDRABINDU 111D0..111D9 ; XID_Continue # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE 111DA ; XID_Continue # Lo SHARADA EKAM 111DC ; XID_Continue # Lo SHARADA HEADSTROKE 11200..11211 ; XID_Continue # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; XID_Continue # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1122C..1122E ; XID_Continue # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 1122F..11231 ; XID_Continue # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11232..11233 ; XID_Continue # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11234 ; XID_Continue # Mn KHOJKI SIGN ANUSVARA 11235 ; XID_Continue # Mc KHOJKI SIGN VIRAMA 11236..11237 ; XID_Continue # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; XID_Continue # Mn KHOJKI SIGN SUKUN 1123F..11240 ; XID_Continue # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11241 ; XID_Continue # Mn KHOJKI VOWEL SIGN VOCALIC R 11280..11286 ; XID_Continue # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; XID_Continue # Lo MULTANI LETTER GHA 1128A..1128D ; XID_Continue # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; XID_Continue # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; XID_Continue # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112B0..112DE ; XID_Continue # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 112DF ; XID_Continue # Mn KHUDAWADI SIGN ANUSVARA 112E0..112E2 ; XID_Continue # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 112E3..112EA ; XID_Continue # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 112F0..112F9 ; XID_Continue # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE 11300..11301 ; XID_Continue # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 11302..11303 ; XID_Continue # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 11305..1130C ; XID_Continue # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; XID_Continue # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; XID_Continue # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; XID_Continue # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; XID_Continue # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; XID_Continue # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133B..1133C ; XID_Continue # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1133D ; XID_Continue # Lo GRANTHA SIGN AVAGRAHA 1133E..1133F ; XID_Continue # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I 11340 ; XID_Continue # Mn GRANTHA VOWEL SIGN II 11341..11344 ; XID_Continue # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; XID_Continue # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134D ; XID_Continue # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA 11350 ; XID_Continue # Lo GRANTHA OM 11357 ; XID_Continue # Mc GRANTHA AU LENGTH MARK 1135D..11361 ; XID_Continue # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11362..11363 ; XID_Continue # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 11366..1136C ; XID_Continue # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; XID_Continue # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 11380..11389 ; XID_Continue # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; XID_Continue # Lo TULU-TIGALARI LETTER EE 1138E ; XID_Continue # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; XID_Continue # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; XID_Continue # Lo TULU-TIGALARI SIGN AVAGRAHA 113B8..113BA ; XID_Continue # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II 113BB..113C0 ; XID_Continue # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; XID_Continue # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; XID_Continue # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113CA ; XID_Continue # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; XID_Continue # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 113CE ; XID_Continue # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; XID_Continue # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; XID_Continue # Mn TULU-TIGALARI CONJOINER 113D1 ; XID_Continue # Lo TULU-TIGALARI REPHA 113D2 ; XID_Continue # Mn TULU-TIGALARI GEMINATION MARK 113D3 ; XID_Continue # Lo TULU-TIGALARI SIGN PLUTA 113E1..113E2 ; XID_Continue # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11400..11434 ; XID_Continue # Lo [53] NEWA LETTER A..NEWA LETTER HA 11435..11437 ; XID_Continue # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11438..1143F ; XID_Continue # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11440..11441 ; XID_Continue # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11442..11444 ; XID_Continue # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11445 ; XID_Continue # Mc NEWA SIGN VISARGA 11446 ; XID_Continue # Mn NEWA SIGN NUKTA 11447..1144A ; XID_Continue # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 11450..11459 ; XID_Continue # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE 1145E ; XID_Continue # Mn NEWA SANDHI MARK 1145F..11461 ; XID_Continue # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; XID_Continue # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114B0..114B2 ; XID_Continue # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II 114B3..114B8 ; XID_Continue # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114B9 ; XID_Continue # Mc TIRHUTA VOWEL SIGN E 114BA ; XID_Continue # Mn TIRHUTA VOWEL SIGN SHORT E 114BB..114BE ; XID_Continue # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU 114BF..114C0 ; XID_Continue # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C1 ; XID_Continue # Mc TIRHUTA SIGN VISARGA 114C2..114C3 ; XID_Continue # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 114C4..114C5 ; XID_Continue # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C7 ; XID_Continue # Lo TIRHUTA OM 114D0..114D9 ; XID_Continue # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE 11580..115AE ; XID_Continue # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115AF..115B1 ; XID_Continue # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II 115B2..115B5 ; XID_Continue # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115B8..115BB ; XID_Continue # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BC..115BD ; XID_Continue # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BE ; XID_Continue # Mc SIDDHAM SIGN VISARGA 115BF..115C0 ; XID_Continue # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115D8..115DB ; XID_Continue # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 115DC..115DD ; XID_Continue # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11600..1162F ; XID_Continue # Lo [48] MODI LETTER A..MODI LETTER LLA 11630..11632 ; XID_Continue # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 11633..1163A ; XID_Continue # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163B..1163C ; XID_Continue # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163D ; XID_Continue # Mn MODI SIGN ANUSVARA 1163E ; XID_Continue # Mc MODI SIGN VISARGA 1163F..11640 ; XID_Continue # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 11644 ; XID_Continue # Lo MODI SIGN HUVA 11650..11659 ; XID_Continue # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE 11680..116AA ; XID_Continue # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116AB ; XID_Continue # Mn TAKRI SIGN ANUSVARA 116AC ; XID_Continue # Mc TAKRI SIGN VISARGA 116AD ; XID_Continue # Mn TAKRI VOWEL SIGN AA 116AE..116AF ; XID_Continue # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B0..116B5 ; XID_Continue # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B6 ; XID_Continue # Mc TAKRI SIGN VIRAMA 116B7 ; XID_Continue # Mn TAKRI SIGN NUKTA 116B8 ; XID_Continue # Lo TAKRI LETTER ARCHAIC KHA 116C0..116C9 ; XID_Continue # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE 116D0..116E3 ; XID_Continue # Nd [20] MYANMAR PAO DIGIT ZERO..MYANMAR EASTERN PWO KAREN DIGIT NINE 11700..1171A ; XID_Continue # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 1171D ; XID_Continue # Mn AHOM CONSONANT SIGN MEDIAL LA 1171E ; XID_Continue # Mc AHOM CONSONANT SIGN MEDIAL RA 1171F ; XID_Continue # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11720..11721 ; XID_Continue # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11722..11725 ; XID_Continue # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11726 ; XID_Continue # Mc AHOM VOWEL SIGN E 11727..1172B ; XID_Continue # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 11730..11739 ; XID_Continue # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 11740..11746 ; XID_Continue # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; XID_Continue # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 1182C..1182E ; XID_Continue # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 1182F..11837 ; XID_Continue # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11838 ; XID_Continue # Mc DOGRA SIGN VISARGA 11839..1183A ; XID_Continue # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 118A0..118DF ; XID_Continue # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118E0..118E9 ; XID_Continue # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 118FF..11906 ; XID_Continue # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; XID_Continue # Lo DIVES AKURU LETTER O 1190C..11913 ; XID_Continue # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; XID_Continue # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; XID_Continue # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 11930..11935 ; XID_Continue # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E 11937..11938 ; XID_Continue # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193B..1193C ; XID_Continue # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193D ; XID_Continue # Mc DIVES AKURU SIGN HALANTA 1193E ; XID_Continue # Mn DIVES AKURU VIRAMA 1193F ; XID_Continue # Lo DIVES AKURU PREFIXED NASAL SIGN 11940 ; XID_Continue # Mc DIVES AKURU MEDIAL YA 11941 ; XID_Continue # Lo DIVES AKURU INITIAL RA 11942 ; XID_Continue # Mc DIVES AKURU MEDIAL RA 11943 ; XID_Continue # Mn DIVES AKURU SIGN NUKTA 11950..11959 ; XID_Continue # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE 119A0..119A7 ; XID_Continue # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; XID_Continue # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119D1..119D3 ; XID_Continue # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119D4..119D7 ; XID_Continue # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; XID_Continue # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119DC..119DF ; XID_Continue # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E0 ; XID_Continue # Mn NANDINAGARI SIGN VIRAMA 119E1 ; XID_Continue # Lo NANDINAGARI SIGN AVAGRAHA 119E3 ; XID_Continue # Lo NANDINAGARI HEADSTROKE 119E4 ; XID_Continue # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A00 ; XID_Continue # Lo ZANABAZAR SQUARE LETTER A 11A01..11A0A ; XID_Continue # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A0B..11A32 ; XID_Continue # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A33..11A38 ; XID_Continue # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A39 ; XID_Continue # Mc ZANABAZAR SQUARE SIGN VISARGA 11A3A ; XID_Continue # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A3B..11A3E ; XID_Continue # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; XID_Continue # Mn ZANABAZAR SQUARE SUBJOINER 11A50 ; XID_Continue # Lo SOYOMBO LETTER A 11A51..11A56 ; XID_Continue # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A57..11A58 ; XID_Continue # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A59..11A5B ; XID_Continue # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A5C..11A89 ; XID_Continue # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A8A..11A96 ; XID_Continue # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A97 ; XID_Continue # Mc SOYOMBO SIGN VISARGA 11A98..11A99 ; XID_Continue # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11A9D ; XID_Continue # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; XID_Continue # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B60 ; XID_Continue # Mn SHARADA VOWEL SIGN OE 11B61 ; XID_Continue # Mc SHARADA VOWEL SIGN OOE 11B62..11B64 ; XID_Continue # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B65 ; XID_Continue # Mc SHARADA VOWEL SIGN SHORT O 11B66 ; XID_Continue # Mn SHARADA VOWEL SIGN CANDRA E 11B67 ; XID_Continue # Mc SHARADA VOWEL SIGN CANDRA O 11BC0..11BE0 ; XID_Continue # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11BF0..11BF9 ; XID_Continue # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; XID_Continue # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; XID_Continue # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; XID_Continue # Mc BHAIKSUKI VOWEL SIGN AA 11C30..11C36 ; XID_Continue # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; XID_Continue # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3E ; XID_Continue # Mc BHAIKSUKI SIGN VISARGA 11C3F ; XID_Continue # Mn BHAIKSUKI SIGN VIRAMA 11C40 ; XID_Continue # Lo BHAIKSUKI SIGN AVAGRAHA 11C50..11C59 ; XID_Continue # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11C72..11C8F ; XID_Continue # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11C92..11CA7 ; XID_Continue # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CA9 ; XID_Continue # Mc MARCHEN SUBJOINED LETTER YA 11CAA..11CB0 ; XID_Continue # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB1 ; XID_Continue # Mc MARCHEN VOWEL SIGN I 11CB2..11CB3 ; XID_Continue # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB4 ; XID_Continue # Mc MARCHEN VOWEL SIGN O 11CB5..11CB6 ; XID_Continue # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D00..11D06 ; XID_Continue # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; XID_Continue # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; XID_Continue # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D31..11D36 ; XID_Continue # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; XID_Continue # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; XID_Continue # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; XID_Continue # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D46 ; XID_Continue # Lo MASARAM GONDI REPHA 11D47 ; XID_Continue # Mn MASARAM GONDI RA-KARA 11D50..11D59 ; XID_Continue # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11D60..11D65 ; XID_Continue # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; XID_Continue # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; XID_Continue # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D8A..11D8E ; XID_Continue # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D90..11D91 ; XID_Continue # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D93..11D94 ; XID_Continue # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D95 ; XID_Continue # Mn GUNJALA GONDI SIGN ANUSVARA 11D96 ; XID_Continue # Mc GUNJALA GONDI SIGN VISARGA 11D97 ; XID_Continue # Mn GUNJALA GONDI VIRAMA 11D98 ; XID_Continue # Lo GUNJALA GONDI OM 11DA0..11DA9 ; XID_Continue # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE 11DB0..11DD8 ; XID_Continue # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; XID_Continue # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; XID_Continue # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11DE0..11DE9 ; XID_Continue # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE 11EE0..11EF2 ; XID_Continue # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11EF3..11EF4 ; XID_Continue # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11EF5..11EF6 ; XID_Continue # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11F00..11F01 ; XID_Continue # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F02 ; XID_Continue # Lo KAWI SIGN REPHA 11F03 ; XID_Continue # Mc KAWI SIGN VISARGA 11F04..11F10 ; XID_Continue # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; XID_Continue # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11F34..11F35 ; XID_Continue # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F36..11F3A ; XID_Continue # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F3E..11F3F ; XID_Continue # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F40 ; XID_Continue # Mn KAWI VOWEL SIGN EU 11F41 ; XID_Continue # Mc KAWI SIGN KILLER 11F42 ; XID_Continue # Mn KAWI CONJOINER 11F50..11F59 ; XID_Continue # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE 11F5A ; XID_Continue # Mn KAWI SIGN NUKTA 11FB0 ; XID_Continue # Lo LISU LETTER YHA 12000..12399 ; XID_Continue # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; XID_Continue # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12480..12543 ; XID_Continue # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; XID_Continue # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 13000..1342F ; XID_Continue # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13440 ; XID_Continue # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13441..13446 ; XID_Continue # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13447..13455 ; XID_Continue # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 13460..143FA ; XID_Continue # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; XID_Continue # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; XID_Continue # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 1611E..16129 ; XID_Continue # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612A..1612C ; XID_Continue # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 1612D..1612F ; XID_Continue # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16130..16139 ; XID_Continue # Nd [10] GURUNG KHEMA DIGIT ZERO..GURUNG KHEMA DIGIT NINE 16800..16A38 ; XID_Continue # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; XID_Continue # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A60..16A69 ; XID_Continue # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE 16A70..16ABE ; XID_Continue # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AC0..16AC9 ; XID_Continue # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE 16AD0..16AED ; XID_Continue # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16AF0..16AF4 ; XID_Continue # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B00..16B2F ; XID_Continue # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B30..16B36 ; XID_Continue # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16B40..16B43 ; XID_Continue # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B50..16B59 ; XID_Continue # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE 16B63..16B77 ; XID_Continue # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; XID_Continue # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; XID_Continue # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; XID_Continue # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; XID_Continue # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16D70..16D79 ; XID_Continue # Nd [10] KIRAT RAI DIGIT ZERO..KIRAT RAI DIGIT NINE 16E40..16E7F ; XID_Continue # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EA0..16EB8 ; XID_Continue # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; XID_Continue # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; XID_Continue # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F4F ; XID_Continue # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F50 ; XID_Continue # Lo MIAO LETTER NASALIZATION 16F51..16F87 ; XID_Continue # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; XID_Continue # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; XID_Continue # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; XID_Continue # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; XID_Continue # Lm OLD CHINESE ITERATION MARK 16FE4 ; XID_Continue # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; XID_Continue # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 16FF2..16FF3 ; XID_Continue # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; XID_Continue # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; XID_Continue # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; XID_Continue # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; XID_Continue # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; XID_Continue # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; XID_Continue # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; XID_Continue # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; XID_Continue # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; XID_Continue # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; XID_Continue # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; XID_Continue # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; XID_Continue # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; XID_Continue # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; XID_Continue # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; XID_Continue # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; XID_Continue # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; XID_Continue # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1BC9D..1BC9E ; XID_Continue # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CCF0..1CCF9 ; XID_Continue # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE 1CF00..1CF2D ; XID_Continue # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; XID_Continue # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D165..1D166 ; XID_Continue # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D167..1D169 ; XID_Continue # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D16D..1D172 ; XID_Continue # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; XID_Continue # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; XID_Continue # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; XID_Continue # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; XID_Continue # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1D400..1D454 ; XID_Continue # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; XID_Continue # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; XID_Continue # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; XID_Continue # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; XID_Continue # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; XID_Continue # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; XID_Continue # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; XID_Continue # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; XID_Continue # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; XID_Continue # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; XID_Continue # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; XID_Continue # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; XID_Continue # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; XID_Continue # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; XID_Continue # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; XID_Continue # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; XID_Continue # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; XID_Continue # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; XID_Continue # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; XID_Continue # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; XID_Continue # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; XID_Continue # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; XID_Continue # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; XID_Continue # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; XID_Continue # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; XID_Continue # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; XID_Continue # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; XID_Continue # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; XID_Continue # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; XID_Continue # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1D7CE..1D7FF ; XID_Continue # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1DA00..1DA36 ; XID_Continue # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; XID_Continue # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; XID_Continue # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; XID_Continue # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; XID_Continue # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; XID_Continue # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1DF00..1DF09 ; XID_Continue # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; XID_Continue # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; XID_Continue # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; XID_Continue # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E000..1E006 ; XID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; XID_Continue # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; XID_Continue # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; XID_Continue # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; XID_Continue # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E030..1E06D ; XID_Continue # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E08F ; XID_Continue # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E100..1E12C ; XID_Continue # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E130..1E136 ; XID_Continue # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E137..1E13D ; XID_Continue # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E140..1E149 ; XID_Continue # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE 1E14E ; XID_Continue # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E290..1E2AD ; XID_Continue # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2AE ; XID_Continue # Mn TOTO SIGN RISING TONE 1E2C0..1E2EB ; XID_Continue # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E2EC..1E2EF ; XID_Continue # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E2F0..1E2F9 ; XID_Continue # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE 1E4D0..1E4EA ; XID_Continue # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; XID_Continue # Lm NAG MUNDARI SIGN OJOD 1E4EC..1E4EF ; XID_Continue # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E4F0..1E4F9 ; XID_Continue # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE 1E5D0..1E5ED ; XID_Continue # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5EE..1E5EF ; XID_Continue # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E5F0 ; XID_Continue # Lo OL ONAL SIGN HODDOND 1E5F1..1E5FA ; XID_Continue # Nd [10] OL ONAL DIGIT ZERO..OL ONAL DIGIT NINE 1E6C0..1E6DE ; XID_Continue # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; XID_Continue # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E3 ; XID_Continue # Mn TAI YO SIGN UE 1E6E4..1E6E5 ; XID_Continue # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E6 ; XID_Continue # Mn TAI YO SIGN AU 1E6E7..1E6ED ; XID_Continue # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6EE..1E6EF ; XID_Continue # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F0..1E6F4 ; XID_Continue # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6F5 ; XID_Continue # Mn TAI YO SIGN OM 1E6FE ; XID_Continue # Lo TAI YO SYMBOL MUEANG 1E6FF ; XID_Continue # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; XID_Continue # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; XID_Continue # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; XID_Continue # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; XID_Continue # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; XID_Continue # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E8D0..1E8D6 ; XID_Continue # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E900..1E943 ; XID_Continue # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E944..1E94A ; XID_Continue # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA 1E94B ; XID_Continue # Lm ADLAM NASALIZATION MARK 1E950..1E959 ; XID_Continue # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1EE00..1EE03 ; XID_Continue # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; XID_Continue # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; XID_Continue # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; XID_Continue # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; XID_Continue # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; XID_Continue # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; XID_Continue # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; XID_Continue # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; XID_Continue # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; XID_Continue # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; XID_Continue # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; XID_Continue # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; XID_Continue # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; XID_Continue # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; XID_Continue # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; XID_Continue # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; XID_Continue # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; XID_Continue # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; XID_Continue # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; XID_Continue # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; XID_Continue # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; XID_Continue # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; XID_Continue # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; XID_Continue # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1FBF0..1FBF9 ; XID_Continue # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE 20000..2A6DF ; XID_Continue # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; XID_Continue # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; XID_Continue # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; XID_Continue # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; XID_Continue # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; XID_Continue # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; XID_Continue # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; XID_Continue # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 E0100..E01EF ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 149221 # ================================================ # Derived Property: Default_Ignorable_Code_Point # Generated from # Other_Default_Ignorable_Code_Point # + Cf (Format characters) # + Variation_Selector # - White_Space # - FFF9..FFFB (Interlinear annotation format characters) # - 13430..13440 (Egyptian hieroglyph format characters) # - Prepended_Concatenation_Mark (Exceptional format characters that should be visible) # # There are currently no stability guarantees for DICP. However, the # values of DICP interact with the derivation of XID_Continue # and NFKC_CF, for which there are stability guarantees. # Maintainers of this property should note that in the # unlikely case that the DICP value changes for an existing character # which is also XID_Continue=Yes, then exceptions must be put # in place to ensure that the NFKC_CF mapping value for that # existing character does not change. 00AD ; Default_Ignorable_Code_Point # Cf SOFT HYPHEN 034F ; Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER 061C ; Default_Ignorable_Code_Point # Cf ARABIC LETTER MARK 115F..1160 ; Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER 17B4..17B5 ; Default_Ignorable_Code_Point # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 180B..180D ; Default_Ignorable_Code_Point # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180E ; Default_Ignorable_Code_Point # Cf MONGOLIAN VOWEL SEPARATOR 180F ; Default_Ignorable_Code_Point # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 200B..200F ; Default_Ignorable_Code_Point # Cf [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK 202A..202E ; Default_Ignorable_Code_Point # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE 2060..2064 ; Default_Ignorable_Code_Point # Cf [5] WORD JOINER..INVISIBLE PLUS 2065 ; Default_Ignorable_Code_Point # Cn 2066..206F ; Default_Ignorable_Code_Point # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES 3164 ; Default_Ignorable_Code_Point # Lo HANGUL FILLER FE00..FE0F ; Default_Ignorable_Code_Point # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FEFF ; Default_Ignorable_Code_Point # Cf ZERO WIDTH NO-BREAK SPACE FFA0 ; Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER FFF0..FFF8 ; Default_Ignorable_Code_Point # Cn [9] .. 1BCA0..1BCA3 ; Default_Ignorable_Code_Point # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1D173..1D17A ; Default_Ignorable_Code_Point # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE E0000 ; Default_Ignorable_Code_Point # Cn E0001 ; Default_Ignorable_Code_Point # Cf LANGUAGE TAG E0002..E001F ; Default_Ignorable_Code_Point # Cn [30] .. E0020..E007F ; Default_Ignorable_Code_Point # Cf [96] TAG SPACE..CANCEL TAG E0080..E00FF ; Default_Ignorable_Code_Point # Cn [128] .. E0100..E01EF ; Default_Ignorable_Code_Point # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] .. # Total code points: 4174 # ================================================ # Derived Property: Grapheme_Extend # Generated from: Me + Mn + Other_Grapheme_Extend # Note: depending on an application's interpretation of Co (private use), # they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither. 0300..036F ; Grapheme_Extend # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0483..0487 ; Grapheme_Extend # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0488..0489 ; Grapheme_Extend # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 0591..05BD ; Grapheme_Extend # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; Grapheme_Extend # Mn HEBREW POINT RAFE 05C1..05C2 ; Grapheme_Extend # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Grapheme_Extend # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Grapheme_Extend # Mn HEBREW POINT QAMATS QATAN 0610..061A ; Grapheme_Extend # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 064B..065F ; Grapheme_Extend # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0670 ; Grapheme_Extend # Mn ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; Grapheme_Extend # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; Grapheme_Extend # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E7..06E8 ; Grapheme_Extend # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; Grapheme_Extend # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 0711 ; Grapheme_Extend # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0730..074A ; Grapheme_Extend # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; Grapheme_Extend # Mn [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; Grapheme_Extend # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07FD ; Grapheme_Extend # Mn NKO DANTAYALAN 0816..0819 ; Grapheme_Extend # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081B..0823 ; Grapheme_Extend # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; Grapheme_Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Grapheme_Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Grapheme_Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0897..089F ; Grapheme_Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Grapheme_Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Grapheme_Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; Grapheme_Extend # Mn DEVANAGARI VOWEL SIGN OE 093C ; Grapheme_Extend # Mn DEVANAGARI SIGN NUKTA 0941..0948 ; Grapheme_Extend # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 094D ; Grapheme_Extend # Mn DEVANAGARI SIGN VIRAMA 0951..0957 ; Grapheme_Extend # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; Grapheme_Extend # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0981 ; Grapheme_Extend # Mn BENGALI SIGN CANDRABINDU 09BC ; Grapheme_Extend # Mn BENGALI SIGN NUKTA 09BE ; Grapheme_Extend # Mc BENGALI VOWEL SIGN AA 09C1..09C4 ; Grapheme_Extend # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09CD ; Grapheme_Extend # Mn BENGALI SIGN VIRAMA 09D7 ; Grapheme_Extend # Mc BENGALI AU LENGTH MARK 09E2..09E3 ; Grapheme_Extend # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09FE ; Grapheme_Extend # Mn BENGALI SANDHI MARK 0A01..0A02 ; Grapheme_Extend # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A3C ; Grapheme_Extend # Mn GURMUKHI SIGN NUKTA 0A41..0A42 ; Grapheme_Extend # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; Grapheme_Extend # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; Grapheme_Extend # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; Grapheme_Extend # Mn GURMUKHI SIGN UDAAT 0A70..0A71 ; Grapheme_Extend # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; Grapheme_Extend # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; Grapheme_Extend # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0ABC ; Grapheme_Extend # Mn GUJARATI SIGN NUKTA 0AC1..0AC5 ; Grapheme_Extend # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; Grapheme_Extend # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0ACD ; Grapheme_Extend # Mn GUJARATI SIGN VIRAMA 0AE2..0AE3 ; Grapheme_Extend # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFF ; Grapheme_Extend # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; Grapheme_Extend # Mn ORIYA SIGN CANDRABINDU 0B3C ; Grapheme_Extend # Mn ORIYA SIGN NUKTA 0B3E ; Grapheme_Extend # Mc ORIYA VOWEL SIGN AA 0B3F ; Grapheme_Extend # Mn ORIYA VOWEL SIGN I 0B41..0B44 ; Grapheme_Extend # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B4D ; Grapheme_Extend # Mn ORIYA SIGN VIRAMA 0B55..0B56 ; Grapheme_Extend # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B57 ; Grapheme_Extend # Mc ORIYA AU LENGTH MARK 0B62..0B63 ; Grapheme_Extend # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; Grapheme_Extend # Mn TAMIL SIGN ANUSVARA 0BBE ; Grapheme_Extend # Mc TAMIL VOWEL SIGN AA 0BC0 ; Grapheme_Extend # Mn TAMIL VOWEL SIGN II 0BCD ; Grapheme_Extend # Mn TAMIL SIGN VIRAMA 0BD7 ; Grapheme_Extend # Mc TAMIL AU LENGTH MARK 0C00 ; Grapheme_Extend # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C04 ; Grapheme_Extend # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3C ; Grapheme_Extend # Mn TELUGU SIGN NUKTA 0C3E..0C40 ; Grapheme_Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C46..0C48 ; Grapheme_Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; Grapheme_Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; Grapheme_Extend # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Grapheme_Extend # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; Grapheme_Extend # Mn KANNADA SIGN CANDRABINDU 0CBC ; Grapheme_Extend # Mn KANNADA SIGN NUKTA 0CBF ; Grapheme_Extend # Mn KANNADA VOWEL SIGN I 0CC0 ; Grapheme_Extend # Mc KANNADA VOWEL SIGN II 0CC2 ; Grapheme_Extend # Mc KANNADA VOWEL SIGN UU 0CC6 ; Grapheme_Extend # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; Grapheme_Extend # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; Grapheme_Extend # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; Grapheme_Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; Grapheme_Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CE2..0CE3 ; Grapheme_Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0D00..0D01 ; Grapheme_Extend # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D3B..0D3C ; Grapheme_Extend # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D3E ; Grapheme_Extend # Mc MALAYALAM VOWEL SIGN AA 0D41..0D44 ; Grapheme_Extend # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D4D ; Grapheme_Extend # Mn MALAYALAM SIGN VIRAMA 0D57 ; Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK 0D62..0D63 ; Grapheme_Extend # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; Grapheme_Extend # Mn SINHALA SIGN CANDRABINDU 0DCA ; Grapheme_Extend # Mn SINHALA SIGN AL-LAKUNA 0DCF ; Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA 0DD2..0DD4 ; Grapheme_Extend # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Grapheme_Extend # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DDF ; Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA 0E31 ; Grapheme_Extend # Mn THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; Grapheme_Extend # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E47..0E4E ; Grapheme_Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0EB1 ; Grapheme_Extend # Mn LAO VOWEL SIGN MAI KAN 0EB4..0EBC ; Grapheme_Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EC8..0ECE ; Grapheme_Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0F18..0F19 ; Grapheme_Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; Grapheme_Extend # Mn TIBETAN MARK TSA -PHRU 0F71..0F7E ; Grapheme_Extend # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F80..0F84 ; Grapheme_Extend # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; Grapheme_Extend # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F8D..0F97 ; Grapheme_Extend # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; Grapheme_Extend # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; Grapheme_Extend # Mn TIBETAN SYMBOL PADMA GDAN 102D..1030 ; Grapheme_Extend # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1032..1037 ; Grapheme_Extend # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1039..103A ; Grapheme_Extend # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103D..103E ; Grapheme_Extend # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1058..1059 ; Grapheme_Extend # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; Grapheme_Extend # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1071..1074 ; Grapheme_Extend # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; Grapheme_Extend # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1085..1086 ; Grapheme_Extend # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 108D ; Grapheme_Extend # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 109D ; Grapheme_Extend # Mn MYANMAR VOWEL SIGN AITON AI 135D..135F ; Grapheme_Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; Grapheme_Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1715 ; Grapheme_Extend # Mc TAGALOG SIGN PAMUDPOD 1732..1733 ; Grapheme_Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1734 ; Grapheme_Extend # Mc HANUNOO SIGN PAMUDPOD 1752..1753 ; Grapheme_Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; Grapheme_Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; Grapheme_Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B7..17BD ; Grapheme_Extend # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17C6 ; Grapheme_Extend # Mn KHMER SIGN NIKAHIT 17C9..17D3 ; Grapheme_Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17DD ; Grapheme_Extend # Mn KHMER SIGN ATTHACAN 180B..180D ; Grapheme_Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; Grapheme_Extend # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1885..1886 ; Grapheme_Extend # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; Grapheme_Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; Grapheme_Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1927..1928 ; Grapheme_Extend # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1932 ; Grapheme_Extend # Mn LIMBU SMALL LETTER ANUSVARA 1939..193B ; Grapheme_Extend # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A17..1A18 ; Grapheme_Extend # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A1B ; Grapheme_Extend # Mn BUGINESE VOWEL SIGN AE 1A56 ; Grapheme_Extend # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A58..1A5E ; Grapheme_Extend # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; Grapheme_Extend # Mn TAI THAM SIGN SAKOT 1A62 ; Grapheme_Extend # Mn TAI THAM VOWEL SIGN MAI SAT 1A65..1A6C ; Grapheme_Extend # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A73..1A7C ; Grapheme_Extend # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Grapheme_Extend # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AB0..1ABD ; Grapheme_Extend # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; Grapheme_Extend # Me COMBINING PARENTHESES OVERLAY 1ABF..1ADD ; Grapheme_Extend # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; Grapheme_Extend # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; Grapheme_Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B34 ; Grapheme_Extend # Mn BALINESE SIGN REREKAN 1B35 ; Grapheme_Extend # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; Grapheme_Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; Grapheme_Extend # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; Grapheme_Extend # Mn BALINESE VOWEL SIGN LA LENGA 1B3D ; Grapheme_Extend # Mc BALINESE VOWEL SIGN LA LENGA TEDUNG 1B42 ; Grapheme_Extend # Mn BALINESE VOWEL SIGN PEPET 1B43..1B44 ; Grapheme_Extend # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1B6B..1B73 ; Grapheme_Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; Grapheme_Extend # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; Grapheme_Extend # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; Grapheme_Extend # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; Grapheme_Extend # Mc SUNDANESE SIGN PAMAAEH 1BAB..1BAD ; Grapheme_Extend # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; Grapheme_Extend # Mn BATAK SIGN TOMPI 1BE8..1BE9 ; Grapheme_Extend # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; Grapheme_Extend # Mn BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; Grapheme_Extend # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1BF2..1BF3 ; Grapheme_Extend # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C2C..1C33 ; Grapheme_Extend # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; Grapheme_Extend # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1CD0..1CD2 ; Grapheme_Extend # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Grapheme_Extend # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Grapheme_Extend # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Grapheme_Extend # Mn VEDIC SIGN TIRYAK 1CF4 ; Grapheme_Extend # Mn VEDIC TONE CANDRA ABOVE 1CF8..1CF9 ; Grapheme_Extend # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1DC0..1DFF ; Grapheme_Extend # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 200C ; Grapheme_Extend # Cf ZERO WIDTH NON-JOINER 20D0..20DC ; Grapheme_Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20DD..20E0 ; Grapheme_Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH 20E1 ; Grapheme_Extend # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E2..20E4 ; Grapheme_Extend # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE 20E5..20F0 ; Grapheme_Extend # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2CEF..2CF1 ; Grapheme_Extend # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2D7F ; Grapheme_Extend # Mn TIFINAGH CONSONANT JOINER 2DE0..2DFF ; Grapheme_Extend # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 302A..302D ; Grapheme_Extend # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 302E..302F ; Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3099..309A ; Grapheme_Extend # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK A66F ; Grapheme_Extend # Mn COMBINING CYRILLIC VZMET A670..A672 ; Grapheme_Extend # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN A674..A67D ; Grapheme_Extend # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A69E..A69F ; Grapheme_Extend # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6F0..A6F1 ; Grapheme_Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A802 ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN DVISVARA A806 ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN HASANTA A80B ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN ANUSVARA A825..A826 ; Grapheme_Extend # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A82C ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4..A8C5 ; Grapheme_Extend # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8E0..A8F1 ; Grapheme_Extend # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8FF ; Grapheme_Extend # Mn DEVANAGARI VOWEL SIGN AY A926..A92D ; Grapheme_Extend # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; Grapheme_Extend # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A953 ; Grapheme_Extend # Mc REJANG VIRAMA A980..A982 ; Grapheme_Extend # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; Grapheme_Extend # Mn JAVANESE SIGN CECAK TELU A9B6..A9B9 ; Grapheme_Extend # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; Grapheme_Extend # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9C0 ; Grapheme_Extend # Mc JAVANESE PANGKON A9E5 ; Grapheme_Extend # Mn MYANMAR SIGN SHAN SAW AA29..AA2E ; Grapheme_Extend # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; Grapheme_Extend # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA35..AA36 ; Grapheme_Extend # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; Grapheme_Extend # Mn CHAM CONSONANT SIGN FINAL NG AA4C ; Grapheme_Extend # Mn CHAM CONSONANT SIGN FINAL M AA7C ; Grapheme_Extend # Mn MYANMAR SIGN TAI LAING TONE-2 AAB0 ; Grapheme_Extend # Mn TAI VIET MAI KANG AAB2..AAB4 ; Grapheme_Extend # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; Grapheme_Extend # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE..AABF ; Grapheme_Extend # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC1 ; Grapheme_Extend # Mn TAI VIET TONE MAI THO AAEC..AAED ; Grapheme_Extend # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAF6 ; Grapheme_Extend # Mn MEETEI MAYEK VIRAMA ABE5 ; Grapheme_Extend # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE8 ; Grapheme_Extend # Mn MEETEI MAYEK VOWEL SIGN UNAP ABED ; Grapheme_Extend # Mn MEETEI MAYEK APUN IYEK FB1E ; Grapheme_Extend # Mn HEBREW POINT JUDEO-SPANISH VARIKA FE00..FE0F ; Grapheme_Extend # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; Grapheme_Extend # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FF9E..FF9F ; Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK 101FD ; Grapheme_Extend # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 102E0 ; Grapheme_Extend # Mn COPTIC EPACT THOUSANDS MARK 10376..1037A ; Grapheme_Extend # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; Grapheme_Extend # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Grapheme_Extend # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Grapheme_Extend # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A38..10A3A ; Grapheme_Extend # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; Grapheme_Extend # Mn KHAROSHTHI VIRAMA 10AE5..10AE6 ; Grapheme_Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Grapheme_Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D69..10D6D ; Grapheme_Extend # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10EAB..10EAC ; Grapheme_Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EFA..10EFF ; Grapheme_Extend # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Grapheme_Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Grapheme_Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Grapheme_Extend # Mn BRAHMI SIGN ANUSVARA 11038..11046 ; Grapheme_Extend # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11070 ; Grapheme_Extend # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11073..11074 ; Grapheme_Extend # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 1107F..11081 ; Grapheme_Extend # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 110B3..110B6 ; Grapheme_Extend # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B9..110BA ; Grapheme_Extend # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; Grapheme_Extend # Mn KAITHI VOWEL SIGN VOCALIC R 11100..11102 ; Grapheme_Extend # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; Grapheme_Extend # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112D..11134 ; Grapheme_Extend # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11173 ; Grapheme_Extend # Mn MAHAJANI SIGN NUKTA 11180..11181 ; Grapheme_Extend # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; Grapheme_Extend # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111C0 ; Grapheme_Extend # Mc SHARADA SIGN VIRAMA 111C9..111CC ; Grapheme_Extend # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; Grapheme_Extend # Mn SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; Grapheme_Extend # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; Grapheme_Extend # Mn KHOJKI SIGN ANUSVARA 11235 ; Grapheme_Extend # Mc KHOJKI SIGN VIRAMA 11236..11237 ; Grapheme_Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; Grapheme_Extend # Mn KHOJKI SIGN SUKUN 11241 ; Grapheme_Extend # Mn KHOJKI VOWEL SIGN VOCALIC R 112DF ; Grapheme_Extend # Mn KHUDAWADI SIGN ANUSVARA 112E3..112EA ; Grapheme_Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 11300..11301 ; Grapheme_Extend # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 1133B..1133C ; Grapheme_Extend # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1133E ; Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA 11340 ; Grapheme_Extend # Mn GRANTHA VOWEL SIGN II 1134D ; Grapheme_Extend # Mc GRANTHA SIGN VIRAMA 11357 ; Grapheme_Extend # Mc GRANTHA AU LENGTH MARK 11366..1136C ; Grapheme_Extend # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; Grapheme_Extend # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113B8 ; Grapheme_Extend # Mc TULU-TIGALARI VOWEL SIGN AA 113BB..113C0 ; Grapheme_Extend # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; Grapheme_Extend # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; Grapheme_Extend # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113C9 ; Grapheme_Extend # Mc [3] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI AU LENGTH MARK 113CE ; Grapheme_Extend # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; Grapheme_Extend # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; Grapheme_Extend # Mn TULU-TIGALARI CONJOINER 113D2 ; Grapheme_Extend # Mn TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; Grapheme_Extend # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11438..1143F ; Grapheme_Extend # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11442..11444 ; Grapheme_Extend # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11446 ; Grapheme_Extend # Mn NEWA SIGN NUKTA 1145E ; Grapheme_Extend # Mn NEWA SANDHI MARK 114B0 ; Grapheme_Extend # Mc TIRHUTA VOWEL SIGN AA 114B3..114B8 ; Grapheme_Extend # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114BA ; Grapheme_Extend # Mn TIRHUTA VOWEL SIGN SHORT E 114BD ; Grapheme_Extend # Mc TIRHUTA VOWEL SIGN SHORT O 114BF..114C0 ; Grapheme_Extend # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C2..114C3 ; Grapheme_Extend # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115AF ; Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA 115B2..115B5 ; Grapheme_Extend # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115BC..115BD ; Grapheme_Extend # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BF..115C0 ; Grapheme_Extend # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115DC..115DD ; Grapheme_Extend # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11633..1163A ; Grapheme_Extend # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163D ; Grapheme_Extend # Mn MODI SIGN ANUSVARA 1163F..11640 ; Grapheme_Extend # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 116AB ; Grapheme_Extend # Mn TAKRI SIGN ANUSVARA 116AD ; Grapheme_Extend # Mn TAKRI VOWEL SIGN AA 116B0..116B5 ; Grapheme_Extend # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B6 ; Grapheme_Extend # Mc TAKRI SIGN VIRAMA 116B7 ; Grapheme_Extend # Mn TAKRI SIGN NUKTA 1171D ; Grapheme_Extend # Mn AHOM CONSONANT SIGN MEDIAL LA 1171F ; Grapheme_Extend # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11722..11725 ; Grapheme_Extend # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11727..1172B ; Grapheme_Extend # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 1182F..11837 ; Grapheme_Extend # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11839..1183A ; Grapheme_Extend # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 11930 ; Grapheme_Extend # Mc DIVES AKURU VOWEL SIGN AA 1193B..1193C ; Grapheme_Extend # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193D ; Grapheme_Extend # Mc DIVES AKURU SIGN HALANTA 1193E ; Grapheme_Extend # Mn DIVES AKURU VIRAMA 11943 ; Grapheme_Extend # Mn DIVES AKURU SIGN NUKTA 119D4..119D7 ; Grapheme_Extend # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; Grapheme_Extend # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119E0 ; Grapheme_Extend # Mn NANDINAGARI SIGN VIRAMA 11A01..11A0A ; Grapheme_Extend # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A33..11A38 ; Grapheme_Extend # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A3B..11A3E ; Grapheme_Extend # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; Grapheme_Extend # Mn ZANABAZAR SQUARE SUBJOINER 11A51..11A56 ; Grapheme_Extend # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A59..11A5B ; Grapheme_Extend # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; Grapheme_Extend # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A98..11A99 ; Grapheme_Extend # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11B60 ; Grapheme_Extend # Mn SHARADA VOWEL SIGN OE 11B62..11B64 ; Grapheme_Extend # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B66 ; Grapheme_Extend # Mn SHARADA VOWEL SIGN CANDRA E 11C30..11C36 ; Grapheme_Extend # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; Grapheme_Extend # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3F ; Grapheme_Extend # Mn BHAIKSUKI SIGN VIRAMA 11C92..11CA7 ; Grapheme_Extend # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CAA..11CB0 ; Grapheme_Extend # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB2..11CB3 ; Grapheme_Extend # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB5..11CB6 ; Grapheme_Extend # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; Grapheme_Extend # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; Grapheme_Extend # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; Grapheme_Extend # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; Grapheme_Extend # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D47 ; Grapheme_Extend # Mn MASARAM GONDI RA-KARA 11D90..11D91 ; Grapheme_Extend # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D95 ; Grapheme_Extend # Mn GUNJALA GONDI SIGN ANUSVARA 11D97 ; Grapheme_Extend # Mn GUNJALA GONDI VIRAMA 11EF3..11EF4 ; Grapheme_Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11F00..11F01 ; Grapheme_Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; Grapheme_Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; Grapheme_Extend # Mn KAWI VOWEL SIGN EU 11F41 ; Grapheme_Extend # Mc KAWI SIGN KILLER 11F42 ; Grapheme_Extend # Mn KAWI CONJOINER 11F5A ; Grapheme_Extend # Mn KAWI SIGN NUKTA 13440 ; Grapheme_Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13447..13455 ; Grapheme_Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1611E..16129 ; Grapheme_Extend # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612D..1612F ; Grapheme_Extend # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; Grapheme_Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; Grapheme_Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F4F ; Grapheme_Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; Grapheme_Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FE4 ; Grapheme_Extend # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; Grapheme_Extend # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1BC9D..1BC9E ; Grapheme_Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CF00..1CF2D ; Grapheme_Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; Grapheme_Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D165..1D166 ; Grapheme_Extend # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D167..1D169 ; Grapheme_Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D16D..1D172 ; Grapheme_Extend # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; Grapheme_Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Grapheme_Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Grapheme_Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; Grapheme_Extend # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1DA00..1DA36 ; Grapheme_Extend # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; Grapheme_Extend # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; Grapheme_Extend # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; Grapheme_Extend # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; Grapheme_Extend # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; Grapheme_Extend # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1E000..1E006 ; Grapheme_Extend # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; Grapheme_Extend # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; Grapheme_Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; Grapheme_Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; Grapheme_Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E08F ; Grapheme_Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E130..1E136 ; Grapheme_Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E2AE ; Grapheme_Extend # Mn TOTO SIGN RISING TONE 1E2EC..1E2EF ; Grapheme_Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E4EC..1E4EF ; Grapheme_Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E5EE..1E5EF ; Grapheme_Extend # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E6E3 ; Grapheme_Extend # Mn TAI YO SIGN UE 1E6E6 ; Grapheme_Extend # Mn TAI YO SIGN AU 1E6EE..1E6EF ; Grapheme_Extend # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; Grapheme_Extend # Mn TAI YO SIGN OM 1E8D0..1E8D6 ; Grapheme_Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E94A ; Grapheme_Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA E0020..E007F ; Grapheme_Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 2232 # ================================================ # Derived Property: Grapheme_Base # Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend # Note: depending on an application's interpretation of Co (private use), # they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither. 0020 ; Grapheme_Base # Zs SPACE 0021..0023 ; Grapheme_Base # Po [3] EXCLAMATION MARK..NUMBER SIGN 0024 ; Grapheme_Base # Sc DOLLAR SIGN 0025..0027 ; Grapheme_Base # Po [3] PERCENT SIGN..APOSTROPHE 0028 ; Grapheme_Base # Ps LEFT PARENTHESIS 0029 ; Grapheme_Base # Pe RIGHT PARENTHESIS 002A ; Grapheme_Base # Po ASTERISK 002B ; Grapheme_Base # Sm PLUS SIGN 002C ; Grapheme_Base # Po COMMA 002D ; Grapheme_Base # Pd HYPHEN-MINUS 002E..002F ; Grapheme_Base # Po [2] FULL STOP..SOLIDUS 0030..0039 ; Grapheme_Base # Nd [10] DIGIT ZERO..DIGIT NINE 003A..003B ; Grapheme_Base # Po [2] COLON..SEMICOLON 003C..003E ; Grapheme_Base # Sm [3] LESS-THAN SIGN..GREATER-THAN SIGN 003F..0040 ; Grapheme_Base # Po [2] QUESTION MARK..COMMERCIAL AT 0041..005A ; Grapheme_Base # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 005B ; Grapheme_Base # Ps LEFT SQUARE BRACKET 005C ; Grapheme_Base # Po REVERSE SOLIDUS 005D ; Grapheme_Base # Pe RIGHT SQUARE BRACKET 005E ; Grapheme_Base # Sk CIRCUMFLEX ACCENT 005F ; Grapheme_Base # Pc LOW LINE 0060 ; Grapheme_Base # Sk GRAVE ACCENT 0061..007A ; Grapheme_Base # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 007B ; Grapheme_Base # Ps LEFT CURLY BRACKET 007C ; Grapheme_Base # Sm VERTICAL LINE 007D ; Grapheme_Base # Pe RIGHT CURLY BRACKET 007E ; Grapheme_Base # Sm TILDE 00A0 ; Grapheme_Base # Zs NO-BREAK SPACE 00A1 ; Grapheme_Base # Po INVERTED EXCLAMATION MARK 00A2..00A5 ; Grapheme_Base # Sc [4] CENT SIGN..YEN SIGN 00A6 ; Grapheme_Base # So BROKEN BAR 00A7 ; Grapheme_Base # Po SECTION SIGN 00A8 ; Grapheme_Base # Sk DIAERESIS 00A9 ; Grapheme_Base # So COPYRIGHT SIGN 00AA ; Grapheme_Base # Lo FEMININE ORDINAL INDICATOR 00AB ; Grapheme_Base # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00AC ; Grapheme_Base # Sm NOT SIGN 00AE ; Grapheme_Base # So REGISTERED SIGN 00AF ; Grapheme_Base # Sk MACRON 00B0 ; Grapheme_Base # So DEGREE SIGN 00B1 ; Grapheme_Base # Sm PLUS-MINUS SIGN 00B2..00B3 ; Grapheme_Base # No [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE 00B4 ; Grapheme_Base # Sk ACUTE ACCENT 00B5 ; Grapheme_Base # L& MICRO SIGN 00B6..00B7 ; Grapheme_Base # Po [2] PILCROW SIGN..MIDDLE DOT 00B8 ; Grapheme_Base # Sk CEDILLA 00B9 ; Grapheme_Base # No SUPERSCRIPT ONE 00BA ; Grapheme_Base # Lo MASCULINE ORDINAL INDICATOR 00BB ; Grapheme_Base # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00BC..00BE ; Grapheme_Base # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS 00BF ; Grapheme_Base # Po INVERTED QUESTION MARK 00C0..00D6 ; Grapheme_Base # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D7 ; Grapheme_Base # Sm MULTIPLICATION SIGN 00D8..00F6 ; Grapheme_Base # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS 00F7 ; Grapheme_Base # Sm DIVISION SIGN 00F8..01BA ; Grapheme_Base # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL 01BB ; Grapheme_Base # Lo LATIN LETTER TWO WITH STROKE 01BC..01BF ; Grapheme_Base # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; Grapheme_Base # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; Grapheme_Base # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL 0294..0295 ; Grapheme_Base # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 0296..02AF ; Grapheme_Base # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02C1 ; Grapheme_Base # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C2..02C5 ; Grapheme_Base # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD 02C6..02D1 ; Grapheme_Base # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02D2..02DF ; Grapheme_Base # Sk [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT 02E0..02E4 ; Grapheme_Base # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02E5..02EB ; Grapheme_Base # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK 02EC ; Grapheme_Base # Lm MODIFIER LETTER VOICING 02ED ; Grapheme_Base # Sk MODIFIER LETTER UNASPIRATED 02EE ; Grapheme_Base # Lm MODIFIER LETTER DOUBLE APOSTROPHE 02EF..02FF ; Grapheme_Base # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW 0370..0373 ; Grapheme_Base # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI 0374 ; Grapheme_Base # Lm GREEK NUMERAL SIGN 0375 ; Grapheme_Base # Sk GREEK LOWER NUMERAL SIGN 0376..0377 ; Grapheme_Base # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; Grapheme_Base # Lm GREEK YPOGEGRAMMENI 037B..037D ; Grapheme_Base # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 037E ; Grapheme_Base # Po GREEK QUESTION MARK 037F ; Grapheme_Base # L& GREEK CAPITAL LETTER YOT 0384..0385 ; Grapheme_Base # Sk [2] GREEK TONOS..GREEK DIALYTIKA TONOS 0386 ; Grapheme_Base # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0387 ; Grapheme_Base # Po GREEK ANO TELEIA 0388..038A ; Grapheme_Base # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Grapheme_Base # L& GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..03A1 ; Grapheme_Base # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO 03A3..03F5 ; Grapheme_Base # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL 03F6 ; Grapheme_Base # Sm GREEK REVERSED LUNATE EPSILON SYMBOL 03F7..0481 ; Grapheme_Base # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA 0482 ; Grapheme_Base # So CYRILLIC THOUSANDS SIGN 048A..052F ; Grapheme_Base # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 0531..0556 ; Grapheme_Base # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 0559 ; Grapheme_Base # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 055A..055F ; Grapheme_Base # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK 0560..0588 ; Grapheme_Base # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 0589 ; Grapheme_Base # Po ARMENIAN FULL STOP 058A ; Grapheme_Base # Pd ARMENIAN HYPHEN 058D..058E ; Grapheme_Base # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN 058F ; Grapheme_Base # Sc ARMENIAN DRAM SIGN 05BE ; Grapheme_Base # Pd HEBREW PUNCTUATION MAQAF 05C0 ; Grapheme_Base # Po HEBREW PUNCTUATION PASEQ 05C3 ; Grapheme_Base # Po HEBREW PUNCTUATION SOF PASUQ 05C6 ; Grapheme_Base # Po HEBREW PUNCTUATION NUN HAFUKHA 05D0..05EA ; Grapheme_Base # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; Grapheme_Base # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 05F3..05F4 ; Grapheme_Base # Po [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM 0606..0608 ; Grapheme_Base # Sm [3] ARABIC-INDIC CUBE ROOT..ARABIC RAY 0609..060A ; Grapheme_Base # Po [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN 060B ; Grapheme_Base # Sc AFGHANI SIGN 060C..060D ; Grapheme_Base # Po [2] ARABIC COMMA..ARABIC DATE SEPARATOR 060E..060F ; Grapheme_Base # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA 061B ; Grapheme_Base # Po ARABIC SEMICOLON 061D..061F ; Grapheme_Base # Po [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK 0620..063F ; Grapheme_Base # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0640 ; Grapheme_Base # Lm ARABIC TATWEEL 0641..064A ; Grapheme_Base # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH 0660..0669 ; Grapheme_Base # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE 066A..066D ; Grapheme_Base # Po [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR 066E..066F ; Grapheme_Base # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0671..06D3 ; Grapheme_Base # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D4 ; Grapheme_Base # Po ARABIC FULL STOP 06D5 ; Grapheme_Base # Lo ARABIC LETTER AE 06DE ; Grapheme_Base # So ARABIC START OF RUB EL HIZB 06E5..06E6 ; Grapheme_Base # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06E9 ; Grapheme_Base # So ARABIC PLACE OF SAJDAH 06EE..06EF ; Grapheme_Base # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06F0..06F9 ; Grapheme_Base # Nd [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE 06FA..06FC ; Grapheme_Base # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FD..06FE ; Grapheme_Base # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN 06FF ; Grapheme_Base # Lo ARABIC LETTER HEH WITH INVERTED V 0700..070D ; Grapheme_Base # Po [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS 0710 ; Grapheme_Base # Lo SYRIAC LETTER ALAPH 0712..072F ; Grapheme_Base # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 074D..07A5 ; Grapheme_Base # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07B1 ; Grapheme_Base # Lo THAANA LETTER NAA 07C0..07C9 ; Grapheme_Base # Nd [10] NKO DIGIT ZERO..NKO DIGIT NINE 07CA..07EA ; Grapheme_Base # Lo [33] NKO LETTER A..NKO LETTER JONA RA 07F4..07F5 ; Grapheme_Base # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07F6 ; Grapheme_Base # So NKO SYMBOL OO DENNEN 07F7..07F9 ; Grapheme_Base # Po [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK 07FA ; Grapheme_Base # Lm NKO LAJANYALAN 07FE..07FF ; Grapheme_Base # Sc [2] NKO DOROME SIGN..NKO TAMAN SIGN 0800..0815 ; Grapheme_Base # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 081A ; Grapheme_Base # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT 0824 ; Grapheme_Base # Lm SAMARITAN MODIFIER LETTER SHORT A 0828 ; Grapheme_Base # Lm SAMARITAN MODIFIER LETTER I 0830..083E ; Grapheme_Base # Po [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU 0840..0858 ; Grapheme_Base # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 085E ; Grapheme_Base # Po MANDAIC PUNCTUATION 0860..086A ; Grapheme_Base # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; Grapheme_Base # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0888 ; Grapheme_Base # Sk ARABIC RAISED ROUND DOT 0889..088F ; Grapheme_Base # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 08A0..08C8 ; Grapheme_Base # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; Grapheme_Base # Lm ARABIC SMALL FARSI YEH 0903 ; Grapheme_Base # Mc DEVANAGARI SIGN VISARGA 0904..0939 ; Grapheme_Base # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093B ; Grapheme_Base # Mc DEVANAGARI VOWEL SIGN OOE 093D ; Grapheme_Base # Lo DEVANAGARI SIGN AVAGRAHA 093E..0940 ; Grapheme_Base # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0949..094C ; Grapheme_Base # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094E..094F ; Grapheme_Base # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0950 ; Grapheme_Base # Lo DEVANAGARI OM 0958..0961 ; Grapheme_Base # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0964..0965 ; Grapheme_Base # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA 0966..096F ; Grapheme_Base # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 0970 ; Grapheme_Base # Po DEVANAGARI ABBREVIATION SIGN 0971 ; Grapheme_Base # Lm DEVANAGARI SIGN HIGH SPACING DOT 0972..0980 ; Grapheme_Base # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0982..0983 ; Grapheme_Base # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; Grapheme_Base # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; Grapheme_Base # Lo [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; Grapheme_Base # Lo [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; Grapheme_Base # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; Grapheme_Base # Lo BENGALI LETTER LA 09B6..09B9 ; Grapheme_Base # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BD ; Grapheme_Base # Lo BENGALI SIGN AVAGRAHA 09BF..09C0 ; Grapheme_Base # Mc [2] BENGALI VOWEL SIGN I..BENGALI VOWEL SIGN II 09C7..09C8 ; Grapheme_Base # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; Grapheme_Base # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09CE ; Grapheme_Base # Lo BENGALI LETTER KHANDA TA 09DC..09DD ; Grapheme_Base # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; Grapheme_Base # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09E6..09EF ; Grapheme_Base # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE 09F0..09F1 ; Grapheme_Base # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09F2..09F3 ; Grapheme_Base # Sc [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN 09F4..09F9 ; Grapheme_Base # No [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN 09FA ; Grapheme_Base # So BENGALI ISSHAR 09FB ; Grapheme_Base # Sc BENGALI GANDA MARK 09FC ; Grapheme_Base # Lo BENGALI LETTER VEDIC ANUSVARA 09FD ; Grapheme_Base # Po BENGALI ABBREVIATION SIGN 0A03 ; Grapheme_Base # Mc GURMUKHI SIGN VISARGA 0A05..0A0A ; Grapheme_Base # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; Grapheme_Base # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; Grapheme_Base # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; Grapheme_Base # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; Grapheme_Base # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; Grapheme_Base # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; Grapheme_Base # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A3E..0A40 ; Grapheme_Base # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A59..0A5C ; Grapheme_Base # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; Grapheme_Base # Lo GURMUKHI LETTER FA 0A66..0A6F ; Grapheme_Base # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE 0A72..0A74 ; Grapheme_Base # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A76 ; Grapheme_Base # Po GURMUKHI ABBREVIATION SIGN 0A83 ; Grapheme_Base # Mc GUJARATI SIGN VISARGA 0A85..0A8D ; Grapheme_Base # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; Grapheme_Base # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; Grapheme_Base # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; Grapheme_Base # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; Grapheme_Base # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; Grapheme_Base # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABD ; Grapheme_Base # Lo GUJARATI SIGN AVAGRAHA 0ABE..0AC0 ; Grapheme_Base # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC9 ; Grapheme_Base # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; Grapheme_Base # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0AD0 ; Grapheme_Base # Lo GUJARATI OM 0AE0..0AE1 ; Grapheme_Base # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AE6..0AEF ; Grapheme_Base # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE 0AF0 ; Grapheme_Base # Po GUJARATI ABBREVIATION SIGN 0AF1 ; Grapheme_Base # Sc GUJARATI RUPEE SIGN 0AF9 ; Grapheme_Base # Lo GUJARATI LETTER ZHA 0B02..0B03 ; Grapheme_Base # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B05..0B0C ; Grapheme_Base # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; Grapheme_Base # Lo [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; Grapheme_Base # Lo [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; Grapheme_Base # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; Grapheme_Base # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; Grapheme_Base # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3D ; Grapheme_Base # Lo ORIYA SIGN AVAGRAHA 0B40 ; Grapheme_Base # Mc ORIYA VOWEL SIGN II 0B47..0B48 ; Grapheme_Base # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; Grapheme_Base # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B5C..0B5D ; Grapheme_Base # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; Grapheme_Base # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B66..0B6F ; Grapheme_Base # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE 0B70 ; Grapheme_Base # So ORIYA ISSHAR 0B71 ; Grapheme_Base # Lo ORIYA LETTER WA 0B72..0B77 ; Grapheme_Base # No [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS 0B83 ; Grapheme_Base # Lo TAMIL SIGN VISARGA 0B85..0B8A ; Grapheme_Base # Lo [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; Grapheme_Base # Lo [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; Grapheme_Base # Lo [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; Grapheme_Base # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; Grapheme_Base # Lo TAMIL LETTER JA 0B9E..0B9F ; Grapheme_Base # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; Grapheme_Base # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; Grapheme_Base # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; Grapheme_Base # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA 0BBF ; Grapheme_Base # Mc TAMIL VOWEL SIGN I 0BC1..0BC2 ; Grapheme_Base # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; Grapheme_Base # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; Grapheme_Base # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD0 ; Grapheme_Base # Lo TAMIL OM 0BE6..0BEF ; Grapheme_Base # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE 0BF0..0BF2 ; Grapheme_Base # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND 0BF3..0BF8 ; Grapheme_Base # So [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN 0BF9 ; Grapheme_Base # Sc TAMIL RUPEE SIGN 0BFA ; Grapheme_Base # So TAMIL NUMBER SIGN 0C01..0C03 ; Grapheme_Base # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C05..0C0C ; Grapheme_Base # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; Grapheme_Base # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; Grapheme_Base # Lo [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; Grapheme_Base # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; Grapheme_Base # Lo TELUGU SIGN AVAGRAHA 0C41..0C44 ; Grapheme_Base # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C58..0C5A ; Grapheme_Base # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; Grapheme_Base # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; Grapheme_Base # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C66..0C6F ; Grapheme_Base # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE 0C77 ; Grapheme_Base # Po TELUGU SIGN SIDDHAM 0C78..0C7E ; Grapheme_Base # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0C7F ; Grapheme_Base # So TELUGU SIGN TUUMU 0C80 ; Grapheme_Base # Lo KANNADA SIGN SPACING CANDRABINDU 0C82..0C83 ; Grapheme_Base # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C84 ; Grapheme_Base # Po KANNADA SIGN SIDDHAM 0C85..0C8C ; Grapheme_Base # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; Grapheme_Base # Lo [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; Grapheme_Base # Lo [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; Grapheme_Base # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; Grapheme_Base # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBD ; Grapheme_Base # Lo KANNADA SIGN AVAGRAHA 0CBE ; Grapheme_Base # Mc KANNADA VOWEL SIGN AA 0CC1 ; Grapheme_Base # Mc KANNADA VOWEL SIGN U 0CC3..0CC4 ; Grapheme_Base # Mc [2] KANNADA VOWEL SIGN VOCALIC R..KANNADA VOWEL SIGN VOCALIC RR 0CDC..0CDE ; Grapheme_Base # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; Grapheme_Base # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE6..0CEF ; Grapheme_Base # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; Grapheme_Base # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; Grapheme_Base # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D02..0D03 ; Grapheme_Base # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D04..0D0C ; Grapheme_Base # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; Grapheme_Base # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; Grapheme_Base # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3D ; Grapheme_Base # Lo MALAYALAM SIGN AVAGRAHA 0D3F..0D40 ; Grapheme_Base # Mc [2] MALAYALAM VOWEL SIGN I..MALAYALAM VOWEL SIGN II 0D46..0D48 ; Grapheme_Base # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; Grapheme_Base # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D4E ; Grapheme_Base # Lo MALAYALAM LETTER DOT REPH 0D4F ; Grapheme_Base # So MALAYALAM SIGN PARA 0D54..0D56 ; Grapheme_Base # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D58..0D5E ; Grapheme_Base # No [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH 0D5F..0D61 ; Grapheme_Base # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D66..0D6F ; Grapheme_Base # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE 0D70..0D78 ; Grapheme_Base # No [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS 0D79 ; Grapheme_Base # So MALAYALAM DATE MARK 0D7A..0D7F ; Grapheme_Base # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D82..0D83 ; Grapheme_Base # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0D85..0D96 ; Grapheme_Base # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; Grapheme_Base # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; Grapheme_Base # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; Grapheme_Base # Lo SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; Grapheme_Base # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0DD0..0DD1 ; Grapheme_Base # Mc [2] SINHALA VOWEL SIGN KETTI AEDA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD8..0DDE ; Grapheme_Base # Mc [7] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA 0DE6..0DEF ; Grapheme_Base # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0DF2..0DF3 ; Grapheme_Base # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0DF4 ; Grapheme_Base # Po SINHALA PUNCTUATION KUNDDALIYA 0E01..0E30 ; Grapheme_Base # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E32..0E33 ; Grapheme_Base # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E3F ; Grapheme_Base # Sc THAI CURRENCY SYMBOL BAHT 0E40..0E45 ; Grapheme_Base # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E46 ; Grapheme_Base # Lm THAI CHARACTER MAIYAMOK 0E4F ; Grapheme_Base # Po THAI CHARACTER FONGMAN 0E50..0E59 ; Grapheme_Base # Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE 0E5A..0E5B ; Grapheme_Base # Po [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT 0E81..0E82 ; Grapheme_Base # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; Grapheme_Base # Lo LAO LETTER KHO TAM 0E86..0E8A ; Grapheme_Base # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; Grapheme_Base # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; Grapheme_Base # Lo LAO LETTER LO LOOT 0EA7..0EB0 ; Grapheme_Base # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB2..0EB3 ; Grapheme_Base # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EBD ; Grapheme_Base # Lo LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; Grapheme_Base # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EC6 ; Grapheme_Base # Lm LAO KO LA 0ED0..0ED9 ; Grapheme_Base # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE 0EDC..0EDF ; Grapheme_Base # Lo [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; Grapheme_Base # Lo TIBETAN SYLLABLE OM 0F01..0F03 ; Grapheme_Base # So [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA 0F04..0F12 ; Grapheme_Base # Po [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD 0F13 ; Grapheme_Base # So TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN 0F14 ; Grapheme_Base # Po TIBETAN MARK GTER TSHEG 0F15..0F17 ; Grapheme_Base # So [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS 0F1A..0F1F ; Grapheme_Base # So [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG 0F20..0F29 ; Grapheme_Base # Nd [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE 0F2A..0F33 ; Grapheme_Base # No [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO 0F34 ; Grapheme_Base # So TIBETAN MARK BSDUS RTAGS 0F36 ; Grapheme_Base # So TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN 0F38 ; Grapheme_Base # So TIBETAN MARK CHE MGO 0F3A ; Grapheme_Base # Ps TIBETAN MARK GUG RTAGS GYON 0F3B ; Grapheme_Base # Pe TIBETAN MARK GUG RTAGS GYAS 0F3C ; Grapheme_Base # Ps TIBETAN MARK ANG KHANG GYON 0F3D ; Grapheme_Base # Pe TIBETAN MARK ANG KHANG GYAS 0F3E..0F3F ; Grapheme_Base # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F40..0F47 ; Grapheme_Base # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; Grapheme_Base # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F7F ; Grapheme_Base # Mc TIBETAN SIGN RNAM BCAD 0F85 ; Grapheme_Base # Po TIBETAN MARK PALUTA 0F88..0F8C ; Grapheme_Base # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 0FBE..0FC5 ; Grapheme_Base # So [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE 0FC7..0FCC ; Grapheme_Base # So [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL 0FCE..0FCF ; Grapheme_Base # So [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM 0FD0..0FD4 ; Grapheme_Base # Po [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA 0FD5..0FD8 ; Grapheme_Base # So [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS 0FD9..0FDA ; Grapheme_Base # Po [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS 1000..102A ; Grapheme_Base # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 102B..102C ; Grapheme_Base # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 1031 ; Grapheme_Base # Mc MYANMAR VOWEL SIGN E 1038 ; Grapheme_Base # Mc MYANMAR SIGN VISARGA 103B..103C ; Grapheme_Base # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 103F ; Grapheme_Base # Lo MYANMAR LETTER GREAT SA 1040..1049 ; Grapheme_Base # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE 104A..104F ; Grapheme_Base # Po [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE 1050..1055 ; Grapheme_Base # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 1056..1057 ; Grapheme_Base # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 105A..105D ; Grapheme_Base # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 1061 ; Grapheme_Base # Lo MYANMAR LETTER SGAW KAREN SHA 1062..1064 ; Grapheme_Base # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1065..1066 ; Grapheme_Base # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 1067..106D ; Grapheme_Base # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 106E..1070 ; Grapheme_Base # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1075..1081 ; Grapheme_Base # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 1083..1084 ; Grapheme_Base # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1087..108C ; Grapheme_Base # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108E ; Grapheme_Base # Lo MYANMAR LETTER RUMAI PALAUNG FA 108F ; Grapheme_Base # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 1090..1099 ; Grapheme_Base # Nd [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE 109A..109C ; Grapheme_Base # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109E..109F ; Grapheme_Base # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION 10A0..10C5 ; Grapheme_Base # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Grapheme_Base # L& GEORGIAN CAPITAL LETTER YN 10CD ; Grapheme_Base # L& GEORGIAN CAPITAL LETTER AEN 10D0..10FA ; Grapheme_Base # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FB ; Grapheme_Base # Po GEORGIAN PARAGRAPH SEPARATOR 10FC ; Grapheme_Base # Lm MODIFIER LETTER GEORGIAN NAR 10FD..10FF ; Grapheme_Base # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 1100..1248 ; Grapheme_Base # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; Grapheme_Base # Lo ETHIOPIC SYLLABLE QHWA 125A..125D ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; Grapheme_Base # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; Grapheme_Base # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; Grapheme_Base # Lo ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; Grapheme_Base # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; Grapheme_Base # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; Grapheme_Base # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 1360..1368 ; Grapheme_Base # Po [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR 1369..137C ; Grapheme_Base # No [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND 1380..138F ; Grapheme_Base # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 1390..1399 ; Grapheme_Base # So [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT 13A0..13F5 ; Grapheme_Base # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; Grapheme_Base # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1400 ; Grapheme_Base # Pd CANADIAN SYLLABICS HYPHEN 1401..166C ; Grapheme_Base # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166D ; Grapheme_Base # So CANADIAN SYLLABICS CHI SIGN 166E ; Grapheme_Base # Po CANADIAN SYLLABICS FULL STOP 166F..167F ; Grapheme_Base # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1680 ; Grapheme_Base # Zs OGHAM SPACE MARK 1681..169A ; Grapheme_Base # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 169B ; Grapheme_Base # Ps OGHAM FEATHER MARK 169C ; Grapheme_Base # Pe OGHAM REVERSED FEATHER MARK 16A0..16EA ; Grapheme_Base # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EB..16ED ; Grapheme_Base # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION 16EE..16F0 ; Grapheme_Base # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 16F1..16F8 ; Grapheme_Base # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; Grapheme_Base # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA 171F..1731 ; Grapheme_Base # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1735..1736 ; Grapheme_Base # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 1740..1751 ; Grapheme_Base # Lo [18] BUHID LETTER A..BUHID LETTER HA 1760..176C ; Grapheme_Base # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; Grapheme_Base # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1780..17B3 ; Grapheme_Base # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17B6 ; Grapheme_Base # Mc KHMER VOWEL SIGN AA 17BE..17C5 ; Grapheme_Base # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C7..17C8 ; Grapheme_Base # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 17D4..17D6 ; Grapheme_Base # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 17D7 ; Grapheme_Base # Lm KHMER SIGN LEK TOO 17D8..17DA ; Grapheme_Base # Po [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT 17DB ; Grapheme_Base # Sc KHMER CURRENCY SYMBOL RIEL 17DC ; Grapheme_Base # Lo KHMER SIGN AVAKRAHASANYA 17E0..17E9 ; Grapheme_Base # Nd [10] KHMER DIGIT ZERO..KHMER DIGIT NINE 17F0..17F9 ; Grapheme_Base # No [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON 1800..1805 ; Grapheme_Base # Po [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS 1806 ; Grapheme_Base # Pd MONGOLIAN TODO SOFT HYPHEN 1807..180A ; Grapheme_Base # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU 1810..1819 ; Grapheme_Base # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1820..1842 ; Grapheme_Base # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; Grapheme_Base # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1844..1878 ; Grapheme_Base # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; Grapheme_Base # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1887..18A8 ; Grapheme_Base # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18AA ; Grapheme_Base # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; Grapheme_Base # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; Grapheme_Base # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1923..1926 ; Grapheme_Base # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1929..192B ; Grapheme_Base # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; Grapheme_Base # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1933..1938 ; Grapheme_Base # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1940 ; Grapheme_Base # So LIMBU SIGN LOO 1944..1945 ; Grapheme_Base # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 1946..194F ; Grapheme_Base # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE 1950..196D ; Grapheme_Base # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; Grapheme_Base # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; Grapheme_Base # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; Grapheme_Base # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 19D0..19D9 ; Grapheme_Base # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE 19DA ; Grapheme_Base # No NEW TAI LUE THAM DIGIT ONE 19DE..19FF ; Grapheme_Base # So [34] NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM ROC 1A00..1A16 ; Grapheme_Base # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A19..1A1A ; Grapheme_Base # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A1E..1A1F ; Grapheme_Base # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION 1A20..1A54 ; Grapheme_Base # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1A55 ; Grapheme_Base # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A57 ; Grapheme_Base # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A61 ; Grapheme_Base # Mc TAI THAM VOWEL SIGN A 1A63..1A64 ; Grapheme_Base # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A6D..1A72 ; Grapheme_Base # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1A80..1A89 ; Grapheme_Base # Nd [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE 1A90..1A99 ; Grapheme_Base # Nd [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE 1AA0..1AA6 ; Grapheme_Base # Po [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA 1AA7 ; Grapheme_Base # Lm TAI THAM SIGN MAI YAMOK 1AA8..1AAD ; Grapheme_Base # Po [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG 1B04 ; Grapheme_Base # Mc BALINESE SIGN BISAH 1B05..1B33 ; Grapheme_Base # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B3E..1B41 ; Grapheme_Base # Mc [4] BALINESE VOWEL SIGN TALING..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B45..1B4C ; Grapheme_Base # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B4E..1B4F ; Grapheme_Base # Po [2] BALINESE INVERTED CARIK SIKI..BALINESE INVERTED CARIK PAREREN 1B50..1B59 ; Grapheme_Base # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE 1B5A..1B60 ; Grapheme_Base # Po [7] BALINESE PANTI..BALINESE PAMENENG 1B61..1B6A ; Grapheme_Base # So [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE 1B74..1B7C ; Grapheme_Base # So [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING 1B7D..1B7F ; Grapheme_Base # Po [3] BALINESE PANTI LANTANG..BALINESE PANTI BAWAK 1B82 ; Grapheme_Base # Mc SUNDANESE SIGN PANGWISAD 1B83..1BA0 ; Grapheme_Base # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BA1 ; Grapheme_Base # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA6..1BA7 ; Grapheme_Base # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BAE..1BAF ; Grapheme_Base # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BB0..1BB9 ; Grapheme_Base # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1BBA..1BE5 ; Grapheme_Base # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1BE7 ; Grapheme_Base # Mc BATAK VOWEL SIGN E 1BEA..1BEC ; Grapheme_Base # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BEE ; Grapheme_Base # Mc BATAK VOWEL SIGN U 1BFC..1BFF ; Grapheme_Base # Po [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT 1C00..1C23 ; Grapheme_Base # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C24..1C2B ; Grapheme_Base # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C34..1C35 ; Grapheme_Base # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C3B..1C3F ; Grapheme_Base # Po [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK 1C40..1C49 ; Grapheme_Base # Nd [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE 1C4D..1C4F ; Grapheme_Base # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C50..1C59 ; Grapheme_Base # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE 1C5A..1C77 ; Grapheme_Base # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; Grapheme_Base # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; Grapheme_Base # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD 1C80..1C8A ; Grapheme_Base # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Grapheme_Base # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Grapheme_Base # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; Grapheme_Base # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA 1CD3 ; Grapheme_Base # Po VEDIC SIGN NIHSHVASA 1CE1 ; Grapheme_Base # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CE9..1CEC ; Grapheme_Base # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CEE..1CF3 ; Grapheme_Base # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF5..1CF6 ; Grapheme_Base # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CF7 ; Grapheme_Base # Mc VEDIC SIGN ATIKRAMA 1CFA ; Grapheme_Base # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 1D00..1D2B ; Grapheme_Base # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; Grapheme_Base # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; Grapheme_Base # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D78 ; Grapheme_Base # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; Grapheme_Base # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; Grapheme_Base # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 1E00..1F15 ; Grapheme_Base # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; Grapheme_Base # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; Grapheme_Base # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F48..1F4D ; Grapheme_Base # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Grapheme_Base # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F59 ; Grapheme_Base # L& GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Grapheme_Base # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Grapheme_Base # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F..1F7D ; Grapheme_Base # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1FB4 ; Grapheme_Base # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FBC ; Grapheme_Base # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FBD ; Grapheme_Base # Sk GREEK KORONIS 1FBE ; Grapheme_Base # L& GREEK PROSGEGRAMMENI 1FBF..1FC1 ; Grapheme_Base # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI 1FC2..1FC4 ; Grapheme_Base # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FCC ; Grapheme_Base # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FCD..1FCF ; Grapheme_Base # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI 1FD0..1FD3 ; Grapheme_Base # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FDB ; Grapheme_Base # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA 1FDD..1FDF ; Grapheme_Base # Sk [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI 1FE0..1FEC ; Grapheme_Base # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FED..1FEF ; Grapheme_Base # Sk [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA 1FF2..1FF4 ; Grapheme_Base # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FFC ; Grapheme_Base # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 1FFD..1FFE ; Grapheme_Base # Sk [2] GREEK OXIA..GREEK DASIA 2000..200A ; Grapheme_Base # Zs [11] EN QUAD..HAIR SPACE 2010..2015 ; Grapheme_Base # Pd [6] HYPHEN..HORIZONTAL BAR 2016..2017 ; Grapheme_Base # Po [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE 2018 ; Grapheme_Base # Pi LEFT SINGLE QUOTATION MARK 2019 ; Grapheme_Base # Pf RIGHT SINGLE QUOTATION MARK 201A ; Grapheme_Base # Ps SINGLE LOW-9 QUOTATION MARK 201B..201C ; Grapheme_Base # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK 201D ; Grapheme_Base # Pf RIGHT DOUBLE QUOTATION MARK 201E ; Grapheme_Base # Ps DOUBLE LOW-9 QUOTATION MARK 201F ; Grapheme_Base # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2020..2027 ; Grapheme_Base # Po [8] DAGGER..HYPHENATION POINT 202F ; Grapheme_Base # Zs NARROW NO-BREAK SPACE 2030..2038 ; Grapheme_Base # Po [9] PER MILLE SIGN..CARET 2039 ; Grapheme_Base # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A ; Grapheme_Base # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 203B..203E ; Grapheme_Base # Po [4] REFERENCE MARK..OVERLINE 203F..2040 ; Grapheme_Base # Pc [2] UNDERTIE..CHARACTER TIE 2041..2043 ; Grapheme_Base # Po [3] CARET INSERTION POINT..HYPHEN BULLET 2044 ; Grapheme_Base # Sm FRACTION SLASH 2045 ; Grapheme_Base # Ps LEFT SQUARE BRACKET WITH QUILL 2046 ; Grapheme_Base # Pe RIGHT SQUARE BRACKET WITH QUILL 2047..2051 ; Grapheme_Base # Po [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY 2052 ; Grapheme_Base # Sm COMMERCIAL MINUS SIGN 2053 ; Grapheme_Base # Po SWUNG DASH 2054 ; Grapheme_Base # Pc INVERTED UNDERTIE 2055..205E ; Grapheme_Base # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS 205F ; Grapheme_Base # Zs MEDIUM MATHEMATICAL SPACE 2070 ; Grapheme_Base # No SUPERSCRIPT ZERO 2071 ; Grapheme_Base # Lm SUPERSCRIPT LATIN SMALL LETTER I 2074..2079 ; Grapheme_Base # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE 207A..207C ; Grapheme_Base # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN 207D ; Grapheme_Base # Ps SUPERSCRIPT LEFT PARENTHESIS 207E ; Grapheme_Base # Pe SUPERSCRIPT RIGHT PARENTHESIS 207F ; Grapheme_Base # Lm SUPERSCRIPT LATIN SMALL LETTER N 2080..2089 ; Grapheme_Base # No [10] SUBSCRIPT ZERO..SUBSCRIPT NINE 208A..208C ; Grapheme_Base # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; Grapheme_Base # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Grapheme_Base # Pe SUBSCRIPT RIGHT PARENTHESIS 2090..209C ; Grapheme_Base # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 20A0..20C1 ; Grapheme_Base # Sc [34] EURO-CURRENCY SIGN..SAUDI RIYAL SIGN 2100..2101 ; Grapheme_Base # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2102 ; Grapheme_Base # L& DOUBLE-STRUCK CAPITAL C 2103..2106 ; Grapheme_Base # So [4] DEGREE CELSIUS..CADA UNA 2107 ; Grapheme_Base # L& EULER CONSTANT 2108..2109 ; Grapheme_Base # So [2] SCRUPLE..DEGREE FAHRENHEIT 210A..2113 ; Grapheme_Base # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2114 ; Grapheme_Base # So L B BAR SYMBOL 2115 ; Grapheme_Base # L& DOUBLE-STRUCK CAPITAL N 2116..2117 ; Grapheme_Base # So [2] NUMERO SIGN..SOUND RECORDING COPYRIGHT 2118 ; Grapheme_Base # Sm SCRIPT CAPITAL P 2119..211D ; Grapheme_Base # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 211E..2123 ; Grapheme_Base # So [6] PRESCRIPTION TAKE..VERSICLE 2124 ; Grapheme_Base # L& DOUBLE-STRUCK CAPITAL Z 2125 ; Grapheme_Base # So OUNCE SIGN 2126 ; Grapheme_Base # L& OHM SIGN 2127 ; Grapheme_Base # So INVERTED OHM SIGN 2128 ; Grapheme_Base # L& BLACK-LETTER CAPITAL Z 2129 ; Grapheme_Base # So TURNED GREEK SMALL LETTER IOTA 212A..212D ; Grapheme_Base # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 212E ; Grapheme_Base # So ESTIMATED SYMBOL 212F..2134 ; Grapheme_Base # L& [6] SCRIPT SMALL E..SCRIPT SMALL O 2135..2138 ; Grapheme_Base # Lo [4] ALEF SYMBOL..DALET SYMBOL 2139 ; Grapheme_Base # L& INFORMATION SOURCE 213A..213B ; Grapheme_Base # So [2] ROTATED CAPITAL Q..FACSIMILE SIGN 213C..213F ; Grapheme_Base # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2140..2144 ; Grapheme_Base # Sm [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y 2145..2149 ; Grapheme_Base # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 214A ; Grapheme_Base # So PROPERTY LINE 214B ; Grapheme_Base # Sm TURNED AMPERSAND 214C..214D ; Grapheme_Base # So [2] PER SIGN..AKTIESELSKAB 214E ; Grapheme_Base # L& TURNED SMALL F 214F ; Grapheme_Base # So SYMBOL FOR SAMARITAN SOURCE 2150..215F ; Grapheme_Base # No [16] VULGAR FRACTION ONE SEVENTH..FRACTION NUMERATOR ONE 2160..2182 ; Grapheme_Base # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2183..2184 ; Grapheme_Base # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C 2185..2188 ; Grapheme_Base # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 2189 ; Grapheme_Base # No VULGAR FRACTION ZERO THIRDS 218A..218B ; Grapheme_Base # So [2] TURNED DIGIT TWO..TURNED DIGIT THREE 2190..2194 ; Grapheme_Base # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW 2195..2199 ; Grapheme_Base # So [5] UP DOWN ARROW..SOUTH WEST ARROW 219A..219B ; Grapheme_Base # Sm [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE 219C..219F ; Grapheme_Base # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW 21A0 ; Grapheme_Base # Sm RIGHTWARDS TWO HEADED ARROW 21A1..21A2 ; Grapheme_Base # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL 21A3 ; Grapheme_Base # Sm RIGHTWARDS ARROW WITH TAIL 21A4..21A5 ; Grapheme_Base # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR 21A6 ; Grapheme_Base # Sm RIGHTWARDS ARROW FROM BAR 21A7..21AD ; Grapheme_Base # So [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW 21AE ; Grapheme_Base # Sm LEFT RIGHT ARROW WITH STROKE 21AF..21CD ; Grapheme_Base # So [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE 21CE..21CF ; Grapheme_Base # Sm [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE 21D0..21D1 ; Grapheme_Base # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW 21D2 ; Grapheme_Base # Sm RIGHTWARDS DOUBLE ARROW 21D3 ; Grapheme_Base # So DOWNWARDS DOUBLE ARROW 21D4 ; Grapheme_Base # Sm LEFT RIGHT DOUBLE ARROW 21D5..21F3 ; Grapheme_Base # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..22FF ; Grapheme_Base # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2300..2307 ; Grapheme_Base # So [8] DIAMETER SIGN..WAVY LINE 2308 ; Grapheme_Base # Ps LEFT CEILING 2309 ; Grapheme_Base # Pe RIGHT CEILING 230A ; Grapheme_Base # Ps LEFT FLOOR 230B ; Grapheme_Base # Pe RIGHT FLOOR 230C..231F ; Grapheme_Base # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; Grapheme_Base # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; Grapheme_Base # So [7] FROWN..KEYBOARD 2329 ; Grapheme_Base # Ps LEFT-POINTING ANGLE BRACKET 232A ; Grapheme_Base # Pe RIGHT-POINTING ANGLE BRACKET 232B..237B ; Grapheme_Base # So [81] ERASE TO THE LEFT..NOT CHECK MARK 237C ; Grapheme_Base # Sm RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW 237D..239A ; Grapheme_Base # So [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL 239B..23B3 ; Grapheme_Base # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; Grapheme_Base # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; Grapheme_Base # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET 23E2..2429 ; Grapheme_Base # So [72] WHITE TRAPEZIUM..SYMBOL FOR DELETE MEDIUM SHADE FORM 2440..244A ; Grapheme_Base # So [11] OCR HOOK..OCR DOUBLE BACKSLASH 2460..249B ; Grapheme_Base # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP 249C..24E9 ; Grapheme_Base # So [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 24EA..24FF ; Grapheme_Base # No [22] CIRCLED DIGIT ZERO..NEGATIVE CIRCLED DIGIT ZERO 2500..25B6 ; Grapheme_Base # So [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE 25B7 ; Grapheme_Base # Sm WHITE RIGHT-POINTING TRIANGLE 25B8..25C0 ; Grapheme_Base # So [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE 25C1 ; Grapheme_Base # Sm WHITE LEFT-POINTING TRIANGLE 25C2..25F7 ; Grapheme_Base # So [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT 25F8..25FF ; Grapheme_Base # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; Grapheme_Base # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; Grapheme_Base # Sm MUSIC SHARP SIGN 2670..2767 ; Grapheme_Base # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2768 ; Grapheme_Base # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; Grapheme_Base # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; Grapheme_Base # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 276B ; Grapheme_Base # Pe MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 276C ; Grapheme_Base # Ps MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 276D ; Grapheme_Base # Pe MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 276E ; Grapheme_Base # Ps HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 276F ; Grapheme_Base # Pe HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 2770 ; Grapheme_Base # Ps HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 2771 ; Grapheme_Base # Pe HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 2772 ; Grapheme_Base # Ps LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT 2773 ; Grapheme_Base # Pe LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT 2774 ; Grapheme_Base # Ps MEDIUM LEFT CURLY BRACKET ORNAMENT 2775 ; Grapheme_Base # Pe MEDIUM RIGHT CURLY BRACKET ORNAMENT 2776..2793 ; Grapheme_Base # No [30] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN 2794..27BF ; Grapheme_Base # So [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP 27C0..27C4 ; Grapheme_Base # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET 27C5 ; Grapheme_Base # Ps LEFT S-SHAPED BAG DELIMITER 27C6 ; Grapheme_Base # Pe RIGHT S-SHAPED BAG DELIMITER 27C7..27E5 ; Grapheme_Base # Sm [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK 27E6 ; Grapheme_Base # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E7 ; Grapheme_Base # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E8 ; Grapheme_Base # Ps MATHEMATICAL LEFT ANGLE BRACKET 27E9 ; Grapheme_Base # Pe MATHEMATICAL RIGHT ANGLE BRACKET 27EA ; Grapheme_Base # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EB ; Grapheme_Base # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27EC ; Grapheme_Base # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27ED ; Grapheme_Base # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EE ; Grapheme_Base # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS 27EF ; Grapheme_Base # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS 27F0..27FF ; Grapheme_Base # Sm [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW 2800..28FF ; Grapheme_Base # So [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 2900..2982 ; Grapheme_Base # Sm [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON 2983 ; Grapheme_Base # Ps LEFT WHITE CURLY BRACKET 2984 ; Grapheme_Base # Pe RIGHT WHITE CURLY BRACKET 2985 ; Grapheme_Base # Ps LEFT WHITE PARENTHESIS 2986 ; Grapheme_Base # Pe RIGHT WHITE PARENTHESIS 2987 ; Grapheme_Base # Ps Z NOTATION LEFT IMAGE BRACKET 2988 ; Grapheme_Base # Pe Z NOTATION RIGHT IMAGE BRACKET 2989 ; Grapheme_Base # Ps Z NOTATION LEFT BINDING BRACKET 298A ; Grapheme_Base # Pe Z NOTATION RIGHT BINDING BRACKET 298B ; Grapheme_Base # Ps LEFT SQUARE BRACKET WITH UNDERBAR 298C ; Grapheme_Base # Pe RIGHT SQUARE BRACKET WITH UNDERBAR 298D ; Grapheme_Base # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298E ; Grapheme_Base # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 298F ; Grapheme_Base # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990 ; Grapheme_Base # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2991 ; Grapheme_Base # Ps LEFT ANGLE BRACKET WITH DOT 2992 ; Grapheme_Base # Pe RIGHT ANGLE BRACKET WITH DOT 2993 ; Grapheme_Base # Ps LEFT ARC LESS-THAN BRACKET 2994 ; Grapheme_Base # Pe RIGHT ARC GREATER-THAN BRACKET 2995 ; Grapheme_Base # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET 2996 ; Grapheme_Base # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET 2997 ; Grapheme_Base # Ps LEFT BLACK TORTOISE SHELL BRACKET 2998 ; Grapheme_Base # Pe RIGHT BLACK TORTOISE SHELL BRACKET 2999..29D7 ; Grapheme_Base # Sm [63] DOTTED FENCE..BLACK HOURGLASS 29D8 ; Grapheme_Base # Ps LEFT WIGGLY FENCE 29D9 ; Grapheme_Base # Pe RIGHT WIGGLY FENCE 29DA ; Grapheme_Base # Ps LEFT DOUBLE WIGGLY FENCE 29DB ; Grapheme_Base # Pe RIGHT DOUBLE WIGGLY FENCE 29DC..29FB ; Grapheme_Base # Sm [32] INCOMPLETE INFINITY..TRIPLE PLUS 29FC ; Grapheme_Base # Ps LEFT-POINTING CURVED ANGLE BRACKET 29FD ; Grapheme_Base # Pe RIGHT-POINTING CURVED ANGLE BRACKET 29FE..2AFF ; Grapheme_Base # Sm [258] TINY..N-ARY WHITE VERTICAL BAR 2B00..2B2F ; Grapheme_Base # So [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE 2B30..2B44 ; Grapheme_Base # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; Grapheme_Base # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Grapheme_Base # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR 2B4D..2B73 ; Grapheme_Base # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR 2B76..2BFF ; Grapheme_Base # So [138] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..HELLSCHREIBER PAUSE SYMBOL 2C00..2C7B ; Grapheme_Base # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E 2C7C..2C7D ; Grapheme_Base # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2C7E..2CE4 ; Grapheme_Base # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI 2CE5..2CEA ; Grapheme_Base # So [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA 2CEB..2CEE ; Grapheme_Base # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF2..2CF3 ; Grapheme_Base # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI 2CF9..2CFC ; Grapheme_Base # Po [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER 2CFD ; Grapheme_Base # No COPTIC FRACTION ONE HALF 2CFE..2CFF ; Grapheme_Base # Po [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER 2D00..2D25 ; Grapheme_Base # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Grapheme_Base # L& GEORGIAN SMALL LETTER YN 2D2D ; Grapheme_Base # L& GEORGIAN SMALL LETTER AEN 2D30..2D67 ; Grapheme_Base # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D6F ; Grapheme_Base # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2D70 ; Grapheme_Base # Po TIFINAGH SEPARATOR MARK 2D80..2D96 ; Grapheme_Base # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 2E00..2E01 ; Grapheme_Base # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Grapheme_Base # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Grapheme_Base # Pf RIGHT SUBSTITUTION BRACKET 2E04 ; Grapheme_Base # Pi LEFT DOTTED SUBSTITUTION BRACKET 2E05 ; Grapheme_Base # Pf RIGHT DOTTED SUBSTITUTION BRACKET 2E06..2E08 ; Grapheme_Base # Po [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER 2E09 ; Grapheme_Base # Pi LEFT TRANSPOSITION BRACKET 2E0A ; Grapheme_Base # Pf RIGHT TRANSPOSITION BRACKET 2E0B ; Grapheme_Base # Po RAISED SQUARE 2E0C ; Grapheme_Base # Pi LEFT RAISED OMISSION BRACKET 2E0D ; Grapheme_Base # Pf RIGHT RAISED OMISSION BRACKET 2E0E..2E16 ; Grapheme_Base # Po [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE 2E17 ; Grapheme_Base # Pd DOUBLE OBLIQUE HYPHEN 2E18..2E19 ; Grapheme_Base # Po [2] INVERTED INTERROBANG..PALM BRANCH 2E1A ; Grapheme_Base # Pd HYPHEN WITH DIAERESIS 2E1B ; Grapheme_Base # Po TILDE WITH RING ABOVE 2E1C ; Grapheme_Base # Pi LEFT LOW PARAPHRASE BRACKET 2E1D ; Grapheme_Base # Pf RIGHT LOW PARAPHRASE BRACKET 2E1E..2E1F ; Grapheme_Base # Po [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW 2E20 ; Grapheme_Base # Pi LEFT VERTICAL BAR WITH QUILL 2E21 ; Grapheme_Base # Pf RIGHT VERTICAL BAR WITH QUILL 2E22 ; Grapheme_Base # Ps TOP LEFT HALF BRACKET 2E23 ; Grapheme_Base # Pe TOP RIGHT HALF BRACKET 2E24 ; Grapheme_Base # Ps BOTTOM LEFT HALF BRACKET 2E25 ; Grapheme_Base # Pe BOTTOM RIGHT HALF BRACKET 2E26 ; Grapheme_Base # Ps LEFT SIDEWAYS U BRACKET 2E27 ; Grapheme_Base # Pe RIGHT SIDEWAYS U BRACKET 2E28 ; Grapheme_Base # Ps LEFT DOUBLE PARENTHESIS 2E29 ; Grapheme_Base # Pe RIGHT DOUBLE PARENTHESIS 2E2A..2E2E ; Grapheme_Base # Po [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK 2E2F ; Grapheme_Base # Lm VERTICAL TILDE 2E30..2E39 ; Grapheme_Base # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; Grapheme_Base # Pd [2] TWO-EM DASH..THREE-EM DASH 2E3C..2E3F ; Grapheme_Base # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM 2E40 ; Grapheme_Base # Pd DOUBLE HYPHEN 2E41 ; Grapheme_Base # Po REVERSED COMMA 2E42 ; Grapheme_Base # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 2E43..2E4F ; Grapheme_Base # Po [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER 2E50..2E51 ; Grapheme_Base # So [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR 2E52..2E54 ; Grapheme_Base # Po [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK 2E55 ; Grapheme_Base # Ps LEFT SQUARE BRACKET WITH STROKE 2E56 ; Grapheme_Base # Pe RIGHT SQUARE BRACKET WITH STROKE 2E57 ; Grapheme_Base # Ps LEFT SQUARE BRACKET WITH DOUBLE STROKE 2E58 ; Grapheme_Base # Pe RIGHT SQUARE BRACKET WITH DOUBLE STROKE 2E59 ; Grapheme_Base # Ps TOP HALF LEFT PARENTHESIS 2E5A ; Grapheme_Base # Pe TOP HALF RIGHT PARENTHESIS 2E5B ; Grapheme_Base # Ps BOTTOM HALF LEFT PARENTHESIS 2E5C ; Grapheme_Base # Pe BOTTOM HALF RIGHT PARENTHESIS 2E5D ; Grapheme_Base # Pd OBLIQUE HYPHEN 2E80..2E99 ; Grapheme_Base # So [26] CJK RADICAL REPEAT..CJK RADICAL RAP 2E9B..2EF3 ; Grapheme_Base # So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE 2F00..2FD5 ; Grapheme_Base # So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE 2FF0..2FFF ; Grapheme_Base # So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION 3000 ; Grapheme_Base # Zs IDEOGRAPHIC SPACE 3001..3003 ; Grapheme_Base # Po [3] IDEOGRAPHIC COMMA..DITTO MARK 3004 ; Grapheme_Base # So JAPANESE INDUSTRIAL STANDARD SYMBOL 3005 ; Grapheme_Base # Lm IDEOGRAPHIC ITERATION MARK 3006 ; Grapheme_Base # Lo IDEOGRAPHIC CLOSING MARK 3007 ; Grapheme_Base # Nl IDEOGRAPHIC NUMBER ZERO 3008 ; Grapheme_Base # Ps LEFT ANGLE BRACKET 3009 ; Grapheme_Base # Pe RIGHT ANGLE BRACKET 300A ; Grapheme_Base # Ps LEFT DOUBLE ANGLE BRACKET 300B ; Grapheme_Base # Pe RIGHT DOUBLE ANGLE BRACKET 300C ; Grapheme_Base # Ps LEFT CORNER BRACKET 300D ; Grapheme_Base # Pe RIGHT CORNER BRACKET 300E ; Grapheme_Base # Ps LEFT WHITE CORNER BRACKET 300F ; Grapheme_Base # Pe RIGHT WHITE CORNER BRACKET 3010 ; Grapheme_Base # Ps LEFT BLACK LENTICULAR BRACKET 3011 ; Grapheme_Base # Pe RIGHT BLACK LENTICULAR BRACKET 3012..3013 ; Grapheme_Base # So [2] POSTAL MARK..GETA MARK 3014 ; Grapheme_Base # Ps LEFT TORTOISE SHELL BRACKET 3015 ; Grapheme_Base # Pe RIGHT TORTOISE SHELL BRACKET 3016 ; Grapheme_Base # Ps LEFT WHITE LENTICULAR BRACKET 3017 ; Grapheme_Base # Pe RIGHT WHITE LENTICULAR BRACKET 3018 ; Grapheme_Base # Ps LEFT WHITE TORTOISE SHELL BRACKET 3019 ; Grapheme_Base # Pe RIGHT WHITE TORTOISE SHELL BRACKET 301A ; Grapheme_Base # Ps LEFT WHITE SQUARE BRACKET 301B ; Grapheme_Base # Pe RIGHT WHITE SQUARE BRACKET 301C ; Grapheme_Base # Pd WAVE DASH 301D ; Grapheme_Base # Ps REVERSED DOUBLE PRIME QUOTATION MARK 301E..301F ; Grapheme_Base # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK 3020 ; Grapheme_Base # So POSTAL MARK FACE 3021..3029 ; Grapheme_Base # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 3030 ; Grapheme_Base # Pd WAVY DASH 3031..3035 ; Grapheme_Base # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 3036..3037 ; Grapheme_Base # So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL 3038..303A ; Grapheme_Base # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; Grapheme_Base # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 303C ; Grapheme_Base # Lo MASU MARK 303D ; Grapheme_Base # Po PART ALTERNATION MARK 303E..303F ; Grapheme_Base # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE 3041..3096 ; Grapheme_Base # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 309B..309C ; Grapheme_Base # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309D..309E ; Grapheme_Base # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 309F ; Grapheme_Base # Lo HIRAGANA DIGRAPH YORI 30A0 ; Grapheme_Base # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN 30A1..30FA ; Grapheme_Base # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FB ; Grapheme_Base # Po KATAKANA MIDDLE DOT 30FC..30FE ; Grapheme_Base # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK 30FF ; Grapheme_Base # Lo KATAKANA DIGRAPH KOTO 3105..312F ; Grapheme_Base # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; Grapheme_Base # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 3190..3191 ; Grapheme_Base # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK 3192..3195 ; Grapheme_Base # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK 3196..319F ; Grapheme_Base # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK 31A0..31BF ; Grapheme_Base # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31C0..31E5 ; Grapheme_Base # So [38] CJK STROKE T..CJK STROKE SZP 31EF ; Grapheme_Base # So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION 31F0..31FF ; Grapheme_Base # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3200..321E ; Grapheme_Base # So [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU 3220..3229 ; Grapheme_Base # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN 322A..3247 ; Grapheme_Base # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO 3248..324F ; Grapheme_Base # No [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE 3250 ; Grapheme_Base # So PARTNERSHIP SIGN 3251..325F ; Grapheme_Base # No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE 3260..327F ; Grapheme_Base # So [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL 3280..3289 ; Grapheme_Base # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN 328A..32B0 ; Grapheme_Base # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT 32B1..32BF ; Grapheme_Base # No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY 32C0..33FF ; Grapheme_Base # So [320] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE GAL 3400..4DBF ; Grapheme_Base # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4DC0..4DFF ; Grapheme_Base # So [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION 4E00..A014 ; Grapheme_Base # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A015 ; Grapheme_Base # Lm YI SYLLABLE WU A016..A48C ; Grapheme_Base # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A490..A4C6 ; Grapheme_Base # So [55] YI RADICAL QOT..YI RADICAL KE A4D0..A4F7 ; Grapheme_Base # Lo [40] LISU LETTER BA..LISU LETTER OE A4F8..A4FD ; Grapheme_Base # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A4FE..A4FF ; Grapheme_Base # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP A500..A60B ; Grapheme_Base # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG A60C ; Grapheme_Base # Lm VAI SYLLABLE LENGTHENER A60D..A60F ; Grapheme_Base # Po [3] VAI COMMA..VAI QUESTION MARK A610..A61F ; Grapheme_Base # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A620..A629 ; Grapheme_Base # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE A62A..A62B ; Grapheme_Base # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A640..A66D ; Grapheme_Base # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A66E ; Grapheme_Base # Lo CYRILLIC LETTER MULTIOCULAR O A673 ; Grapheme_Base # Po SLAVONIC ASTERISK A67E ; Grapheme_Base # Po CYRILLIC KAVYKA A67F ; Grapheme_Base # Lm CYRILLIC PAYEROK A680..A69B ; Grapheme_Base # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O A69C..A69D ; Grapheme_Base # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6A0..A6E5 ; Grapheme_Base # Lo [70] BAMUM LETTER A..BAMUM LETTER KI A6E6..A6EF ; Grapheme_Base # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM A6F2..A6F7 ; Grapheme_Base # Po [6] BAMUM NJAEMLI..BAMUM QUESTION MARK A700..A716 ; Grapheme_Base # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR A717..A71F ; Grapheme_Base # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A720..A721 ; Grapheme_Base # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE A722..A76F ; Grapheme_Base # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON A770 ; Grapheme_Base # Lm MODIFIER LETTER US A771..A787 ; Grapheme_Base # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A788 ; Grapheme_Base # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A789..A78A ; Grapheme_Base # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN A78B..A78E ; Grapheme_Base # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; Grapheme_Base # Lo LATIN LETTER SINOLOGICAL DOT A790..A7DC ; Grapheme_Base # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F1..A7F4 ; Grapheme_Base # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; Grapheme_Base # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; Grapheme_Base # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; Grapheme_Base # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; Grapheme_Base # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A801 ; Grapheme_Base # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A803..A805 ; Grapheme_Base # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A807..A80A ; Grapheme_Base # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80C..A822 ; Grapheme_Base # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A823..A824 ; Grapheme_Base # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A827 ; Grapheme_Base # Mc SYLOTI NAGRI VOWEL SIGN OO A828..A82B ; Grapheme_Base # So [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4 A830..A835 ; Grapheme_Base # No [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS A836..A837 ; Grapheme_Base # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK A838 ; Grapheme_Base # Sc NORTH INDIC RUPEE MARK A839 ; Grapheme_Base # So NORTH INDIC QUANTITY MARK A840..A873 ; Grapheme_Base # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A874..A877 ; Grapheme_Base # Po [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD A880..A881 ; Grapheme_Base # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A882..A8B3 ; Grapheme_Base # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8B4..A8C3 ; Grapheme_Base # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A8CE..A8CF ; Grapheme_Base # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA A8D0..A8D9 ; Grapheme_Base # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE A8F2..A8F7 ; Grapheme_Base # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8F8..A8FA ; Grapheme_Base # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET A8FB ; Grapheme_Base # Lo DEVANAGARI HEADSTROKE A8FC ; Grapheme_Base # Po DEVANAGARI SIGN SIDDHAM A8FD..A8FE ; Grapheme_Base # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A900..A909 ; Grapheme_Base # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A90A..A925 ; Grapheme_Base # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A92E..A92F ; Grapheme_Base # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA A930..A946 ; Grapheme_Base # Lo [23] REJANG LETTER KA..REJANG LETTER A A952 ; Grapheme_Base # Mc REJANG CONSONANT SIGN H A95F ; Grapheme_Base # Po REJANG SECTION MARK A960..A97C ; Grapheme_Base # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A983 ; Grapheme_Base # Mc JAVANESE SIGN WIGNYAN A984..A9B2 ; Grapheme_Base # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA A9B4..A9B5 ; Grapheme_Base # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9BA..A9BB ; Grapheme_Base # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BE..A9BF ; Grapheme_Base # Mc [2] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE CONSONANT SIGN CAKRA A9C1..A9CD ; Grapheme_Base # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH A9CF ; Grapheme_Base # Lm JAVANESE PANGRANGKEP A9D0..A9D9 ; Grapheme_Base # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9DE..A9DF ; Grapheme_Base # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN A9E0..A9E4 ; Grapheme_Base # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E6 ; Grapheme_Base # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION A9E7..A9EF ; Grapheme_Base # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9F0..A9F9 ; Grapheme_Base # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE A9FA..A9FE ; Grapheme_Base # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; Grapheme_Base # Lo [41] CHAM LETTER A..CHAM LETTER HA AA2F..AA30 ; Grapheme_Base # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA33..AA34 ; Grapheme_Base # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA40..AA42 ; Grapheme_Base # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA44..AA4B ; Grapheme_Base # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA4D ; Grapheme_Base # Mc CHAM CONSONANT SIGN FINAL H AA50..AA59 ; Grapheme_Base # Nd [10] CHAM DIGIT ZERO..CHAM DIGIT NINE AA5C..AA5F ; Grapheme_Base # Po [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA AA60..AA6F ; Grapheme_Base # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; Grapheme_Base # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; Grapheme_Base # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA77..AA79 ; Grapheme_Base # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO AA7A ; Grapheme_Base # Lo MYANMAR LETTER AITON RA AA7B ; Grapheme_Base # Mc MYANMAR SIGN PAO KAREN TONE AA7D ; Grapheme_Base # Mc MYANMAR SIGN TAI LAING TONE-5 AA7E..AAAF ; Grapheme_Base # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB1 ; Grapheme_Base # Lo TAI VIET VOWEL AA AAB5..AAB6 ; Grapheme_Base # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB9..AABD ; Grapheme_Base # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AAC0 ; Grapheme_Base # Lo TAI VIET TONE MAI NUENG AAC2 ; Grapheme_Base # Lo TAI VIET TONE MAI SONG AADB..AADC ; Grapheme_Base # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AADD ; Grapheme_Base # Lm TAI VIET SYMBOL SAM AADE..AADF ; Grapheme_Base # Po [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI AAE0..AAEA ; Grapheme_Base # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAEB ; Grapheme_Base # Mc MEETEI MAYEK VOWEL SIGN II AAEE..AAEF ; Grapheme_Base # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF0..AAF1 ; Grapheme_Base # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM AAF2 ; Grapheme_Base # Lo MEETEI MAYEK ANJI AAF3..AAF4 ; Grapheme_Base # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AAF5 ; Grapheme_Base # Mc MEETEI MAYEK VOWEL SIGN VISARGA AB01..AB06 ; Grapheme_Base # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; Grapheme_Base # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; Grapheme_Base # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO AB30..AB5A ; Grapheme_Base # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB5B ; Grapheme_Base # Sk MODIFIER BREVE WITH INVERTED BREVE AB5C..AB5F ; Grapheme_Base # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB60..AB68 ; Grapheme_Base # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB69 ; Grapheme_Base # Lm MODIFIER LETTER SMALL TURNED W AB6A..AB6B ; Grapheme_Base # Sk [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK AB70..ABBF ; Grapheme_Base # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA ABC0..ABE2 ; Grapheme_Base # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM ABE3..ABE4 ; Grapheme_Base # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE6..ABE7 ; Grapheme_Base # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE9..ABEA ; Grapheme_Base # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG ABEB ; Grapheme_Base # Po MEETEI MAYEK CHEIKHEI ABEC ; Grapheme_Base # Mc MEETEI MAYEK LUM IYEK ABF0..ABF9 ; Grapheme_Base # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE AC00..D7A3 ; Grapheme_Base # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; Grapheme_Base # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; Grapheme_Base # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; Grapheme_Base # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Grapheme_Base # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB00..FB06 ; Grapheme_Base # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Grapheme_Base # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FB1D ; Grapheme_Base # Lo HEBREW LETTER YOD WITH HIRIQ FB1F..FB28 ; Grapheme_Base # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB29 ; Grapheme_Base # Sm HEBREW LETTER ALTERNATIVE PLUS SIGN FB2A..FB36 ; Grapheme_Base # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; Grapheme_Base # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; Grapheme_Base # Lo HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; Grapheme_Base # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; Grapheme_Base # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; Grapheme_Base # Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBB2..FBC2 ; Grapheme_Base # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE FBC3..FBD2 ; Grapheme_Base # So [16] ARABIC LIGATURE JALLA WA-ALAA..ARABIC LIGATURE ALAYHI AR-RAHMAH FBD3..FD3D ; Grapheme_Base # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD3E ; Grapheme_Base # Pe ORNATE LEFT PARENTHESIS FD3F ; Grapheme_Base # Ps ORNATE RIGHT PARENTHESIS FD40..FD4F ; Grapheme_Base # So [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH FD50..FD8F ; Grapheme_Base # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD90..FD91 ; Grapheme_Base # So [2] ARABIC LIGATURE RAHMATU ALLAAHI ALAYH..ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA FD92..FDC7 ; Grapheme_Base # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDC8..FDCF ; Grapheme_Base # So [8] ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA..ARABIC LIGATURE SALAAMUHU ALAYNAA FDF0..FDFB ; Grapheme_Base # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FDFC ; Grapheme_Base # Sc RIAL SIGN FDFD..FDFF ; Grapheme_Base # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL FE10..FE16 ; Grapheme_Base # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE17 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET FE18 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET FE19 ; Grapheme_Base # Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS FE30 ; Grapheme_Base # Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER FE31..FE32 ; Grapheme_Base # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH FE33..FE34 ; Grapheme_Base # Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE FE35 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS FE36 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS FE37 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET FE38 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET FE39 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET FE3A ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET FE3B ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET FE3C ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET FE3D ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET FE3E ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET FE3F ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET FE40 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET FE41 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET FE42 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET FE43 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET FE44 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET FE45..FE46 ; Grapheme_Base # Po [2] SESAME DOT..WHITE SESAME DOT FE47 ; Grapheme_Base # Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET FE48 ; Grapheme_Base # Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET FE49..FE4C ; Grapheme_Base # Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE FE4D..FE4F ; Grapheme_Base # Pc [3] DASHED LOW LINE..WAVY LOW LINE FE50..FE52 ; Grapheme_Base # Po [3] SMALL COMMA..SMALL FULL STOP FE54..FE57 ; Grapheme_Base # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK FE58 ; Grapheme_Base # Pd SMALL EM DASH FE59 ; Grapheme_Base # Ps SMALL LEFT PARENTHESIS FE5A ; Grapheme_Base # Pe SMALL RIGHT PARENTHESIS FE5B ; Grapheme_Base # Ps SMALL LEFT CURLY BRACKET FE5C ; Grapheme_Base # Pe SMALL RIGHT CURLY BRACKET FE5D ; Grapheme_Base # Ps SMALL LEFT TORTOISE SHELL BRACKET FE5E ; Grapheme_Base # Pe SMALL RIGHT TORTOISE SHELL BRACKET FE5F..FE61 ; Grapheme_Base # Po [3] SMALL NUMBER SIGN..SMALL ASTERISK FE62 ; Grapheme_Base # Sm SMALL PLUS SIGN FE63 ; Grapheme_Base # Pd SMALL HYPHEN-MINUS FE64..FE66 ; Grapheme_Base # Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN FE68 ; Grapheme_Base # Po SMALL REVERSE SOLIDUS FE69 ; Grapheme_Base # Sc SMALL DOLLAR SIGN FE6A..FE6B ; Grapheme_Base # Po [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT FE70..FE74 ; Grapheme_Base # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; Grapheme_Base # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF01..FF03 ; Grapheme_Base # Po [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN FF04 ; Grapheme_Base # Sc FULLWIDTH DOLLAR SIGN FF05..FF07 ; Grapheme_Base # Po [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE FF08 ; Grapheme_Base # Ps FULLWIDTH LEFT PARENTHESIS FF09 ; Grapheme_Base # Pe FULLWIDTH RIGHT PARENTHESIS FF0A ; Grapheme_Base # Po FULLWIDTH ASTERISK FF0B ; Grapheme_Base # Sm FULLWIDTH PLUS SIGN FF0C ; Grapheme_Base # Po FULLWIDTH COMMA FF0D ; Grapheme_Base # Pd FULLWIDTH HYPHEN-MINUS FF0E..FF0F ; Grapheme_Base # Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS FF10..FF19 ; Grapheme_Base # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE FF1A..FF1B ; Grapheme_Base # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON FF1C..FF1E ; Grapheme_Base # Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN FF1F..FF20 ; Grapheme_Base # Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT FF21..FF3A ; Grapheme_Base # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF3B ; Grapheme_Base # Ps FULLWIDTH LEFT SQUARE BRACKET FF3C ; Grapheme_Base # Po FULLWIDTH REVERSE SOLIDUS FF3D ; Grapheme_Base # Pe FULLWIDTH RIGHT SQUARE BRACKET FF3E ; Grapheme_Base # Sk FULLWIDTH CIRCUMFLEX ACCENT FF3F ; Grapheme_Base # Pc FULLWIDTH LOW LINE FF40 ; Grapheme_Base # Sk FULLWIDTH GRAVE ACCENT FF41..FF5A ; Grapheme_Base # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z FF5B ; Grapheme_Base # Ps FULLWIDTH LEFT CURLY BRACKET FF5C ; Grapheme_Base # Sm FULLWIDTH VERTICAL LINE FF5D ; Grapheme_Base # Pe FULLWIDTH RIGHT CURLY BRACKET FF5E ; Grapheme_Base # Sm FULLWIDTH TILDE FF5F ; Grapheme_Base # Ps FULLWIDTH LEFT WHITE PARENTHESIS FF60 ; Grapheme_Base # Pe FULLWIDTH RIGHT WHITE PARENTHESIS FF61 ; Grapheme_Base # Po HALFWIDTH IDEOGRAPHIC FULL STOP FF62 ; Grapheme_Base # Ps HALFWIDTH LEFT CORNER BRACKET FF63 ; Grapheme_Base # Pe HALFWIDTH RIGHT CORNER BRACKET FF64..FF65 ; Grapheme_Base # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT FF66..FF6F ; Grapheme_Base # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF70 ; Grapheme_Base # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF71..FF9D ; Grapheme_Base # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FFA0..FFBE ; Grapheme_Base # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; Grapheme_Base # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; Grapheme_Base # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; Grapheme_Base # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; Grapheme_Base # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I FFE0..FFE1 ; Grapheme_Base # Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN FFE2 ; Grapheme_Base # Sm FULLWIDTH NOT SIGN FFE3 ; Grapheme_Base # Sk FULLWIDTH MACRON FFE4 ; Grapheme_Base # So FULLWIDTH BROKEN BAR FFE5..FFE6 ; Grapheme_Base # Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN FFE8 ; Grapheme_Base # So HALFWIDTH FORMS LIGHT VERTICAL FFE9..FFEC ; Grapheme_Base # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW FFED..FFEE ; Grapheme_Base # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 10000..1000B ; Grapheme_Base # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; Grapheme_Base # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; Grapheme_Base # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; Grapheme_Base # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; Grapheme_Base # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; Grapheme_Base # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; Grapheme_Base # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10100..10102 ; Grapheme_Base # Po [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK 10107..10133 ; Grapheme_Base # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND 10137..1013F ; Grapheme_Base # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT 10140..10174 ; Grapheme_Base # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10175..10178 ; Grapheme_Base # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN 10179..10189 ; Grapheme_Base # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN 1018A..1018B ; Grapheme_Base # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN 1018C..1018E ; Grapheme_Base # So [3] GREEK SINUSOID SIGN..NOMISMA SIGN 10190..1019C ; Grapheme_Base # So [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL 101A0 ; Grapheme_Base # So GREEK SYMBOL TAU RHO 101D0..101FC ; Grapheme_Base # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND 10280..1029C ; Grapheme_Base # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; Grapheme_Base # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 102E1..102FB ; Grapheme_Base # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED 10300..1031F ; Grapheme_Base # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 10320..10323 ; Grapheme_Base # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY 1032D..10340 ; Grapheme_Base # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10341 ; Grapheme_Base # Nl GOTHIC LETTER NINETY 10342..10349 ; Grapheme_Base # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 1034A ; Grapheme_Base # Nl GOTHIC LETTER NINE HUNDRED 10350..10375 ; Grapheme_Base # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10380..1039D ; Grapheme_Base # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 1039F ; Grapheme_Base # Po UGARITIC WORD DIVIDER 103A0..103C3 ; Grapheme_Base # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; Grapheme_Base # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 103D0 ; Grapheme_Base # Po OLD PERSIAN WORD DIVIDER 103D1..103D5 ; Grapheme_Base # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 10400..1044F ; Grapheme_Base # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW 10450..1049D ; Grapheme_Base # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 104A0..104A9 ; Grapheme_Base # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE 104B0..104D3 ; Grapheme_Base # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 104D8..104FB ; Grapheme_Base # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10500..10527 ; Grapheme_Base # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; Grapheme_Base # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 1056F ; Grapheme_Base # Po CAUCASIAN ALBANIAN CITATION MARK 10570..1057A ; Grapheme_Base # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Grapheme_Base # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Grapheme_Base # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Grapheme_Base # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10597..105A1 ; Grapheme_Base # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Grapheme_Base # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Grapheme_Base # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Grapheme_Base # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 105C0..105F3 ; Grapheme_Base # Lo [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; Grapheme_Base # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; Grapheme_Base # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; Grapheme_Base # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10780..10785 ; Grapheme_Base # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Grapheme_Base # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Grapheme_Base # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10800..10805 ; Grapheme_Base # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; Grapheme_Base # Lo CYPRIOT SYLLABLE JO 1080A..10835 ; Grapheme_Base # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; Grapheme_Base # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; Grapheme_Base # Lo CYPRIOT SYLLABLE ZA 1083F..10855 ; Grapheme_Base # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10857 ; Grapheme_Base # Po IMPERIAL ARAMAIC SECTION SIGN 10858..1085F ; Grapheme_Base # No [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND 10860..10876 ; Grapheme_Base # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10877..10878 ; Grapheme_Base # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON 10879..1087F ; Grapheme_Base # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY 10880..1089E ; Grapheme_Base # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108A7..108AF ; Grapheme_Base # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED 108E0..108F2 ; Grapheme_Base # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; Grapheme_Base # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 108FB..108FF ; Grapheme_Base # No [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED 10900..10915 ; Grapheme_Base # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10916..1091B ; Grapheme_Base # No [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE 1091F ; Grapheme_Base # Po PHOENICIAN WORD SEPARATOR 10920..10939 ; Grapheme_Base # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C 1093F ; Grapheme_Base # Po LYDIAN TRIANGULAR MARK 10940..10959 ; Grapheme_Base # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; Grapheme_Base # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BC..109BD ; Grapheme_Base # No [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF 109BE..109BF ; Grapheme_Base # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 109C0..109CF ; Grapheme_Base # No [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY 109D2..109FF ; Grapheme_Base # No [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS 10A00 ; Grapheme_Base # Lo KHAROSHTHI LETTER A 10A10..10A13 ; Grapheme_Base # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; Grapheme_Base # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; Grapheme_Base # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A40..10A48 ; Grapheme_Base # No [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF 10A50..10A58 ; Grapheme_Base # Po [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES 10A60..10A7C ; Grapheme_Base # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A7D..10A7E ; Grapheme_Base # No [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY 10A7F ; Grapheme_Base # Po OLD SOUTH ARABIAN NUMERIC INDICATOR 10A80..10A9C ; Grapheme_Base # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10A9D..10A9F ; Grapheme_Base # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY 10AC0..10AC7 ; Grapheme_Base # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC8 ; Grapheme_Base # So MANICHAEAN SIGN UD 10AC9..10AE4 ; Grapheme_Base # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10AEB..10AEF ; Grapheme_Base # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED 10AF0..10AF6 ; Grapheme_Base # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER 10B00..10B35 ; Grapheme_Base # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B39..10B3F ; Grapheme_Base # Po [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION 10B40..10B55 ; Grapheme_Base # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B58..10B5F ; Grapheme_Base # No [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND 10B60..10B72 ; Grapheme_Base # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B78..10B7F ; Grapheme_Base # No [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND 10B80..10B91 ; Grapheme_Base # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10B99..10B9C ; Grapheme_Base # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT 10BA9..10BAF ; Grapheme_Base # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED 10C00..10C48 ; Grapheme_Base # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10C80..10CB2 ; Grapheme_Base # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10CC0..10CF2 ; Grapheme_Base # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10CFA..10CFF ; Grapheme_Base # No [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND 10D00..10D23 ; Grapheme_Base # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D30..10D39 ; Grapheme_Base # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE 10D40..10D49 ; Grapheme_Base # Nd [10] GARAY DIGIT ZERO..GARAY DIGIT NINE 10D4A..10D4D ; Grapheme_Base # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4E ; Grapheme_Base # Lm GARAY VOWEL LENGTH MARK 10D4F ; Grapheme_Base # Lo GARAY SUKUN 10D50..10D65 ; Grapheme_Base # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 10D6E ; Grapheme_Base # Pd GARAY HYPHEN 10D6F ; Grapheme_Base # Lm GARAY REDUPLICATION MARK 10D70..10D85 ; Grapheme_Base # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 10D8E..10D8F ; Grapheme_Base # Sm [2] GARAY PLUS SIGN..GARAY MINUS SIGN 10E60..10E7E ; Grapheme_Base # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS 10E80..10EA9 ; Grapheme_Base # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAD ; Grapheme_Base # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; Grapheme_Base # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; Grapheme_Base # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC5 ; Grapheme_Base # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 10EC6..10EC7 ; Grapheme_Base # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10ED0 ; Grapheme_Base # Po ARABIC BIBLICAL END OF VERSE 10ED1..10ED8 ; Grapheme_Base # So [8] ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM..ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH 10F00..10F1C ; Grapheme_Base # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; Grapheme_Base # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; Grapheme_Base # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; Grapheme_Base # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F51..10F54 ; Grapheme_Base # No [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED 10F55..10F59 ; Grapheme_Base # Po [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 10F70..10F81 ; Grapheme_Base # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10F86..10F89 ; Grapheme_Base # Po [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS 10FB0..10FC4 ; Grapheme_Base # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FC5..10FCB ; Grapheme_Base # No [7] CHORASMIAN NUMBER ONE..CHORASMIAN NUMBER ONE HUNDRED 10FE0..10FF6 ; Grapheme_Base # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11000 ; Grapheme_Base # Mc BRAHMI SIGN CANDRABINDU 11002 ; Grapheme_Base # Mc BRAHMI SIGN VISARGA 11003..11037 ; Grapheme_Base # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11047..1104D ; Grapheme_Base # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 11052..11065 ; Grapheme_Base # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND 11066..1106F ; Grapheme_Base # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE 11071..11072 ; Grapheme_Base # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11075 ; Grapheme_Base # Lo BRAHMI LETTER OLD TAMIL LLA 11082 ; Grapheme_Base # Mc KAITHI SIGN VISARGA 11083..110AF ; Grapheme_Base # Lo [45] KAITHI LETTER A..KAITHI LETTER HA 110B0..110B2 ; Grapheme_Base # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B7..110B8 ; Grapheme_Base # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 110BB..110BC ; Grapheme_Base # Po [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN 110BE..110C1 ; Grapheme_Base # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 110D0..110E8 ; Grapheme_Base # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 110F0..110F9 ; Grapheme_Base # Nd [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE 11103..11126 ; Grapheme_Base # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 1112C ; Grapheme_Base # Mc CHAKMA VOWEL SIGN E 11136..1113F ; Grapheme_Base # Nd [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE 11140..11143 ; Grapheme_Base # Po [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK 11144 ; Grapheme_Base # Lo CHAKMA LETTER LHAA 11145..11146 ; Grapheme_Base # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11147 ; Grapheme_Base # Lo CHAKMA LETTER VAA 11150..11172 ; Grapheme_Base # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11174..11175 ; Grapheme_Base # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK 11176 ; Grapheme_Base # Lo MAHAJANI LIGATURE SHRI 11182 ; Grapheme_Base # Mc SHARADA SIGN VISARGA 11183..111B2 ; Grapheme_Base # Lo [48] SHARADA LETTER A..SHARADA LETTER HA 111B3..111B5 ; Grapheme_Base # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111BF ; Grapheme_Base # Mc SHARADA VOWEL SIGN AU 111C1..111C4 ; Grapheme_Base # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111C5..111C8 ; Grapheme_Base # Po [4] SHARADA DANDA..SHARADA SEPARATOR 111CD ; Grapheme_Base # Po SHARADA SUTRA MARK 111CE ; Grapheme_Base # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 111D0..111D9 ; Grapheme_Base # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE 111DA ; Grapheme_Base # Lo SHARADA EKAM 111DB ; Grapheme_Base # Po SHARADA SIGN SIDDHAM 111DC ; Grapheme_Base # Lo SHARADA HEADSTROKE 111DD..111DF ; Grapheme_Base # Po [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 111E1..111F4 ; Grapheme_Base # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND 11200..11211 ; Grapheme_Base # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; Grapheme_Base # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1122C..1122E ; Grapheme_Base # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 11232..11233 ; Grapheme_Base # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11238..1123D ; Grapheme_Base # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN 1123F..11240 ; Grapheme_Base # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11280..11286 ; Grapheme_Base # Lo [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; Grapheme_Base # Lo MULTANI LETTER GHA 1128A..1128D ; Grapheme_Base # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; Grapheme_Base # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; Grapheme_Base # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112A9 ; Grapheme_Base # Po MULTANI SECTION MARK 112B0..112DE ; Grapheme_Base # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 112E0..112E2 ; Grapheme_Base # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 112F0..112F9 ; Grapheme_Base # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE 11302..11303 ; Grapheme_Base # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 11305..1130C ; Grapheme_Base # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; Grapheme_Base # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; Grapheme_Base # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; Grapheme_Base # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; Grapheme_Base # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; Grapheme_Base # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133D ; Grapheme_Base # Lo GRANTHA SIGN AVAGRAHA 1133F ; Grapheme_Base # Mc GRANTHA VOWEL SIGN I 11341..11344 ; Grapheme_Base # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; Grapheme_Base # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134C ; Grapheme_Base # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU 11350 ; Grapheme_Base # Lo GRANTHA OM 1135D..11361 ; Grapheme_Base # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11362..11363 ; Grapheme_Base # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 11380..11389 ; Grapheme_Base # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; Grapheme_Base # Lo TULU-TIGALARI LETTER EE 1138E ; Grapheme_Base # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; Grapheme_Base # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; Grapheme_Base # Lo TULU-TIGALARI SIGN AVAGRAHA 113B9..113BA ; Grapheme_Base # Mc [2] TULU-TIGALARI VOWEL SIGN I..TULU-TIGALARI VOWEL SIGN II 113CA ; Grapheme_Base # Mc TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; Grapheme_Base # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 113D1 ; Grapheme_Base # Lo TULU-TIGALARI REPHA 113D3 ; Grapheme_Base # Lo TULU-TIGALARI SIGN PLUTA 113D4..113D5 ; Grapheme_Base # Po [2] TULU-TIGALARI DANDA..TULU-TIGALARI DOUBLE DANDA 113D7..113D8 ; Grapheme_Base # Po [2] TULU-TIGALARI SIGN OM PUSHPIKA..TULU-TIGALARI SIGN SHRII PUSHPIKA 11400..11434 ; Grapheme_Base # Lo [53] NEWA LETTER A..NEWA LETTER HA 11435..11437 ; Grapheme_Base # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11440..11441 ; Grapheme_Base # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11445 ; Grapheme_Base # Mc NEWA SIGN VISARGA 11447..1144A ; Grapheme_Base # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 1144B..1144F ; Grapheme_Base # Po [5] NEWA DANDA..NEWA ABBREVIATION SIGN 11450..11459 ; Grapheme_Base # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE 1145A..1145B ; Grapheme_Base # Po [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK 1145D ; Grapheme_Base # Po NEWA INSERTION SIGN 1145F..11461 ; Grapheme_Base # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; Grapheme_Base # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114B1..114B2 ; Grapheme_Base # Mc [2] TIRHUTA VOWEL SIGN I..TIRHUTA VOWEL SIGN II 114B9 ; Grapheme_Base # Mc TIRHUTA VOWEL SIGN E 114BB..114BC ; Grapheme_Base # Mc [2] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN O 114BE ; Grapheme_Base # Mc TIRHUTA VOWEL SIGN AU 114C1 ; Grapheme_Base # Mc TIRHUTA SIGN VISARGA 114C4..114C5 ; Grapheme_Base # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C6 ; Grapheme_Base # Po TIRHUTA ABBREVIATION SIGN 114C7 ; Grapheme_Base # Lo TIRHUTA OM 114D0..114D9 ; Grapheme_Base # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE 11580..115AE ; Grapheme_Base # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115B0..115B1 ; Grapheme_Base # Mc [2] SIDDHAM VOWEL SIGN I..SIDDHAM VOWEL SIGN II 115B8..115BB ; Grapheme_Base # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BE ; Grapheme_Base # Mc SIDDHAM SIGN VISARGA 115C1..115D7 ; Grapheme_Base # Po [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 115D8..115DB ; Grapheme_Base # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 11600..1162F ; Grapheme_Base # Lo [48] MODI LETTER A..MODI LETTER LLA 11630..11632 ; Grapheme_Base # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 1163B..1163C ; Grapheme_Base # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163E ; Grapheme_Base # Mc MODI SIGN VISARGA 11641..11643 ; Grapheme_Base # Po [3] MODI DANDA..MODI ABBREVIATION SIGN 11644 ; Grapheme_Base # Lo MODI SIGN HUVA 11650..11659 ; Grapheme_Base # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE 11660..1166C ; Grapheme_Base # Po [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT 11680..116AA ; Grapheme_Base # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA 116AC ; Grapheme_Base # Mc TAKRI SIGN VISARGA 116AE..116AF ; Grapheme_Base # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B8 ; Grapheme_Base # Lo TAKRI LETTER ARCHAIC KHA 116B9 ; Grapheme_Base # Po TAKRI ABBREVIATION SIGN 116C0..116C9 ; Grapheme_Base # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE 116D0..116E3 ; Grapheme_Base # Nd [20] MYANMAR PAO DIGIT ZERO..MYANMAR EASTERN PWO KAREN DIGIT NINE 11700..1171A ; Grapheme_Base # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 1171E ; Grapheme_Base # Mc AHOM CONSONANT SIGN MEDIAL RA 11720..11721 ; Grapheme_Base # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11726 ; Grapheme_Base # Mc AHOM VOWEL SIGN E 11730..11739 ; Grapheme_Base # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 1173A..1173B ; Grapheme_Base # No [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY 1173C..1173E ; Grapheme_Base # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI 1173F ; Grapheme_Base # So AHOM SYMBOL VI 11740..11746 ; Grapheme_Base # Lo [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; Grapheme_Base # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA 1182C..1182E ; Grapheme_Base # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 11838 ; Grapheme_Base # Mc DOGRA SIGN VISARGA 1183B ; Grapheme_Base # Po DOGRA ABBREVIATION SIGN 118A0..118DF ; Grapheme_Base # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 118E0..118E9 ; Grapheme_Base # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 118EA..118F2 ; Grapheme_Base # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY 118FF..11906 ; Grapheme_Base # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; Grapheme_Base # Lo DIVES AKURU LETTER O 1190C..11913 ; Grapheme_Base # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; Grapheme_Base # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; Grapheme_Base # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 11931..11935 ; Grapheme_Base # Mc [5] DIVES AKURU VOWEL SIGN I..DIVES AKURU VOWEL SIGN E 11937..11938 ; Grapheme_Base # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193F ; Grapheme_Base # Lo DIVES AKURU PREFIXED NASAL SIGN 11940 ; Grapheme_Base # Mc DIVES AKURU MEDIAL YA 11941 ; Grapheme_Base # Lo DIVES AKURU INITIAL RA 11942 ; Grapheme_Base # Mc DIVES AKURU MEDIAL RA 11944..11946 ; Grapheme_Base # Po [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK 11950..11959 ; Grapheme_Base # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE 119A0..119A7 ; Grapheme_Base # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; Grapheme_Base # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119D1..119D3 ; Grapheme_Base # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119DC..119DF ; Grapheme_Base # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E1 ; Grapheme_Base # Lo NANDINAGARI SIGN AVAGRAHA 119E2 ; Grapheme_Base # Po NANDINAGARI SIGN SIDDHAM 119E3 ; Grapheme_Base # Lo NANDINAGARI HEADSTROKE 119E4 ; Grapheme_Base # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A00 ; Grapheme_Base # Lo ZANABAZAR SQUARE LETTER A 11A0B..11A32 ; Grapheme_Base # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A39 ; Grapheme_Base # Mc ZANABAZAR SQUARE SIGN VISARGA 11A3A ; Grapheme_Base # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A3F..11A46 ; Grapheme_Base # Po [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK 11A50 ; Grapheme_Base # Lo SOYOMBO LETTER A 11A57..11A58 ; Grapheme_Base # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A5C..11A89 ; Grapheme_Base # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A97 ; Grapheme_Base # Mc SOYOMBO SIGN VISARGA 11A9A..11A9C ; Grapheme_Base # Po [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD 11A9D ; Grapheme_Base # Lo SOYOMBO MARK PLUTA 11A9E..11AA2 ; Grapheme_Base # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11AB0..11AF8 ; Grapheme_Base # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; Grapheme_Base # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU 11B61 ; Grapheme_Base # Mc SHARADA VOWEL SIGN OOE 11B65 ; Grapheme_Base # Mc SHARADA VOWEL SIGN SHORT O 11B67 ; Grapheme_Base # Mc SHARADA VOWEL SIGN CANDRA O 11BC0..11BE0 ; Grapheme_Base # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11BE1 ; Grapheme_Base # Po SUNUWAR SIGN PVO 11BF0..11BF9 ; Grapheme_Base # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; Grapheme_Base # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; Grapheme_Base # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; Grapheme_Base # Mc BHAIKSUKI VOWEL SIGN AA 11C3E ; Grapheme_Base # Mc BHAIKSUKI SIGN VISARGA 11C40 ; Grapheme_Base # Lo BHAIKSUKI SIGN AVAGRAHA 11C41..11C45 ; Grapheme_Base # Po [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 11C50..11C59 ; Grapheme_Base # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11C5A..11C6C ; Grapheme_Base # No [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK 11C70..11C71 ; Grapheme_Base # Po [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD 11C72..11C8F ; Grapheme_Base # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A 11CA9 ; Grapheme_Base # Mc MARCHEN SUBJOINED LETTER YA 11CB1 ; Grapheme_Base # Mc MARCHEN VOWEL SIGN I 11CB4 ; Grapheme_Base # Mc MARCHEN VOWEL SIGN O 11D00..11D06 ; Grapheme_Base # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; Grapheme_Base # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; Grapheme_Base # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D46 ; Grapheme_Base # Lo MASARAM GONDI REPHA 11D50..11D59 ; Grapheme_Base # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11D60..11D65 ; Grapheme_Base # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; Grapheme_Base # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; Grapheme_Base # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D8A..11D8E ; Grapheme_Base # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D93..11D94 ; Grapheme_Base # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D96 ; Grapheme_Base # Mc GUNJALA GONDI SIGN VISARGA 11D98 ; Grapheme_Base # Lo GUNJALA GONDI OM 11DA0..11DA9 ; Grapheme_Base # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE 11DB0..11DD8 ; Grapheme_Base # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DD9 ; Grapheme_Base # Lm TOLONG SIKI SIGN SELA 11DDA..11DDB ; Grapheme_Base # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11DE0..11DE9 ; Grapheme_Base # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE 11EE0..11EF2 ; Grapheme_Base # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA 11EF5..11EF6 ; Grapheme_Base # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11EF7..11EF8 ; Grapheme_Base # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION 11F02 ; Grapheme_Base # Lo KAWI SIGN REPHA 11F03 ; Grapheme_Base # Mc KAWI SIGN VISARGA 11F04..11F10 ; Grapheme_Base # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; Grapheme_Base # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA 11F34..11F35 ; Grapheme_Base # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F3E..11F3F ; Grapheme_Base # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F43..11F4F ; Grapheme_Base # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL 11F50..11F59 ; Grapheme_Base # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE 11FB0 ; Grapheme_Base # Lo LISU LETTER YHA 11FC0..11FD4 ; Grapheme_Base # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH 11FD5..11FDC ; Grapheme_Base # So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI 11FDD..11FE0 ; Grapheme_Base # Sc [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN 11FE1..11FF1 ; Grapheme_Base # So [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA 11FFF ; Grapheme_Base # Po TAMIL PUNCTUATION END OF TEXT 12000..12399 ; Grapheme_Base # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12400..1246E ; Grapheme_Base # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 12470..12474 ; Grapheme_Base # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 12480..12543 ; Grapheme_Base # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; Grapheme_Base # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 12FF1..12FF2 ; Grapheme_Base # Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302 13000..1342F ; Grapheme_Base # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13441..13446 ; Grapheme_Base # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13460..143FA ; Grapheme_Base # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; Grapheme_Base # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; Grapheme_Base # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 1612A..1612C ; Grapheme_Base # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 16130..16139 ; Grapheme_Base # Nd [10] GURUNG KHEMA DIGIT ZERO..GURUNG KHEMA DIGIT NINE 16800..16A38 ; Grapheme_Base # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; Grapheme_Base # Lo [31] MRO LETTER TA..MRO LETTER TEK 16A60..16A69 ; Grapheme_Base # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE 16A6E..16A6F ; Grapheme_Base # Po [2] MRO DANDA..MRO DOUBLE DANDA 16A70..16ABE ; Grapheme_Base # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AC0..16AC9 ; Grapheme_Base # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE 16AD0..16AED ; Grapheme_Base # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16AF5 ; Grapheme_Base # Po BASSA VAH FULL STOP 16B00..16B2F ; Grapheme_Base # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B37..16B3B ; Grapheme_Base # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM 16B3C..16B3F ; Grapheme_Base # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB 16B40..16B43 ; Grapheme_Base # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16B44 ; Grapheme_Base # Po PAHAWH HMONG SIGN XAUS 16B45 ; Grapheme_Base # So PAHAWH HMONG SIGN CIM TSOV ROG 16B50..16B59 ; Grapheme_Base # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE 16B5B..16B61 ; Grapheme_Base # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS 16B63..16B77 ; Grapheme_Base # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; Grapheme_Base # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D40..16D42 ; Grapheme_Base # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D43..16D6A ; Grapheme_Base # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16D6B..16D6C ; Grapheme_Base # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16D6D..16D6F ; Grapheme_Base # Po [3] KIRAT RAI SIGN YUPI..KIRAT RAI DOUBLE DANDA 16D70..16D79 ; Grapheme_Base # Nd [10] KIRAT RAI DIGIT ZERO..KIRAT RAI DIGIT NINE 16E40..16E7F ; Grapheme_Base # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16E80..16E96 ; Grapheme_Base # No [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM 16E97..16E9A ; Grapheme_Base # Po [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH 16EA0..16EB8 ; Grapheme_Base # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 16EBB..16ED3 ; Grapheme_Base # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 16F00..16F4A ; Grapheme_Base # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; Grapheme_Base # Lo MIAO LETTER NASALIZATION 16F51..16F87 ; Grapheme_Base # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F93..16F9F ; Grapheme_Base # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; Grapheme_Base # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE2 ; Grapheme_Base # Po OLD CHINESE HOOK MARK 16FE3 ; Grapheme_Base # Lm OLD CHINESE ITERATION MARK 16FF2..16FF3 ; Grapheme_Base # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; Grapheme_Base # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; Grapheme_Base # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; Grapheme_Base # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; Grapheme_Base # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1AFF0..1AFF3 ; Grapheme_Base # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Grapheme_Base # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Grapheme_Base # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1B000..1B122 ; Grapheme_Base # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; Grapheme_Base # Lo HIRAGANA LETTER SMALL KO 1B150..1B152 ; Grapheme_Base # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; Grapheme_Base # Lo KATAKANA LETTER SMALL KO 1B164..1B167 ; Grapheme_Base # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; Grapheme_Base # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; Grapheme_Base # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; Grapheme_Base # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; Grapheme_Base # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; Grapheme_Base # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1BC9C ; Grapheme_Base # So DUPLOYAN SIGN O WITH CROSS 1BC9F ; Grapheme_Base # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP 1CC00..1CCEF ; Grapheme_Base # So [240] UP-POINTING GO-KART..OUTLINED LATIN CAPITAL LETTER Z 1CCF0..1CCF9 ; Grapheme_Base # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE 1CCFA..1CCFC ; Grapheme_Base # So [3] SNAKE SYMBOL..NOSE SYMBOL 1CD00..1CEB3 ; Grapheme_Base # So [436] BLOCK OCTANT-3..BLACK RIGHT TRIANGLE CARET 1CEBA..1CED0 ; Grapheme_Base # So [23] FRAGILE SYMBOL..LEUKOTHEA 1CEE0..1CEEF ; Grapheme_Base # So [16] GEOMANTIC FIGURE POPULUS..GEOMANTIC FIGURE VIA 1CEF0 ; Grapheme_Base # Sm MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR 1CF50..1CFC3 ; Grapheme_Base # So [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK 1D000..1D0F5 ; Grapheme_Base # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; Grapheme_Base # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 1D129..1D164 ; Grapheme_Base # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE 1D16A..1D16C ; Grapheme_Base # So [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3 1D183..1D184 ; Grapheme_Base # So [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN 1D18C..1D1A9 ; Grapheme_Base # So [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH 1D1AE..1D1EA ; Grapheme_Base # So [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON 1D200..1D241 ; Grapheme_Base # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 1D245 ; Grapheme_Base # So GREEK MUSICAL LEIMMA 1D2C0..1D2D3 ; Grapheme_Base # No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN 1D2E0..1D2F3 ; Grapheme_Base # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN 1D300..1D356 ; Grapheme_Base # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING 1D360..1D378 ; Grapheme_Base # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE 1D400..1D454 ; Grapheme_Base # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; Grapheme_Base # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Grapheme_Base # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Grapheme_Base # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Grapheme_Base # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Grapheme_Base # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; Grapheme_Base # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Grapheme_Base # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Grapheme_Base # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; Grapheme_Base # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Grapheme_Base # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Grapheme_Base # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Grapheme_Base # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; Grapheme_Base # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Grapheme_Base # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Grapheme_Base # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Grapheme_Base # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Grapheme_Base # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; Grapheme_Base # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; Grapheme_Base # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C1 ; Grapheme_Base # Sm MATHEMATICAL BOLD NABLA 1D6C2..1D6DA ; Grapheme_Base # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DB ; Grapheme_Base # Sm MATHEMATICAL BOLD PARTIAL DIFFERENTIAL 1D6DC..1D6FA ; Grapheme_Base # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FB ; Grapheme_Base # Sm MATHEMATICAL ITALIC NABLA 1D6FC..1D714 ; Grapheme_Base # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D715 ; Grapheme_Base # Sm MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL 1D716..1D734 ; Grapheme_Base # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D735 ; Grapheme_Base # Sm MATHEMATICAL BOLD ITALIC NABLA 1D736..1D74E ; Grapheme_Base # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D74F ; Grapheme_Base # Sm MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL 1D750..1D76E ; Grapheme_Base # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D76F ; Grapheme_Base # Sm MATHEMATICAL SANS-SERIF BOLD NABLA 1D770..1D788 ; Grapheme_Base # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D789 ; Grapheme_Base # Sm MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL 1D78A..1D7A8 ; Grapheme_Base # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7A9 ; Grapheme_Base # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA 1D7AA..1D7C2 ; Grapheme_Base # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C3 ; Grapheme_Base # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL 1D7C4..1D7CB ; Grapheme_Base # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1D7CE..1D7FF ; Grapheme_Base # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1D800..1D9FF ; Grapheme_Base # So [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD 1DA37..1DA3A ; Grapheme_Base # So [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE 1DA6D..1DA74 ; Grapheme_Base # So [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING 1DA76..1DA83 ; Grapheme_Base # So [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH 1DA85..1DA86 ; Grapheme_Base # So [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS 1DA87..1DA8B ; Grapheme_Base # Po [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS 1DF00..1DF09 ; Grapheme_Base # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0A ; Grapheme_Base # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1DF0B..1DF1E ; Grapheme_Base # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Grapheme_Base # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E030..1E06D ; Grapheme_Base # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E100..1E12C ; Grapheme_Base # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E137..1E13D ; Grapheme_Base # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E140..1E149 ; Grapheme_Base # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE 1E14E ; Grapheme_Base # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E14F ; Grapheme_Base # So NYIAKENG PUACHUE HMONG CIRCLED CA 1E290..1E2AD ; Grapheme_Base # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; Grapheme_Base # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E2F0..1E2F9 ; Grapheme_Base # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE 1E2FF ; Grapheme_Base # Sc WANCHO NGUN SIGN 1E4D0..1E4EA ; Grapheme_Base # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E4EB ; Grapheme_Base # Lm NAG MUNDARI SIGN OJOD 1E4F0..1E4F9 ; Grapheme_Base # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE 1E5D0..1E5ED ; Grapheme_Base # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5F0 ; Grapheme_Base # Lo OL ONAL SIGN HODDOND 1E5F1..1E5FA ; Grapheme_Base # Nd [10] OL ONAL DIGIT ZERO..OL ONAL DIGIT NINE 1E5FF ; Grapheme_Base # Po OL ONAL ABBREVIATION SIGN 1E6C0..1E6DE ; Grapheme_Base # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; Grapheme_Base # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E4..1E6E5 ; Grapheme_Base # Lo [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E7..1E6ED ; Grapheme_Base # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6F0..1E6F4 ; Grapheme_Base # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6FE ; Grapheme_Base # Lo TAI YO SYMBOL MUEANG 1E6FF ; Grapheme_Base # Lm TAI YO XAM LAI 1E7E0..1E7E6 ; Grapheme_Base # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; Grapheme_Base # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; Grapheme_Base # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; Grapheme_Base # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; Grapheme_Base # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1E8C7..1E8CF ; Grapheme_Base # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE 1E900..1E943 ; Grapheme_Base # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA 1E94B ; Grapheme_Base # Lm ADLAM NASALIZATION MARK 1E950..1E959 ; Grapheme_Base # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1E95E..1E95F ; Grapheme_Base # Po [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK 1EC71..1ECAB ; Grapheme_Base # No [59] INDIC SIYAQ NUMBER ONE..INDIC SIYAQ NUMBER PREFIXED NINE 1ECAC ; Grapheme_Base # So INDIC SIYAQ PLACEHOLDER 1ECAD..1ECAF ; Grapheme_Base # No [3] INDIC SIYAQ FRACTION ONE QUARTER..INDIC SIYAQ FRACTION THREE QUARTERS 1ECB0 ; Grapheme_Base # Sc INDIC SIYAQ RUPEE MARK 1ECB1..1ECB4 ; Grapheme_Base # No [4] INDIC SIYAQ NUMBER ALTERNATE ONE..INDIC SIYAQ ALTERNATE LAKH MARK 1ED01..1ED2D ; Grapheme_Base # No [45] OTTOMAN SIYAQ NUMBER ONE..OTTOMAN SIYAQ NUMBER NINETY THOUSAND 1ED2E ; Grapheme_Base # So OTTOMAN SIYAQ MARRATAN 1ED2F..1ED3D ; Grapheme_Base # No [15] OTTOMAN SIYAQ ALTERNATE NUMBER TWO..OTTOMAN SIYAQ FRACTION ONE SIXTH 1EE00..1EE03 ; Grapheme_Base # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Grapheme_Base # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Grapheme_Base # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; Grapheme_Base # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; Grapheme_Base # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; Grapheme_Base # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; Grapheme_Base # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; Grapheme_Base # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; Grapheme_Base # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; Grapheme_Base # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; Grapheme_Base # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; Grapheme_Base # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; Grapheme_Base # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; Grapheme_Base # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; Grapheme_Base # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; Grapheme_Base # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; Grapheme_Base # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; Grapheme_Base # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; Grapheme_Base # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; Grapheme_Base # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; Grapheme_Base # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; Grapheme_Base # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; Grapheme_Base # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Grapheme_Base # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Grapheme_Base # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL 1F000..1F02B ; Grapheme_Base # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK 1F030..1F093 ; Grapheme_Base # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 1F0A0..1F0AE ; Grapheme_Base # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES 1F0B1..1F0BF ; Grapheme_Base # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER 1F0C1..1F0CF ; Grapheme_Base # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER 1F0D1..1F0F5 ; Grapheme_Base # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 1F100..1F10C ; Grapheme_Base # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO 1F10D..1F1AD ; Grapheme_Base # So [161] CIRCLED ZERO WITH SLASH..MASK WORK SYMBOL 1F1E6..1F202 ; Grapheme_Base # So [29] REGIONAL INDICATOR SYMBOL LETTER A..SQUARED KATAKANA SA 1F210..1F23B ; Grapheme_Base # So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D 1F240..1F248 ; Grapheme_Base # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 1F250..1F251 ; Grapheme_Base # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT 1F260..1F265 ; Grapheme_Base # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI 1F300..1F3FA ; Grapheme_Base # So [251] CYCLONE..AMPHORA 1F3FB..1F3FF ; Grapheme_Base # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 1F400..1F6D8 ; Grapheme_Base # So [729] RAT..LANDSLIDE 1F6DC..1F6EC ; Grapheme_Base # So [17] WIRELESS..AIRPLANE ARRIVING 1F6F0..1F6FC ; Grapheme_Base # So [13] SATELLITE..ROLLER SKATE 1F700..1F7D9 ; Grapheme_Base # So [218] ALCHEMICAL SYMBOL FOR QUINTESSENCE..NINE POINTED WHITE STAR 1F7E0..1F7EB ; Grapheme_Base # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE 1F7F0 ; Grapheme_Base # So HEAVY EQUALS SIGN 1F800..1F80B ; Grapheme_Base # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD 1F810..1F847 ; Grapheme_Base # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW 1F850..1F859 ; Grapheme_Base # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW 1F860..1F887 ; Grapheme_Base # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW 1F890..1F8AD ; Grapheme_Base # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS 1F8B0..1F8BB ; Grapheme_Base # So [12] ARROW POINTING UPWARDS THEN NORTH WEST..SOUTH WEST ARROW FROM BAR 1F8C0..1F8C1 ; Grapheme_Base # So [2] LEFTWARDS ARROW FROM DOWNWARDS ARROW..RIGHTWARDS ARROW FROM DOWNWARDS ARROW 1F8D0..1F8D8 ; Grapheme_Base # Sm [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE 1F900..1FA57 ; Grapheme_Base # So [344] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS ALFIL 1FA60..1FA6D ; Grapheme_Base # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER 1FA70..1FA7C ; Grapheme_Base # So [13] BALLET SHOES..CRUTCH 1FA80..1FA8A ; Grapheme_Base # So [11] YO-YO..TROMBONE 1FA8E..1FAC6 ; Grapheme_Base # So [57] TREASURE CHEST..FINGERPRINT 1FAC8 ; Grapheme_Base # So HAIRY CREATURE 1FACD..1FADC ; Grapheme_Base # So [16] ORCA..ROOT VEGETABLE 1FADF..1FAEA ; Grapheme_Base # So [12] SPLATTER..DISTORTED FACE 1FAEF..1FAF8 ; Grapheme_Base # So [10] FIGHT CLOUD..RIGHTWARDS PUSHING HAND 1FB00..1FB92 ; Grapheme_Base # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK 1FB94..1FBEF ; Grapheme_Base # So [92] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..TOP LEFT JUSTIFIED LOWER RIGHT QUARTER BLACK CIRCLE 1FBF0..1FBF9 ; Grapheme_Base # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE 1FBFA ; Grapheme_Base # So ALARM BELL SYMBOL 20000..2A6DF ; Grapheme_Base # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; Grapheme_Base # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; Grapheme_Base # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Grapheme_Base # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Grapheme_Base # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; Grapheme_Base # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; Grapheme_Base # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; Grapheme_Base # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 157494 # ================================================ # Derived Property: Grapheme_Link (deprecated) # Generated from: Canonical_Combining_Class=Virama # Use Canonical_Combining_Class=Virama directly instead 094D ; Grapheme_Link # Mn DEVANAGARI SIGN VIRAMA 09CD ; Grapheme_Link # Mn BENGALI SIGN VIRAMA 0A4D ; Grapheme_Link # Mn GURMUKHI SIGN VIRAMA 0ACD ; Grapheme_Link # Mn GUJARATI SIGN VIRAMA 0B4D ; Grapheme_Link # Mn ORIYA SIGN VIRAMA 0BCD ; Grapheme_Link # Mn TAMIL SIGN VIRAMA 0C4D ; Grapheme_Link # Mn TELUGU SIGN VIRAMA 0CCD ; Grapheme_Link # Mn KANNADA SIGN VIRAMA 0D3B..0D3C ; Grapheme_Link # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D4D ; Grapheme_Link # Mn MALAYALAM SIGN VIRAMA 0DCA ; Grapheme_Link # Mn SINHALA SIGN AL-LAKUNA 0E3A ; Grapheme_Link # Mn THAI CHARACTER PHINTHU 0EBA ; Grapheme_Link # Mn LAO SIGN PALI VIRAMA 0F84 ; Grapheme_Link # Mn TIBETAN MARK HALANTA 1039..103A ; Grapheme_Link # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 1714 ; Grapheme_Link # Mn TAGALOG SIGN VIRAMA 1715 ; Grapheme_Link # Mc TAGALOG SIGN PAMUDPOD 1734 ; Grapheme_Link # Mc HANUNOO SIGN PAMUDPOD 17D2 ; Grapheme_Link # Mn KHMER SIGN COENG 1A60 ; Grapheme_Link # Mn TAI THAM SIGN SAKOT 1B44 ; Grapheme_Link # Mc BALINESE ADEG ADEG 1BAA ; Grapheme_Link # Mc SUNDANESE SIGN PAMAAEH 1BAB ; Grapheme_Link # Mn SUNDANESE SIGN VIRAMA 1BF2..1BF3 ; Grapheme_Link # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 2D7F ; Grapheme_Link # Mn TIFINAGH CONSONANT JOINER A806 ; Grapheme_Link # Mn SYLOTI NAGRI SIGN HASANTA A82C ; Grapheme_Link # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4 ; Grapheme_Link # Mn SAURASHTRA SIGN VIRAMA A953 ; Grapheme_Link # Mc REJANG VIRAMA A9C0 ; Grapheme_Link # Mc JAVANESE PANGKON AAF6 ; Grapheme_Link # Mn MEETEI MAYEK VIRAMA ABED ; Grapheme_Link # Mn MEETEI MAYEK APUN IYEK 10A3F ; Grapheme_Link # Mn KHAROSHTHI VIRAMA 11046 ; Grapheme_Link # Mn BRAHMI VIRAMA 11070 ; Grapheme_Link # Mn BRAHMI SIGN OLD TAMIL VIRAMA 1107F ; Grapheme_Link # Mn BRAHMI NUMBER JOINER 110B9 ; Grapheme_Link # Mn KAITHI SIGN VIRAMA 11133..11134 ; Grapheme_Link # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA 111C0 ; Grapheme_Link # Mc SHARADA SIGN VIRAMA 11235 ; Grapheme_Link # Mc KHOJKI SIGN VIRAMA 112EA ; Grapheme_Link # Mn KHUDAWADI SIGN VIRAMA 1134D ; Grapheme_Link # Mc GRANTHA SIGN VIRAMA 113CE ; Grapheme_Link # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; Grapheme_Link # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; Grapheme_Link # Mn TULU-TIGALARI CONJOINER 11442 ; Grapheme_Link # Mn NEWA SIGN VIRAMA 114C2 ; Grapheme_Link # Mn TIRHUTA SIGN VIRAMA 115BF ; Grapheme_Link # Mn SIDDHAM SIGN VIRAMA 1163F ; Grapheme_Link # Mn MODI SIGN VIRAMA 116B6 ; Grapheme_Link # Mc TAKRI SIGN VIRAMA 1172B ; Grapheme_Link # Mn AHOM SIGN KILLER 11839 ; Grapheme_Link # Mn DOGRA SIGN VIRAMA 1193D ; Grapheme_Link # Mc DIVES AKURU SIGN HALANTA 1193E ; Grapheme_Link # Mn DIVES AKURU VIRAMA 119E0 ; Grapheme_Link # Mn NANDINAGARI SIGN VIRAMA 11A34 ; Grapheme_Link # Mn ZANABAZAR SQUARE SIGN VIRAMA 11A47 ; Grapheme_Link # Mn ZANABAZAR SQUARE SUBJOINER 11A99 ; Grapheme_Link # Mn SOYOMBO SUBJOINER 11C3F ; Grapheme_Link # Mn BHAIKSUKI SIGN VIRAMA 11D44..11D45 ; Grapheme_Link # Mn [2] MASARAM GONDI SIGN HALANTA..MASARAM GONDI VIRAMA 11D97 ; Grapheme_Link # Mn GUNJALA GONDI VIRAMA 11F41 ; Grapheme_Link # Mc KAWI SIGN KILLER 11F42 ; Grapheme_Link # Mn KAWI CONJOINER 1612F ; Grapheme_Link # Mn GURUNG KHEMA SIGN THOLHOMA # Total code points: 69 # ================================================ # Derived Property: Indic_Conjunct_Break # Generated from the Grapheme_Cluster_Break, Indic_Syllabic_Category, # Canonical_Combining_Class, and Script properties as described in UAX #44: # https://www.unicode.org/reports/tr44/. # All code points not explicitly listed for Indic_Conjunct_Break # have the value None. # @missing: 0000..10FFFF; InCB; None # ================================================ # Indic_Conjunct_Break=Linker 094D ; InCB; Linker # Mn DEVANAGARI SIGN VIRAMA 09CD ; InCB; Linker # Mn BENGALI SIGN VIRAMA 0ACD ; InCB; Linker # Mn GUJARATI SIGN VIRAMA 0B4D ; InCB; Linker # Mn ORIYA SIGN VIRAMA 0C4D ; InCB; Linker # Mn TELUGU SIGN VIRAMA 0D4D ; InCB; Linker # Mn MALAYALAM SIGN VIRAMA 1039 ; InCB; Linker # Mn MYANMAR SIGN VIRAMA 17D2 ; InCB; Linker # Mn KHMER SIGN COENG 1A60 ; InCB; Linker # Mn TAI THAM SIGN SAKOT 1B44 ; InCB; Linker # Mc BALINESE ADEG ADEG 1BAB ; InCB; Linker # Mn SUNDANESE SIGN VIRAMA A9C0 ; InCB; Linker # Mc JAVANESE PANGKON AAF6 ; InCB; Linker # Mn MEETEI MAYEK VIRAMA 10A3F ; InCB; Linker # Mn KHAROSHTHI VIRAMA 11133 ; InCB; Linker # Mn CHAKMA VIRAMA 113D0 ; InCB; Linker # Mn TULU-TIGALARI CONJOINER 1193E ; InCB; Linker # Mn DIVES AKURU VIRAMA 11A47 ; InCB; Linker # Mn ZANABAZAR SQUARE SUBJOINER 11A99 ; InCB; Linker # Mn SOYOMBO SUBJOINER 11F42 ; InCB; Linker # Mn KAWI CONJOINER # Total code points: 20 # ================================================ # Indic_Conjunct_Break=Consonant 0915..0939 ; InCB; Consonant # Lo [37] DEVANAGARI LETTER KA..DEVANAGARI LETTER HA 0958..095F ; InCB; Consonant # Lo [8] DEVANAGARI LETTER QA..DEVANAGARI LETTER YYA 0978..097F ; InCB; Consonant # Lo [8] DEVANAGARI LETTER MARWARI DDA..DEVANAGARI LETTER BBA 0995..09A8 ; InCB; Consonant # Lo [20] BENGALI LETTER KA..BENGALI LETTER NA 09AA..09B0 ; InCB; Consonant # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; InCB; Consonant # Lo BENGALI LETTER LA 09B6..09B9 ; InCB; Consonant # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA 09DC..09DD ; InCB; Consonant # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF ; InCB; Consonant # Lo BENGALI LETTER YYA 09F0..09F1 ; InCB; Consonant # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 0A95..0AA8 ; InCB; Consonant # Lo [20] GUJARATI LETTER KA..GUJARATI LETTER NA 0AAA..0AB0 ; InCB; Consonant # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; InCB; Consonant # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; InCB; Consonant # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0AF9 ; InCB; Consonant # Lo GUJARATI LETTER ZHA 0B15..0B28 ; InCB; Consonant # Lo [20] ORIYA LETTER KA..ORIYA LETTER NA 0B2A..0B30 ; InCB; Consonant # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; InCB; Consonant # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; InCB; Consonant # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA 0B5C..0B5D ; InCB; Consonant # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F ; InCB; Consonant # Lo ORIYA LETTER YYA 0B71 ; InCB; Consonant # Lo ORIYA LETTER WA 0C15..0C28 ; InCB; Consonant # Lo [20] TELUGU LETTER KA..TELUGU LETTER NA 0C2A..0C39 ; InCB; Consonant # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C58..0C5A ; InCB; Consonant # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0D15..0D3A ; InCB; Consonant # Lo [38] MALAYALAM LETTER KA..MALAYALAM LETTER TTTA 1000..102A ; InCB; Consonant # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU 103F ; InCB; Consonant # Lo MYANMAR LETTER GREAT SA 1050..1055 ; InCB; Consonant # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 105A..105D ; InCB; Consonant # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 1061 ; InCB; Consonant # Lo MYANMAR LETTER SGAW KAREN SHA 1065..1066 ; InCB; Consonant # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 106E..1070 ; InCB; Consonant # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1075..1081 ; InCB; Consonant # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 108E ; InCB; Consonant # Lo MYANMAR LETTER RUMAI PALAUNG FA 1780..17B3 ; InCB; Consonant # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 1A20..1A54 ; InCB; Consonant # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1B0B..1B0C ; InCB; Consonant # Lo [2] BALINESE LETTER RA REPA..BALINESE LETTER RA REPA TEDUNG 1B13..1B33 ; InCB; Consonant # Lo [33] BALINESE LETTER KA..BALINESE LETTER HA 1B45..1B4C ; InCB; Consonant # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B83..1BA0 ; InCB; Consonant # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BAE..1BAF ; InCB; Consonant # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BBB..1BBD ; InCB; Consonant # Lo [3] SUNDANESE LETTER REU..SUNDANESE LETTER BHA A989..A98B ; InCB; Consonant # Lo [3] JAVANESE LETTER PA CEREK..JAVANESE LETTER NGA LELET RASWADI A98F..A9B2 ; InCB; Consonant # Lo [36] JAVANESE LETTER KA..JAVANESE LETTER HA A9E0..A9E4 ; InCB; Consonant # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E7..A9EF ; InCB; Consonant # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9FA..A9FE ; InCB; Consonant # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA60..AA6F ; InCB; Consonant # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA71..AA73 ; InCB; Consonant # Lo [3] MYANMAR LETTER KHAMTI XA..MYANMAR LETTER KHAMTI RA AA7A ; InCB; Consonant # Lo MYANMAR LETTER AITON RA AA7E..AA7F ; InCB; Consonant # Lo [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA AAE0..AAEA ; InCB; Consonant # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA ABC0..ABDA ; InCB; Consonant # Lo [27] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER BHAM 10A00 ; InCB; Consonant # Lo KHAROSHTHI LETTER A 10A10..10A13 ; InCB; Consonant # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; InCB; Consonant # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; InCB; Consonant # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 11103..11126 ; InCB; Consonant # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11144 ; InCB; Consonant # Lo CHAKMA LETTER LHAA 11147 ; InCB; Consonant # Lo CHAKMA LETTER VAA 11380..11389 ; InCB; Consonant # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; InCB; Consonant # Lo TULU-TIGALARI LETTER EE 1138E ; InCB; Consonant # Lo TULU-TIGALARI LETTER AI 11390..113B5 ; InCB; Consonant # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 11900..11906 ; InCB; Consonant # Lo [7] DIVES AKURU LETTER A..DIVES AKURU LETTER E 11909 ; InCB; Consonant # Lo DIVES AKURU LETTER O 1190C..11913 ; InCB; Consonant # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; InCB; Consonant # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; InCB; Consonant # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 11A00 ; InCB; Consonant # Lo ZANABAZAR SQUARE LETTER A 11A0B..11A32 ; InCB; Consonant # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A50 ; InCB; Consonant # Lo SOYOMBO LETTER A 11A5C..11A83 ; InCB; Consonant # Lo [40] SOYOMBO LETTER KA..SOYOMBO LETTER KSSA 11F04..11F10 ; InCB; Consonant # Lo [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; InCB; Consonant # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA # Total code points: 911 # ================================================ # Indic_Conjunct_Break=Extend 0300..036F ; InCB; Extend # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0483..0487 ; InCB; Extend # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0488..0489 ; InCB; Extend # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 0591..05BD ; InCB; Extend # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; InCB; Extend # Mn HEBREW POINT RAFE 05C1..05C2 ; InCB; Extend # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; InCB; Extend # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; InCB; Extend # Mn HEBREW POINT QAMATS QATAN 0610..061A ; InCB; Extend # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 064B..065F ; InCB; Extend # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0670 ; InCB; Extend # Mn ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; InCB; Extend # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; InCB; Extend # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E7..06E8 ; InCB; Extend # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; InCB; Extend # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 0711 ; InCB; Extend # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0730..074A ; InCB; Extend # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; InCB; Extend # Mn [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; InCB; Extend # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07FD ; InCB; Extend # Mn NKO DANTAYALAN 0816..0819 ; InCB; Extend # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081B..0823 ; InCB; Extend # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; InCB; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; InCB; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; InCB; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0897..089F ; InCB; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; InCB; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; InCB; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; InCB; Extend # Mn DEVANAGARI VOWEL SIGN OE 093C ; InCB; Extend # Mn DEVANAGARI SIGN NUKTA 0941..0948 ; InCB; Extend # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 0951..0957 ; InCB; Extend # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; InCB; Extend # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0981 ; InCB; Extend # Mn BENGALI SIGN CANDRABINDU 09BC ; InCB; Extend # Mn BENGALI SIGN NUKTA 09BE ; InCB; Extend # Mc BENGALI VOWEL SIGN AA 09C1..09C4 ; InCB; Extend # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09D7 ; InCB; Extend # Mc BENGALI AU LENGTH MARK 09E2..09E3 ; InCB; Extend # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09FE ; InCB; Extend # Mn BENGALI SANDHI MARK 0A01..0A02 ; InCB; Extend # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A3C ; InCB; Extend # Mn GURMUKHI SIGN NUKTA 0A41..0A42 ; InCB; Extend # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; InCB; Extend # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; InCB; Extend # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; InCB; Extend # Mn GURMUKHI SIGN UDAAT 0A70..0A71 ; InCB; Extend # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; InCB; Extend # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; InCB; Extend # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0ABC ; InCB; Extend # Mn GUJARATI SIGN NUKTA 0AC1..0AC5 ; InCB; Extend # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; InCB; Extend # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0AE2..0AE3 ; InCB; Extend # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFF ; InCB; Extend # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; InCB; Extend # Mn ORIYA SIGN CANDRABINDU 0B3C ; InCB; Extend # Mn ORIYA SIGN NUKTA 0B3E ; InCB; Extend # Mc ORIYA VOWEL SIGN AA 0B3F ; InCB; Extend # Mn ORIYA VOWEL SIGN I 0B41..0B44 ; InCB; Extend # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B55..0B56 ; InCB; Extend # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B57 ; InCB; Extend # Mc ORIYA AU LENGTH MARK 0B62..0B63 ; InCB; Extend # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; InCB; Extend # Mn TAMIL SIGN ANUSVARA 0BBE ; InCB; Extend # Mc TAMIL VOWEL SIGN AA 0BC0 ; InCB; Extend # Mn TAMIL VOWEL SIGN II 0BCD ; InCB; Extend # Mn TAMIL SIGN VIRAMA 0BD7 ; InCB; Extend # Mc TAMIL AU LENGTH MARK 0C00 ; InCB; Extend # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C04 ; InCB; Extend # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3C ; InCB; Extend # Mn TELUGU SIGN NUKTA 0C3E..0C40 ; InCB; Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C46..0C48 ; InCB; Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4C ; InCB; Extend # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU 0C55..0C56 ; InCB; Extend # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; InCB; Extend # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; InCB; Extend # Mn KANNADA SIGN CANDRABINDU 0CBC ; InCB; Extend # Mn KANNADA SIGN NUKTA 0CBF ; InCB; Extend # Mn KANNADA VOWEL SIGN I 0CC0 ; InCB; Extend # Mc KANNADA VOWEL SIGN II 0CC2 ; InCB; Extend # Mc KANNADA VOWEL SIGN UU 0CC6 ; InCB; Extend # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; InCB; Extend # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; InCB; Extend # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; InCB; Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; InCB; Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CE2..0CE3 ; InCB; Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0D00..0D01 ; InCB; Extend # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D3B..0D3C ; InCB; Extend # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D3E ; InCB; Extend # Mc MALAYALAM VOWEL SIGN AA 0D41..0D44 ; InCB; Extend # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D57 ; InCB; Extend # Mc MALAYALAM AU LENGTH MARK 0D62..0D63 ; InCB; Extend # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; InCB; Extend # Mn SINHALA SIGN CANDRABINDU 0DCA ; InCB; Extend # Mn SINHALA SIGN AL-LAKUNA 0DCF ; InCB; Extend # Mc SINHALA VOWEL SIGN AELA-PILLA 0DD2..0DD4 ; InCB; Extend # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; InCB; Extend # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DDF ; InCB; Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA 0E31 ; InCB; Extend # Mn THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; InCB; Extend # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E47..0E4E ; InCB; Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0EB1 ; InCB; Extend # Mn LAO VOWEL SIGN MAI KAN 0EB4..0EBC ; InCB; Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EC8..0ECE ; InCB; Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0F18..0F19 ; InCB; Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; InCB; Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; InCB; Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; InCB; Extend # Mn TIBETAN MARK TSA -PHRU 0F71..0F7E ; InCB; Extend # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F80..0F84 ; InCB; Extend # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; InCB; Extend # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F8D..0F97 ; InCB; Extend # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; InCB; Extend # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; InCB; Extend # Mn TIBETAN SYMBOL PADMA GDAN 102D..1030 ; InCB; Extend # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1032..1037 ; InCB; Extend # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 103A ; InCB; Extend # Mn MYANMAR SIGN ASAT 103D..103E ; InCB; Extend # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1058..1059 ; InCB; Extend # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; InCB; Extend # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1071..1074 ; InCB; Extend # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; InCB; Extend # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1085..1086 ; InCB; Extend # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 108D ; InCB; Extend # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 109D ; InCB; Extend # Mn MYANMAR VOWEL SIGN AITON AI 135D..135F ; InCB; Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; InCB; Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1715 ; InCB; Extend # Mc TAGALOG SIGN PAMUDPOD 1732..1733 ; InCB; Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1734 ; InCB; Extend # Mc HANUNOO SIGN PAMUDPOD 1752..1753 ; InCB; Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; InCB; Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; InCB; Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B7..17BD ; InCB; Extend # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17C6 ; InCB; Extend # Mn KHMER SIGN NIKAHIT 17C9..17D1 ; InCB; Extend # Mn [9] KHMER SIGN MUUSIKATOAN..KHMER SIGN VIRIAM 17D3 ; InCB; Extend # Mn KHMER SIGN BATHAMASAT 17DD ; InCB; Extend # Mn KHMER SIGN ATTHACAN 180B..180D ; InCB; Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; InCB; Extend # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1885..1886 ; InCB; Extend # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; InCB; Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; InCB; Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1927..1928 ; InCB; Extend # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1932 ; InCB; Extend # Mn LIMBU SMALL LETTER ANUSVARA 1939..193B ; InCB; Extend # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A17..1A18 ; InCB; Extend # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A1B ; InCB; Extend # Mn BUGINESE VOWEL SIGN AE 1A56 ; InCB; Extend # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A58..1A5E ; InCB; Extend # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A62 ; InCB; Extend # Mn TAI THAM VOWEL SIGN MAI SAT 1A65..1A6C ; InCB; Extend # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A73..1A7C ; InCB; Extend # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; InCB; Extend # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AB0..1ABD ; InCB; Extend # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; InCB; Extend # Me COMBINING PARENTHESES OVERLAY 1ABF..1ADD ; InCB; Extend # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; InCB; Extend # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; InCB; Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B34 ; InCB; Extend # Mn BALINESE SIGN REREKAN 1B35 ; InCB; Extend # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; InCB; Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; InCB; Extend # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; InCB; Extend # Mn BALINESE VOWEL SIGN LA LENGA 1B3D ; InCB; Extend # Mc BALINESE VOWEL SIGN LA LENGA TEDUNG 1B42 ; InCB; Extend # Mn BALINESE VOWEL SIGN PEPET 1B43 ; InCB; Extend # Mc BALINESE VOWEL SIGN PEPET TEDUNG 1B6B..1B73 ; InCB; Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; InCB; Extend # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; InCB; Extend # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; InCB; Extend # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; InCB; Extend # Mc SUNDANESE SIGN PAMAAEH 1BAC..1BAD ; InCB; Extend # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; InCB; Extend # Mn BATAK SIGN TOMPI 1BE8..1BE9 ; InCB; Extend # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; InCB; Extend # Mn BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; InCB; Extend # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1BF2..1BF3 ; InCB; Extend # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C2C..1C33 ; InCB; Extend # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; InCB; Extend # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1CD0..1CD2 ; InCB; Extend # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; InCB; Extend # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; InCB; Extend # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; InCB; Extend # Mn VEDIC SIGN TIRYAK 1CF4 ; InCB; Extend # Mn VEDIC TONE CANDRA ABOVE 1CF8..1CF9 ; InCB; Extend # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1DC0..1DFF ; InCB; Extend # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 200D ; InCB; Extend # Cf ZERO WIDTH JOINER 20D0..20DC ; InCB; Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20DD..20E0 ; InCB; Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH 20E1 ; InCB; Extend # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E2..20E4 ; InCB; Extend # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE 20E5..20F0 ; InCB; Extend # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2CEF..2CF1 ; InCB; Extend # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2D7F ; InCB; Extend # Mn TIFINAGH CONSONANT JOINER 2DE0..2DFF ; InCB; Extend # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 302A..302D ; InCB; Extend # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 302E..302F ; InCB; Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3099..309A ; InCB; Extend # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK A66F ; InCB; Extend # Mn COMBINING CYRILLIC VZMET A670..A672 ; InCB; Extend # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN A674..A67D ; InCB; Extend # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A69E..A69F ; InCB; Extend # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6F0..A6F1 ; InCB; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A802 ; InCB; Extend # Mn SYLOTI NAGRI SIGN DVISVARA A806 ; InCB; Extend # Mn SYLOTI NAGRI SIGN HASANTA A80B ; InCB; Extend # Mn SYLOTI NAGRI SIGN ANUSVARA A825..A826 ; InCB; Extend # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A82C ; InCB; Extend # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4..A8C5 ; InCB; Extend # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8E0..A8F1 ; InCB; Extend # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8FF ; InCB; Extend # Mn DEVANAGARI VOWEL SIGN AY A926..A92D ; InCB; Extend # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; InCB; Extend # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A953 ; InCB; Extend # Mc REJANG VIRAMA A980..A982 ; InCB; Extend # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; InCB; Extend # Mn JAVANESE SIGN CECAK TELU A9B6..A9B9 ; InCB; Extend # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; InCB; Extend # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9E5 ; InCB; Extend # Mn MYANMAR SIGN SHAN SAW AA29..AA2E ; InCB; Extend # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; InCB; Extend # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA35..AA36 ; InCB; Extend # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; InCB; Extend # Mn CHAM CONSONANT SIGN FINAL NG AA4C ; InCB; Extend # Mn CHAM CONSONANT SIGN FINAL M AA7C ; InCB; Extend # Mn MYANMAR SIGN TAI LAING TONE-2 AAB0 ; InCB; Extend # Mn TAI VIET MAI KANG AAB2..AAB4 ; InCB; Extend # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; InCB; Extend # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE..AABF ; InCB; Extend # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC1 ; InCB; Extend # Mn TAI VIET TONE MAI THO AAEC..AAED ; InCB; Extend # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI ABE5 ; InCB; Extend # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE8 ; InCB; Extend # Mn MEETEI MAYEK VOWEL SIGN UNAP ABED ; InCB; Extend # Mn MEETEI MAYEK APUN IYEK FB1E ; InCB; Extend # Mn HEBREW POINT JUDEO-SPANISH VARIKA FE00..FE0F ; InCB; Extend # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; InCB; Extend # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FF9E..FF9F ; InCB; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK 101FD ; InCB; Extend # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 102E0 ; InCB; Extend # Mn COPTIC EPACT THOUSANDS MARK 10376..1037A ; InCB; Extend # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; InCB; Extend # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; InCB; Extend # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; InCB; Extend # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A38..10A3A ; InCB; Extend # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10AE5..10AE6 ; InCB; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; InCB; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D69..10D6D ; InCB; Extend # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10EAB..10EAC ; InCB; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EFA..10EFF ; InCB; Extend # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; InCB; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; InCB; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; InCB; Extend # Mn BRAHMI SIGN ANUSVARA 11038..11046 ; InCB; Extend # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11070 ; InCB; Extend # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11073..11074 ; InCB; Extend # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 1107F..11081 ; InCB; Extend # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 110B3..110B6 ; InCB; Extend # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B9..110BA ; InCB; Extend # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; InCB; Extend # Mn KAITHI VOWEL SIGN VOCALIC R 11100..11102 ; InCB; Extend # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; InCB; Extend # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112D..11132 ; InCB; Extend # Mn [6] CHAKMA VOWEL SIGN AI..CHAKMA AU MARK 11134 ; InCB; Extend # Mn CHAKMA MAAYYAA 11173 ; InCB; Extend # Mn MAHAJANI SIGN NUKTA 11180..11181 ; InCB; Extend # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; InCB; Extend # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111C0 ; InCB; Extend # Mc SHARADA SIGN VIRAMA 111C9..111CC ; InCB; Extend # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; InCB; Extend # Mn SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; InCB; Extend # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; InCB; Extend # Mn KHOJKI SIGN ANUSVARA 11235 ; InCB; Extend # Mc KHOJKI SIGN VIRAMA 11236..11237 ; InCB; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; InCB; Extend # Mn KHOJKI SIGN SUKUN 11241 ; InCB; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R 112DF ; InCB; Extend # Mn KHUDAWADI SIGN ANUSVARA 112E3..112EA ; InCB; Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 11300..11301 ; InCB; Extend # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 1133B..1133C ; InCB; Extend # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1133E ; InCB; Extend # Mc GRANTHA VOWEL SIGN AA 11340 ; InCB; Extend # Mn GRANTHA VOWEL SIGN II 1134D ; InCB; Extend # Mc GRANTHA SIGN VIRAMA 11357 ; InCB; Extend # Mc GRANTHA AU LENGTH MARK 11366..1136C ; InCB; Extend # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; InCB; Extend # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113B8 ; InCB; Extend # Mc TULU-TIGALARI VOWEL SIGN AA 113BB..113C0 ; InCB; Extend # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; InCB; Extend # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; InCB; Extend # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113C9 ; InCB; Extend # Mc [3] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI AU LENGTH MARK 113CE ; InCB; Extend # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; InCB; Extend # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D2 ; InCB; Extend # Mn TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; InCB; Extend # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11438..1143F ; InCB; Extend # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11442..11444 ; InCB; Extend # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11446 ; InCB; Extend # Mn NEWA SIGN NUKTA 1145E ; InCB; Extend # Mn NEWA SANDHI MARK 114B0 ; InCB; Extend # Mc TIRHUTA VOWEL SIGN AA 114B3..114B8 ; InCB; Extend # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114BA ; InCB; Extend # Mn TIRHUTA VOWEL SIGN SHORT E 114BD ; InCB; Extend # Mc TIRHUTA VOWEL SIGN SHORT O 114BF..114C0 ; InCB; Extend # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C2..114C3 ; InCB; Extend # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115AF ; InCB; Extend # Mc SIDDHAM VOWEL SIGN AA 115B2..115B5 ; InCB; Extend # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115BC..115BD ; InCB; Extend # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BF..115C0 ; InCB; Extend # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115DC..115DD ; InCB; Extend # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11633..1163A ; InCB; Extend # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163D ; InCB; Extend # Mn MODI SIGN ANUSVARA 1163F..11640 ; InCB; Extend # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 116AB ; InCB; Extend # Mn TAKRI SIGN ANUSVARA 116AD ; InCB; Extend # Mn TAKRI VOWEL SIGN AA 116B0..116B5 ; InCB; Extend # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B6 ; InCB; Extend # Mc TAKRI SIGN VIRAMA 116B7 ; InCB; Extend # Mn TAKRI SIGN NUKTA 1171D ; InCB; Extend # Mn AHOM CONSONANT SIGN MEDIAL LA 1171F ; InCB; Extend # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11722..11725 ; InCB; Extend # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11727..1172B ; InCB; Extend # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 1182F..11837 ; InCB; Extend # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11839..1183A ; InCB; Extend # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 11930 ; InCB; Extend # Mc DIVES AKURU VOWEL SIGN AA 1193B..1193C ; InCB; Extend # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193D ; InCB; Extend # Mc DIVES AKURU SIGN HALANTA 11943 ; InCB; Extend # Mn DIVES AKURU SIGN NUKTA 119D4..119D7 ; InCB; Extend # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; InCB; Extend # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119E0 ; InCB; Extend # Mn NANDINAGARI SIGN VIRAMA 11A01..11A0A ; InCB; Extend # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A33..11A38 ; InCB; Extend # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A3B..11A3E ; InCB; Extend # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A51..11A56 ; InCB; Extend # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A59..11A5B ; InCB; Extend # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; InCB; Extend # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A98 ; InCB; Extend # Mn SOYOMBO GEMINATION MARK 11B60 ; InCB; Extend # Mn SHARADA VOWEL SIGN OE 11B62..11B64 ; InCB; Extend # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B66 ; InCB; Extend # Mn SHARADA VOWEL SIGN CANDRA E 11C30..11C36 ; InCB; Extend # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; InCB; Extend # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3F ; InCB; Extend # Mn BHAIKSUKI SIGN VIRAMA 11C92..11CA7 ; InCB; Extend # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CAA..11CB0 ; InCB; Extend # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB2..11CB3 ; InCB; Extend # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB5..11CB6 ; InCB; Extend # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; InCB; Extend # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; InCB; Extend # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; InCB; Extend # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; InCB; Extend # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D47 ; InCB; Extend # Mn MASARAM GONDI RA-KARA 11D90..11D91 ; InCB; Extend # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D95 ; InCB; Extend # Mn GUNJALA GONDI SIGN ANUSVARA 11D97 ; InCB; Extend # Mn GUNJALA GONDI VIRAMA 11EF3..11EF4 ; InCB; Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11F00..11F01 ; InCB; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; InCB; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; InCB; Extend # Mn KAWI VOWEL SIGN EU 11F41 ; InCB; Extend # Mc KAWI SIGN KILLER 11F5A ; InCB; Extend # Mn KAWI SIGN NUKTA 13440 ; InCB; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13447..13455 ; InCB; Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1611E..16129 ; InCB; Extend # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612D..1612F ; InCB; Extend # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; InCB; Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; InCB; Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F4F ; InCB; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; InCB; Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FE4 ; InCB; Extend # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; InCB; Extend # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1BC9D..1BC9E ; InCB; Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CF00..1CF2D ; InCB; Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; InCB; Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D165..1D166 ; InCB; Extend # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D167..1D169 ; InCB; Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D16D..1D172 ; InCB; Extend # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; InCB; Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; InCB; Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; InCB; Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; InCB; Extend # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1DA00..1DA36 ; InCB; Extend # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; InCB; Extend # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; InCB; Extend # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; InCB; Extend # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; InCB; Extend # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; InCB; Extend # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1E000..1E006 ; InCB; Extend # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; InCB; Extend # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; InCB; Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; InCB; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; InCB; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E08F ; InCB; Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E130..1E136 ; InCB; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E2AE ; InCB; Extend # Mn TOTO SIGN RISING TONE 1E2EC..1E2EF ; InCB; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E4EC..1E4EF ; InCB; Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E5EE..1E5EF ; InCB; Extend # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E6E3 ; InCB; Extend # Mn TAI YO SIGN UE 1E6E6 ; InCB; Extend # Mn TAI YO SIGN AU 1E6EE..1E6EF ; InCB; Extend # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; InCB; Extend # Mn TAI YO SIGN OM 1E8D0..1E8D6 ; InCB; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E94A ; InCB; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA 1F3FB..1F3FF ; InCB; Extend # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 E0020..E007F ; InCB; Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; InCB; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 2217 # EOF ================================================ FILE: maint/Unicode.tables/DerivedGeneralCategory.txt ================================================ # DerivedGeneralCategory-17.0.0.txt # Date: 2025-07-24, 00:12:50 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # ================================================ # Property: General_Category # ================================================ # General_Category=Unassigned 0378..0379 ; Cn # [2] .. 0380..0383 ; Cn # [4] .. 038B ; Cn # 038D ; Cn # 03A2 ; Cn # 0530 ; Cn # 0557..0558 ; Cn # [2] .. 058B..058C ; Cn # [2] .. 0590 ; Cn # 05C8..05CF ; Cn # [8] .. 05EB..05EE ; Cn # [4] .. 05F5..05FF ; Cn # [11] .. 070E ; Cn # 074B..074C ; Cn # [2] .. 07B2..07BF ; Cn # [14] .. 07FB..07FC ; Cn # [2] .. 082E..082F ; Cn # [2] .. 083F ; Cn # 085C..085D ; Cn # [2] .. 085F ; Cn # 086B..086F ; Cn # [5] .. 0892..0896 ; Cn # [5] .. 0984 ; Cn # 098D..098E ; Cn # [2] .. 0991..0992 ; Cn # [2] .. 09A9 ; Cn # 09B1 ; Cn # 09B3..09B5 ; Cn # [3] .. 09BA..09BB ; Cn # [2] .. 09C5..09C6 ; Cn # [2] .. 09C9..09CA ; Cn # [2] .. 09CF..09D6 ; Cn # [8] .. 09D8..09DB ; Cn # [4] .. 09DE ; Cn # 09E4..09E5 ; Cn # [2] .. 09FF..0A00 ; Cn # [2] .. 0A04 ; Cn # 0A0B..0A0E ; Cn # [4] .. 0A11..0A12 ; Cn # [2] .. 0A29 ; Cn # 0A31 ; Cn # 0A34 ; Cn # 0A37 ; Cn # 0A3A..0A3B ; Cn # [2] .. 0A3D ; Cn # 0A43..0A46 ; Cn # [4] .. 0A49..0A4A ; Cn # [2] .. 0A4E..0A50 ; Cn # [3] .. 0A52..0A58 ; Cn # [7] .. 0A5D ; Cn # 0A5F..0A65 ; Cn # [7] .. 0A77..0A80 ; Cn # [10] .. 0A84 ; Cn # 0A8E ; Cn # 0A92 ; Cn # 0AA9 ; Cn # 0AB1 ; Cn # 0AB4 ; Cn # 0ABA..0ABB ; Cn # [2] .. 0AC6 ; Cn # 0ACA ; Cn # 0ACE..0ACF ; Cn # [2] .. 0AD1..0ADF ; Cn # [15] .. 0AE4..0AE5 ; Cn # [2] .. 0AF2..0AF8 ; Cn # [7] .. 0B00 ; Cn # 0B04 ; Cn # 0B0D..0B0E ; Cn # [2] .. 0B11..0B12 ; Cn # [2] .. 0B29 ; Cn # 0B31 ; Cn # 0B34 ; Cn # 0B3A..0B3B ; Cn # [2] .. 0B45..0B46 ; Cn # [2] .. 0B49..0B4A ; Cn # [2] .. 0B4E..0B54 ; Cn # [7] .. 0B58..0B5B ; Cn # [4] .. 0B5E ; Cn # 0B64..0B65 ; Cn # [2] .. 0B78..0B81 ; Cn # [10] .. 0B84 ; Cn # 0B8B..0B8D ; Cn # [3] .. 0B91 ; Cn # 0B96..0B98 ; Cn # [3] .. 0B9B ; Cn # 0B9D ; Cn # 0BA0..0BA2 ; Cn # [3] .. 0BA5..0BA7 ; Cn # [3] .. 0BAB..0BAD ; Cn # [3] .. 0BBA..0BBD ; Cn # [4] .. 0BC3..0BC5 ; Cn # [3] .. 0BC9 ; Cn # 0BCE..0BCF ; Cn # [2] .. 0BD1..0BD6 ; Cn # [6] .. 0BD8..0BE5 ; Cn # [14] .. 0BFB..0BFF ; Cn # [5] .. 0C0D ; Cn # 0C11 ; Cn # 0C29 ; Cn # 0C3A..0C3B ; Cn # [2] .. 0C45 ; Cn # 0C49 ; Cn # 0C4E..0C54 ; Cn # [7] .. 0C57 ; Cn # 0C5B ; Cn # 0C5E..0C5F ; Cn # [2] .. 0C64..0C65 ; Cn # [2] .. 0C70..0C76 ; Cn # [7] .. 0C8D ; Cn # 0C91 ; Cn # 0CA9 ; Cn # 0CB4 ; Cn # 0CBA..0CBB ; Cn # [2] .. 0CC5 ; Cn # 0CC9 ; Cn # 0CCE..0CD4 ; Cn # [7] .. 0CD7..0CDB ; Cn # [5] .. 0CDF ; Cn # 0CE4..0CE5 ; Cn # [2] .. 0CF0 ; Cn # 0CF4..0CFF ; Cn # [12] .. 0D0D ; Cn # 0D11 ; Cn # 0D45 ; Cn # 0D49 ; Cn # 0D50..0D53 ; Cn # [4] .. 0D64..0D65 ; Cn # [2] .. 0D80 ; Cn # 0D84 ; Cn # 0D97..0D99 ; Cn # [3] .. 0DB2 ; Cn # 0DBC ; Cn # 0DBE..0DBF ; Cn # [2] .. 0DC7..0DC9 ; Cn # [3] .. 0DCB..0DCE ; Cn # [4] .. 0DD5 ; Cn # 0DD7 ; Cn # 0DE0..0DE5 ; Cn # [6] .. 0DF0..0DF1 ; Cn # [2] .. 0DF5..0E00 ; Cn # [12] .. 0E3B..0E3E ; Cn # [4] .. 0E5C..0E80 ; Cn # [37] .. 0E83 ; Cn # 0E85 ; Cn # 0E8B ; Cn # 0EA4 ; Cn # 0EA6 ; Cn # 0EBE..0EBF ; Cn # [2] .. 0EC5 ; Cn # 0EC7 ; Cn # 0ECF ; Cn # 0EDA..0EDB ; Cn # [2] .. 0EE0..0EFF ; Cn # [32] .. 0F48 ; Cn # 0F6D..0F70 ; Cn # [4] .. 0F98 ; Cn # 0FBD ; Cn # 0FCD ; Cn # 0FDB..0FFF ; Cn # [37] .. 10C6 ; Cn # 10C8..10CC ; Cn # [5] .. 10CE..10CF ; Cn # [2] .. 1249 ; Cn # 124E..124F ; Cn # [2] .. 1257 ; Cn # 1259 ; Cn # 125E..125F ; Cn # [2] .. 1289 ; Cn # 128E..128F ; Cn # [2] .. 12B1 ; Cn # 12B6..12B7 ; Cn # [2] .. 12BF ; Cn # 12C1 ; Cn # 12C6..12C7 ; Cn # [2] .. 12D7 ; Cn # 1311 ; Cn # 1316..1317 ; Cn # [2] .. 135B..135C ; Cn # [2] .. 137D..137F ; Cn # [3] .. 139A..139F ; Cn # [6] .. 13F6..13F7 ; Cn # [2] .. 13FE..13FF ; Cn # [2] .. 169D..169F ; Cn # [3] .. 16F9..16FF ; Cn # [7] .. 1716..171E ; Cn # [9] .. 1737..173F ; Cn # [9] .. 1754..175F ; Cn # [12] .. 176D ; Cn # 1771 ; Cn # 1774..177F ; Cn # [12] .. 17DE..17DF ; Cn # [2] .. 17EA..17EF ; Cn # [6] .. 17FA..17FF ; Cn # [6] .. 181A..181F ; Cn # [6] .. 1879..187F ; Cn # [7] .. 18AB..18AF ; Cn # [5] .. 18F6..18FF ; Cn # [10] .. 191F ; Cn # 192C..192F ; Cn # [4] .. 193C..193F ; Cn # [4] .. 1941..1943 ; Cn # [3] .. 196E..196F ; Cn # [2] .. 1975..197F ; Cn # [11] .. 19AC..19AF ; Cn # [4] .. 19CA..19CF ; Cn # [6] .. 19DB..19DD ; Cn # [3] .. 1A1C..1A1D ; Cn # [2] .. 1A5F ; Cn # 1A7D..1A7E ; Cn # [2] .. 1A8A..1A8F ; Cn # [6] .. 1A9A..1A9F ; Cn # [6] .. 1AAE..1AAF ; Cn # [2] .. 1ADE..1ADF ; Cn # [2] .. 1AEC..1AFF ; Cn # [20] .. 1B4D ; Cn # 1BF4..1BFB ; Cn # [8] .. 1C38..1C3A ; Cn # [3] .. 1C4A..1C4C ; Cn # [3] .. 1C8B..1C8F ; Cn # [5] .. 1CBB..1CBC ; Cn # [2] .. 1CC8..1CCF ; Cn # [8] .. 1CFB..1CFF ; Cn # [5] .. 1F16..1F17 ; Cn # [2] .. 1F1E..1F1F ; Cn # [2] .. 1F46..1F47 ; Cn # [2] .. 1F4E..1F4F ; Cn # [2] .. 1F58 ; Cn # 1F5A ; Cn # 1F5C ; Cn # 1F5E ; Cn # 1F7E..1F7F ; Cn # [2] .. 1FB5 ; Cn # 1FC5 ; Cn # 1FD4..1FD5 ; Cn # [2] .. 1FDC ; Cn # 1FF0..1FF1 ; Cn # [2] .. 1FF5 ; Cn # 1FFF ; Cn # 2065 ; Cn # 2072..2073 ; Cn # [2] .. 208F ; Cn # 209D..209F ; Cn # [3] .. 20C2..20CF ; Cn # [14] .. 20F1..20FF ; Cn # [15] .. 218C..218F ; Cn # [4] .. 242A..243F ; Cn # [22] .. 244B..245F ; Cn # [21] .. 2B74..2B75 ; Cn # [2] .. 2CF4..2CF8 ; Cn # [5] .. 2D26 ; Cn # 2D28..2D2C ; Cn # [5] .. 2D2E..2D2F ; Cn # [2] .. 2D68..2D6E ; Cn # [7] .. 2D71..2D7E ; Cn # [14] .. 2D97..2D9F ; Cn # [9] .. 2DA7 ; Cn # 2DAF ; Cn # 2DB7 ; Cn # 2DBF ; Cn # 2DC7 ; Cn # 2DCF ; Cn # 2DD7 ; Cn # 2DDF ; Cn # 2E5E..2E7F ; Cn # [34] .. 2E9A ; Cn # 2EF4..2EFF ; Cn # [12] .. 2FD6..2FEF ; Cn # [26] .. 3040 ; Cn # 3097..3098 ; Cn # [2] .. 3100..3104 ; Cn # [5] .. 3130 ; Cn # 318F ; Cn # 31E6..31EE ; Cn # [9] .. 321F ; Cn # A48D..A48F ; Cn # [3] .. A4C7..A4CF ; Cn # [9] .. A62C..A63F ; Cn # [20] .. A6F8..A6FF ; Cn # [8] .. A7DD..A7F0 ; Cn # [20] .. A82D..A82F ; Cn # [3] .. A83A..A83F ; Cn # [6] .. A878..A87F ; Cn # [8] .. A8C6..A8CD ; Cn # [8] .. A8DA..A8DF ; Cn # [6] .. A954..A95E ; Cn # [11] .. A97D..A97F ; Cn # [3] .. A9CE ; Cn # A9DA..A9DD ; Cn # [4] .. A9FF ; Cn # AA37..AA3F ; Cn # [9] .. AA4E..AA4F ; Cn # [2] .. AA5A..AA5B ; Cn # [2] .. AAC3..AADA ; Cn # [24] .. AAF7..AB00 ; Cn # [10] .. AB07..AB08 ; Cn # [2] .. AB0F..AB10 ; Cn # [2] .. AB17..AB1F ; Cn # [9] .. AB27 ; Cn # AB2F ; Cn # AB6C..AB6F ; Cn # [4] .. ABEE..ABEF ; Cn # [2] .. ABFA..ABFF ; Cn # [6] .. D7A4..D7AF ; Cn # [12] .. D7C7..D7CA ; Cn # [4] .. D7FC..D7FF ; Cn # [4] .. FA6E..FA6F ; Cn # [2] .. FADA..FAFF ; Cn # [38] .. FB07..FB12 ; Cn # [12] .. FB18..FB1C ; Cn # [5] .. FB37 ; Cn # FB3D ; Cn # FB3F ; Cn # FB42 ; Cn # FB45 ; Cn # FDD0..FDEF ; Cn # [32] .. FE1A..FE1F ; Cn # [6] .. FE53 ; Cn # FE67 ; Cn # FE6C..FE6F ; Cn # [4] .. FE75 ; Cn # FEFD..FEFE ; Cn # [2] .. FF00 ; Cn # FFBF..FFC1 ; Cn # [3] .. FFC8..FFC9 ; Cn # [2] .. FFD0..FFD1 ; Cn # [2] .. FFD8..FFD9 ; Cn # [2] .. FFDD..FFDF ; Cn # [3] .. FFE7 ; Cn # FFEF..FFF8 ; Cn # [10] .. FFFE..FFFF ; Cn # [2] .. 1000C ; Cn # 10027 ; Cn # 1003B ; Cn # 1003E ; Cn # 1004E..1004F ; Cn # [2] .. 1005E..1007F ; Cn # [34] .. 100FB..100FF ; Cn # [5] .. 10103..10106 ; Cn # [4] .. 10134..10136 ; Cn # [3] .. 1018F ; Cn # 1019D..1019F ; Cn # [3] .. 101A1..101CF ; Cn # [47] .. 101FE..1027F ; Cn # [130] .. 1029D..1029F ; Cn # [3] .. 102D1..102DF ; Cn # [15] .. 102FC..102FF ; Cn # [4] .. 10324..1032C ; Cn # [9] .. 1034B..1034F ; Cn # [5] .. 1037B..1037F ; Cn # [5] .. 1039E ; Cn # 103C4..103C7 ; Cn # [4] .. 103D6..103FF ; Cn # [42] .. 1049E..1049F ; Cn # [2] .. 104AA..104AF ; Cn # [6] .. 104D4..104D7 ; Cn # [4] .. 104FC..104FF ; Cn # [4] .. 10528..1052F ; Cn # [8] .. 10564..1056E ; Cn # [11] .. 1057B ; Cn # 1058B ; Cn # 10593 ; Cn # 10596 ; Cn # 105A2 ; Cn # 105B2 ; Cn # 105BA ; Cn # 105BD..105BF ; Cn # [3] .. 105F4..105FF ; Cn # [12] .. 10737..1073F ; Cn # [9] .. 10756..1075F ; Cn # [10] .. 10768..1077F ; Cn # [24] .. 10786 ; Cn # 107B1 ; Cn # 107BB..107FF ; Cn # [69] .. 10806..10807 ; Cn # [2] .. 10809 ; Cn # 10836 ; Cn # 10839..1083B ; Cn # [3] .. 1083D..1083E ; Cn # [2] .. 10856 ; Cn # 1089F..108A6 ; Cn # [8] .. 108B0..108DF ; Cn # [48] .. 108F3 ; Cn # 108F6..108FA ; Cn # [5] .. 1091C..1091E ; Cn # [3] .. 1093A..1093E ; Cn # [5] .. 1095A..1097F ; Cn # [38] .. 109B8..109BB ; Cn # [4] .. 109D0..109D1 ; Cn # [2] .. 10A04 ; Cn # 10A07..10A0B ; Cn # [5] .. 10A14 ; Cn # 10A18 ; Cn # 10A36..10A37 ; Cn # [2] .. 10A3B..10A3E ; Cn # [4] .. 10A49..10A4F ; Cn # [7] .. 10A59..10A5F ; Cn # [7] .. 10AA0..10ABF ; Cn # [32] .. 10AE7..10AEA ; Cn # [4] .. 10AF7..10AFF ; Cn # [9] .. 10B36..10B38 ; Cn # [3] .. 10B56..10B57 ; Cn # [2] .. 10B73..10B77 ; Cn # [5] .. 10B92..10B98 ; Cn # [7] .. 10B9D..10BA8 ; Cn # [12] .. 10BB0..10BFF ; Cn # [80] .. 10C49..10C7F ; Cn # [55] .. 10CB3..10CBF ; Cn # [13] .. 10CF3..10CF9 ; Cn # [7] .. 10D28..10D2F ; Cn # [8] .. 10D3A..10D3F ; Cn # [6] .. 10D66..10D68 ; Cn # [3] .. 10D86..10D8D ; Cn # [8] .. 10D90..10E5F ; Cn # [208] .. 10E7F ; Cn # 10EAA ; Cn # 10EAE..10EAF ; Cn # [2] .. 10EB2..10EC1 ; Cn # [16] .. 10EC8..10ECF ; Cn # [8] .. 10ED9..10EF9 ; Cn # [33] .. 10F28..10F2F ; Cn # [8] .. 10F5A..10F6F ; Cn # [22] .. 10F8A..10FAF ; Cn # [38] .. 10FCC..10FDF ; Cn # [20] .. 10FF7..10FFF ; Cn # [9] .. 1104E..11051 ; Cn # [4] .. 11076..1107E ; Cn # [9] .. 110C3..110CC ; Cn # [10] .. 110CE..110CF ; Cn # [2] .. 110E9..110EF ; Cn # [7] .. 110FA..110FF ; Cn # [6] .. 11135 ; Cn # 11148..1114F ; Cn # [8] .. 11177..1117F ; Cn # [9] .. 111E0 ; Cn # 111F5..111FF ; Cn # [11] .. 11212 ; Cn # 11242..1127F ; Cn # [62] .. 11287 ; Cn # 11289 ; Cn # 1128E ; Cn # 1129E ; Cn # 112AA..112AF ; Cn # [6] .. 112EB..112EF ; Cn # [5] .. 112FA..112FF ; Cn # [6] .. 11304 ; Cn # 1130D..1130E ; Cn # [2] .. 11311..11312 ; Cn # [2] .. 11329 ; Cn # 11331 ; Cn # 11334 ; Cn # 1133A ; Cn # 11345..11346 ; Cn # [2] .. 11349..1134A ; Cn # [2] .. 1134E..1134F ; Cn # [2] .. 11351..11356 ; Cn # [6] .. 11358..1135C ; Cn # [5] .. 11364..11365 ; Cn # [2] .. 1136D..1136F ; Cn # [3] .. 11375..1137F ; Cn # [11] .. 1138A ; Cn # 1138C..1138D ; Cn # [2] .. 1138F ; Cn # 113B6 ; Cn # 113C1 ; Cn # 113C3..113C4 ; Cn # [2] .. 113C6 ; Cn # 113CB ; Cn # 113D6 ; Cn # 113D9..113E0 ; Cn # [8] .. 113E3..113FF ; Cn # [29] .. 1145C ; Cn # 11462..1147F ; Cn # [30] .. 114C8..114CF ; Cn # [8] .. 114DA..1157F ; Cn # [166] .. 115B6..115B7 ; Cn # [2] .. 115DE..115FF ; Cn # [34] .. 11645..1164F ; Cn # [11] .. 1165A..1165F ; Cn # [6] .. 1166D..1167F ; Cn # [19] .. 116BA..116BF ; Cn # [6] .. 116CA..116CF ; Cn # [6] .. 116E4..116FF ; Cn # [28] .. 1171B..1171C ; Cn # [2] .. 1172C..1172F ; Cn # [4] .. 11747..117FF ; Cn # [185] .. 1183C..1189F ; Cn # [100] .. 118F3..118FE ; Cn # [12] .. 11907..11908 ; Cn # [2] .. 1190A..1190B ; Cn # [2] .. 11914 ; Cn # 11917 ; Cn # 11936 ; Cn # 11939..1193A ; Cn # [2] .. 11947..1194F ; Cn # [9] .. 1195A..1199F ; Cn # [70] .. 119A8..119A9 ; Cn # [2] .. 119D8..119D9 ; Cn # [2] .. 119E5..119FF ; Cn # [27] .. 11A48..11A4F ; Cn # [8] .. 11AA3..11AAF ; Cn # [13] .. 11AF9..11AFF ; Cn # [7] .. 11B0A..11B5F ; Cn # [86] .. 11B68..11BBF ; Cn # [88] .. 11BE2..11BEF ; Cn # [14] .. 11BFA..11BFF ; Cn # [6] .. 11C09 ; Cn # 11C37 ; Cn # 11C46..11C4F ; Cn # [10] .. 11C6D..11C6F ; Cn # [3] .. 11C90..11C91 ; Cn # [2] .. 11CA8 ; Cn # 11CB7..11CFF ; Cn # [73] .. 11D07 ; Cn # 11D0A ; Cn # 11D37..11D39 ; Cn # [3] .. 11D3B ; Cn # 11D3E ; Cn # 11D48..11D4F ; Cn # [8] .. 11D5A..11D5F ; Cn # [6] .. 11D66 ; Cn # 11D69 ; Cn # 11D8F ; Cn # 11D92 ; Cn # 11D99..11D9F ; Cn # [7] .. 11DAA..11DAF ; Cn # [6] .. 11DDC..11DDF ; Cn # [4] .. 11DEA..11EDF ; Cn # [246] .. 11EF9..11EFF ; Cn # [7] .. 11F11 ; Cn # 11F3B..11F3D ; Cn # [3] .. 11F5B..11FAF ; Cn # [85] .. 11FB1..11FBF ; Cn # [15] .. 11FF2..11FFE ; Cn # [13] .. 1239A..123FF ; Cn # [102] .. 1246F ; Cn # 12475..1247F ; Cn # [11] .. 12544..12F8F ; Cn # [2636] .. 12FF3..12FFF ; Cn # [13] .. 13456..1345F ; Cn # [10] .. 143FB..143FF ; Cn # [5] .. 14647..160FF ; Cn # [6841] .. 1613A..167FF ; Cn # [1734] .. 16A39..16A3F ; Cn # [7] .. 16A5F ; Cn # 16A6A..16A6D ; Cn # [4] .. 16ABF ; Cn # 16ACA..16ACF ; Cn # [6] .. 16AEE..16AEF ; Cn # [2] .. 16AF6..16AFF ; Cn # [10] .. 16B46..16B4F ; Cn # [10] .. 16B5A ; Cn # 16B62 ; Cn # 16B78..16B7C ; Cn # [5] .. 16B90..16D3F ; Cn # [432] .. 16D7A..16E3F ; Cn # [198] .. 16E9B..16E9F ; Cn # [5] .. 16EB9..16EBA ; Cn # [2] .. 16ED4..16EFF ; Cn # [44] .. 16F4B..16F4E ; Cn # [4] .. 16F88..16F8E ; Cn # [7] .. 16FA0..16FDF ; Cn # [64] .. 16FE5..16FEF ; Cn # [11] .. 16FF7..16FFF ; Cn # [9] .. 18CD6..18CFE ; Cn # [41] .. 18D1F..18D7F ; Cn # [97] .. 18DF3..1AFEF ; Cn # [8701] .. 1AFF4 ; Cn # 1AFFC ; Cn # 1AFFF ; Cn # 1B123..1B131 ; Cn # [15] .. 1B133..1B14F ; Cn # [29] .. 1B153..1B154 ; Cn # [2] .. 1B156..1B163 ; Cn # [14] .. 1B168..1B16F ; Cn # [8] .. 1B2FC..1BBFF ; Cn # [2308] .. 1BC6B..1BC6F ; Cn # [5] .. 1BC7D..1BC7F ; Cn # [3] .. 1BC89..1BC8F ; Cn # [7] .. 1BC9A..1BC9B ; Cn # [2] .. 1BCA4..1CBFF ; Cn # [3932] .. 1CCFD..1CCFF ; Cn # [3] .. 1CEB4..1CEB9 ; Cn # [6] .. 1CED1..1CEDF ; Cn # [15] .. 1CEF1..1CEFF ; Cn # [15] .. 1CF2E..1CF2F ; Cn # [2] .. 1CF47..1CF4F ; Cn # [9] .. 1CFC4..1CFFF ; Cn # [60] .. 1D0F6..1D0FF ; Cn # [10] .. 1D127..1D128 ; Cn # [2] .. 1D1EB..1D1FF ; Cn # [21] .. 1D246..1D2BF ; Cn # [122] .. 1D2D4..1D2DF ; Cn # [12] .. 1D2F4..1D2FF ; Cn # [12] .. 1D357..1D35F ; Cn # [9] .. 1D379..1D3FF ; Cn # [135] .. 1D455 ; Cn # 1D49D ; Cn # 1D4A0..1D4A1 ; Cn # [2] .. 1D4A3..1D4A4 ; Cn # [2] .. 1D4A7..1D4A8 ; Cn # [2] .. 1D4AD ; Cn # 1D4BA ; Cn # 1D4BC ; Cn # 1D4C4 ; Cn # 1D506 ; Cn # 1D50B..1D50C ; Cn # [2] .. 1D515 ; Cn # 1D51D ; Cn # 1D53A ; Cn # 1D53F ; Cn # 1D545 ; Cn # 1D547..1D549 ; Cn # [3] .. 1D551 ; Cn # 1D6A6..1D6A7 ; Cn # [2] .. 1D7CC..1D7CD ; Cn # [2] .. 1DA8C..1DA9A ; Cn # [15] .. 1DAA0 ; Cn # 1DAB0..1DEFF ; Cn # [1104] .. 1DF1F..1DF24 ; Cn # [6] .. 1DF2B..1DFFF ; Cn # [213] .. 1E007 ; Cn # 1E019..1E01A ; Cn # [2] .. 1E022 ; Cn # 1E025 ; Cn # 1E02B..1E02F ; Cn # [5] .. 1E06E..1E08E ; Cn # [33] .. 1E090..1E0FF ; Cn # [112] .. 1E12D..1E12F ; Cn # [3] .. 1E13E..1E13F ; Cn # [2] .. 1E14A..1E14D ; Cn # [4] .. 1E150..1E28F ; Cn # [320] .. 1E2AF..1E2BF ; Cn # [17] .. 1E2FA..1E2FE ; Cn # [5] .. 1E300..1E4CF ; Cn # [464] .. 1E4FA..1E5CF ; Cn # [214] .. 1E5FB..1E5FE ; Cn # [4] .. 1E600..1E6BF ; Cn # [192] .. 1E6DF ; Cn # 1E6F6..1E6FD ; Cn # [8] .. 1E700..1E7DF ; Cn # [224] .. 1E7E7 ; Cn # 1E7EC ; Cn # 1E7EF ; Cn # 1E7FF ; Cn # 1E8C5..1E8C6 ; Cn # [2] .. 1E8D7..1E8FF ; Cn # [41] .. 1E94C..1E94F ; Cn # [4] .. 1E95A..1E95D ; Cn # [4] .. 1E960..1EC70 ; Cn # [785] .. 1ECB5..1ED00 ; Cn # [76] .. 1ED3E..1EDFF ; Cn # [194] .. 1EE04 ; Cn # 1EE20 ; Cn # 1EE23 ; Cn # 1EE25..1EE26 ; Cn # [2] .. 1EE28 ; Cn # 1EE33 ; Cn # 1EE38 ; Cn # 1EE3A ; Cn # 1EE3C..1EE41 ; Cn # [6] .. 1EE43..1EE46 ; Cn # [4] .. 1EE48 ; Cn # 1EE4A ; Cn # 1EE4C ; Cn # 1EE50 ; Cn # 1EE53 ; Cn # 1EE55..1EE56 ; Cn # [2] .. 1EE58 ; Cn # 1EE5A ; Cn # 1EE5C ; Cn # 1EE5E ; Cn # 1EE60 ; Cn # 1EE63 ; Cn # 1EE65..1EE66 ; Cn # [2] .. 1EE6B ; Cn # 1EE73 ; Cn # 1EE78 ; Cn # 1EE7D ; Cn # 1EE7F ; Cn # 1EE8A ; Cn # 1EE9C..1EEA0 ; Cn # [5] .. 1EEA4 ; Cn # 1EEAA ; Cn # 1EEBC..1EEEF ; Cn # [52] .. 1EEF2..1EFFF ; Cn # [270] .. 1F02C..1F02F ; Cn # [4] .. 1F094..1F09F ; Cn # [12] .. 1F0AF..1F0B0 ; Cn # [2] .. 1F0C0 ; Cn # 1F0D0 ; Cn # 1F0F6..1F0FF ; Cn # [10] .. 1F1AE..1F1E5 ; Cn # [56] .. 1F203..1F20F ; Cn # [13] .. 1F23C..1F23F ; Cn # [4] .. 1F249..1F24F ; Cn # [7] .. 1F252..1F25F ; Cn # [14] .. 1F266..1F2FF ; Cn # [154] .. 1F6D9..1F6DB ; Cn # [3] .. 1F6ED..1F6EF ; Cn # [3] .. 1F6FD..1F6FF ; Cn # [3] .. 1F7DA..1F7DF ; Cn # [6] .. 1F7EC..1F7EF ; Cn # [4] .. 1F7F1..1F7FF ; Cn # [15] .. 1F80C..1F80F ; Cn # [4] .. 1F848..1F84F ; Cn # [8] .. 1F85A..1F85F ; Cn # [6] .. 1F888..1F88F ; Cn # [8] .. 1F8AE..1F8AF ; Cn # [2] .. 1F8BC..1F8BF ; Cn # [4] .. 1F8C2..1F8CF ; Cn # [14] .. 1F8D9..1F8FF ; Cn # [39] .. 1FA58..1FA5F ; Cn # [8] .. 1FA6E..1FA6F ; Cn # [2] .. 1FA7D..1FA7F ; Cn # [3] .. 1FA8B..1FA8D ; Cn # [3] .. 1FAC7 ; Cn # 1FAC9..1FACC ; Cn # [4] .. 1FADD..1FADE ; Cn # [2] .. 1FAEB..1FAEE ; Cn # [4] .. 1FAF9..1FAFF ; Cn # [7] .. 1FB93 ; Cn # 1FBFB..1FFFF ; Cn # [1029] .. 2A6E0..2A6FF ; Cn # [32] .. 2B81E..2B81F ; Cn # [2] .. 2CEAE..2CEAF ; Cn # [2] .. 2EBE1..2EBEF ; Cn # [15] .. 2EE5E..2F7FF ; Cn # [2466] .. 2FA1E..2FFFF ; Cn # [1506] .. 3134B..3134F ; Cn # [5] .. 3347A..E0000 ; Cn # [707463] .. E0002..E001F ; Cn # [30] .. E0080..E00FF ; Cn # [128] .. E01F0..EFFFF ; Cn # [65040] .. FFFFE..FFFFF ; Cn # [2] .. 10FFFE..10FFFF; Cn # [2] .. # Total code points: 814730 # ================================================ # General_Category=Uppercase_Letter 0041..005A ; Lu # [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 00C0..00D6 ; Lu # [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS 00D8..00DE ; Lu # [7] LATIN CAPITAL LETTER O WITH STROKE..LATIN CAPITAL LETTER THORN 0100 ; Lu # LATIN CAPITAL LETTER A WITH MACRON 0102 ; Lu # LATIN CAPITAL LETTER A WITH BREVE 0104 ; Lu # LATIN CAPITAL LETTER A WITH OGONEK 0106 ; Lu # LATIN CAPITAL LETTER C WITH ACUTE 0108 ; Lu # LATIN CAPITAL LETTER C WITH CIRCUMFLEX 010A ; Lu # LATIN CAPITAL LETTER C WITH DOT ABOVE 010C ; Lu # LATIN CAPITAL LETTER C WITH CARON 010E ; Lu # LATIN CAPITAL LETTER D WITH CARON 0110 ; Lu # LATIN CAPITAL LETTER D WITH STROKE 0112 ; Lu # LATIN CAPITAL LETTER E WITH MACRON 0114 ; Lu # LATIN CAPITAL LETTER E WITH BREVE 0116 ; Lu # LATIN CAPITAL LETTER E WITH DOT ABOVE 0118 ; Lu # LATIN CAPITAL LETTER E WITH OGONEK 011A ; Lu # LATIN CAPITAL LETTER E WITH CARON 011C ; Lu # LATIN CAPITAL LETTER G WITH CIRCUMFLEX 011E ; Lu # LATIN CAPITAL LETTER G WITH BREVE 0120 ; Lu # LATIN CAPITAL LETTER G WITH DOT ABOVE 0122 ; Lu # LATIN CAPITAL LETTER G WITH CEDILLA 0124 ; Lu # LATIN CAPITAL LETTER H WITH CIRCUMFLEX 0126 ; Lu # LATIN CAPITAL LETTER H WITH STROKE 0128 ; Lu # LATIN CAPITAL LETTER I WITH TILDE 012A ; Lu # LATIN CAPITAL LETTER I WITH MACRON 012C ; Lu # LATIN CAPITAL LETTER I WITH BREVE 012E ; Lu # LATIN CAPITAL LETTER I WITH OGONEK 0130 ; Lu # LATIN CAPITAL LETTER I WITH DOT ABOVE 0132 ; Lu # LATIN CAPITAL LIGATURE IJ 0134 ; Lu # LATIN CAPITAL LETTER J WITH CIRCUMFLEX 0136 ; Lu # LATIN CAPITAL LETTER K WITH CEDILLA 0139 ; Lu # LATIN CAPITAL LETTER L WITH ACUTE 013B ; Lu # LATIN CAPITAL LETTER L WITH CEDILLA 013D ; Lu # LATIN CAPITAL LETTER L WITH CARON 013F ; Lu # LATIN CAPITAL LETTER L WITH MIDDLE DOT 0141 ; Lu # LATIN CAPITAL LETTER L WITH STROKE 0143 ; Lu # LATIN CAPITAL LETTER N WITH ACUTE 0145 ; Lu # LATIN CAPITAL LETTER N WITH CEDILLA 0147 ; Lu # LATIN CAPITAL LETTER N WITH CARON 014A ; Lu # LATIN CAPITAL LETTER ENG 014C ; Lu # LATIN CAPITAL LETTER O WITH MACRON 014E ; Lu # LATIN CAPITAL LETTER O WITH BREVE 0150 ; Lu # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE 0152 ; Lu # LATIN CAPITAL LIGATURE OE 0154 ; Lu # LATIN CAPITAL LETTER R WITH ACUTE 0156 ; Lu # LATIN CAPITAL LETTER R WITH CEDILLA 0158 ; Lu # LATIN CAPITAL LETTER R WITH CARON 015A ; Lu # LATIN CAPITAL LETTER S WITH ACUTE 015C ; Lu # LATIN CAPITAL LETTER S WITH CIRCUMFLEX 015E ; Lu # LATIN CAPITAL LETTER S WITH CEDILLA 0160 ; Lu # LATIN CAPITAL LETTER S WITH CARON 0162 ; Lu # LATIN CAPITAL LETTER T WITH CEDILLA 0164 ; Lu # LATIN CAPITAL LETTER T WITH CARON 0166 ; Lu # LATIN CAPITAL LETTER T WITH STROKE 0168 ; Lu # LATIN CAPITAL LETTER U WITH TILDE 016A ; Lu # LATIN CAPITAL LETTER U WITH MACRON 016C ; Lu # LATIN CAPITAL LETTER U WITH BREVE 016E ; Lu # LATIN CAPITAL LETTER U WITH RING ABOVE 0170 ; Lu # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE 0172 ; Lu # LATIN CAPITAL LETTER U WITH OGONEK 0174 ; Lu # LATIN CAPITAL LETTER W WITH CIRCUMFLEX 0176 ; Lu # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX 0178..0179 ; Lu # [2] LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN CAPITAL LETTER Z WITH ACUTE 017B ; Lu # LATIN CAPITAL LETTER Z WITH DOT ABOVE 017D ; Lu # LATIN CAPITAL LETTER Z WITH CARON 0181..0182 ; Lu # [2] LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPITAL LETTER B WITH TOPBAR 0184 ; Lu # LATIN CAPITAL LETTER TONE SIX 0186..0187 ; Lu # [2] LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL LETTER C WITH HOOK 0189..018B ; Lu # [3] LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITAL LETTER D WITH TOPBAR 018E..0191 ; Lu # [4] LATIN CAPITAL LETTER REVERSED E..LATIN CAPITAL LETTER F WITH HOOK 0193..0194 ; Lu # [2] LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPITAL LETTER GAMMA 0196..0198 ; Lu # [3] LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LETTER K WITH HOOK 019C..019D ; Lu # [2] LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL LETTER N WITH LEFT HOOK 019F..01A0 ; Lu # [2] LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LATIN CAPITAL LETTER O WITH HORN 01A2 ; Lu # LATIN CAPITAL LETTER OI 01A4 ; Lu # LATIN CAPITAL LETTER P WITH HOOK 01A6..01A7 ; Lu # [2] LATIN LETTER YR..LATIN CAPITAL LETTER TONE TWO 01A9 ; Lu # LATIN CAPITAL LETTER ESH 01AC ; Lu # LATIN CAPITAL LETTER T WITH HOOK 01AE..01AF ; Lu # [2] LATIN CAPITAL LETTER T WITH RETROFLEX HOOK..LATIN CAPITAL LETTER U WITH HORN 01B1..01B3 ; Lu # [3] LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL LETTER Y WITH HOOK 01B5 ; Lu # LATIN CAPITAL LETTER Z WITH STROKE 01B7..01B8 ; Lu # [2] LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETTER EZH REVERSED 01BC ; Lu # LATIN CAPITAL LETTER TONE FIVE 01C4 ; Lu # LATIN CAPITAL LETTER DZ WITH CARON 01C7 ; Lu # LATIN CAPITAL LETTER LJ 01CA ; Lu # LATIN CAPITAL LETTER NJ 01CD ; Lu # LATIN CAPITAL LETTER A WITH CARON 01CF ; Lu # LATIN CAPITAL LETTER I WITH CARON 01D1 ; Lu # LATIN CAPITAL LETTER O WITH CARON 01D3 ; Lu # LATIN CAPITAL LETTER U WITH CARON 01D5 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON 01D7 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE 01D9 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON 01DB ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE 01DE ; Lu # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON 01E0 ; Lu # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON 01E2 ; Lu # LATIN CAPITAL LETTER AE WITH MACRON 01E4 ; Lu # LATIN CAPITAL LETTER G WITH STROKE 01E6 ; Lu # LATIN CAPITAL LETTER G WITH CARON 01E8 ; Lu # LATIN CAPITAL LETTER K WITH CARON 01EA ; Lu # LATIN CAPITAL LETTER O WITH OGONEK 01EC ; Lu # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON 01EE ; Lu # LATIN CAPITAL LETTER EZH WITH CARON 01F1 ; Lu # LATIN CAPITAL LETTER DZ 01F4 ; Lu # LATIN CAPITAL LETTER G WITH ACUTE 01F6..01F8 ; Lu # [3] LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LETTER N WITH GRAVE 01FA ; Lu # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE 01FC ; Lu # LATIN CAPITAL LETTER AE WITH ACUTE 01FE ; Lu # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 0200 ; Lu # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE 0202 ; Lu # LATIN CAPITAL LETTER A WITH INVERTED BREVE 0204 ; Lu # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE 0206 ; Lu # LATIN CAPITAL LETTER E WITH INVERTED BREVE 0208 ; Lu # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE 020A ; Lu # LATIN CAPITAL LETTER I WITH INVERTED BREVE 020C ; Lu # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE 020E ; Lu # LATIN CAPITAL LETTER O WITH INVERTED BREVE 0210 ; Lu # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE 0212 ; Lu # LATIN CAPITAL LETTER R WITH INVERTED BREVE 0214 ; Lu # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE 0216 ; Lu # LATIN CAPITAL LETTER U WITH INVERTED BREVE 0218 ; Lu # LATIN CAPITAL LETTER S WITH COMMA BELOW 021A ; Lu # LATIN CAPITAL LETTER T WITH COMMA BELOW 021C ; Lu # LATIN CAPITAL LETTER YOGH 021E ; Lu # LATIN CAPITAL LETTER H WITH CARON 0220 ; Lu # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 0222 ; Lu # LATIN CAPITAL LETTER OU 0224 ; Lu # LATIN CAPITAL LETTER Z WITH HOOK 0226 ; Lu # LATIN CAPITAL LETTER A WITH DOT ABOVE 0228 ; Lu # LATIN CAPITAL LETTER E WITH CEDILLA 022A ; Lu # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 022C ; Lu # LATIN CAPITAL LETTER O WITH TILDE AND MACRON 022E ; Lu # LATIN CAPITAL LETTER O WITH DOT ABOVE 0230 ; Lu # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON 0232 ; Lu # LATIN CAPITAL LETTER Y WITH MACRON 023A..023B ; Lu # [2] LATIN CAPITAL LETTER A WITH STROKE..LATIN CAPITAL LETTER C WITH STROKE 023D..023E ; Lu # [2] LATIN CAPITAL LETTER L WITH BAR..LATIN CAPITAL LETTER T WITH DIAGONAL STROKE 0241 ; Lu # LATIN CAPITAL LETTER GLOTTAL STOP 0243..0246 ; Lu # [4] LATIN CAPITAL LETTER B WITH STROKE..LATIN CAPITAL LETTER E WITH STROKE 0248 ; Lu # LATIN CAPITAL LETTER J WITH STROKE 024A ; Lu # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL 024C ; Lu # LATIN CAPITAL LETTER R WITH STROKE 024E ; Lu # LATIN CAPITAL LETTER Y WITH STROKE 0370 ; Lu # GREEK CAPITAL LETTER HETA 0372 ; Lu # GREEK CAPITAL LETTER ARCHAIC SAMPI 0376 ; Lu # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA 037F ; Lu # GREEK CAPITAL LETTER YOT 0386 ; Lu # GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Lu # [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS 038C ; Lu # GREEK CAPITAL LETTER OMICRON WITH TONOS 038E..038F ; Lu # [2] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER OMEGA WITH TONOS 0391..03A1 ; Lu # [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO 03A3..03AB ; Lu # [9] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA 03CF ; Lu # GREEK CAPITAL KAI SYMBOL 03D2..03D4 ; Lu # [3] GREEK UPSILON WITH HOOK SYMBOL..GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL 03D8 ; Lu # GREEK LETTER ARCHAIC KOPPA 03DA ; Lu # GREEK LETTER STIGMA 03DC ; Lu # GREEK LETTER DIGAMMA 03DE ; Lu # GREEK LETTER KOPPA 03E0 ; Lu # GREEK LETTER SAMPI 03E2 ; Lu # COPTIC CAPITAL LETTER SHEI 03E4 ; Lu # COPTIC CAPITAL LETTER FEI 03E6 ; Lu # COPTIC CAPITAL LETTER KHEI 03E8 ; Lu # COPTIC CAPITAL LETTER HORI 03EA ; Lu # COPTIC CAPITAL LETTER GANGIA 03EC ; Lu # COPTIC CAPITAL LETTER SHIMA 03EE ; Lu # COPTIC CAPITAL LETTER DEI 03F4 ; Lu # GREEK CAPITAL THETA SYMBOL 03F7 ; Lu # GREEK CAPITAL LETTER SHO 03F9..03FA ; Lu # [2] GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAPITAL LETTER SAN 03FD..042F ; Lu # [51] GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..CYRILLIC CAPITAL LETTER YA 0460 ; Lu # CYRILLIC CAPITAL LETTER OMEGA 0462 ; Lu # CYRILLIC CAPITAL LETTER YAT 0464 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED E 0466 ; Lu # CYRILLIC CAPITAL LETTER LITTLE YUS 0468 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS 046A ; Lu # CYRILLIC CAPITAL LETTER BIG YUS 046C ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS 046E ; Lu # CYRILLIC CAPITAL LETTER KSI 0470 ; Lu # CYRILLIC CAPITAL LETTER PSI 0472 ; Lu # CYRILLIC CAPITAL LETTER FITA 0474 ; Lu # CYRILLIC CAPITAL LETTER IZHITSA 0476 ; Lu # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0478 ; Lu # CYRILLIC CAPITAL LETTER UK 047A ; Lu # CYRILLIC CAPITAL LETTER ROUND OMEGA 047C ; Lu # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO 047E ; Lu # CYRILLIC CAPITAL LETTER OT 0480 ; Lu # CYRILLIC CAPITAL LETTER KOPPA 048A ; Lu # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL 048C ; Lu # CYRILLIC CAPITAL LETTER SEMISOFT SIGN 048E ; Lu # CYRILLIC CAPITAL LETTER ER WITH TICK 0490 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH UPTURN 0492 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH STROKE 0494 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK 0496 ; Lu # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 0498 ; Lu # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER 049A ; Lu # CYRILLIC CAPITAL LETTER KA WITH DESCENDER 049C ; Lu # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE 049E ; Lu # CYRILLIC CAPITAL LETTER KA WITH STROKE 04A0 ; Lu # CYRILLIC CAPITAL LETTER BASHKIR KA 04A2 ; Lu # CYRILLIC CAPITAL LETTER EN WITH DESCENDER 04A4 ; Lu # CYRILLIC CAPITAL LIGATURE EN GHE 04A6 ; Lu # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK 04A8 ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN HA 04AA ; Lu # CYRILLIC CAPITAL LETTER ES WITH DESCENDER 04AC ; Lu # CYRILLIC CAPITAL LETTER TE WITH DESCENDER 04AE ; Lu # CYRILLIC CAPITAL LETTER STRAIGHT U 04B0 ; Lu # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE 04B2 ; Lu # CYRILLIC CAPITAL LETTER HA WITH DESCENDER 04B4 ; Lu # CYRILLIC CAPITAL LIGATURE TE TSE 04B6 ; Lu # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER 04B8 ; Lu # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE 04BA ; Lu # CYRILLIC CAPITAL LETTER SHHA 04BC ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN CHE 04BE ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER 04C0..04C1 ; Lu # [2] CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL LETTER ZHE WITH BREVE 04C3 ; Lu # CYRILLIC CAPITAL LETTER KA WITH HOOK 04C5 ; Lu # CYRILLIC CAPITAL LETTER EL WITH TAIL 04C7 ; Lu # CYRILLIC CAPITAL LETTER EN WITH HOOK 04C9 ; Lu # CYRILLIC CAPITAL LETTER EN WITH TAIL 04CB ; Lu # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE 04CD ; Lu # CYRILLIC CAPITAL LETTER EM WITH TAIL 04D0 ; Lu # CYRILLIC CAPITAL LETTER A WITH BREVE 04D2 ; Lu # CYRILLIC CAPITAL LETTER A WITH DIAERESIS 04D4 ; Lu # CYRILLIC CAPITAL LIGATURE A IE 04D6 ; Lu # CYRILLIC CAPITAL LETTER IE WITH BREVE 04D8 ; Lu # CYRILLIC CAPITAL LETTER SCHWA 04DA ; Lu # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS 04DC ; Lu # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS 04DE ; Lu # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS 04E0 ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN DZE 04E2 ; Lu # CYRILLIC CAPITAL LETTER I WITH MACRON 04E4 ; Lu # CYRILLIC CAPITAL LETTER I WITH DIAERESIS 04E6 ; Lu # CYRILLIC CAPITAL LETTER O WITH DIAERESIS 04E8 ; Lu # CYRILLIC CAPITAL LETTER BARRED O 04EA ; Lu # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS 04EC ; Lu # CYRILLIC CAPITAL LETTER E WITH DIAERESIS 04EE ; Lu # CYRILLIC CAPITAL LETTER U WITH MACRON 04F0 ; Lu # CYRILLIC CAPITAL LETTER U WITH DIAERESIS 04F2 ; Lu # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 04F4 ; Lu # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS 04F6 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER 04F8 ; Lu # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS 04FA ; Lu # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK 04FC ; Lu # CYRILLIC CAPITAL LETTER HA WITH HOOK 04FE ; Lu # CYRILLIC CAPITAL LETTER HA WITH STROKE 0500 ; Lu # CYRILLIC CAPITAL LETTER KOMI DE 0502 ; Lu # CYRILLIC CAPITAL LETTER KOMI DJE 0504 ; Lu # CYRILLIC CAPITAL LETTER KOMI ZJE 0506 ; Lu # CYRILLIC CAPITAL LETTER KOMI DZJE 0508 ; Lu # CYRILLIC CAPITAL LETTER KOMI LJE 050A ; Lu # CYRILLIC CAPITAL LETTER KOMI NJE 050C ; Lu # CYRILLIC CAPITAL LETTER KOMI SJE 050E ; Lu # CYRILLIC CAPITAL LETTER KOMI TJE 0510 ; Lu # CYRILLIC CAPITAL LETTER REVERSED ZE 0512 ; Lu # CYRILLIC CAPITAL LETTER EL WITH HOOK 0514 ; Lu # CYRILLIC CAPITAL LETTER LHA 0516 ; Lu # CYRILLIC CAPITAL LETTER RHA 0518 ; Lu # CYRILLIC CAPITAL LETTER YAE 051A ; Lu # CYRILLIC CAPITAL LETTER QA 051C ; Lu # CYRILLIC CAPITAL LETTER WE 051E ; Lu # CYRILLIC CAPITAL LETTER ALEUT KA 0520 ; Lu # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK 0522 ; Lu # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK 0524 ; Lu # CYRILLIC CAPITAL LETTER PE WITH DESCENDER 0526 ; Lu # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER 0528 ; Lu # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK 052A ; Lu # CYRILLIC CAPITAL LETTER DZZHE 052C ; Lu # CYRILLIC CAPITAL LETTER DCHE 052E ; Lu # CYRILLIC CAPITAL LETTER EL WITH DESCENDER 0531..0556 ; Lu # [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH 10A0..10C5 ; Lu # [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE 10C7 ; Lu # GEORGIAN CAPITAL LETTER YN 10CD ; Lu # GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Lu # [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 1C89 ; Lu # CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Lu # [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Lu # [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Lu # LATIN CAPITAL LETTER A WITH RING BELOW 1E02 ; Lu # LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04 ; Lu # LATIN CAPITAL LETTER B WITH DOT BELOW 1E06 ; Lu # LATIN CAPITAL LETTER B WITH LINE BELOW 1E08 ; Lu # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE 1E0A ; Lu # LATIN CAPITAL LETTER D WITH DOT ABOVE 1E0C ; Lu # LATIN CAPITAL LETTER D WITH DOT BELOW 1E0E ; Lu # LATIN CAPITAL LETTER D WITH LINE BELOW 1E10 ; Lu # LATIN CAPITAL LETTER D WITH CEDILLA 1E12 ; Lu # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW 1E14 ; Lu # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE 1E16 ; Lu # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE 1E18 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW 1E1A ; Lu # LATIN CAPITAL LETTER E WITH TILDE BELOW 1E1C ; Lu # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE 1E1E ; Lu # LATIN CAPITAL LETTER F WITH DOT ABOVE 1E20 ; Lu # LATIN CAPITAL LETTER G WITH MACRON 1E22 ; Lu # LATIN CAPITAL LETTER H WITH DOT ABOVE 1E24 ; Lu # LATIN CAPITAL LETTER H WITH DOT BELOW 1E26 ; Lu # LATIN CAPITAL LETTER H WITH DIAERESIS 1E28 ; Lu # LATIN CAPITAL LETTER H WITH CEDILLA 1E2A ; Lu # LATIN CAPITAL LETTER H WITH BREVE BELOW 1E2C ; Lu # LATIN CAPITAL LETTER I WITH TILDE BELOW 1E2E ; Lu # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE 1E30 ; Lu # LATIN CAPITAL LETTER K WITH ACUTE 1E32 ; Lu # LATIN CAPITAL LETTER K WITH DOT BELOW 1E34 ; Lu # LATIN CAPITAL LETTER K WITH LINE BELOW 1E36 ; Lu # LATIN CAPITAL LETTER L WITH DOT BELOW 1E38 ; Lu # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON 1E3A ; Lu # LATIN CAPITAL LETTER L WITH LINE BELOW 1E3C ; Lu # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW 1E3E ; Lu # LATIN CAPITAL LETTER M WITH ACUTE 1E40 ; Lu # LATIN CAPITAL LETTER M WITH DOT ABOVE 1E42 ; Lu # LATIN CAPITAL LETTER M WITH DOT BELOW 1E44 ; Lu # LATIN CAPITAL LETTER N WITH DOT ABOVE 1E46 ; Lu # LATIN CAPITAL LETTER N WITH DOT BELOW 1E48 ; Lu # LATIN CAPITAL LETTER N WITH LINE BELOW 1E4A ; Lu # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW 1E4C ; Lu # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE 1E4E ; Lu # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS 1E50 ; Lu # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE 1E52 ; Lu # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE 1E54 ; Lu # LATIN CAPITAL LETTER P WITH ACUTE 1E56 ; Lu # LATIN CAPITAL LETTER P WITH DOT ABOVE 1E58 ; Lu # LATIN CAPITAL LETTER R WITH DOT ABOVE 1E5A ; Lu # LATIN CAPITAL LETTER R WITH DOT BELOW 1E5C ; Lu # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON 1E5E ; Lu # LATIN CAPITAL LETTER R WITH LINE BELOW 1E60 ; Lu # LATIN CAPITAL LETTER S WITH DOT ABOVE 1E62 ; Lu # LATIN CAPITAL LETTER S WITH DOT BELOW 1E64 ; Lu # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE 1E66 ; Lu # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE 1E68 ; Lu # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6A ; Lu # LATIN CAPITAL LETTER T WITH DOT ABOVE 1E6C ; Lu # LATIN CAPITAL LETTER T WITH DOT BELOW 1E6E ; Lu # LATIN CAPITAL LETTER T WITH LINE BELOW 1E70 ; Lu # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW 1E72 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW 1E74 ; Lu # LATIN CAPITAL LETTER U WITH TILDE BELOW 1E76 ; Lu # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW 1E78 ; Lu # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE 1E7A ; Lu # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS 1E7C ; Lu # LATIN CAPITAL LETTER V WITH TILDE 1E7E ; Lu # LATIN CAPITAL LETTER V WITH DOT BELOW 1E80 ; Lu # LATIN CAPITAL LETTER W WITH GRAVE 1E82 ; Lu # LATIN CAPITAL LETTER W WITH ACUTE 1E84 ; Lu # LATIN CAPITAL LETTER W WITH DIAERESIS 1E86 ; Lu # LATIN CAPITAL LETTER W WITH DOT ABOVE 1E88 ; Lu # LATIN CAPITAL LETTER W WITH DOT BELOW 1E8A ; Lu # LATIN CAPITAL LETTER X WITH DOT ABOVE 1E8C ; Lu # LATIN CAPITAL LETTER X WITH DIAERESIS 1E8E ; Lu # LATIN CAPITAL LETTER Y WITH DOT ABOVE 1E90 ; Lu # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX 1E92 ; Lu # LATIN CAPITAL LETTER Z WITH DOT BELOW 1E94 ; Lu # LATIN CAPITAL LETTER Z WITH LINE BELOW 1E9E ; Lu # LATIN CAPITAL LETTER SHARP S 1EA0 ; Lu # LATIN CAPITAL LETTER A WITH DOT BELOW 1EA2 ; Lu # LATIN CAPITAL LETTER A WITH HOOK ABOVE 1EA4 ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA6 ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA8 ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAA ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE 1EAC ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAE ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE 1EB0 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE 1EB2 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE 1EB4 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND TILDE 1EB6 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW 1EB8 ; Lu # LATIN CAPITAL LETTER E WITH DOT BELOW 1EBA ; Lu # LATIN CAPITAL LETTER E WITH HOOK ABOVE 1EBC ; Lu # LATIN CAPITAL LETTER E WITH TILDE 1EBE ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC0 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC2 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC4 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE 1EC6 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC8 ; Lu # LATIN CAPITAL LETTER I WITH HOOK ABOVE 1ECA ; Lu # LATIN CAPITAL LETTER I WITH DOT BELOW 1ECC ; Lu # LATIN CAPITAL LETTER O WITH DOT BELOW 1ECE ; Lu # LATIN CAPITAL LETTER O WITH HOOK ABOVE 1ED0 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED2 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED4 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED6 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE 1ED8 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDA ; Lu # LATIN CAPITAL LETTER O WITH HORN AND ACUTE 1EDC ; Lu # LATIN CAPITAL LETTER O WITH HORN AND GRAVE 1EDE ; Lu # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE 1EE0 ; Lu # LATIN CAPITAL LETTER O WITH HORN AND TILDE 1EE2 ; Lu # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW 1EE4 ; Lu # LATIN CAPITAL LETTER U WITH DOT BELOW 1EE6 ; Lu # LATIN CAPITAL LETTER U WITH HOOK ABOVE 1EE8 ; Lu # LATIN CAPITAL LETTER U WITH HORN AND ACUTE 1EEA ; Lu # LATIN CAPITAL LETTER U WITH HORN AND GRAVE 1EEC ; Lu # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE 1EEE ; Lu # LATIN CAPITAL LETTER U WITH HORN AND TILDE 1EF0 ; Lu # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW 1EF2 ; Lu # LATIN CAPITAL LETTER Y WITH GRAVE 1EF4 ; Lu # LATIN CAPITAL LETTER Y WITH DOT BELOW 1EF6 ; Lu # LATIN CAPITAL LETTER Y WITH HOOK ABOVE 1EF8 ; Lu # LATIN CAPITAL LETTER Y WITH TILDE 1EFA ; Lu # LATIN CAPITAL LETTER MIDDLE-WELSH LL 1EFC ; Lu # LATIN CAPITAL LETTER MIDDLE-WELSH V 1EFE ; Lu # LATIN CAPITAL LETTER Y WITH LOOP 1F08..1F0F ; Lu # [8] GREEK CAPITAL LETTER ALPHA WITH PSILI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F18..1F1D ; Lu # [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F28..1F2F ; Lu # [8] GREEK CAPITAL LETTER ETA WITH PSILI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI 1F38..1F3F ; Lu # [8] GREEK CAPITAL LETTER IOTA WITH PSILI..GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI 1F48..1F4D ; Lu # [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA 1F59 ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA 1F5B ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA 1F5D ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA 1F5F ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F68..1F6F ; Lu # [8] GREEK CAPITAL LETTER OMEGA WITH PSILI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI 1FB8..1FBB ; Lu # [4] GREEK CAPITAL LETTER ALPHA WITH VRACHY..GREEK CAPITAL LETTER ALPHA WITH OXIA 1FC8..1FCB ; Lu # [4] GREEK CAPITAL LETTER EPSILON WITH VARIA..GREEK CAPITAL LETTER ETA WITH OXIA 1FD8..1FDB ; Lu # [4] GREEK CAPITAL LETTER IOTA WITH VRACHY..GREEK CAPITAL LETTER IOTA WITH OXIA 1FE8..1FEC ; Lu # [5] GREEK CAPITAL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA 1FF8..1FFB ; Lu # [4] GREEK CAPITAL LETTER OMICRON WITH VARIA..GREEK CAPITAL LETTER OMEGA WITH OXIA 2102 ; Lu # DOUBLE-STRUCK CAPITAL C 2107 ; Lu # EULER CONSTANT 210B..210D ; Lu # [3] SCRIPT CAPITAL H..DOUBLE-STRUCK CAPITAL H 2110..2112 ; Lu # [3] SCRIPT CAPITAL I..SCRIPT CAPITAL L 2115 ; Lu # DOUBLE-STRUCK CAPITAL N 2119..211D ; Lu # [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; Lu # DOUBLE-STRUCK CAPITAL Z 2126 ; Lu # OHM SIGN 2128 ; Lu # BLACK-LETTER CAPITAL Z 212A..212D ; Lu # [4] KELVIN SIGN..BLACK-LETTER CAPITAL C 2130..2133 ; Lu # [4] SCRIPT CAPITAL E..SCRIPT CAPITAL M 213E..213F ; Lu # [2] DOUBLE-STRUCK CAPITAL GAMMA..DOUBLE-STRUCK CAPITAL PI 2145 ; Lu # DOUBLE-STRUCK ITALIC CAPITAL D 2183 ; Lu # ROMAN NUMERAL REVERSED ONE HUNDRED 2C00..2C2F ; Lu # [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI 2C60 ; Lu # LATIN CAPITAL LETTER L WITH DOUBLE BAR 2C62..2C64 ; Lu # [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL 2C67 ; Lu # LATIN CAPITAL LETTER H WITH DESCENDER 2C69 ; Lu # LATIN CAPITAL LETTER K WITH DESCENDER 2C6B ; Lu # LATIN CAPITAL LETTER Z WITH DESCENDER 2C6D..2C70 ; Lu # [4] LATIN CAPITAL LETTER ALPHA..LATIN CAPITAL LETTER TURNED ALPHA 2C72 ; Lu # LATIN CAPITAL LETTER W WITH HOOK 2C75 ; Lu # LATIN CAPITAL LETTER HALF H 2C7E..2C80 ; Lu # [3] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC CAPITAL LETTER ALFA 2C82 ; Lu # COPTIC CAPITAL LETTER VIDA 2C84 ; Lu # COPTIC CAPITAL LETTER GAMMA 2C86 ; Lu # COPTIC CAPITAL LETTER DALDA 2C88 ; Lu # COPTIC CAPITAL LETTER EIE 2C8A ; Lu # COPTIC CAPITAL LETTER SOU 2C8C ; Lu # COPTIC CAPITAL LETTER ZATA 2C8E ; Lu # COPTIC CAPITAL LETTER HATE 2C90 ; Lu # COPTIC CAPITAL LETTER THETHE 2C92 ; Lu # COPTIC CAPITAL LETTER IAUDA 2C94 ; Lu # COPTIC CAPITAL LETTER KAPA 2C96 ; Lu # COPTIC CAPITAL LETTER LAULA 2C98 ; Lu # COPTIC CAPITAL LETTER MI 2C9A ; Lu # COPTIC CAPITAL LETTER NI 2C9C ; Lu # COPTIC CAPITAL LETTER KSI 2C9E ; Lu # COPTIC CAPITAL LETTER O 2CA0 ; Lu # COPTIC CAPITAL LETTER PI 2CA2 ; Lu # COPTIC CAPITAL LETTER RO 2CA4 ; Lu # COPTIC CAPITAL LETTER SIMA 2CA6 ; Lu # COPTIC CAPITAL LETTER TAU 2CA8 ; Lu # COPTIC CAPITAL LETTER UA 2CAA ; Lu # COPTIC CAPITAL LETTER FI 2CAC ; Lu # COPTIC CAPITAL LETTER KHI 2CAE ; Lu # COPTIC CAPITAL LETTER PSI 2CB0 ; Lu # COPTIC CAPITAL LETTER OOU 2CB2 ; Lu # COPTIC CAPITAL LETTER DIALECT-P ALEF 2CB4 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC AIN 2CB6 ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE 2CB8 ; Lu # COPTIC CAPITAL LETTER DIALECT-P KAPA 2CBA ; Lu # COPTIC CAPITAL LETTER DIALECT-P NI 2CBC ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI 2CBE ; Lu # COPTIC CAPITAL LETTER OLD COPTIC OOU 2CC0 ; Lu # COPTIC CAPITAL LETTER SAMPI 2CC2 ; Lu # COPTIC CAPITAL LETTER CROSSED SHEI 2CC4 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC SHEI 2CC6 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC ESH 2CC8 ; Lu # COPTIC CAPITAL LETTER AKHMIMIC KHEI 2CCA ; Lu # COPTIC CAPITAL LETTER DIALECT-P HORI 2CCC ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HORI 2CCE ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HA 2CD0 ; Lu # COPTIC CAPITAL LETTER L-SHAPED HA 2CD2 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HEI 2CD4 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HAT 2CD6 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC GANGIA 2CD8 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC DJA 2CDA ; Lu # COPTIC CAPITAL LETTER OLD COPTIC SHIMA 2CDC ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA 2CDE ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN NGI 2CE0 ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN NYI 2CE2 ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN WAU 2CEB ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI 2CED ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA 2CF2 ; Lu # COPTIC CAPITAL LETTER BOHAIRIC KHEI A640 ; Lu # CYRILLIC CAPITAL LETTER ZEMLYA A642 ; Lu # CYRILLIC CAPITAL LETTER DZELO A644 ; Lu # CYRILLIC CAPITAL LETTER REVERSED DZE A646 ; Lu # CYRILLIC CAPITAL LETTER IOTA A648 ; Lu # CYRILLIC CAPITAL LETTER DJERV A64A ; Lu # CYRILLIC CAPITAL LETTER MONOGRAPH UK A64C ; Lu # CYRILLIC CAPITAL LETTER BROAD OMEGA A64E ; Lu # CYRILLIC CAPITAL LETTER NEUTRAL YER A650 ; Lu # CYRILLIC CAPITAL LETTER YERU WITH BACK YER A652 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED YAT A654 ; Lu # CYRILLIC CAPITAL LETTER REVERSED YU A656 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED A A658 ; Lu # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS A65A ; Lu # CYRILLIC CAPITAL LETTER BLENDED YUS A65C ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS A65E ; Lu # CYRILLIC CAPITAL LETTER YN A660 ; Lu # CYRILLIC CAPITAL LETTER REVERSED TSE A662 ; Lu # CYRILLIC CAPITAL LETTER SOFT DE A664 ; Lu # CYRILLIC CAPITAL LETTER SOFT EL A666 ; Lu # CYRILLIC CAPITAL LETTER SOFT EM A668 ; Lu # CYRILLIC CAPITAL LETTER MONOCULAR O A66A ; Lu # CYRILLIC CAPITAL LETTER BINOCULAR O A66C ; Lu # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O A680 ; Lu # CYRILLIC CAPITAL LETTER DWE A682 ; Lu # CYRILLIC CAPITAL LETTER DZWE A684 ; Lu # CYRILLIC CAPITAL LETTER ZHWE A686 ; Lu # CYRILLIC CAPITAL LETTER CCHE A688 ; Lu # CYRILLIC CAPITAL LETTER DZZE A68A ; Lu # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK A68C ; Lu # CYRILLIC CAPITAL LETTER TWE A68E ; Lu # CYRILLIC CAPITAL LETTER TSWE A690 ; Lu # CYRILLIC CAPITAL LETTER TSSE A692 ; Lu # CYRILLIC CAPITAL LETTER TCHE A694 ; Lu # CYRILLIC CAPITAL LETTER HWE A696 ; Lu # CYRILLIC CAPITAL LETTER SHWE A698 ; Lu # CYRILLIC CAPITAL LETTER DOUBLE O A69A ; Lu # CYRILLIC CAPITAL LETTER CROSSED O A722 ; Lu # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF A724 ; Lu # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN A726 ; Lu # LATIN CAPITAL LETTER HENG A728 ; Lu # LATIN CAPITAL LETTER TZ A72A ; Lu # LATIN CAPITAL LETTER TRESILLO A72C ; Lu # LATIN CAPITAL LETTER CUATRILLO A72E ; Lu # LATIN CAPITAL LETTER CUATRILLO WITH COMMA A732 ; Lu # LATIN CAPITAL LETTER AA A734 ; Lu # LATIN CAPITAL LETTER AO A736 ; Lu # LATIN CAPITAL LETTER AU A738 ; Lu # LATIN CAPITAL LETTER AV A73A ; Lu # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR A73C ; Lu # LATIN CAPITAL LETTER AY A73E ; Lu # LATIN CAPITAL LETTER REVERSED C WITH DOT A740 ; Lu # LATIN CAPITAL LETTER K WITH STROKE A742 ; Lu # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE A744 ; Lu # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE A746 ; Lu # LATIN CAPITAL LETTER BROKEN L A748 ; Lu # LATIN CAPITAL LETTER L WITH HIGH STROKE A74A ; Lu # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY A74C ; Lu # LATIN CAPITAL LETTER O WITH LOOP A74E ; Lu # LATIN CAPITAL LETTER OO A750 ; Lu # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER A752 ; Lu # LATIN CAPITAL LETTER P WITH FLOURISH A754 ; Lu # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL A756 ; Lu # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER A758 ; Lu # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE A75A ; Lu # LATIN CAPITAL LETTER R ROTUNDA A75C ; Lu # LATIN CAPITAL LETTER RUM ROTUNDA A75E ; Lu # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE A760 ; Lu # LATIN CAPITAL LETTER VY A762 ; Lu # LATIN CAPITAL LETTER VISIGOTHIC Z A764 ; Lu # LATIN CAPITAL LETTER THORN WITH STROKE A766 ; Lu # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER A768 ; Lu # LATIN CAPITAL LETTER VEND A76A ; Lu # LATIN CAPITAL LETTER ET A76C ; Lu # LATIN CAPITAL LETTER IS A76E ; Lu # LATIN CAPITAL LETTER CON A779 ; Lu # LATIN CAPITAL LETTER INSULAR D A77B ; Lu # LATIN CAPITAL LETTER INSULAR F A77D..A77E ; Lu # [2] LATIN CAPITAL LETTER INSULAR G..LATIN CAPITAL LETTER TURNED INSULAR G A780 ; Lu # LATIN CAPITAL LETTER TURNED L A782 ; Lu # LATIN CAPITAL LETTER INSULAR R A784 ; Lu # LATIN CAPITAL LETTER INSULAR S A786 ; Lu # LATIN CAPITAL LETTER INSULAR T A78B ; Lu # LATIN CAPITAL LETTER SALTILLO A78D ; Lu # LATIN CAPITAL LETTER TURNED H A790 ; Lu # LATIN CAPITAL LETTER N WITH DESCENDER A792 ; Lu # LATIN CAPITAL LETTER C WITH BAR A796 ; Lu # LATIN CAPITAL LETTER B WITH FLOURISH A798 ; Lu # LATIN CAPITAL LETTER F WITH STROKE A79A ; Lu # LATIN CAPITAL LETTER VOLAPUK AE A79C ; Lu # LATIN CAPITAL LETTER VOLAPUK OE A79E ; Lu # LATIN CAPITAL LETTER VOLAPUK UE A7A0 ; Lu # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE A7A2 ; Lu # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE A7A4 ; Lu # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE A7A6 ; Lu # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE A7A8 ; Lu # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE A7AA..A7AE ; Lu # [5] LATIN CAPITAL LETTER H WITH HOOK..LATIN CAPITAL LETTER SMALL CAPITAL I A7B0..A7B4 ; Lu # [5] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER BETA A7B6 ; Lu # LATIN CAPITAL LETTER OMEGA A7B8 ; Lu # LATIN CAPITAL LETTER U WITH STROKE A7BA ; Lu # LATIN CAPITAL LETTER GLOTTAL A A7BC ; Lu # LATIN CAPITAL LETTER GLOTTAL I A7BE ; Lu # LATIN CAPITAL LETTER GLOTTAL U A7C0 ; Lu # LATIN CAPITAL LETTER OLD POLISH O A7C2 ; Lu # LATIN CAPITAL LETTER ANGLICANA W A7C4..A7C7 ; Lu # [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9 ; Lu # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB..A7CC ; Lu # [2] LATIN CAPITAL LETTER RAMS HORN..LATIN CAPITAL LETTER S WITH DIAGONAL STROKE A7CE ; Lu # LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0 ; Lu # LATIN CAPITAL LETTER CLOSED INSULAR G A7D2 ; Lu # LATIN CAPITAL LETTER DOUBLE THORN A7D4 ; Lu # LATIN CAPITAL LETTER DOUBLE WYNN A7D6 ; Lu # LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8 ; Lu # LATIN CAPITAL LETTER SIGMOID S A7DA ; Lu # LATIN CAPITAL LETTER LAMBDA A7DC ; Lu # LATIN CAPITAL LETTER LAMBDA WITH STROKE A7F5 ; Lu # LATIN CAPITAL LETTER REVERSED HALF H FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z 10400..10427 ; Lu # [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW 104B0..104D3 ; Lu # [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA 10570..1057A ; Lu # [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA 1057C..1058A ; Lu # [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE 1058C..10592 ; Lu # [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE 10594..10595 ; Lu # [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE 10C80..10CB2 ; Lu # [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US 10D50..10D65 ; Lu # [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA 118A0..118BF ; Lu # [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO 16E40..16E5F ; Lu # [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y 16EA0..16EB8 ; Lu # [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY 1D400..1D419 ; Lu # [26] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL BOLD CAPITAL Z 1D434..1D44D ; Lu # [26] MATHEMATICAL ITALIC CAPITAL A..MATHEMATICAL ITALIC CAPITAL Z 1D468..1D481 ; Lu # [26] MATHEMATICAL BOLD ITALIC CAPITAL A..MATHEMATICAL BOLD ITALIC CAPITAL Z 1D49C ; Lu # MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Lu # [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Lu # MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Lu # [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Lu # [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B5 ; Lu # [8] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT CAPITAL Z 1D4D0..1D4E9 ; Lu # [26] MATHEMATICAL BOLD SCRIPT CAPITAL A..MATHEMATICAL BOLD SCRIPT CAPITAL Z 1D504..1D505 ; Lu # [2] MATHEMATICAL FRAKTUR CAPITAL A..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Lu # [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Lu # [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Lu # [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D538..1D539 ; Lu # [2] MATHEMATICAL DOUBLE-STRUCK CAPITAL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Lu # [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Lu # [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Lu # MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Lu # [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D56C..1D585 ; Lu # [26] MATHEMATICAL BOLD FRAKTUR CAPITAL A..MATHEMATICAL BOLD FRAKTUR CAPITAL Z 1D5A0..1D5B9 ; Lu # [26] MATHEMATICAL SANS-SERIF CAPITAL A..MATHEMATICAL SANS-SERIF CAPITAL Z 1D5D4..1D5ED ; Lu # [26] MATHEMATICAL SANS-SERIF BOLD CAPITAL A..MATHEMATICAL SANS-SERIF BOLD CAPITAL Z 1D608..1D621 ; Lu # [26] MATHEMATICAL SANS-SERIF ITALIC CAPITAL A..MATHEMATICAL SANS-SERIF ITALIC CAPITAL Z 1D63C..1D655 ; Lu # [26] MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL A..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL Z 1D670..1D689 ; Lu # [26] MATHEMATICAL MONOSPACE CAPITAL A..MATHEMATICAL MONOSPACE CAPITAL Z 1D6A8..1D6C0 ; Lu # [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6E2..1D6FA ; Lu # [25] MATHEMATICAL ITALIC CAPITAL ALPHA..MATHEMATICAL ITALIC CAPITAL OMEGA 1D71C..1D734 ; Lu # [25] MATHEMATICAL BOLD ITALIC CAPITAL ALPHA..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D756..1D76E ; Lu # [25] MATHEMATICAL SANS-SERIF BOLD CAPITAL ALPHA..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D790..1D7A8 ; Lu # [25] MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7CA ; Lu # MATHEMATICAL BOLD CAPITAL DIGAMMA 1E900..1E921 ; Lu # [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA # Total code points: 1886 # ================================================ # General_Category=Lowercase_Letter 0061..007A ; Ll # [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 00B5 ; Ll # MICRO SIGN 00DF..00F6 ; Ll # [24] LATIN SMALL LETTER SHARP S..LATIN SMALL LETTER O WITH DIAERESIS 00F8..00FF ; Ll # [8] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER Y WITH DIAERESIS 0101 ; Ll # LATIN SMALL LETTER A WITH MACRON 0103 ; Ll # LATIN SMALL LETTER A WITH BREVE 0105 ; Ll # LATIN SMALL LETTER A WITH OGONEK 0107 ; Ll # LATIN SMALL LETTER C WITH ACUTE 0109 ; Ll # LATIN SMALL LETTER C WITH CIRCUMFLEX 010B ; Ll # LATIN SMALL LETTER C WITH DOT ABOVE 010D ; Ll # LATIN SMALL LETTER C WITH CARON 010F ; Ll # LATIN SMALL LETTER D WITH CARON 0111 ; Ll # LATIN SMALL LETTER D WITH STROKE 0113 ; Ll # LATIN SMALL LETTER E WITH MACRON 0115 ; Ll # LATIN SMALL LETTER E WITH BREVE 0117 ; Ll # LATIN SMALL LETTER E WITH DOT ABOVE 0119 ; Ll # LATIN SMALL LETTER E WITH OGONEK 011B ; Ll # LATIN SMALL LETTER E WITH CARON 011D ; Ll # LATIN SMALL LETTER G WITH CIRCUMFLEX 011F ; Ll # LATIN SMALL LETTER G WITH BREVE 0121 ; Ll # LATIN SMALL LETTER G WITH DOT ABOVE 0123 ; Ll # LATIN SMALL LETTER G WITH CEDILLA 0125 ; Ll # LATIN SMALL LETTER H WITH CIRCUMFLEX 0127 ; Ll # LATIN SMALL LETTER H WITH STROKE 0129 ; Ll # LATIN SMALL LETTER I WITH TILDE 012B ; Ll # LATIN SMALL LETTER I WITH MACRON 012D ; Ll # LATIN SMALL LETTER I WITH BREVE 012F ; Ll # LATIN SMALL LETTER I WITH OGONEK 0131 ; Ll # LATIN SMALL LETTER DOTLESS I 0133 ; Ll # LATIN SMALL LIGATURE IJ 0135 ; Ll # LATIN SMALL LETTER J WITH CIRCUMFLEX 0137..0138 ; Ll # [2] LATIN SMALL LETTER K WITH CEDILLA..LATIN SMALL LETTER KRA 013A ; Ll # LATIN SMALL LETTER L WITH ACUTE 013C ; Ll # LATIN SMALL LETTER L WITH CEDILLA 013E ; Ll # LATIN SMALL LETTER L WITH CARON 0140 ; Ll # LATIN SMALL LETTER L WITH MIDDLE DOT 0142 ; Ll # LATIN SMALL LETTER L WITH STROKE 0144 ; Ll # LATIN SMALL LETTER N WITH ACUTE 0146 ; Ll # LATIN SMALL LETTER N WITH CEDILLA 0148..0149 ; Ll # [2] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 014B ; Ll # LATIN SMALL LETTER ENG 014D ; Ll # LATIN SMALL LETTER O WITH MACRON 014F ; Ll # LATIN SMALL LETTER O WITH BREVE 0151 ; Ll # LATIN SMALL LETTER O WITH DOUBLE ACUTE 0153 ; Ll # LATIN SMALL LIGATURE OE 0155 ; Ll # LATIN SMALL LETTER R WITH ACUTE 0157 ; Ll # LATIN SMALL LETTER R WITH CEDILLA 0159 ; Ll # LATIN SMALL LETTER R WITH CARON 015B ; Ll # LATIN SMALL LETTER S WITH ACUTE 015D ; Ll # LATIN SMALL LETTER S WITH CIRCUMFLEX 015F ; Ll # LATIN SMALL LETTER S WITH CEDILLA 0161 ; Ll # LATIN SMALL LETTER S WITH CARON 0163 ; Ll # LATIN SMALL LETTER T WITH CEDILLA 0165 ; Ll # LATIN SMALL LETTER T WITH CARON 0167 ; Ll # LATIN SMALL LETTER T WITH STROKE 0169 ; Ll # LATIN SMALL LETTER U WITH TILDE 016B ; Ll # LATIN SMALL LETTER U WITH MACRON 016D ; Ll # LATIN SMALL LETTER U WITH BREVE 016F ; Ll # LATIN SMALL LETTER U WITH RING ABOVE 0171 ; Ll # LATIN SMALL LETTER U WITH DOUBLE ACUTE 0173 ; Ll # LATIN SMALL LETTER U WITH OGONEK 0175 ; Ll # LATIN SMALL LETTER W WITH CIRCUMFLEX 0177 ; Ll # LATIN SMALL LETTER Y WITH CIRCUMFLEX 017A ; Ll # LATIN SMALL LETTER Z WITH ACUTE 017C ; Ll # LATIN SMALL LETTER Z WITH DOT ABOVE 017E..0180 ; Ll # [3] LATIN SMALL LETTER Z WITH CARON..LATIN SMALL LETTER B WITH STROKE 0183 ; Ll # LATIN SMALL LETTER B WITH TOPBAR 0185 ; Ll # LATIN SMALL LETTER TONE SIX 0188 ; Ll # LATIN SMALL LETTER C WITH HOOK 018C..018D ; Ll # [2] LATIN SMALL LETTER D WITH TOPBAR..LATIN SMALL LETTER TURNED DELTA 0192 ; Ll # LATIN SMALL LETTER F WITH HOOK 0195 ; Ll # LATIN SMALL LETTER HV 0199..019B ; Ll # [3] LATIN SMALL LETTER K WITH HOOK..LATIN SMALL LETTER LAMBDA WITH STROKE 019E ; Ll # LATIN SMALL LETTER N WITH LONG RIGHT LEG 01A1 ; Ll # LATIN SMALL LETTER O WITH HORN 01A3 ; Ll # LATIN SMALL LETTER OI 01A5 ; Ll # LATIN SMALL LETTER P WITH HOOK 01A8 ; Ll # LATIN SMALL LETTER TONE TWO 01AA..01AB ; Ll # [2] LATIN LETTER REVERSED ESH LOOP..LATIN SMALL LETTER T WITH PALATAL HOOK 01AD ; Ll # LATIN SMALL LETTER T WITH HOOK 01B0 ; Ll # LATIN SMALL LETTER U WITH HORN 01B4 ; Ll # LATIN SMALL LETTER Y WITH HOOK 01B6 ; Ll # LATIN SMALL LETTER Z WITH STROKE 01B9..01BA ; Ll # [2] LATIN SMALL LETTER EZH REVERSED..LATIN SMALL LETTER EZH WITH TAIL 01BD..01BF ; Ll # [3] LATIN SMALL LETTER TONE FIVE..LATIN LETTER WYNN 01C6 ; Ll # LATIN SMALL LETTER DZ WITH CARON 01C9 ; Ll # LATIN SMALL LETTER LJ 01CC ; Ll # LATIN SMALL LETTER NJ 01CE ; Ll # LATIN SMALL LETTER A WITH CARON 01D0 ; Ll # LATIN SMALL LETTER I WITH CARON 01D2 ; Ll # LATIN SMALL LETTER O WITH CARON 01D4 ; Ll # LATIN SMALL LETTER U WITH CARON 01D6 ; Ll # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON 01D8 ; Ll # LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE 01DA ; Ll # LATIN SMALL LETTER U WITH DIAERESIS AND CARON 01DC..01DD ; Ll # [2] LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE..LATIN SMALL LETTER TURNED E 01DF ; Ll # LATIN SMALL LETTER A WITH DIAERESIS AND MACRON 01E1 ; Ll # LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON 01E3 ; Ll # LATIN SMALL LETTER AE WITH MACRON 01E5 ; Ll # LATIN SMALL LETTER G WITH STROKE 01E7 ; Ll # LATIN SMALL LETTER G WITH CARON 01E9 ; Ll # LATIN SMALL LETTER K WITH CARON 01EB ; Ll # LATIN SMALL LETTER O WITH OGONEK 01ED ; Ll # LATIN SMALL LETTER O WITH OGONEK AND MACRON 01EF..01F0 ; Ll # [2] LATIN SMALL LETTER EZH WITH CARON..LATIN SMALL LETTER J WITH CARON 01F3 ; Ll # LATIN SMALL LETTER DZ 01F5 ; Ll # LATIN SMALL LETTER G WITH ACUTE 01F9 ; Ll # LATIN SMALL LETTER N WITH GRAVE 01FB ; Ll # LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE 01FD ; Ll # LATIN SMALL LETTER AE WITH ACUTE 01FF ; Ll # LATIN SMALL LETTER O WITH STROKE AND ACUTE 0201 ; Ll # LATIN SMALL LETTER A WITH DOUBLE GRAVE 0203 ; Ll # LATIN SMALL LETTER A WITH INVERTED BREVE 0205 ; Ll # LATIN SMALL LETTER E WITH DOUBLE GRAVE 0207 ; Ll # LATIN SMALL LETTER E WITH INVERTED BREVE 0209 ; Ll # LATIN SMALL LETTER I WITH DOUBLE GRAVE 020B ; Ll # LATIN SMALL LETTER I WITH INVERTED BREVE 020D ; Ll # LATIN SMALL LETTER O WITH DOUBLE GRAVE 020F ; Ll # LATIN SMALL LETTER O WITH INVERTED BREVE 0211 ; Ll # LATIN SMALL LETTER R WITH DOUBLE GRAVE 0213 ; Ll # LATIN SMALL LETTER R WITH INVERTED BREVE 0215 ; Ll # LATIN SMALL LETTER U WITH DOUBLE GRAVE 0217 ; Ll # LATIN SMALL LETTER U WITH INVERTED BREVE 0219 ; Ll # LATIN SMALL LETTER S WITH COMMA BELOW 021B ; Ll # LATIN SMALL LETTER T WITH COMMA BELOW 021D ; Ll # LATIN SMALL LETTER YOGH 021F ; Ll # LATIN SMALL LETTER H WITH CARON 0221 ; Ll # LATIN SMALL LETTER D WITH CURL 0223 ; Ll # LATIN SMALL LETTER OU 0225 ; Ll # LATIN SMALL LETTER Z WITH HOOK 0227 ; Ll # LATIN SMALL LETTER A WITH DOT ABOVE 0229 ; Ll # LATIN SMALL LETTER E WITH CEDILLA 022B ; Ll # LATIN SMALL LETTER O WITH DIAERESIS AND MACRON 022D ; Ll # LATIN SMALL LETTER O WITH TILDE AND MACRON 022F ; Ll # LATIN SMALL LETTER O WITH DOT ABOVE 0231 ; Ll # LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON 0233..0239 ; Ll # [7] LATIN SMALL LETTER Y WITH MACRON..LATIN SMALL LETTER QP DIGRAPH 023C ; Ll # LATIN SMALL LETTER C WITH STROKE 023F..0240 ; Ll # [2] LATIN SMALL LETTER S WITH SWASH TAIL..LATIN SMALL LETTER Z WITH SWASH TAIL 0242 ; Ll # LATIN SMALL LETTER GLOTTAL STOP 0247 ; Ll # LATIN SMALL LETTER E WITH STROKE 0249 ; Ll # LATIN SMALL LETTER J WITH STROKE 024B ; Ll # LATIN SMALL LETTER Q WITH HOOK TAIL 024D ; Ll # LATIN SMALL LETTER R WITH STROKE 024F..0293 ; Ll # [69] LATIN SMALL LETTER Y WITH STROKE..LATIN SMALL LETTER EZH WITH CURL 0296..02AF ; Ll # [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 0371 ; Ll # GREEK SMALL LETTER HETA 0373 ; Ll # GREEK SMALL LETTER ARCHAIC SAMPI 0377 ; Ll # GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037B..037D ; Ll # [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL 0390 ; Ll # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 03AC..03CE ; Ll # [35] GREEK SMALL LETTER ALPHA WITH TONOS..GREEK SMALL LETTER OMEGA WITH TONOS 03D0..03D1 ; Ll # [2] GREEK BETA SYMBOL..GREEK THETA SYMBOL 03D5..03D7 ; Ll # [3] GREEK PHI SYMBOL..GREEK KAI SYMBOL 03D9 ; Ll # GREEK SMALL LETTER ARCHAIC KOPPA 03DB ; Ll # GREEK SMALL LETTER STIGMA 03DD ; Ll # GREEK SMALL LETTER DIGAMMA 03DF ; Ll # GREEK SMALL LETTER KOPPA 03E1 ; Ll # GREEK SMALL LETTER SAMPI 03E3 ; Ll # COPTIC SMALL LETTER SHEI 03E5 ; Ll # COPTIC SMALL LETTER FEI 03E7 ; Ll # COPTIC SMALL LETTER KHEI 03E9 ; Ll # COPTIC SMALL LETTER HORI 03EB ; Ll # COPTIC SMALL LETTER GANGIA 03ED ; Ll # COPTIC SMALL LETTER SHIMA 03EF..03F3 ; Ll # [5] COPTIC SMALL LETTER DEI..GREEK LETTER YOT 03F5 ; Ll # GREEK LUNATE EPSILON SYMBOL 03F8 ; Ll # GREEK SMALL LETTER SHO 03FB..03FC ; Ll # [2] GREEK SMALL LETTER SAN..GREEK RHO WITH STROKE SYMBOL 0430..045F ; Ll # [48] CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETTER DZHE 0461 ; Ll # CYRILLIC SMALL LETTER OMEGA 0463 ; Ll # CYRILLIC SMALL LETTER YAT 0465 ; Ll # CYRILLIC SMALL LETTER IOTIFIED E 0467 ; Ll # CYRILLIC SMALL LETTER LITTLE YUS 0469 ; Ll # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS 046B ; Ll # CYRILLIC SMALL LETTER BIG YUS 046D ; Ll # CYRILLIC SMALL LETTER IOTIFIED BIG YUS 046F ; Ll # CYRILLIC SMALL LETTER KSI 0471 ; Ll # CYRILLIC SMALL LETTER PSI 0473 ; Ll # CYRILLIC SMALL LETTER FITA 0475 ; Ll # CYRILLIC SMALL LETTER IZHITSA 0477 ; Ll # CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT 0479 ; Ll # CYRILLIC SMALL LETTER UK 047B ; Ll # CYRILLIC SMALL LETTER ROUND OMEGA 047D ; Ll # CYRILLIC SMALL LETTER OMEGA WITH TITLO 047F ; Ll # CYRILLIC SMALL LETTER OT 0481 ; Ll # CYRILLIC SMALL LETTER KOPPA 048B ; Ll # CYRILLIC SMALL LETTER SHORT I WITH TAIL 048D ; Ll # CYRILLIC SMALL LETTER SEMISOFT SIGN 048F ; Ll # CYRILLIC SMALL LETTER ER WITH TICK 0491 ; Ll # CYRILLIC SMALL LETTER GHE WITH UPTURN 0493 ; Ll # CYRILLIC SMALL LETTER GHE WITH STROKE 0495 ; Ll # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK 0497 ; Ll # CYRILLIC SMALL LETTER ZHE WITH DESCENDER 0499 ; Ll # CYRILLIC SMALL LETTER ZE WITH DESCENDER 049B ; Ll # CYRILLIC SMALL LETTER KA WITH DESCENDER 049D ; Ll # CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE 049F ; Ll # CYRILLIC SMALL LETTER KA WITH STROKE 04A1 ; Ll # CYRILLIC SMALL LETTER BASHKIR KA 04A3 ; Ll # CYRILLIC SMALL LETTER EN WITH DESCENDER 04A5 ; Ll # CYRILLIC SMALL LIGATURE EN GHE 04A7 ; Ll # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK 04A9 ; Ll # CYRILLIC SMALL LETTER ABKHASIAN HA 04AB ; Ll # CYRILLIC SMALL LETTER ES WITH DESCENDER 04AD ; Ll # CYRILLIC SMALL LETTER TE WITH DESCENDER 04AF ; Ll # CYRILLIC SMALL LETTER STRAIGHT U 04B1 ; Ll # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE 04B3 ; Ll # CYRILLIC SMALL LETTER HA WITH DESCENDER 04B5 ; Ll # CYRILLIC SMALL LIGATURE TE TSE 04B7 ; Ll # CYRILLIC SMALL LETTER CHE WITH DESCENDER 04B9 ; Ll # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE 04BB ; Ll # CYRILLIC SMALL LETTER SHHA 04BD ; Ll # CYRILLIC SMALL LETTER ABKHASIAN CHE 04BF ; Ll # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER 04C2 ; Ll # CYRILLIC SMALL LETTER ZHE WITH BREVE 04C4 ; Ll # CYRILLIC SMALL LETTER KA WITH HOOK 04C6 ; Ll # CYRILLIC SMALL LETTER EL WITH TAIL 04C8 ; Ll # CYRILLIC SMALL LETTER EN WITH HOOK 04CA ; Ll # CYRILLIC SMALL LETTER EN WITH TAIL 04CC ; Ll # CYRILLIC SMALL LETTER KHAKASSIAN CHE 04CE..04CF ; Ll # [2] CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC SMALL LETTER PALOCHKA 04D1 ; Ll # CYRILLIC SMALL LETTER A WITH BREVE 04D3 ; Ll # CYRILLIC SMALL LETTER A WITH DIAERESIS 04D5 ; Ll # CYRILLIC SMALL LIGATURE A IE 04D7 ; Ll # CYRILLIC SMALL LETTER IE WITH BREVE 04D9 ; Ll # CYRILLIC SMALL LETTER SCHWA 04DB ; Ll # CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS 04DD ; Ll # CYRILLIC SMALL LETTER ZHE WITH DIAERESIS 04DF ; Ll # CYRILLIC SMALL LETTER ZE WITH DIAERESIS 04E1 ; Ll # CYRILLIC SMALL LETTER ABKHASIAN DZE 04E3 ; Ll # CYRILLIC SMALL LETTER I WITH MACRON 04E5 ; Ll # CYRILLIC SMALL LETTER I WITH DIAERESIS 04E7 ; Ll # CYRILLIC SMALL LETTER O WITH DIAERESIS 04E9 ; Ll # CYRILLIC SMALL LETTER BARRED O 04EB ; Ll # CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS 04ED ; Ll # CYRILLIC SMALL LETTER E WITH DIAERESIS 04EF ; Ll # CYRILLIC SMALL LETTER U WITH MACRON 04F1 ; Ll # CYRILLIC SMALL LETTER U WITH DIAERESIS 04F3 ; Ll # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE 04F5 ; Ll # CYRILLIC SMALL LETTER CHE WITH DIAERESIS 04F7 ; Ll # CYRILLIC SMALL LETTER GHE WITH DESCENDER 04F9 ; Ll # CYRILLIC SMALL LETTER YERU WITH DIAERESIS 04FB ; Ll # CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK 04FD ; Ll # CYRILLIC SMALL LETTER HA WITH HOOK 04FF ; Ll # CYRILLIC SMALL LETTER HA WITH STROKE 0501 ; Ll # CYRILLIC SMALL LETTER KOMI DE 0503 ; Ll # CYRILLIC SMALL LETTER KOMI DJE 0505 ; Ll # CYRILLIC SMALL LETTER KOMI ZJE 0507 ; Ll # CYRILLIC SMALL LETTER KOMI DZJE 0509 ; Ll # CYRILLIC SMALL LETTER KOMI LJE 050B ; Ll # CYRILLIC SMALL LETTER KOMI NJE 050D ; Ll # CYRILLIC SMALL LETTER KOMI SJE 050F ; Ll # CYRILLIC SMALL LETTER KOMI TJE 0511 ; Ll # CYRILLIC SMALL LETTER REVERSED ZE 0513 ; Ll # CYRILLIC SMALL LETTER EL WITH HOOK 0515 ; Ll # CYRILLIC SMALL LETTER LHA 0517 ; Ll # CYRILLIC SMALL LETTER RHA 0519 ; Ll # CYRILLIC SMALL LETTER YAE 051B ; Ll # CYRILLIC SMALL LETTER QA 051D ; Ll # CYRILLIC SMALL LETTER WE 051F ; Ll # CYRILLIC SMALL LETTER ALEUT KA 0521 ; Ll # CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK 0523 ; Ll # CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK 0525 ; Ll # CYRILLIC SMALL LETTER PE WITH DESCENDER 0527 ; Ll # CYRILLIC SMALL LETTER SHHA WITH DESCENDER 0529 ; Ll # CYRILLIC SMALL LETTER EN WITH LEFT HOOK 052B ; Ll # CYRILLIC SMALL LETTER DZZHE 052D ; Ll # CYRILLIC SMALL LETTER DCHE 052F ; Ll # CYRILLIC SMALL LETTER EL WITH DESCENDER 0560..0588 ; Ll # [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE 10D0..10FA ; Ll # [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN 10FD..10FF ; Ll # [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13F8..13FD ; Ll # [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Ll # [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK 1C8A ; Ll # CYRILLIC SMALL LETTER TJE 1D00..1D2B ; Ll # [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D6B..1D77 ; Ll # [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D79..1D9A ; Ll # [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1E01 ; Ll # LATIN SMALL LETTER A WITH RING BELOW 1E03 ; Ll # LATIN SMALL LETTER B WITH DOT ABOVE 1E05 ; Ll # LATIN SMALL LETTER B WITH DOT BELOW 1E07 ; Ll # LATIN SMALL LETTER B WITH LINE BELOW 1E09 ; Ll # LATIN SMALL LETTER C WITH CEDILLA AND ACUTE 1E0B ; Ll # LATIN SMALL LETTER D WITH DOT ABOVE 1E0D ; Ll # LATIN SMALL LETTER D WITH DOT BELOW 1E0F ; Ll # LATIN SMALL LETTER D WITH LINE BELOW 1E11 ; Ll # LATIN SMALL LETTER D WITH CEDILLA 1E13 ; Ll # LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW 1E15 ; Ll # LATIN SMALL LETTER E WITH MACRON AND GRAVE 1E17 ; Ll # LATIN SMALL LETTER E WITH MACRON AND ACUTE 1E19 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW 1E1B ; Ll # LATIN SMALL LETTER E WITH TILDE BELOW 1E1D ; Ll # LATIN SMALL LETTER E WITH CEDILLA AND BREVE 1E1F ; Ll # LATIN SMALL LETTER F WITH DOT ABOVE 1E21 ; Ll # LATIN SMALL LETTER G WITH MACRON 1E23 ; Ll # LATIN SMALL LETTER H WITH DOT ABOVE 1E25 ; Ll # LATIN SMALL LETTER H WITH DOT BELOW 1E27 ; Ll # LATIN SMALL LETTER H WITH DIAERESIS 1E29 ; Ll # LATIN SMALL LETTER H WITH CEDILLA 1E2B ; Ll # LATIN SMALL LETTER H WITH BREVE BELOW 1E2D ; Ll # LATIN SMALL LETTER I WITH TILDE BELOW 1E2F ; Ll # LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE 1E31 ; Ll # LATIN SMALL LETTER K WITH ACUTE 1E33 ; Ll # LATIN SMALL LETTER K WITH DOT BELOW 1E35 ; Ll # LATIN SMALL LETTER K WITH LINE BELOW 1E37 ; Ll # LATIN SMALL LETTER L WITH DOT BELOW 1E39 ; Ll # LATIN SMALL LETTER L WITH DOT BELOW AND MACRON 1E3B ; Ll # LATIN SMALL LETTER L WITH LINE BELOW 1E3D ; Ll # LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW 1E3F ; Ll # LATIN SMALL LETTER M WITH ACUTE 1E41 ; Ll # LATIN SMALL LETTER M WITH DOT ABOVE 1E43 ; Ll # LATIN SMALL LETTER M WITH DOT BELOW 1E45 ; Ll # LATIN SMALL LETTER N WITH DOT ABOVE 1E47 ; Ll # LATIN SMALL LETTER N WITH DOT BELOW 1E49 ; Ll # LATIN SMALL LETTER N WITH LINE BELOW 1E4B ; Ll # LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW 1E4D ; Ll # LATIN SMALL LETTER O WITH TILDE AND ACUTE 1E4F ; Ll # LATIN SMALL LETTER O WITH TILDE AND DIAERESIS 1E51 ; Ll # LATIN SMALL LETTER O WITH MACRON AND GRAVE 1E53 ; Ll # LATIN SMALL LETTER O WITH MACRON AND ACUTE 1E55 ; Ll # LATIN SMALL LETTER P WITH ACUTE 1E57 ; Ll # LATIN SMALL LETTER P WITH DOT ABOVE 1E59 ; Ll # LATIN SMALL LETTER R WITH DOT ABOVE 1E5B ; Ll # LATIN SMALL LETTER R WITH DOT BELOW 1E5D ; Ll # LATIN SMALL LETTER R WITH DOT BELOW AND MACRON 1E5F ; Ll # LATIN SMALL LETTER R WITH LINE BELOW 1E61 ; Ll # LATIN SMALL LETTER S WITH DOT ABOVE 1E63 ; Ll # LATIN SMALL LETTER S WITH DOT BELOW 1E65 ; Ll # LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE 1E67 ; Ll # LATIN SMALL LETTER S WITH CARON AND DOT ABOVE 1E69 ; Ll # LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE 1E6B ; Ll # LATIN SMALL LETTER T WITH DOT ABOVE 1E6D ; Ll # LATIN SMALL LETTER T WITH DOT BELOW 1E6F ; Ll # LATIN SMALL LETTER T WITH LINE BELOW 1E71 ; Ll # LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW 1E73 ; Ll # LATIN SMALL LETTER U WITH DIAERESIS BELOW 1E75 ; Ll # LATIN SMALL LETTER U WITH TILDE BELOW 1E77 ; Ll # LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW 1E79 ; Ll # LATIN SMALL LETTER U WITH TILDE AND ACUTE 1E7B ; Ll # LATIN SMALL LETTER U WITH MACRON AND DIAERESIS 1E7D ; Ll # LATIN SMALL LETTER V WITH TILDE 1E7F ; Ll # LATIN SMALL LETTER V WITH DOT BELOW 1E81 ; Ll # LATIN SMALL LETTER W WITH GRAVE 1E83 ; Ll # LATIN SMALL LETTER W WITH ACUTE 1E85 ; Ll # LATIN SMALL LETTER W WITH DIAERESIS 1E87 ; Ll # LATIN SMALL LETTER W WITH DOT ABOVE 1E89 ; Ll # LATIN SMALL LETTER W WITH DOT BELOW 1E8B ; Ll # LATIN SMALL LETTER X WITH DOT ABOVE 1E8D ; Ll # LATIN SMALL LETTER X WITH DIAERESIS 1E8F ; Ll # LATIN SMALL LETTER Y WITH DOT ABOVE 1E91 ; Ll # LATIN SMALL LETTER Z WITH CIRCUMFLEX 1E93 ; Ll # LATIN SMALL LETTER Z WITH DOT BELOW 1E95..1E9D ; Ll # [9] LATIN SMALL LETTER Z WITH LINE BELOW..LATIN SMALL LETTER LONG S WITH HIGH STROKE 1E9F ; Ll # LATIN SMALL LETTER DELTA 1EA1 ; Ll # LATIN SMALL LETTER A WITH DOT BELOW 1EA3 ; Ll # LATIN SMALL LETTER A WITH HOOK ABOVE 1EA5 ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE 1EA7 ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE 1EA9 ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE 1EAB ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE 1EAD ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW 1EAF ; Ll # LATIN SMALL LETTER A WITH BREVE AND ACUTE 1EB1 ; Ll # LATIN SMALL LETTER A WITH BREVE AND GRAVE 1EB3 ; Ll # LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE 1EB5 ; Ll # LATIN SMALL LETTER A WITH BREVE AND TILDE 1EB7 ; Ll # LATIN SMALL LETTER A WITH BREVE AND DOT BELOW 1EB9 ; Ll # LATIN SMALL LETTER E WITH DOT BELOW 1EBB ; Ll # LATIN SMALL LETTER E WITH HOOK ABOVE 1EBD ; Ll # LATIN SMALL LETTER E WITH TILDE 1EBF ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE 1EC1 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE 1EC3 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE 1EC5 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE 1EC7 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW 1EC9 ; Ll # LATIN SMALL LETTER I WITH HOOK ABOVE 1ECB ; Ll # LATIN SMALL LETTER I WITH DOT BELOW 1ECD ; Ll # LATIN SMALL LETTER O WITH DOT BELOW 1ECF ; Ll # LATIN SMALL LETTER O WITH HOOK ABOVE 1ED1 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE 1ED3 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE 1ED5 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE 1ED7 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE 1ED9 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW 1EDB ; Ll # LATIN SMALL LETTER O WITH HORN AND ACUTE 1EDD ; Ll # LATIN SMALL LETTER O WITH HORN AND GRAVE 1EDF ; Ll # LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE 1EE1 ; Ll # LATIN SMALL LETTER O WITH HORN AND TILDE 1EE3 ; Ll # LATIN SMALL LETTER O WITH HORN AND DOT BELOW 1EE5 ; Ll # LATIN SMALL LETTER U WITH DOT BELOW 1EE7 ; Ll # LATIN SMALL LETTER U WITH HOOK ABOVE 1EE9 ; Ll # LATIN SMALL LETTER U WITH HORN AND ACUTE 1EEB ; Ll # LATIN SMALL LETTER U WITH HORN AND GRAVE 1EED ; Ll # LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE 1EEF ; Ll # LATIN SMALL LETTER U WITH HORN AND TILDE 1EF1 ; Ll # LATIN SMALL LETTER U WITH HORN AND DOT BELOW 1EF3 ; Ll # LATIN SMALL LETTER Y WITH GRAVE 1EF5 ; Ll # LATIN SMALL LETTER Y WITH DOT BELOW 1EF7 ; Ll # LATIN SMALL LETTER Y WITH HOOK ABOVE 1EF9 ; Ll # LATIN SMALL LETTER Y WITH TILDE 1EFB ; Ll # LATIN SMALL LETTER MIDDLE-WELSH LL 1EFD ; Ll # LATIN SMALL LETTER MIDDLE-WELSH V 1EFF..1F07 ; Ll # [9] LATIN SMALL LETTER Y WITH LOOP..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI 1F10..1F15 ; Ll # [6] GREEK SMALL LETTER EPSILON WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F27 ; Ll # [8] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI 1F30..1F37 ; Ll # [8] GREEK SMALL LETTER IOTA WITH PSILI..GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI 1F40..1F45 ; Ll # [6] GREEK SMALL LETTER OMICRON WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA 1F50..1F57 ; Ll # [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI 1F60..1F67 ; Ll # [8] GREEK SMALL LETTER OMEGA WITH PSILI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI 1F70..1F7D ; Ll # [14] GREEK SMALL LETTER ALPHA WITH VARIA..GREEK SMALL LETTER OMEGA WITH OXIA 1F80..1F87 ; Ll # [8] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1F90..1F97 ; Ll # [8] GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FA0..1FA7 ; Ll # [8] GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 1FB0..1FB4 ; Ll # [5] GREEK SMALL LETTER ALPHA WITH VRACHY..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 1FB6..1FB7 ; Ll # [2] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 1FBE ; Ll # GREEK PROSGEGRAMMENI 1FC2..1FC4 ; Ll # [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 1FC6..1FC7 ; Ll # [2] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 1FD0..1FD3 ; Ll # [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6..1FD7 ; Ll # [2] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FE0..1FE7 ; Ll # [8] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI 1FF2..1FF4 ; Ll # [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 1FF6..1FF7 ; Ll # [2] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 210A ; Ll # SCRIPT SMALL G 210E..210F ; Ll # [2] PLANCK CONSTANT..PLANCK CONSTANT OVER TWO PI 2113 ; Ll # SCRIPT SMALL L 212F ; Ll # SCRIPT SMALL E 2134 ; Ll # SCRIPT SMALL O 2139 ; Ll # INFORMATION SOURCE 213C..213D ; Ll # [2] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK SMALL GAMMA 2146..2149 ; Ll # [4] DOUBLE-STRUCK ITALIC SMALL D..DOUBLE-STRUCK ITALIC SMALL J 214E ; Ll # TURNED SMALL F 2184 ; Ll # LATIN SMALL LETTER REVERSED C 2C30..2C5F ; Ll # [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI 2C61 ; Ll # LATIN SMALL LETTER L WITH DOUBLE BAR 2C65..2C66 ; Ll # [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE 2C68 ; Ll # LATIN SMALL LETTER H WITH DESCENDER 2C6A ; Ll # LATIN SMALL LETTER K WITH DESCENDER 2C6C ; Ll # LATIN SMALL LETTER Z WITH DESCENDER 2C71 ; Ll # LATIN SMALL LETTER V WITH RIGHT HOOK 2C73..2C74 ; Ll # [2] LATIN SMALL LETTER W WITH HOOK..LATIN SMALL LETTER V WITH CURL 2C76..2C7B ; Ll # [6] LATIN SMALL LETTER HALF H..LATIN LETTER SMALL CAPITAL TURNED E 2C81 ; Ll # COPTIC SMALL LETTER ALFA 2C83 ; Ll # COPTIC SMALL LETTER VIDA 2C85 ; Ll # COPTIC SMALL LETTER GAMMA 2C87 ; Ll # COPTIC SMALL LETTER DALDA 2C89 ; Ll # COPTIC SMALL LETTER EIE 2C8B ; Ll # COPTIC SMALL LETTER SOU 2C8D ; Ll # COPTIC SMALL LETTER ZATA 2C8F ; Ll # COPTIC SMALL LETTER HATE 2C91 ; Ll # COPTIC SMALL LETTER THETHE 2C93 ; Ll # COPTIC SMALL LETTER IAUDA 2C95 ; Ll # COPTIC SMALL LETTER KAPA 2C97 ; Ll # COPTIC SMALL LETTER LAULA 2C99 ; Ll # COPTIC SMALL LETTER MI 2C9B ; Ll # COPTIC SMALL LETTER NI 2C9D ; Ll # COPTIC SMALL LETTER KSI 2C9F ; Ll # COPTIC SMALL LETTER O 2CA1 ; Ll # COPTIC SMALL LETTER PI 2CA3 ; Ll # COPTIC SMALL LETTER RO 2CA5 ; Ll # COPTIC SMALL LETTER SIMA 2CA7 ; Ll # COPTIC SMALL LETTER TAU 2CA9 ; Ll # COPTIC SMALL LETTER UA 2CAB ; Ll # COPTIC SMALL LETTER FI 2CAD ; Ll # COPTIC SMALL LETTER KHI 2CAF ; Ll # COPTIC SMALL LETTER PSI 2CB1 ; Ll # COPTIC SMALL LETTER OOU 2CB3 ; Ll # COPTIC SMALL LETTER DIALECT-P ALEF 2CB5 ; Ll # COPTIC SMALL LETTER OLD COPTIC AIN 2CB7 ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC EIE 2CB9 ; Ll # COPTIC SMALL LETTER DIALECT-P KAPA 2CBB ; Ll # COPTIC SMALL LETTER DIALECT-P NI 2CBD ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC NI 2CBF ; Ll # COPTIC SMALL LETTER OLD COPTIC OOU 2CC1 ; Ll # COPTIC SMALL LETTER SAMPI 2CC3 ; Ll # COPTIC SMALL LETTER CROSSED SHEI 2CC5 ; Ll # COPTIC SMALL LETTER OLD COPTIC SHEI 2CC7 ; Ll # COPTIC SMALL LETTER OLD COPTIC ESH 2CC9 ; Ll # COPTIC SMALL LETTER AKHMIMIC KHEI 2CCB ; Ll # COPTIC SMALL LETTER DIALECT-P HORI 2CCD ; Ll # COPTIC SMALL LETTER OLD COPTIC HORI 2CCF ; Ll # COPTIC SMALL LETTER OLD COPTIC HA 2CD1 ; Ll # COPTIC SMALL LETTER L-SHAPED HA 2CD3 ; Ll # COPTIC SMALL LETTER OLD COPTIC HEI 2CD5 ; Ll # COPTIC SMALL LETTER OLD COPTIC HAT 2CD7 ; Ll # COPTIC SMALL LETTER OLD COPTIC GANGIA 2CD9 ; Ll # COPTIC SMALL LETTER OLD COPTIC DJA 2CDB ; Ll # COPTIC SMALL LETTER OLD COPTIC SHIMA 2CDD ; Ll # COPTIC SMALL LETTER OLD NUBIAN SHIMA 2CDF ; Ll # COPTIC SMALL LETTER OLD NUBIAN NGI 2CE1 ; Ll # COPTIC SMALL LETTER OLD NUBIAN NYI 2CE3..2CE4 ; Ll # [2] COPTIC SMALL LETTER OLD NUBIAN WAU..COPTIC SYMBOL KAI 2CEC ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI 2CEE ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA 2CF3 ; Ll # COPTIC SMALL LETTER BOHAIRIC KHEI 2D00..2D25 ; Ll # [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE 2D27 ; Ll # GEORGIAN SMALL LETTER YN 2D2D ; Ll # GEORGIAN SMALL LETTER AEN A641 ; Ll # CYRILLIC SMALL LETTER ZEMLYA A643 ; Ll # CYRILLIC SMALL LETTER DZELO A645 ; Ll # CYRILLIC SMALL LETTER REVERSED DZE A647 ; Ll # CYRILLIC SMALL LETTER IOTA A649 ; Ll # CYRILLIC SMALL LETTER DJERV A64B ; Ll # CYRILLIC SMALL LETTER MONOGRAPH UK A64D ; Ll # CYRILLIC SMALL LETTER BROAD OMEGA A64F ; Ll # CYRILLIC SMALL LETTER NEUTRAL YER A651 ; Ll # CYRILLIC SMALL LETTER YERU WITH BACK YER A653 ; Ll # CYRILLIC SMALL LETTER IOTIFIED YAT A655 ; Ll # CYRILLIC SMALL LETTER REVERSED YU A657 ; Ll # CYRILLIC SMALL LETTER IOTIFIED A A659 ; Ll # CYRILLIC SMALL LETTER CLOSED LITTLE YUS A65B ; Ll # CYRILLIC SMALL LETTER BLENDED YUS A65D ; Ll # CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE YUS A65F ; Ll # CYRILLIC SMALL LETTER YN A661 ; Ll # CYRILLIC SMALL LETTER REVERSED TSE A663 ; Ll # CYRILLIC SMALL LETTER SOFT DE A665 ; Ll # CYRILLIC SMALL LETTER SOFT EL A667 ; Ll # CYRILLIC SMALL LETTER SOFT EM A669 ; Ll # CYRILLIC SMALL LETTER MONOCULAR O A66B ; Ll # CYRILLIC SMALL LETTER BINOCULAR O A66D ; Ll # CYRILLIC SMALL LETTER DOUBLE MONOCULAR O A681 ; Ll # CYRILLIC SMALL LETTER DWE A683 ; Ll # CYRILLIC SMALL LETTER DZWE A685 ; Ll # CYRILLIC SMALL LETTER ZHWE A687 ; Ll # CYRILLIC SMALL LETTER CCHE A689 ; Ll # CYRILLIC SMALL LETTER DZZE A68B ; Ll # CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK A68D ; Ll # CYRILLIC SMALL LETTER TWE A68F ; Ll # CYRILLIC SMALL LETTER TSWE A691 ; Ll # CYRILLIC SMALL LETTER TSSE A693 ; Ll # CYRILLIC SMALL LETTER TCHE A695 ; Ll # CYRILLIC SMALL LETTER HWE A697 ; Ll # CYRILLIC SMALL LETTER SHWE A699 ; Ll # CYRILLIC SMALL LETTER DOUBLE O A69B ; Ll # CYRILLIC SMALL LETTER CROSSED O A723 ; Ll # LATIN SMALL LETTER EGYPTOLOGICAL ALEF A725 ; Ll # LATIN SMALL LETTER EGYPTOLOGICAL AIN A727 ; Ll # LATIN SMALL LETTER HENG A729 ; Ll # LATIN SMALL LETTER TZ A72B ; Ll # LATIN SMALL LETTER TRESILLO A72D ; Ll # LATIN SMALL LETTER CUATRILLO A72F..A731 ; Ll # [3] LATIN SMALL LETTER CUATRILLO WITH COMMA..LATIN LETTER SMALL CAPITAL S A733 ; Ll # LATIN SMALL LETTER AA A735 ; Ll # LATIN SMALL LETTER AO A737 ; Ll # LATIN SMALL LETTER AU A739 ; Ll # LATIN SMALL LETTER AV A73B ; Ll # LATIN SMALL LETTER AV WITH HORIZONTAL BAR A73D ; Ll # LATIN SMALL LETTER AY A73F ; Ll # LATIN SMALL LETTER REVERSED C WITH DOT A741 ; Ll # LATIN SMALL LETTER K WITH STROKE A743 ; Ll # LATIN SMALL LETTER K WITH DIAGONAL STROKE A745 ; Ll # LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE A747 ; Ll # LATIN SMALL LETTER BROKEN L A749 ; Ll # LATIN SMALL LETTER L WITH HIGH STROKE A74B ; Ll # LATIN SMALL LETTER O WITH LONG STROKE OVERLAY A74D ; Ll # LATIN SMALL LETTER O WITH LOOP A74F ; Ll # LATIN SMALL LETTER OO A751 ; Ll # LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER A753 ; Ll # LATIN SMALL LETTER P WITH FLOURISH A755 ; Ll # LATIN SMALL LETTER P WITH SQUIRREL TAIL A757 ; Ll # LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER A759 ; Ll # LATIN SMALL LETTER Q WITH DIAGONAL STROKE A75B ; Ll # LATIN SMALL LETTER R ROTUNDA A75D ; Ll # LATIN SMALL LETTER RUM ROTUNDA A75F ; Ll # LATIN SMALL LETTER V WITH DIAGONAL STROKE A761 ; Ll # LATIN SMALL LETTER VY A763 ; Ll # LATIN SMALL LETTER VISIGOTHIC Z A765 ; Ll # LATIN SMALL LETTER THORN WITH STROKE A767 ; Ll # LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER A769 ; Ll # LATIN SMALL LETTER VEND A76B ; Ll # LATIN SMALL LETTER ET A76D ; Ll # LATIN SMALL LETTER IS A76F ; Ll # LATIN SMALL LETTER CON A771..A778 ; Ll # [8] LATIN SMALL LETTER DUM..LATIN SMALL LETTER UM A77A ; Ll # LATIN SMALL LETTER INSULAR D A77C ; Ll # LATIN SMALL LETTER INSULAR F A77F ; Ll # LATIN SMALL LETTER TURNED INSULAR G A781 ; Ll # LATIN SMALL LETTER TURNED L A783 ; Ll # LATIN SMALL LETTER INSULAR R A785 ; Ll # LATIN SMALL LETTER INSULAR S A787 ; Ll # LATIN SMALL LETTER INSULAR T A78C ; Ll # LATIN SMALL LETTER SALTILLO A78E ; Ll # LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A791 ; Ll # LATIN SMALL LETTER N WITH DESCENDER A793..A795 ; Ll # [3] LATIN SMALL LETTER C WITH BAR..LATIN SMALL LETTER H WITH PALATAL HOOK A797 ; Ll # LATIN SMALL LETTER B WITH FLOURISH A799 ; Ll # LATIN SMALL LETTER F WITH STROKE A79B ; Ll # LATIN SMALL LETTER VOLAPUK AE A79D ; Ll # LATIN SMALL LETTER VOLAPUK OE A79F ; Ll # LATIN SMALL LETTER VOLAPUK UE A7A1 ; Ll # LATIN SMALL LETTER G WITH OBLIQUE STROKE A7A3 ; Ll # LATIN SMALL LETTER K WITH OBLIQUE STROKE A7A5 ; Ll # LATIN SMALL LETTER N WITH OBLIQUE STROKE A7A7 ; Ll # LATIN SMALL LETTER R WITH OBLIQUE STROKE A7A9 ; Ll # LATIN SMALL LETTER S WITH OBLIQUE STROKE A7AF ; Ll # LATIN LETTER SMALL CAPITAL Q A7B5 ; Ll # LATIN SMALL LETTER BETA A7B7 ; Ll # LATIN SMALL LETTER OMEGA A7B9 ; Ll # LATIN SMALL LETTER U WITH STROKE A7BB ; Ll # LATIN SMALL LETTER GLOTTAL A A7BD ; Ll # LATIN SMALL LETTER GLOTTAL I A7BF ; Ll # LATIN SMALL LETTER GLOTTAL U A7C1 ; Ll # LATIN SMALL LETTER OLD POLISH O A7C3 ; Ll # LATIN SMALL LETTER ANGLICANA W A7C8 ; Ll # LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY A7CA ; Ll # LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY A7CD ; Ll # LATIN SMALL LETTER S WITH DIAGONAL STROKE A7CF ; Ll # LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE A7D1 ; Ll # LATIN SMALL LETTER CLOSED INSULAR G A7D3 ; Ll # LATIN SMALL LETTER DOUBLE THORN A7D5 ; Ll # LATIN SMALL LETTER DOUBLE WYNN A7D7 ; Ll # LATIN SMALL LETTER MIDDLE SCOTS S A7D9 ; Ll # LATIN SMALL LETTER SIGMOID S A7DB ; Ll # LATIN SMALL LETTER LAMBDA A7F6 ; Ll # LATIN SMALL LETTER REVERSED HALF H A7FA ; Ll # LATIN LETTER SMALL CAPITAL TURNED M AB30..AB5A ; Ll # [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG AB60..AB68 ; Ll # [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE AB70..ABBF ; Ll # [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA FB00..FB06 ; Ll # [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FB13..FB17 ; Ll # [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z 10428..1044F ; Ll # [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW 104D8..104FB ; Ll # [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA 10597..105A1 ; Ll # [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA 105A3..105B1 ; Ll # [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE 105B3..105B9 ; Ll # [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE 105BB..105BC ; Ll # [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE 10CC0..10CF2 ; Ll # [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US 10D70..10D85 ; Ll # [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA 118C0..118DF ; Ll # [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO 16E60..16E7F ; Ll # [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y 16EBB..16ED3 ; Ll # [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY 1D41A..1D433 ; Ll # [26] MATHEMATICAL BOLD SMALL A..MATHEMATICAL BOLD SMALL Z 1D44E..1D454 ; Ll # [7] MATHEMATICAL ITALIC SMALL A..MATHEMATICAL ITALIC SMALL G 1D456..1D467 ; Ll # [18] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL ITALIC SMALL Z 1D482..1D49B ; Ll # [26] MATHEMATICAL BOLD ITALIC SMALL A..MATHEMATICAL BOLD ITALIC SMALL Z 1D4B6..1D4B9 ; Ll # [4] MATHEMATICAL SCRIPT SMALL A..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Ll # MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Ll # [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D4CF ; Ll # [11] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL SCRIPT SMALL Z 1D4EA..1D503 ; Ll # [26] MATHEMATICAL BOLD SCRIPT SMALL A..MATHEMATICAL BOLD SCRIPT SMALL Z 1D51E..1D537 ; Ll # [26] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL FRAKTUR SMALL Z 1D552..1D56B ; Ll # [26] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL DOUBLE-STRUCK SMALL Z 1D586..1D59F ; Ll # [26] MATHEMATICAL BOLD FRAKTUR SMALL A..MATHEMATICAL BOLD FRAKTUR SMALL Z 1D5BA..1D5D3 ; Ll # [26] MATHEMATICAL SANS-SERIF SMALL A..MATHEMATICAL SANS-SERIF SMALL Z 1D5EE..1D607 ; Ll # [26] MATHEMATICAL SANS-SERIF BOLD SMALL A..MATHEMATICAL SANS-SERIF BOLD SMALL Z 1D622..1D63B ; Ll # [26] MATHEMATICAL SANS-SERIF ITALIC SMALL A..MATHEMATICAL SANS-SERIF ITALIC SMALL Z 1D656..1D66F ; Ll # [26] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL A..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL Z 1D68A..1D6A5 ; Ll # [28] MATHEMATICAL MONOSPACE SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6C2..1D6DA ; Ll # [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6E1 ; Ll # [6] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL BOLD PI SYMBOL 1D6FC..1D714 ; Ll # [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D71B ; Ll # [6] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL ITALIC PI SYMBOL 1D736..1D74E ; Ll # [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D755 ; Ll # [6] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC PI SYMBOL 1D770..1D788 ; Ll # [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D78F ; Ll # [6] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD PI SYMBOL 1D7AA..1D7C2 ; Ll # [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7C9 ; Ll # [6] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL 1D7CB ; Ll # MATHEMATICAL BOLD SMALL DIGAMMA 1DF00..1DF09 ; Ll # [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK 1DF0B..1DF1E ; Ll # [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Ll # [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E922..1E943 ; Ll # [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA # Total code points: 2283 # ================================================ # General_Category=Titlecase_Letter 01C5 ; Lt # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 01C8 ; Lt # LATIN CAPITAL LETTER L WITH SMALL LETTER J 01CB ; Lt # LATIN CAPITAL LETTER N WITH SMALL LETTER J 01F2 ; Lt # LATIN CAPITAL LETTER D WITH SMALL LETTER Z 1F88..1F8F ; Lt # [8] GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1F98..1F9F ; Lt # [8] GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FA8..1FAF ; Lt # [8] GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 1FBC ; Lt # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 1FCC ; Lt # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FFC ; Lt # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI # Total code points: 31 # ================================================ # General_Category=Modifier_Letter 02B0..02C1 ; Lm # [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C6..02D1 ; Lm # [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02E0..02E4 ; Lm # [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02EC ; Lm # MODIFIER LETTER VOICING 02EE ; Lm # MODIFIER LETTER DOUBLE APOSTROPHE 0374 ; Lm # GREEK NUMERAL SIGN 037A ; Lm # GREEK YPOGEGRAMMENI 0559 ; Lm # ARMENIAN MODIFIER LETTER LEFT HALF RING 0640 ; Lm # ARABIC TATWEEL 06E5..06E6 ; Lm # [2] ARABIC SMALL WAW..ARABIC SMALL YEH 07F4..07F5 ; Lm # [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 07FA ; Lm # NKO LAJANYALAN 081A ; Lm # SAMARITAN MODIFIER LETTER EPENTHETIC YUT 0824 ; Lm # SAMARITAN MODIFIER LETTER SHORT A 0828 ; Lm # SAMARITAN MODIFIER LETTER I 08C9 ; Lm # ARABIC SMALL FARSI YEH 0971 ; Lm # DEVANAGARI SIGN HIGH SPACING DOT 0E46 ; Lm # THAI CHARACTER MAIYAMOK 0EC6 ; Lm # LAO KO LA 10FC ; Lm # MODIFIER LETTER GEORGIAN NAR 17D7 ; Lm # KHMER SIGN LEK TOO 1843 ; Lm # MONGOLIAN LETTER TODO LONG VOWEL SIGN 1AA7 ; Lm # TAI THAM SIGN MAI YAMOK 1C78..1C7D ; Lm # [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1D2C..1D6A ; Lm # [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D78 ; Lm # MODIFIER LETTER CYRILLIC EN 1D9B..1DBF ; Lm # [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 2071 ; Lm # SUPERSCRIPT LATIN SMALL LETTER I 207F ; Lm # SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; Lm # [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2C7C..2C7D ; Lm # [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V 2D6F ; Lm # TIFINAGH MODIFIER LETTER LABIALIZATION MARK 2E2F ; Lm # VERTICAL TILDE 3005 ; Lm # IDEOGRAPHIC ITERATION MARK 3031..3035 ; Lm # [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 303B ; Lm # VERTICAL IDEOGRAPHIC ITERATION MARK 309D..309E ; Lm # [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 30FC..30FE ; Lm # [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK A015 ; Lm # YI SYLLABLE WU A4F8..A4FD ; Lm # [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU A60C ; Lm # VAI SYLLABLE LENGTHENER A67F ; Lm # CYRILLIC PAYEROK A69C..A69D ; Lm # [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A717..A71F ; Lm # [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A770 ; Lm # MODIFIER LETTER US A788 ; Lm # MODIFIER LETTER LOW CIRCUMFLEX ACCENT A7F1..A7F4 ; Lm # [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F8..A7F9 ; Lm # [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A9CF ; Lm # JAVANESE PANGRANGKEP A9E6 ; Lm # MYANMAR MODIFIER LETTER SHAN REDUPLICATION AA70 ; Lm # MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AADD ; Lm # TAI VIET SYMBOL SAM AAF3..AAF4 ; Lm # [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK AB5C..AB5F ; Lm # [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB69 ; Lm # MODIFIER LETTER SMALL TURNED W FF70 ; Lm # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK 10780..10785 ; Lm # [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Lm # [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Lm # [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10D4E ; Lm # GARAY VOWEL LENGTH MARK 10D6F ; Lm # GARAY REDUPLICATION MARK 10EC5 ; Lm # ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW 11DD9 ; Lm # TOLONG SIKI SIGN SELA 16B40..16B43 ; Lm # [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM 16D40..16D42 ; Lm # [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA 16D6B..16D6C ; Lm # [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16F93..16F9F ; Lm # [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; Lm # [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Lm # OLD CHINESE ITERATION MARK 16FF2..16FF3 ; Lm # [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1AFF0..1AFF3 ; Lm # [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Lm # [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Lm # [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1E030..1E06D ; Lm # [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E137..1E13D ; Lm # [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E4EB ; Lm # NAG MUNDARI SIGN OJOD 1E6FF ; Lm # TAI YO XAM LAI 1E94B ; Lm # ADLAM NASALIZATION MARK # Total code points: 410 # ================================================ # General_Category=Other_Letter 00AA ; Lo # FEMININE ORDINAL INDICATOR 00BA ; Lo # MASCULINE ORDINAL INDICATOR 01BB ; Lo # LATIN LETTER TWO WITH STROKE 01C0..01C3 ; Lo # [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 0294..0295 ; Lo # [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE 05D0..05EA ; Lo # [27] HEBREW LETTER ALEF..HEBREW LETTER TAV 05EF..05F2 ; Lo # [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD 0620..063F ; Lo # [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE 0641..064A ; Lo # [10] ARABIC LETTER FEH..ARABIC LETTER YEH 066E..066F ; Lo # [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF 0671..06D3 ; Lo # [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 06D5 ; Lo # ARABIC LETTER AE 06EE..06EF ; Lo # [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V 06FA..06FC ; Lo # [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW 06FF ; Lo # ARABIC LETTER HEH WITH INVERTED V 0710 ; Lo # SYRIAC LETTER ALAPH 0712..072F ; Lo # [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH 074D..07A5 ; Lo # [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU 07B1 ; Lo # THAANA LETTER NAA 07CA..07EA ; Lo # [33] NKO LETTER A..NKO LETTER JONA RA 0800..0815 ; Lo # [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF 0840..0858 ; Lo # [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN 0860..086A ; Lo # [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; Lo # [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088F ; Lo # [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 08A0..08C8 ; Lo # [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 0904..0939 ; Lo # [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA 093D ; Lo # DEVANAGARI SIGN AVAGRAHA 0950 ; Lo # DEVANAGARI OM 0958..0961 ; Lo # [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL 0972..0980 ; Lo # [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI 0985..098C ; Lo # [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L 098F..0990 ; Lo # [2] BENGALI LETTER E..BENGALI LETTER AI 0993..09A8 ; Lo # [22] BENGALI LETTER O..BENGALI LETTER NA 09AA..09B0 ; Lo # [7] BENGALI LETTER PA..BENGALI LETTER RA 09B2 ; Lo # BENGALI LETTER LA 09B6..09B9 ; Lo # [4] BENGALI LETTER SHA..BENGALI LETTER HA 09BD ; Lo # BENGALI SIGN AVAGRAHA 09CE ; Lo # BENGALI LETTER KHANDA TA 09DC..09DD ; Lo # [2] BENGALI LETTER RRA..BENGALI LETTER RHA 09DF..09E1 ; Lo # [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL 09F0..09F1 ; Lo # [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL 09FC ; Lo # BENGALI LETTER VEDIC ANUSVARA 0A05..0A0A ; Lo # [6] GURMUKHI LETTER A..GURMUKHI LETTER UU 0A0F..0A10 ; Lo # [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI 0A13..0A28 ; Lo # [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA 0A2A..0A30 ; Lo # [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA 0A32..0A33 ; Lo # [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA 0A35..0A36 ; Lo # [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA 0A38..0A39 ; Lo # [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA 0A59..0A5C ; Lo # [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA 0A5E ; Lo # GURMUKHI LETTER FA 0A72..0A74 ; Lo # [3] GURMUKHI IRI..GURMUKHI EK ONKAR 0A85..0A8D ; Lo # [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E 0A8F..0A91 ; Lo # [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O 0A93..0AA8 ; Lo # [22] GUJARATI LETTER O..GUJARATI LETTER NA 0AAA..0AB0 ; Lo # [7] GUJARATI LETTER PA..GUJARATI LETTER RA 0AB2..0AB3 ; Lo # [2] GUJARATI LETTER LA..GUJARATI LETTER LLA 0AB5..0AB9 ; Lo # [5] GUJARATI LETTER VA..GUJARATI LETTER HA 0ABD ; Lo # GUJARATI SIGN AVAGRAHA 0AD0 ; Lo # GUJARATI OM 0AE0..0AE1 ; Lo # [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL 0AF9 ; Lo # GUJARATI LETTER ZHA 0B05..0B0C ; Lo # [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L 0B0F..0B10 ; Lo # [2] ORIYA LETTER E..ORIYA LETTER AI 0B13..0B28 ; Lo # [22] ORIYA LETTER O..ORIYA LETTER NA 0B2A..0B30 ; Lo # [7] ORIYA LETTER PA..ORIYA LETTER RA 0B32..0B33 ; Lo # [2] ORIYA LETTER LA..ORIYA LETTER LLA 0B35..0B39 ; Lo # [5] ORIYA LETTER VA..ORIYA LETTER HA 0B3D ; Lo # ORIYA SIGN AVAGRAHA 0B5C..0B5D ; Lo # [2] ORIYA LETTER RRA..ORIYA LETTER RHA 0B5F..0B61 ; Lo # [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL 0B71 ; Lo # ORIYA LETTER WA 0B83 ; Lo # TAMIL SIGN VISARGA 0B85..0B8A ; Lo # [6] TAMIL LETTER A..TAMIL LETTER UU 0B8E..0B90 ; Lo # [3] TAMIL LETTER E..TAMIL LETTER AI 0B92..0B95 ; Lo # [4] TAMIL LETTER O..TAMIL LETTER KA 0B99..0B9A ; Lo # [2] TAMIL LETTER NGA..TAMIL LETTER CA 0B9C ; Lo # TAMIL LETTER JA 0B9E..0B9F ; Lo # [2] TAMIL LETTER NYA..TAMIL LETTER TTA 0BA3..0BA4 ; Lo # [2] TAMIL LETTER NNA..TAMIL LETTER TA 0BA8..0BAA ; Lo # [3] TAMIL LETTER NA..TAMIL LETTER PA 0BAE..0BB9 ; Lo # [12] TAMIL LETTER MA..TAMIL LETTER HA 0BD0 ; Lo # TAMIL OM 0C05..0C0C ; Lo # [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; Lo # [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; Lo # [23] TELUGU LETTER O..TELUGU LETTER NA 0C2A..0C39 ; Lo # [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; Lo # TELUGU SIGN AVAGRAHA 0C58..0C5A ; Lo # [3] TELUGU LETTER TSA..TELUGU LETTER RRRA 0C5C..0C5D ; Lo # [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; Lo # [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C80 ; Lo # KANNADA SIGN SPACING CANDRABINDU 0C85..0C8C ; Lo # [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; Lo # [3] KANNADA LETTER E..KANNADA LETTER AI 0C92..0CA8 ; Lo # [23] KANNADA LETTER O..KANNADA LETTER NA 0CAA..0CB3 ; Lo # [10] KANNADA LETTER PA..KANNADA LETTER LLA 0CB5..0CB9 ; Lo # [5] KANNADA LETTER VA..KANNADA LETTER HA 0CBD ; Lo # KANNADA SIGN AVAGRAHA 0CDC..0CDE ; Lo # [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; Lo # [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CF1..0CF2 ; Lo # [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0D04..0D0C ; Lo # [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; Lo # [3] MALAYALAM LETTER E..MALAYALAM LETTER AI 0D12..0D3A ; Lo # [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA 0D3D ; Lo # MALAYALAM SIGN AVAGRAHA 0D4E ; Lo # MALAYALAM LETTER DOT REPH 0D54..0D56 ; Lo # [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL 0D5F..0D61 ; Lo # [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL 0D7A..0D7F ; Lo # [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K 0D85..0D96 ; Lo # [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA 0D9A..0DB1 ; Lo # [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA 0DB3..0DBB ; Lo # [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA 0DBD ; Lo # SINHALA LETTER DANTAJA LAYANNA 0DC0..0DC6 ; Lo # [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA 0E01..0E30 ; Lo # [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A 0E32..0E33 ; Lo # [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM 0E40..0E45 ; Lo # [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO 0E81..0E82 ; Lo # [2] LAO LETTER KO..LAO LETTER KHO SUNG 0E84 ; Lo # LAO LETTER KHO TAM 0E86..0E8A ; Lo # [5] LAO LETTER PALI GHA..LAO LETTER SO TAM 0E8C..0EA3 ; Lo # [24] LAO LETTER PALI JHA..LAO LETTER LO LING 0EA5 ; Lo # LAO LETTER LO LOOT 0EA7..0EB0 ; Lo # [10] LAO LETTER WO..LAO VOWEL SIGN A 0EB2..0EB3 ; Lo # [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM 0EBD ; Lo # LAO SEMIVOWEL SIGN NYO 0EC0..0EC4 ; Lo # [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 0EDC..0EDF ; Lo # [4] LAO HO NO..LAO LETTER KHMU NYO 0F00 ; Lo # TIBETAN SYLLABLE OM 0F40..0F47 ; Lo # [8] TIBETAN LETTER KA..TIBETAN LETTER JA 0F49..0F6C ; Lo # [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA 0F88..0F8C ; Lo # [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN 1000..102A ; Lo # [43] MYANMAR LETTER KA..MYANMAR LETTER AU 103F ; Lo # MYANMAR LETTER GREAT SA 1050..1055 ; Lo # [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL 105A..105D ; Lo # [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE 1061 ; Lo # MYANMAR LETTER SGAW KAREN SHA 1065..1066 ; Lo # [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA 106E..1070 ; Lo # [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA 1075..1081 ; Lo # [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA 108E ; Lo # MYANMAR LETTER RUMAI PALAUNG FA 1100..1248 ; Lo # [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA 124A..124D ; Lo # [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE 1250..1256 ; Lo # [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO 1258 ; Lo # ETHIOPIC SYLLABLE QHWA 125A..125D ; Lo # [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE 1260..1288 ; Lo # [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA 128A..128D ; Lo # [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE 1290..12B0 ; Lo # [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA 12B2..12B5 ; Lo # [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE 12B8..12BE ; Lo # [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO 12C0 ; Lo # ETHIOPIC SYLLABLE KXWA 12C2..12C5 ; Lo # [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE 12C8..12D6 ; Lo # [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O 12D8..1310 ; Lo # [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA 1312..1315 ; Lo # [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE 1318..135A ; Lo # [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA 1380..138F ; Lo # [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE 1401..166C ; Lo # [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA 166F..167F ; Lo # [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W 1681..169A ; Lo # [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH 16A0..16EA ; Lo # [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16F1..16F8 ; Lo # [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC 1700..1711 ; Lo # [18] TAGALOG LETTER A..TAGALOG LETTER HA 171F..1731 ; Lo # [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA 1740..1751 ; Lo # [18] BUHID LETTER A..BUHID LETTER HA 1760..176C ; Lo # [13] TAGBANWA LETTER A..TAGBANWA LETTER YA 176E..1770 ; Lo # [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA 1780..17B3 ; Lo # [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU 17DC ; Lo # KHMER SIGN AVAKRAHASANYA 1820..1842 ; Lo # [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1844..1878 ; Lo # [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS 1880..1884 ; Lo # [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA 1887..18A8 ; Lo # [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA 18AA ; Lo # MONGOLIAN LETTER MANCHU ALI GALI LHA 18B0..18F5 ; Lo # [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S 1900..191E ; Lo # [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1950..196D ; Lo # [30] TAI LE LETTER KA..TAI LE LETTER AI 1970..1974 ; Lo # [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 1980..19AB ; Lo # [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA 19B0..19C9 ; Lo # [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 1A00..1A16 ; Lo # [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A20..1A54 ; Lo # [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA 1B05..1B33 ; Lo # [47] BALINESE LETTER AKARA..BALINESE LETTER HA 1B45..1B4C ; Lo # [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA 1B83..1BA0 ; Lo # [30] SUNDANESE LETTER A..SUNDANESE LETTER HA 1BAE..1BAF ; Lo # [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BBA..1BE5 ; Lo # [44] SUNDANESE AVAGRAHA..BATAK LETTER U 1C00..1C23 ; Lo # [36] LEPCHA LETTER KA..LEPCHA LETTER A 1C4D..1C4F ; Lo # [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; Lo # [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1CE9..1CEC ; Lo # [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL 1CEE..1CF3 ; Lo # [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA 1CF5..1CF6 ; Lo # [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA 1CFA ; Lo # VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA 2135..2138 ; Lo # [4] ALEF SYMBOL..DALET SYMBOL 2D30..2D67 ; Lo # [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO 2D80..2D96 ; Lo # [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE 2DA0..2DA6 ; Lo # [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO 2DA8..2DAE ; Lo # [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO 2DB0..2DB6 ; Lo # [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO 2DB8..2DBE ; Lo # [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO 2DC0..2DC6 ; Lo # [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO 2DC8..2DCE ; Lo # [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO 2DD0..2DD6 ; Lo # [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO 2DD8..2DDE ; Lo # [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO 3006 ; Lo # IDEOGRAPHIC CLOSING MARK 303C ; Lo # MASU MARK 3041..3096 ; Lo # [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE 309F ; Lo # HIRAGANA DIGRAPH YORI 30A1..30FA ; Lo # [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO 30FF ; Lo # KATAKANA DIGRAPH KOTO 3105..312F ; Lo # [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN 3131..318E ; Lo # [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE 31A0..31BF ; Lo # [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH 31F0..31FF ; Lo # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO 3400..4DBF ; Lo # [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..A014 ; Lo # [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E A016..A48C ; Lo # [1143] YI SYLLABLE BIT..YI SYLLABLE YYR A4D0..A4F7 ; Lo # [40] LISU LETTER BA..LISU LETTER OE A500..A60B ; Lo # [268] VAI SYLLABLE EE..VAI SYLLABLE NG A610..A61F ; Lo # [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG A62A..A62B ; Lo # [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO A66E ; Lo # CYRILLIC LETTER MULTIOCULAR O A6A0..A6E5 ; Lo # [70] BAMUM LETTER A..BAMUM LETTER KI A78F ; Lo # LATIN LETTER SINOLOGICAL DOT A7F7 ; Lo # LATIN EPIGRAPHIC LETTER SIDEWAYS I A7FB..A801 ; Lo # [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I A803..A805 ; Lo # [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O A807..A80A ; Lo # [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO A80C..A822 ; Lo # [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO A840..A873 ; Lo # [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU A882..A8B3 ; Lo # [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA A8F2..A8F7 ; Lo # [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8FB ; Lo # DEVANAGARI HEADSTROKE A8FD..A8FE ; Lo # [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY A90A..A925 ; Lo # [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A930..A946 ; Lo # [23] REJANG LETTER KA..REJANG LETTER A A960..A97C ; Lo # [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH A984..A9B2 ; Lo # [47] JAVANESE LETTER A..JAVANESE LETTER HA A9E0..A9E4 ; Lo # [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA A9E7..A9EF ; Lo # [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA A9FA..A9FE ; Lo # [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA00..AA28 ; Lo # [41] CHAM LETTER A..CHAM LETTER HA AA40..AA42 ; Lo # [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG AA44..AA4B ; Lo # [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS AA60..AA6F ; Lo # [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA71..AA76 ; Lo # [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA7A ; Lo # MYANMAR LETTER AITON RA AA7E..AAAF ; Lo # [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O AAB1 ; Lo # TAI VIET VOWEL AA AAB5..AAB6 ; Lo # [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB9..AABD ; Lo # [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN AAC0 ; Lo # TAI VIET TONE MAI NUENG AAC2 ; Lo # TAI VIET TONE MAI SONG AADB..AADC ; Lo # [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG AAE0..AAEA ; Lo # [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA AAF2 ; Lo # MEETEI MAYEK ANJI AB01..AB06 ; Lo # [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO AB09..AB0E ; Lo # [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO AB11..AB16 ; Lo # [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO AB20..AB26 ; Lo # [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO AB28..AB2E ; Lo # [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO ABC0..ABE2 ; Lo # [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM AC00..D7A3 ; Lo # [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH D7B0..D7C6 ; Lo # [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E D7CB..D7FB ; Lo # [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH F900..FA6D ; Lo # [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Lo # [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 FB1D ; Lo # HEBREW LETTER YOD WITH HIRIQ FB1F..FB28 ; Lo # [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV FB2A..FB36 ; Lo # [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH FB38..FB3C ; Lo # [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH FB3E ; Lo # HEBREW LETTER MEM WITH DAGESH FB40..FB41 ; Lo # [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH FB43..FB44 ; Lo # [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH FB46..FBB1 ; Lo # [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBD3..FD3D ; Lo # [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD50..FD8F ; Lo # [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM FD92..FDC7 ; Lo # [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; Lo # [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FE70..FE74 ; Lo # [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; Lo # [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM FF66..FF6F ; Lo # [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU FF71..FF9D ; Lo # [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N FFA0..FFBE ; Lo # [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH FFC2..FFC7 ; Lo # [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E FFCA..FFCF ; Lo # [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE FFD2..FFD7 ; Lo # [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10000..1000B ; Lo # [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE 1000D..10026 ; Lo # [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO 10028..1003A ; Lo # [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO 1003C..1003D ; Lo # [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE 1003F..1004D ; Lo # [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO 10050..1005D ; Lo # [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 10080..100FA ; Lo # [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 10280..1029C ; Lo # [29] LYCIAN LETTER A..LYCIAN LETTER X 102A0..102D0 ; Lo # [49] CARIAN LETTER A..CARIAN LETTER UUU3 10300..1031F ; Lo # [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 1032D..10340 ; Lo # [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA 10342..10349 ; Lo # [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL 10350..10375 ; Lo # [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA 10380..1039D ; Lo # [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU 103A0..103C3 ; Lo # [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA 103C8..103CF ; Lo # [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH 10450..1049D ; Lo # [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO 10500..10527 ; Lo # [40] ELBASAN LETTER A..ELBASAN LETTER KHE 10530..10563 ; Lo # [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW 105C0..105F3 ; Lo # [52] TODHRI LETTER A..TODHRI LETTER OO 10600..10736 ; Lo # [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 10740..10755 ; Lo # [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE 10760..10767 ; Lo # [8] LINEAR A SIGN A800..LINEAR A SIGN A807 10800..10805 ; Lo # [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA 10808 ; Lo # CYPRIOT SYLLABLE JO 1080A..10835 ; Lo # [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO 10837..10838 ; Lo # [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE 1083C ; Lo # CYPRIOT SYLLABLE ZA 1083F..10855 ; Lo # [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW 10860..10876 ; Lo # [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW 10880..1089E ; Lo # [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW 108E0..108F2 ; Lo # [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH 108F4..108F5 ; Lo # [2] HATRAN LETTER SHIN..HATRAN LETTER TAW 10900..10915 ; Lo # [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU 10920..10939 ; Lo # [26] LYDIAN LETTER A..LYDIAN LETTER C 10940..10959 ; Lo # [26] SIDETIC LETTER N01..SIDETIC LETTER N26 10980..109B7 ; Lo # [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA 109BE..109BF ; Lo # [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN 10A00 ; Lo # KHAROSHTHI LETTER A 10A10..10A13 ; Lo # [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA 10A15..10A17 ; Lo # [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA 10A19..10A35 ; Lo # [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA 10A60..10A7C ; Lo # [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH 10A80..10A9C ; Lo # [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH 10AC0..10AC7 ; Lo # [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW 10AC9..10AE4 ; Lo # [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW 10B00..10B35 ; Lo # [54] AVESTAN LETTER A..AVESTAN LETTER HE 10B40..10B55 ; Lo # [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW 10B60..10B72 ; Lo # [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW 10B80..10B91 ; Lo # [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW 10C00..10C48 ; Lo # [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH 10D00..10D23 ; Lo # [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10D4A..10D4D ; Lo # [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE 10D4F ; Lo # GARAY SUKUN 10E80..10EA9 ; Lo # [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; Lo # [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10EC2..10EC4 ; Lo # [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10EC6..10EC7 ; Lo # [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW 10F00..10F1C ; Lo # [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; Lo # OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; Lo # [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F70..10F81 ; Lo # [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 10FB0..10FC4 ; Lo # [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW 10FE0..10FF6 ; Lo # [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH 11003..11037 ; Lo # [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA 11071..11072 ; Lo # [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O 11075 ; Lo # BRAHMI LETTER OLD TAMIL LLA 11083..110AF ; Lo # [45] KAITHI LETTER A..KAITHI LETTER HA 110D0..110E8 ; Lo # [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE 11103..11126 ; Lo # [36] CHAKMA LETTER AA..CHAKMA LETTER HAA 11144 ; Lo # CHAKMA LETTER LHAA 11147 ; Lo # CHAKMA LETTER VAA 11150..11172 ; Lo # [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA 11176 ; Lo # MAHAJANI LIGATURE SHRI 11183..111B2 ; Lo # [48] SHARADA LETTER A..SHARADA LETTER HA 111C1..111C4 ; Lo # [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111DA ; Lo # SHARADA EKAM 111DC ; Lo # SHARADA HEADSTROKE 11200..11211 ; Lo # [18] KHOJKI LETTER A..KHOJKI LETTER JJA 11213..1122B ; Lo # [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA 1123F..11240 ; Lo # [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I 11280..11286 ; Lo # [7] MULTANI LETTER A..MULTANI LETTER GA 11288 ; Lo # MULTANI LETTER GHA 1128A..1128D ; Lo # [4] MULTANI LETTER CA..MULTANI LETTER JJA 1128F..1129D ; Lo # [15] MULTANI LETTER NYA..MULTANI LETTER BA 1129F..112A8 ; Lo # [10] MULTANI LETTER BHA..MULTANI LETTER RHA 112B0..112DE ; Lo # [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA 11305..1130C ; Lo # [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L 1130F..11310 ; Lo # [2] GRANTHA LETTER EE..GRANTHA LETTER AI 11313..11328 ; Lo # [22] GRANTHA LETTER OO..GRANTHA LETTER NA 1132A..11330 ; Lo # [7] GRANTHA LETTER PA..GRANTHA LETTER RA 11332..11333 ; Lo # [2] GRANTHA LETTER LA..GRANTHA LETTER LLA 11335..11339 ; Lo # [5] GRANTHA LETTER VA..GRANTHA LETTER HA 1133D ; Lo # GRANTHA SIGN AVAGRAHA 11350 ; Lo # GRANTHA OM 1135D..11361 ; Lo # [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL 11380..11389 ; Lo # [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL 1138B ; Lo # TULU-TIGALARI LETTER EE 1138E ; Lo # TULU-TIGALARI LETTER AI 11390..113B5 ; Lo # [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA 113B7 ; Lo # TULU-TIGALARI SIGN AVAGRAHA 113D1 ; Lo # TULU-TIGALARI REPHA 113D3 ; Lo # TULU-TIGALARI SIGN PLUTA 11400..11434 ; Lo # [53] NEWA LETTER A..NEWA LETTER HA 11447..1144A ; Lo # [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI 1145F..11461 ; Lo # [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA 11480..114AF ; Lo # [48] TIRHUTA ANJI..TIRHUTA LETTER HA 114C4..114C5 ; Lo # [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG 114C7 ; Lo # TIRHUTA OM 11580..115AE ; Lo # [47] SIDDHAM LETTER A..SIDDHAM LETTER HA 115D8..115DB ; Lo # [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U 11600..1162F ; Lo # [48] MODI LETTER A..MODI LETTER LLA 11644 ; Lo # MODI SIGN HUVA 11680..116AA ; Lo # [43] TAKRI LETTER A..TAKRI LETTER RRA 116B8 ; Lo # TAKRI LETTER ARCHAIC KHA 11700..1171A ; Lo # [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA 11740..11746 ; Lo # [7] AHOM LETTER CA..AHOM LETTER LLA 11800..1182B ; Lo # [44] DOGRA LETTER A..DOGRA LETTER RRA 118FF..11906 ; Lo # [8] WARANG CITI OM..DIVES AKURU LETTER E 11909 ; Lo # DIVES AKURU LETTER O 1190C..11913 ; Lo # [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA 11915..11916 ; Lo # [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA 11918..1192F ; Lo # [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA 1193F ; Lo # DIVES AKURU PREFIXED NASAL SIGN 11941 ; Lo # DIVES AKURU INITIAL RA 119A0..119A7 ; Lo # [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR 119AA..119D0 ; Lo # [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA 119E1 ; Lo # NANDINAGARI SIGN AVAGRAHA 119E3 ; Lo # NANDINAGARI HEADSTROKE 11A00 ; Lo # ZANABAZAR SQUARE LETTER A 11A0B..11A32 ; Lo # [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA 11A3A ; Lo # ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA 11A50 ; Lo # SOYOMBO LETTER A 11A5C..11A89 ; Lo # [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; Lo # SOYOMBO MARK PLUTA 11AB0..11AF8 ; Lo # [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11BC0..11BE0 ; Lo # [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; Lo # [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; Lo # [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; Lo # BHAIKSUKI SIGN AVAGRAHA 11C72..11C8F ; Lo # [30] MARCHEN LETTER KA..MARCHEN LETTER A 11D00..11D06 ; Lo # [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E 11D08..11D09 ; Lo # [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O 11D0B..11D30 ; Lo # [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA 11D46 ; Lo # MASARAM GONDI REPHA 11D60..11D65 ; Lo # [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU 11D67..11D68 ; Lo # [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI 11D6A..11D89 ; Lo # [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA 11D98 ; Lo # GUNJALA GONDI OM 11DB0..11DD8 ; Lo # [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH 11DDA..11DDB ; Lo # [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA 11EE0..11EF2 ; Lo # [19] MAKASAR LETTER KA..MAKASAR ANGKA 11F02 ; Lo # KAWI SIGN REPHA 11F04..11F10 ; Lo # [13] KAWI LETTER A..KAWI LETTER O 11F12..11F33 ; Lo # [34] KAWI LETTER KA..KAWI LETTER JNYA 11FB0 ; Lo # LISU LETTER YHA 12000..12399 ; Lo # [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U 12480..12543 ; Lo # [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU 12F90..12FF0 ; Lo # [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 13000..1342F ; Lo # [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D 13441..13446 ; Lo # [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN 13460..143FA ; Lo # [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA 14400..14646 ; Lo # [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 16100..1611D ; Lo # [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA 16800..16A38 ; Lo # [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ 16A40..16A5E ; Lo # [31] MRO LETTER TA..MRO LETTER TEK 16A70..16ABE ; Lo # [79] TANGSA LETTER OZ..TANGSA LETTER ZA 16AD0..16AED ; Lo # [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I 16B00..16B2F ; Lo # [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16B63..16B77 ; Lo # [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS 16B7D..16B8F ; Lo # [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ 16D43..16D6A ; Lo # [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16F00..16F4A ; Lo # [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; Lo # MIAO LETTER NASALIZATION 17000..18CD5 ; Lo # [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; Lo # [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; Lo # [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1B000..1B122 ; Lo # [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU 1B132 ; Lo # HIRAGANA LETTER SMALL KO 1B150..1B152 ; Lo # [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO 1B155 ; Lo # KATAKANA LETTER SMALL KO 1B164..1B167 ; Lo # [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N 1B170..1B2FB ; Lo # [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 1BC00..1BC6A ; Lo # [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; Lo # [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; Lo # [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL 1BC90..1BC99 ; Lo # [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW 1DF0A ; Lo # LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK 1E100..1E12C ; Lo # [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W 1E14E ; Lo # NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ 1E290..1E2AD ; Lo # [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; Lo # [44] WANCHO LETTER AA..WANCHO LETTER YIH 1E4D0..1E4EA ; Lo # [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL 1E5D0..1E5ED ; Lo # [30] OL ONAL LETTER O..OL ONAL LETTER EG 1E5F0 ; Lo # OL ONAL SIGN HODDOND 1E6C0..1E6DE ; Lo # [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO 1E6E0..1E6E2 ; Lo # [3] TAI YO LETTER AA..TAI YO LETTER UE 1E6E4..1E6E5 ; Lo # [2] TAI YO LETTER U..TAI YO LETTER AE 1E6E7..1E6ED ; Lo # [7] TAI YO LETTER O..TAI YO LETTER AUE 1E6F0..1E6F4 ; Lo # [5] TAI YO LETTER AN..TAI YO LETTER AP 1E6FE ; Lo # TAI YO SYMBOL MUEANG 1E7E0..1E7E6 ; Lo # [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO 1E7E8..1E7EB ; Lo # [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE 1E7ED..1E7EE ; Lo # [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE 1E7F0..1E7FE ; Lo # [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE 1E800..1E8C4 ; Lo # [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON 1EE00..1EE03 ; Lo # [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Lo # [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Lo # [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; Lo # ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; Lo # ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; Lo # [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; Lo # [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; Lo # ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; Lo # ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; Lo # ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; Lo # ARABIC MATHEMATICAL TAILED HAH 1EE49 ; Lo # ARABIC MATHEMATICAL TAILED YEH 1EE4B ; Lo # ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; Lo # [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; Lo # [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; Lo # ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; Lo # ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; Lo # ARABIC MATHEMATICAL TAILED DAD 1EE5B ; Lo # ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; Lo # ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; Lo # ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; Lo # [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; Lo # ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; Lo # [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; Lo # [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; Lo # [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; Lo # [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; Lo # ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; Lo # [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; Lo # [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; Lo # [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; Lo # [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Lo # [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 20000..2A6DF ; Lo # [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; Lo # [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; Lo # [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Lo # [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Lo # [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; Lo # [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; Lo # [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 141062 # ================================================ # General_Category=Nonspacing_Mark 0300..036F ; Mn # [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0483..0487 ; Mn # [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0591..05BD ; Mn # [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; Mn # HEBREW POINT RAFE 05C1..05C2 ; Mn # [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Mn # [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Mn # HEBREW POINT QAMATS QATAN 0610..061A ; Mn # [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 064B..065F ; Mn # [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0670 ; Mn # ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; Mn # [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; Mn # [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E7..06E8 ; Mn # [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; Mn # [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 0711 ; Mn # SYRIAC LETTER SUPERSCRIPT ALAPH 0730..074A ; Mn # [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; Mn # [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; Mn # [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07FD ; Mn # NKO DANTAYALAN 0816..0819 ; Mn # [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081B..0823 ; Mn # [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; Mn # [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Mn # [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Mn # [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0897..089F ; Mn # [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Mn # [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Mn # [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; Mn # DEVANAGARI VOWEL SIGN OE 093C ; Mn # DEVANAGARI SIGN NUKTA 0941..0948 ; Mn # [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 094D ; Mn # DEVANAGARI SIGN VIRAMA 0951..0957 ; Mn # [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; Mn # [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0981 ; Mn # BENGALI SIGN CANDRABINDU 09BC ; Mn # BENGALI SIGN NUKTA 09C1..09C4 ; Mn # [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09CD ; Mn # BENGALI SIGN VIRAMA 09E2..09E3 ; Mn # [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09FE ; Mn # BENGALI SANDHI MARK 0A01..0A02 ; Mn # [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A3C ; Mn # GURMUKHI SIGN NUKTA 0A41..0A42 ; Mn # [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; Mn # [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; Mn # [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; Mn # GURMUKHI SIGN UDAAT 0A70..0A71 ; Mn # [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; Mn # GURMUKHI SIGN YAKASH 0A81..0A82 ; Mn # [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0ABC ; Mn # GUJARATI SIGN NUKTA 0AC1..0AC5 ; Mn # [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; Mn # [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0ACD ; Mn # GUJARATI SIGN VIRAMA 0AE2..0AE3 ; Mn # [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFF ; Mn # [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; Mn # ORIYA SIGN CANDRABINDU 0B3C ; Mn # ORIYA SIGN NUKTA 0B3F ; Mn # ORIYA VOWEL SIGN I 0B41..0B44 ; Mn # [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B4D ; Mn # ORIYA SIGN VIRAMA 0B55..0B56 ; Mn # [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B62..0B63 ; Mn # [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; Mn # TAMIL SIGN ANUSVARA 0BC0 ; Mn # TAMIL VOWEL SIGN II 0BCD ; Mn # TAMIL SIGN VIRAMA 0C00 ; Mn # TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C04 ; Mn # TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3C ; Mn # TELUGU SIGN NUKTA 0C3E..0C40 ; Mn # [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C46..0C48 ; Mn # [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; Mn # [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; Mn # [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Mn # [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; Mn # KANNADA SIGN CANDRABINDU 0CBC ; Mn # KANNADA SIGN NUKTA 0CBF ; Mn # KANNADA VOWEL SIGN I 0CC6 ; Mn # KANNADA VOWEL SIGN E 0CCC..0CCD ; Mn # [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CE2..0CE3 ; Mn # [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0D00..0D01 ; Mn # [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D3B..0D3C ; Mn # [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D41..0D44 ; Mn # [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D4D ; Mn # MALAYALAM SIGN VIRAMA 0D62..0D63 ; Mn # [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; Mn # SINHALA SIGN CANDRABINDU 0DCA ; Mn # SINHALA SIGN AL-LAKUNA 0DD2..0DD4 ; Mn # [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Mn # SINHALA VOWEL SIGN DIGA PAA-PILLA 0E31 ; Mn # THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; Mn # [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E47..0E4E ; Mn # [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0EB1 ; Mn # LAO VOWEL SIGN MAI KAN 0EB4..0EBC ; Mn # [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EC8..0ECE ; Mn # [7] LAO TONE MAI EK..LAO YAMAKKAN 0F18..0F19 ; Mn # [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; Mn # TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; Mn # TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; Mn # TIBETAN MARK TSA -PHRU 0F71..0F7E ; Mn # [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F80..0F84 ; Mn # [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; Mn # [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F8D..0F97 ; Mn # [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; Mn # [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; Mn # TIBETAN SYMBOL PADMA GDAN 102D..1030 ; Mn # [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1032..1037 ; Mn # [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1039..103A ; Mn # [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103D..103E ; Mn # [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1058..1059 ; Mn # [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; Mn # [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1071..1074 ; Mn # [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; Mn # MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1085..1086 ; Mn # [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 108D ; Mn # MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 109D ; Mn # MYANMAR VOWEL SIGN AITON AI 135D..135F ; Mn # [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; Mn # [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1732..1733 ; Mn # [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1752..1753 ; Mn # [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; Mn # [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; Mn # [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B7..17BD ; Mn # [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17C6 ; Mn # KHMER SIGN NIKAHIT 17C9..17D3 ; Mn # [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17DD ; Mn # KHMER SIGN ATTHACAN 180B..180D ; Mn # [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; Mn # MONGOLIAN FREE VARIATION SELECTOR FOUR 1885..1886 ; Mn # [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; Mn # MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; Mn # [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1927..1928 ; Mn # [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1932 ; Mn # LIMBU SMALL LETTER ANUSVARA 1939..193B ; Mn # [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A17..1A18 ; Mn # [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A1B ; Mn # BUGINESE VOWEL SIGN AE 1A56 ; Mn # TAI THAM CONSONANT SIGN MEDIAL LA 1A58..1A5E ; Mn # [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; Mn # TAI THAM SIGN SAKOT 1A62 ; Mn # TAI THAM VOWEL SIGN MAI SAT 1A65..1A6C ; Mn # [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A73..1A7C ; Mn # [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Mn # TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AB0..1ABD ; Mn # [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABF..1ADD ; Mn # [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; Mn # [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; Mn # [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B34 ; Mn # BALINESE SIGN REREKAN 1B36..1B3A ; Mn # [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3C ; Mn # BALINESE VOWEL SIGN LA LENGA 1B42 ; Mn # BALINESE VOWEL SIGN PEPET 1B6B..1B73 ; Mn # [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; Mn # [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; Mn # [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; Mn # [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAB..1BAD ; Mn # [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; Mn # BATAK SIGN TOMPI 1BE8..1BE9 ; Mn # [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; Mn # BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; Mn # [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1C2C..1C33 ; Mn # [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; Mn # [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1CD0..1CD2 ; Mn # [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Mn # [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Mn # [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Mn # VEDIC SIGN TIRYAK 1CF4 ; Mn # VEDIC TONE CANDRA ABOVE 1CF8..1CF9 ; Mn # [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1DC0..1DFF ; Mn # [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 20D0..20DC ; Mn # [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20E1 ; Mn # COMBINING LEFT RIGHT ARROW ABOVE 20E5..20F0 ; Mn # [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2CEF..2CF1 ; Mn # [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2D7F ; Mn # TIFINAGH CONSONANT JOINER 2DE0..2DFF ; Mn # [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 302A..302D ; Mn # [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 3099..309A ; Mn # [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK A66F ; Mn # COMBINING CYRILLIC VZMET A674..A67D ; Mn # [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A69E..A69F ; Mn # [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6F0..A6F1 ; Mn # [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A802 ; Mn # SYLOTI NAGRI SIGN DVISVARA A806 ; Mn # SYLOTI NAGRI SIGN HASANTA A80B ; Mn # SYLOTI NAGRI SIGN ANUSVARA A825..A826 ; Mn # [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A82C ; Mn # SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4..A8C5 ; Mn # [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8E0..A8F1 ; Mn # [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8FF ; Mn # DEVANAGARI VOWEL SIGN AY A926..A92D ; Mn # [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; Mn # [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A980..A982 ; Mn # [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; Mn # JAVANESE SIGN CECAK TELU A9B6..A9B9 ; Mn # [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; Mn # [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9E5 ; Mn # MYANMAR SIGN SHAN SAW AA29..AA2E ; Mn # [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; Mn # [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA35..AA36 ; Mn # [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; Mn # CHAM CONSONANT SIGN FINAL NG AA4C ; Mn # CHAM CONSONANT SIGN FINAL M AA7C ; Mn # MYANMAR SIGN TAI LAING TONE-2 AAB0 ; Mn # TAI VIET MAI KANG AAB2..AAB4 ; Mn # [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; Mn # [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE..AABF ; Mn # [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC1 ; Mn # TAI VIET TONE MAI THO AAEC..AAED ; Mn # [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAF6 ; Mn # MEETEI MAYEK VIRAMA ABE5 ; Mn # MEETEI MAYEK VOWEL SIGN ANAP ABE8 ; Mn # MEETEI MAYEK VOWEL SIGN UNAP ABED ; Mn # MEETEI MAYEK APUN IYEK FB1E ; Mn # HEBREW POINT JUDEO-SPANISH VARIKA FE00..FE0F ; Mn # [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF 101FD ; Mn # PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 102E0 ; Mn # COPTIC EPACT THOUSANDS MARK 10376..1037A ; Mn # [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; Mn # [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Mn # [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Mn # [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A38..10A3A ; Mn # [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; Mn # KHAROSHTHI VIRAMA 10AE5..10AE6 ; Mn # [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Mn # [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D69..10D6D ; Mn # [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10EAB..10EAC ; Mn # [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EFA..10EFF ; Mn # [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Mn # [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Mn # [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Mn # BRAHMI SIGN ANUSVARA 11038..11046 ; Mn # [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11070 ; Mn # BRAHMI SIGN OLD TAMIL VIRAMA 11073..11074 ; Mn # [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 1107F..11081 ; Mn # [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 110B3..110B6 ; Mn # [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B9..110BA ; Mn # [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; Mn # KAITHI VOWEL SIGN VOCALIC R 11100..11102 ; Mn # [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; Mn # [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112D..11134 ; Mn # [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11173 ; Mn # MAHAJANI SIGN NUKTA 11180..11181 ; Mn # [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; Mn # [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111C9..111CC ; Mn # [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; Mn # SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; Mn # [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; Mn # KHOJKI SIGN ANUSVARA 11236..11237 ; Mn # [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; Mn # KHOJKI SIGN SUKUN 11241 ; Mn # KHOJKI VOWEL SIGN VOCALIC R 112DF ; Mn # KHUDAWADI SIGN ANUSVARA 112E3..112EA ; Mn # [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 11300..11301 ; Mn # [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 1133B..1133C ; Mn # [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 11340 ; Mn # GRANTHA VOWEL SIGN II 11366..1136C ; Mn # [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; Mn # [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113BB..113C0 ; Mn # [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113CE ; Mn # TULU-TIGALARI SIGN VIRAMA 113D0 ; Mn # TULU-TIGALARI CONJOINER 113D2 ; Mn # TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; Mn # [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11438..1143F ; Mn # [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11442..11444 ; Mn # [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11446 ; Mn # NEWA SIGN NUKTA 1145E ; Mn # NEWA SANDHI MARK 114B3..114B8 ; Mn # [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114BA ; Mn # TIRHUTA VOWEL SIGN SHORT E 114BF..114C0 ; Mn # [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C2..114C3 ; Mn # [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115B2..115B5 ; Mn # [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115BC..115BD ; Mn # [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BF..115C0 ; Mn # [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115DC..115DD ; Mn # [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11633..1163A ; Mn # [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163D ; Mn # MODI SIGN ANUSVARA 1163F..11640 ; Mn # [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 116AB ; Mn # TAKRI SIGN ANUSVARA 116AD ; Mn # TAKRI VOWEL SIGN AA 116B0..116B5 ; Mn # [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B7 ; Mn # TAKRI SIGN NUKTA 1171D ; Mn # AHOM CONSONANT SIGN MEDIAL LA 1171F ; Mn # AHOM CONSONANT SIGN MEDIAL LIGATING RA 11722..11725 ; Mn # [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11727..1172B ; Mn # [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 1182F..11837 ; Mn # [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11839..1183A ; Mn # [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 1193B..1193C ; Mn # [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193E ; Mn # DIVES AKURU VIRAMA 11943 ; Mn # DIVES AKURU SIGN NUKTA 119D4..119D7 ; Mn # [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; Mn # [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119E0 ; Mn # NANDINAGARI SIGN VIRAMA 11A01..11A0A ; Mn # [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A33..11A38 ; Mn # [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A3B..11A3E ; Mn # [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; Mn # ZANABAZAR SQUARE SUBJOINER 11A51..11A56 ; Mn # [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A59..11A5B ; Mn # [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; Mn # [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A98..11A99 ; Mn # [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11B60 ; Mn # SHARADA VOWEL SIGN OE 11B62..11B64 ; Mn # [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B66 ; Mn # SHARADA VOWEL SIGN CANDRA E 11C30..11C36 ; Mn # [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; Mn # [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3F ; Mn # BHAIKSUKI SIGN VIRAMA 11C92..11CA7 ; Mn # [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CAA..11CB0 ; Mn # [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB2..11CB3 ; Mn # [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB5..11CB6 ; Mn # [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; Mn # [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; Mn # MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; Mn # [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; Mn # [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D47 ; Mn # MASARAM GONDI RA-KARA 11D90..11D91 ; Mn # [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D95 ; Mn # GUNJALA GONDI SIGN ANUSVARA 11D97 ; Mn # GUNJALA GONDI VIRAMA 11EF3..11EF4 ; Mn # [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11F00..11F01 ; Mn # [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; Mn # [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; Mn # KAWI VOWEL SIGN EU 11F42 ; Mn # KAWI CONJOINER 11F5A ; Mn # KAWI SIGN NUKTA 13440 ; Mn # EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13447..13455 ; Mn # [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1611E..16129 ; Mn # [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612D..1612F ; Mn # [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; Mn # [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; Mn # [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F4F ; Mn # MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; Mn # [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FE4 ; Mn # KHITAN SMALL SCRIPT FILLER 1BC9D..1BC9E ; Mn # [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CF00..1CF2D ; Mn # [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; Mn # [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D167..1D169 ; Mn # [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D17B..1D182 ; Mn # [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Mn # [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Mn # [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; Mn # [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1DA00..1DA36 ; Mn # [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; Mn # [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; Mn # SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; Mn # SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; Mn # [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; Mn # [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1E000..1E006 ; Mn # [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; Mn # [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; Mn # [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; Mn # [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; Mn # [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E08F ; Mn # COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E130..1E136 ; Mn # [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E2AE ; Mn # TOTO SIGN RISING TONE 1E2EC..1E2EF ; Mn # [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E4EC..1E4EF ; Mn # [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E5EE..1E5EF ; Mn # [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E6E3 ; Mn # TAI YO SIGN UE 1E6E6 ; Mn # TAI YO SIGN AU 1E6EE..1E6EF ; Mn # [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; Mn # TAI YO SIGN OM 1E8D0..1E8D6 ; Mn # [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E94A ; Mn # [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA E0100..E01EF ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 2059 # ================================================ # General_Category=Enclosing_Mark 0488..0489 ; Me # [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 1ABE ; Me # COMBINING PARENTHESES OVERLAY 20DD..20E0 ; Me # [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH 20E2..20E4 ; Me # [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE A670..A672 ; Me # [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN # Total code points: 13 # ================================================ # General_Category=Spacing_Mark 0903 ; Mc # DEVANAGARI SIGN VISARGA 093B ; Mc # DEVANAGARI VOWEL SIGN OOE 093E..0940 ; Mc # [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0949..094C ; Mc # [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094E..094F ; Mc # [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0982..0983 ; Mc # [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 09BE..09C0 ; Mc # [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II 09C7..09C8 ; Mc # [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; Mc # [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09D7 ; Mc # BENGALI AU LENGTH MARK 0A03 ; Mc # GURMUKHI SIGN VISARGA 0A3E..0A40 ; Mc # [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A83 ; Mc # GUJARATI SIGN VISARGA 0ABE..0AC0 ; Mc # [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC9 ; Mc # GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; Mc # [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0B02..0B03 ; Mc # [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B3E ; Mc # ORIYA VOWEL SIGN AA 0B40 ; Mc # ORIYA VOWEL SIGN II 0B47..0B48 ; Mc # [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; Mc # [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B57 ; Mc # ORIYA AU LENGTH MARK 0BBE..0BBF ; Mc # [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I 0BC1..0BC2 ; Mc # [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; Mc # [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; Mc # [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD7 ; Mc # TAMIL AU LENGTH MARK 0C01..0C03 ; Mc # [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C41..0C44 ; Mc # [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C82..0C83 ; Mc # [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0CBE ; Mc # KANNADA VOWEL SIGN AA 0CC0..0CC4 ; Mc # [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR 0CC7..0CC8 ; Mc # [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; Mc # [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CD5..0CD6 ; Mc # [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CF3 ; Mc # KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D02..0D03 ; Mc # [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D3E..0D40 ; Mc # [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D46..0D48 ; Mc # [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; Mc # [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D57 ; Mc # MALAYALAM AU LENGTH MARK 0D82..0D83 ; Mc # [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0DCF..0DD1 ; Mc # [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD8..0DDF ; Mc # [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA 0DF2..0DF3 ; Mc # [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0F3E..0F3F ; Mc # [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F7F ; Mc # TIBETAN SIGN RNAM BCAD 102B..102C ; Mc # [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 1031 ; Mc # MYANMAR VOWEL SIGN E 1038 ; Mc # MYANMAR SIGN VISARGA 103B..103C ; Mc # [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 1056..1057 ; Mc # [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1062..1064 ; Mc # [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1067..106D ; Mc # [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 1083..1084 ; Mc # [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1087..108C ; Mc # [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108F ; Mc # MYANMAR SIGN RUMAI PALAUNG TONE-5 109A..109C ; Mc # [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 1715 ; Mc # TAGALOG SIGN PAMUDPOD 1734 ; Mc # HANUNOO SIGN PAMUDPOD 17B6 ; Mc # KHMER VOWEL SIGN AA 17BE..17C5 ; Mc # [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C7..17C8 ; Mc # [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 1923..1926 ; Mc # [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1929..192B ; Mc # [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; Mc # [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1933..1938 ; Mc # [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1A19..1A1A ; Mc # [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A55 ; Mc # TAI THAM CONSONANT SIGN MEDIAL RA 1A57 ; Mc # TAI THAM CONSONANT SIGN LA TANG LAI 1A61 ; Mc # TAI THAM VOWEL SIGN A 1A63..1A64 ; Mc # [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A6D..1A72 ; Mc # [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1B04 ; Mc # BALINESE SIGN BISAH 1B35 ; Mc # BALINESE VOWEL SIGN TEDUNG 1B3B ; Mc # BALINESE VOWEL SIGN RA REPA TEDUNG 1B3D..1B41 ; Mc # [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B43..1B44 ; Mc # [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1B82 ; Mc # SUNDANESE SIGN PANGWISAD 1BA1 ; Mc # SUNDANESE CONSONANT SIGN PAMINGKAL 1BA6..1BA7 ; Mc # [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BAA ; Mc # SUNDANESE SIGN PAMAAEH 1BE7 ; Mc # BATAK VOWEL SIGN E 1BEA..1BEC ; Mc # [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BEE ; Mc # BATAK VOWEL SIGN U 1BF2..1BF3 ; Mc # [2] BATAK PANGOLAT..BATAK PANONGONAN 1C24..1C2B ; Mc # [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C34..1C35 ; Mc # [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1CE1 ; Mc # VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CF7 ; Mc # VEDIC SIGN ATIKRAMA 302E..302F ; Mc # [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK A823..A824 ; Mc # [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A827 ; Mc # SYLOTI NAGRI VOWEL SIGN OO A880..A881 ; Mc # [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A8B4..A8C3 ; Mc # [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A952..A953 ; Mc # [2] REJANG CONSONANT SIGN H..REJANG VIRAMA A983 ; Mc # JAVANESE SIGN WIGNYAN A9B4..A9B5 ; Mc # [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9BA..A9BB ; Mc # [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BE..A9C0 ; Mc # [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON AA2F..AA30 ; Mc # [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA33..AA34 ; Mc # [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA4D ; Mc # CHAM CONSONANT SIGN FINAL H AA7B ; Mc # MYANMAR SIGN PAO KAREN TONE AA7D ; Mc # MYANMAR SIGN TAI LAING TONE-5 AAEB ; Mc # MEETEI MAYEK VOWEL SIGN II AAEE..AAEF ; Mc # [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF5 ; Mc # MEETEI MAYEK VOWEL SIGN VISARGA ABE3..ABE4 ; Mc # [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE6..ABE7 ; Mc # [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE9..ABEA ; Mc # [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG ABEC ; Mc # MEETEI MAYEK LUM IYEK 11000 ; Mc # BRAHMI SIGN CANDRABINDU 11002 ; Mc # BRAHMI SIGN VISARGA 11082 ; Mc # KAITHI SIGN VISARGA 110B0..110B2 ; Mc # [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B7..110B8 ; Mc # [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 1112C ; Mc # CHAKMA VOWEL SIGN E 11145..11146 ; Mc # [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11182 ; Mc # SHARADA SIGN VISARGA 111B3..111B5 ; Mc # [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111BF..111C0 ; Mc # [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA 111CE ; Mc # SHARADA VOWEL SIGN PRISHTHAMATRA E 1122C..1122E ; Mc # [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 11232..11233 ; Mc # [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11235 ; Mc # KHOJKI SIGN VIRAMA 112E0..112E2 ; Mc # [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 11302..11303 ; Mc # [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 1133E..1133F ; Mc # [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I 11341..11344 ; Mc # [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; Mc # [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134D ; Mc # [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA 11357 ; Mc # GRANTHA AU LENGTH MARK 11362..11363 ; Mc # [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 113B8..113BA ; Mc # [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II 113C2 ; Mc # TULU-TIGALARI VOWEL SIGN EE 113C5 ; Mc # TULU-TIGALARI VOWEL SIGN AI 113C7..113CA ; Mc # [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; Mc # [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 113CF ; Mc # TULU-TIGALARI SIGN LOOPED VIRAMA 11435..11437 ; Mc # [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11440..11441 ; Mc # [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11445 ; Mc # NEWA SIGN VISARGA 114B0..114B2 ; Mc # [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II 114B9 ; Mc # TIRHUTA VOWEL SIGN E 114BB..114BE ; Mc # [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU 114C1 ; Mc # TIRHUTA SIGN VISARGA 115AF..115B1 ; Mc # [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II 115B8..115BB ; Mc # [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BE ; Mc # SIDDHAM SIGN VISARGA 11630..11632 ; Mc # [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 1163B..1163C ; Mc # [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163E ; Mc # MODI SIGN VISARGA 116AC ; Mc # TAKRI SIGN VISARGA 116AE..116AF ; Mc # [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B6 ; Mc # TAKRI SIGN VIRAMA 1171E ; Mc # AHOM CONSONANT SIGN MEDIAL RA 11720..11721 ; Mc # [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11726 ; Mc # AHOM VOWEL SIGN E 1182C..1182E ; Mc # [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 11838 ; Mc # DOGRA SIGN VISARGA 11930..11935 ; Mc # [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E 11937..11938 ; Mc # [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193D ; Mc # DIVES AKURU SIGN HALANTA 11940 ; Mc # DIVES AKURU MEDIAL YA 11942 ; Mc # DIVES AKURU MEDIAL RA 119D1..119D3 ; Mc # [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119DC..119DF ; Mc # [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E4 ; Mc # NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A39 ; Mc # ZANABAZAR SQUARE SIGN VISARGA 11A57..11A58 ; Mc # [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A97 ; Mc # SOYOMBO SIGN VISARGA 11B61 ; Mc # SHARADA VOWEL SIGN OOE 11B65 ; Mc # SHARADA VOWEL SIGN SHORT O 11B67 ; Mc # SHARADA VOWEL SIGN CANDRA O 11C2F ; Mc # BHAIKSUKI VOWEL SIGN AA 11C3E ; Mc # BHAIKSUKI SIGN VISARGA 11CA9 ; Mc # MARCHEN SUBJOINED LETTER YA 11CB1 ; Mc # MARCHEN VOWEL SIGN I 11CB4 ; Mc # MARCHEN VOWEL SIGN O 11D8A..11D8E ; Mc # [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D93..11D94 ; Mc # [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D96 ; Mc # GUNJALA GONDI SIGN VISARGA 11EF5..11EF6 ; Mc # [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11F03 ; Mc # KAWI SIGN VISARGA 11F34..11F35 ; Mc # [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F3E..11F3F ; Mc # [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F41 ; Mc # KAWI SIGN KILLER 1612A..1612C ; Mc # [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 16F51..16F87 ; Mc # [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16FF0..16FF1 ; Mc # [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1D165..1D166 ; Mc # [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D16D..1D172 ; Mc # [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 # Total code points: 471 # ================================================ # General_Category=Decimal_Number 0030..0039 ; Nd # [10] DIGIT ZERO..DIGIT NINE 0660..0669 ; Nd # [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE 06F0..06F9 ; Nd # [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE 07C0..07C9 ; Nd # [10] NKO DIGIT ZERO..NKO DIGIT NINE 0966..096F ; Nd # [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 09E6..09EF ; Nd # [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE 0A66..0A6F ; Nd # [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE 0AE6..0AEF ; Nd # [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE 0B66..0B6F ; Nd # [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE 0BE6..0BEF ; Nd # [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE 0C66..0C6F ; Nd # [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE 0CE6..0CEF ; Nd # [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0D66..0D6F ; Nd # [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE 0DE6..0DEF ; Nd # [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0E50..0E59 ; Nd # [10] THAI DIGIT ZERO..THAI DIGIT NINE 0ED0..0ED9 ; Nd # [10] LAO DIGIT ZERO..LAO DIGIT NINE 0F20..0F29 ; Nd # [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE 1040..1049 ; Nd # [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE 1090..1099 ; Nd # [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE 17E0..17E9 ; Nd # [10] KHMER DIGIT ZERO..KHMER DIGIT NINE 1810..1819 ; Nd # [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1946..194F ; Nd # [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE 19D0..19D9 ; Nd # [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE 1A80..1A89 ; Nd # [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE 1A90..1A99 ; Nd # [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE 1B50..1B59 ; Nd # [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE 1BB0..1BB9 ; Nd # [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1C40..1C49 ; Nd # [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE 1C50..1C59 ; Nd # [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE A620..A629 ; Nd # [10] VAI DIGIT ZERO..VAI DIGIT NINE A8D0..A8D9 ; Nd # [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE A900..A909 ; Nd # [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A9D0..A9D9 ; Nd # [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9F0..A9F9 ; Nd # [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE AA50..AA59 ; Nd # [10] CHAM DIGIT ZERO..CHAM DIGIT NINE ABF0..ABF9 ; Nd # [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 104A0..104A9 ; Nd # [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE 10D30..10D39 ; Nd # [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE 10D40..10D49 ; Nd # [10] GARAY DIGIT ZERO..GARAY DIGIT NINE 11066..1106F ; Nd # [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE 110F0..110F9 ; Nd # [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE 11136..1113F ; Nd # [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE 111D0..111D9 ; Nd # [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE 112F0..112F9 ; Nd # [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE 11450..11459 ; Nd # [10] NEWA DIGIT ZERO..NEWA DIGIT NINE 114D0..114D9 ; Nd # [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE 11650..11659 ; Nd # [10] MODI DIGIT ZERO..MODI DIGIT NINE 116C0..116C9 ; Nd # [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE 116D0..116E3 ; Nd # [20] MYANMAR PAO DIGIT ZERO..MYANMAR EASTERN PWO KAREN DIGIT NINE 11730..11739 ; Nd # [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 118E0..118E9 ; Nd # [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 11950..11959 ; Nd # [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE 11BF0..11BF9 ; Nd # [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C50..11C59 ; Nd # [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11D50..11D59 ; Nd # [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11DA0..11DA9 ; Nd # [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE 11DE0..11DE9 ; Nd # [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE 11F50..11F59 ; Nd # [10] KAWI DIGIT ZERO..KAWI DIGIT NINE 16130..16139 ; Nd # [10] GURUNG KHEMA DIGIT ZERO..GURUNG KHEMA DIGIT NINE 16A60..16A69 ; Nd # [10] MRO DIGIT ZERO..MRO DIGIT NINE 16AC0..16AC9 ; Nd # [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE 16B50..16B59 ; Nd # [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE 16D70..16D79 ; Nd # [10] KIRAT RAI DIGIT ZERO..KIRAT RAI DIGIT NINE 1CCF0..1CCF9 ; Nd # [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE 1D7CE..1D7FF ; Nd # [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1E140..1E149 ; Nd # [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE 1E2F0..1E2F9 ; Nd # [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE 1E4F0..1E4F9 ; Nd # [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE 1E5F1..1E5FA ; Nd # [10] OL ONAL DIGIT ZERO..OL ONAL DIGIT NINE 1E950..1E959 ; Nd # [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1FBF0..1FBF9 ; Nd # [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE # Total code points: 770 # ================================================ # General_Category=Letter_Number 16EE..16F0 ; Nl # [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL 2160..2182 ; Nl # [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND 2185..2188 ; Nl # [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND 3007 ; Nl # IDEOGRAPHIC NUMBER ZERO 3021..3029 ; Nl # [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 3038..303A ; Nl # [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY A6E6..A6EF ; Nl # [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM 10140..10174 ; Nl # [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10341 ; Nl # GOTHIC LETTER NINETY 1034A ; Nl # GOTHIC LETTER NINE HUNDRED 103D1..103D5 ; Nl # [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED 12400..1246E ; Nl # [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM 16FF4..16FF6 ; Nl # [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS # Total code points: 239 # ================================================ # General_Category=Other_Number 00B2..00B3 ; No # [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE 00B9 ; No # SUPERSCRIPT ONE 00BC..00BE ; No # [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS 09F4..09F9 ; No # [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN 0B72..0B77 ; No # [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS 0BF0..0BF2 ; No # [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND 0C78..0C7E ; No # [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0D58..0D5E ; No # [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH 0D70..0D78 ; No # [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS 0F2A..0F33 ; No # [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO 1369..137C ; No # [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND 17F0..17F9 ; No # [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON 19DA ; No # NEW TAI LUE THAM DIGIT ONE 2070 ; No # SUPERSCRIPT ZERO 2074..2079 ; No # [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE 2080..2089 ; No # [10] SUBSCRIPT ZERO..SUBSCRIPT NINE 2150..215F ; No # [16] VULGAR FRACTION ONE SEVENTH..FRACTION NUMERATOR ONE 2189 ; No # VULGAR FRACTION ZERO THIRDS 2460..249B ; No # [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP 24EA..24FF ; No # [22] CIRCLED DIGIT ZERO..NEGATIVE CIRCLED DIGIT ZERO 2776..2793 ; No # [30] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN 2CFD ; No # COPTIC FRACTION ONE HALF 3192..3195 ; No # [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK 3220..3229 ; No # [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN 3248..324F ; No # [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE 3251..325F ; No # [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE 3280..3289 ; No # [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN 32B1..32BF ; No # [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY A830..A835 ; No # [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS 10107..10133 ; No # [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND 10175..10178 ; No # [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN 1018A..1018B ; No # [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN 102E1..102FB ; No # [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED 10320..10323 ; No # [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY 10858..1085F ; No # [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND 10879..1087F ; No # [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY 108A7..108AF ; No # [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED 108FB..108FF ; No # [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED 10916..1091B ; No # [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE 109BC..109BD ; No # [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF 109C0..109CF ; No # [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY 109D2..109FF ; No # [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS 10A40..10A48 ; No # [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF 10A7D..10A7E ; No # [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY 10A9D..10A9F ; No # [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY 10AEB..10AEF ; No # [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED 10B58..10B5F ; No # [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND 10B78..10B7F ; No # [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND 10BA9..10BAF ; No # [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED 10CFA..10CFF ; No # [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND 10E60..10E7E ; No # [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS 10F1D..10F26 ; No # [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F51..10F54 ; No # [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED 10FC5..10FCB ; No # [7] CHORASMIAN NUMBER ONE..CHORASMIAN NUMBER ONE HUNDRED 11052..11065 ; No # [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND 111E1..111F4 ; No # [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND 1173A..1173B ; No # [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY 118EA..118F2 ; No # [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY 11C5A..11C6C ; No # [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK 11FC0..11FD4 ; No # [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH 16B5B..16B61 ; No # [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS 16E80..16E96 ; No # [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM 1D2C0..1D2D3 ; No # [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN 1D2E0..1D2F3 ; No # [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN 1D360..1D378 ; No # [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE 1E8C7..1E8CF ; No # [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE 1EC71..1ECAB ; No # [59] INDIC SIYAQ NUMBER ONE..INDIC SIYAQ NUMBER PREFIXED NINE 1ECAD..1ECAF ; No # [3] INDIC SIYAQ FRACTION ONE QUARTER..INDIC SIYAQ FRACTION THREE QUARTERS 1ECB1..1ECB4 ; No # [4] INDIC SIYAQ NUMBER ALTERNATE ONE..INDIC SIYAQ ALTERNATE LAKH MARK 1ED01..1ED2D ; No # [45] OTTOMAN SIYAQ NUMBER ONE..OTTOMAN SIYAQ NUMBER NINETY THOUSAND 1ED2F..1ED3D ; No # [15] OTTOMAN SIYAQ ALTERNATE NUMBER TWO..OTTOMAN SIYAQ FRACTION ONE SIXTH 1F100..1F10C ; No # [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO # Total code points: 915 # ================================================ # General_Category=Space_Separator 0020 ; Zs # SPACE 00A0 ; Zs # NO-BREAK SPACE 1680 ; Zs # OGHAM SPACE MARK 2000..200A ; Zs # [11] EN QUAD..HAIR SPACE 202F ; Zs # NARROW NO-BREAK SPACE 205F ; Zs # MEDIUM MATHEMATICAL SPACE 3000 ; Zs # IDEOGRAPHIC SPACE # Total code points: 17 # ================================================ # General_Category=Line_Separator 2028 ; Zl # LINE SEPARATOR # Total code points: 1 # ================================================ # General_Category=Paragraph_Separator 2029 ; Zp # PARAGRAPH SEPARATOR # Total code points: 1 # ================================================ # General_Category=Control 0000..001F ; Cc # [32] .. 007F..009F ; Cc # [33] .. # Total code points: 65 # ================================================ # General_Category=Format 00AD ; Cf # SOFT HYPHEN 0600..0605 ; Cf # [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE 061C ; Cf # ARABIC LETTER MARK 06DD ; Cf # ARABIC END OF AYAH 070F ; Cf # SYRIAC ABBREVIATION MARK 0890..0891 ; Cf # [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 08E2 ; Cf # ARABIC DISPUTED END OF AYAH 180E ; Cf # MONGOLIAN VOWEL SEPARATOR 200B..200F ; Cf # [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK 202A..202E ; Cf # [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE 2060..2064 ; Cf # [5] WORD JOINER..INVISIBLE PLUS 2066..206F ; Cf # [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES FEFF ; Cf # ZERO WIDTH NO-BREAK SPACE FFF9..FFFB ; Cf # [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR 110BD ; Cf # KAITHI NUMBER SIGN 110CD ; Cf # KAITHI NUMBER SIGN ABOVE 13430..1343F ; Cf # [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE 1BCA0..1BCA3 ; Cf # [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1D173..1D17A ; Cf # [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE E0001 ; Cf # LANGUAGE TAG E0020..E007F ; Cf # [96] TAG SPACE..CANCEL TAG # Total code points: 170 # ================================================ # General_Category=Private_Use E000..F8FF ; Co # [6400] .. F0000..FFFFD ; Co # [65534] .. 100000..10FFFD; Co # [65534] .. # Total code points: 137468 # ================================================ # General_Category=Surrogate D800..DFFF ; Cs # [2048] .. # Total code points: 2048 # ================================================ # General_Category=Dash_Punctuation 002D ; Pd # HYPHEN-MINUS 058A ; Pd # ARMENIAN HYPHEN 05BE ; Pd # HEBREW PUNCTUATION MAQAF 1400 ; Pd # CANADIAN SYLLABICS HYPHEN 1806 ; Pd # MONGOLIAN TODO SOFT HYPHEN 2010..2015 ; Pd # [6] HYPHEN..HORIZONTAL BAR 2E17 ; Pd # DOUBLE OBLIQUE HYPHEN 2E1A ; Pd # HYPHEN WITH DIAERESIS 2E3A..2E3B ; Pd # [2] TWO-EM DASH..THREE-EM DASH 2E40 ; Pd # DOUBLE HYPHEN 2E5D ; Pd # OBLIQUE HYPHEN 301C ; Pd # WAVE DASH 3030 ; Pd # WAVY DASH 30A0 ; Pd # KATAKANA-HIRAGANA DOUBLE HYPHEN FE31..FE32 ; Pd # [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH FE58 ; Pd # SMALL EM DASH FE63 ; Pd # SMALL HYPHEN-MINUS FF0D ; Pd # FULLWIDTH HYPHEN-MINUS 10D6E ; Pd # GARAY HYPHEN 10EAD ; Pd # YEZIDI HYPHENATION MARK # Total code points: 27 # ================================================ # General_Category=Open_Punctuation 0028 ; Ps # LEFT PARENTHESIS 005B ; Ps # LEFT SQUARE BRACKET 007B ; Ps # LEFT CURLY BRACKET 0F3A ; Ps # TIBETAN MARK GUG RTAGS GYON 0F3C ; Ps # TIBETAN MARK ANG KHANG GYON 169B ; Ps # OGHAM FEATHER MARK 201A ; Ps # SINGLE LOW-9 QUOTATION MARK 201E ; Ps # DOUBLE LOW-9 QUOTATION MARK 2045 ; Ps # LEFT SQUARE BRACKET WITH QUILL 207D ; Ps # SUPERSCRIPT LEFT PARENTHESIS 208D ; Ps # SUBSCRIPT LEFT PARENTHESIS 2308 ; Ps # LEFT CEILING 230A ; Ps # LEFT FLOOR 2329 ; Ps # LEFT-POINTING ANGLE BRACKET 2768 ; Ps # MEDIUM LEFT PARENTHESIS ORNAMENT 276A ; Ps # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 276C ; Ps # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 276E ; Ps # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 2770 ; Ps # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 2772 ; Ps # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT 2774 ; Ps # MEDIUM LEFT CURLY BRACKET ORNAMENT 27C5 ; Ps # LEFT S-SHAPED BAG DELIMITER 27E6 ; Ps # MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E8 ; Ps # MATHEMATICAL LEFT ANGLE BRACKET 27EA ; Ps # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EC ; Ps # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27EE ; Ps # MATHEMATICAL LEFT FLATTENED PARENTHESIS 2983 ; Ps # LEFT WHITE CURLY BRACKET 2985 ; Ps # LEFT WHITE PARENTHESIS 2987 ; Ps # Z NOTATION LEFT IMAGE BRACKET 2989 ; Ps # Z NOTATION LEFT BINDING BRACKET 298B ; Ps # LEFT SQUARE BRACKET WITH UNDERBAR 298D ; Ps # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298F ; Ps # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2991 ; Ps # LEFT ANGLE BRACKET WITH DOT 2993 ; Ps # LEFT ARC LESS-THAN BRACKET 2995 ; Ps # DOUBLE LEFT ARC GREATER-THAN BRACKET 2997 ; Ps # LEFT BLACK TORTOISE SHELL BRACKET 29D8 ; Ps # LEFT WIGGLY FENCE 29DA ; Ps # LEFT DOUBLE WIGGLY FENCE 29FC ; Ps # LEFT-POINTING CURVED ANGLE BRACKET 2E22 ; Ps # TOP LEFT HALF BRACKET 2E24 ; Ps # BOTTOM LEFT HALF BRACKET 2E26 ; Ps # LEFT SIDEWAYS U BRACKET 2E28 ; Ps # LEFT DOUBLE PARENTHESIS 2E42 ; Ps # DOUBLE LOW-REVERSED-9 QUOTATION MARK 2E55 ; Ps # LEFT SQUARE BRACKET WITH STROKE 2E57 ; Ps # LEFT SQUARE BRACKET WITH DOUBLE STROKE 2E59 ; Ps # TOP HALF LEFT PARENTHESIS 2E5B ; Ps # BOTTOM HALF LEFT PARENTHESIS 3008 ; Ps # LEFT ANGLE BRACKET 300A ; Ps # LEFT DOUBLE ANGLE BRACKET 300C ; Ps # LEFT CORNER BRACKET 300E ; Ps # LEFT WHITE CORNER BRACKET 3010 ; Ps # LEFT BLACK LENTICULAR BRACKET 3014 ; Ps # LEFT TORTOISE SHELL BRACKET 3016 ; Ps # LEFT WHITE LENTICULAR BRACKET 3018 ; Ps # LEFT WHITE TORTOISE SHELL BRACKET 301A ; Ps # LEFT WHITE SQUARE BRACKET 301D ; Ps # REVERSED DOUBLE PRIME QUOTATION MARK FD3F ; Ps # ORNATE RIGHT PARENTHESIS FE17 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET FE35 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS FE37 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET FE39 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET FE3B ; Ps # PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET FE3D ; Ps # PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET FE3F ; Ps # PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET FE41 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET FE43 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET FE47 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET FE59 ; Ps # SMALL LEFT PARENTHESIS FE5B ; Ps # SMALL LEFT CURLY BRACKET FE5D ; Ps # SMALL LEFT TORTOISE SHELL BRACKET FF08 ; Ps # FULLWIDTH LEFT PARENTHESIS FF3B ; Ps # FULLWIDTH LEFT SQUARE BRACKET FF5B ; Ps # FULLWIDTH LEFT CURLY BRACKET FF5F ; Ps # FULLWIDTH LEFT WHITE PARENTHESIS FF62 ; Ps # HALFWIDTH LEFT CORNER BRACKET # Total code points: 79 # ================================================ # General_Category=Close_Punctuation 0029 ; Pe # RIGHT PARENTHESIS 005D ; Pe # RIGHT SQUARE BRACKET 007D ; Pe # RIGHT CURLY BRACKET 0F3B ; Pe # TIBETAN MARK GUG RTAGS GYAS 0F3D ; Pe # TIBETAN MARK ANG KHANG GYAS 169C ; Pe # OGHAM REVERSED FEATHER MARK 2046 ; Pe # RIGHT SQUARE BRACKET WITH QUILL 207E ; Pe # SUPERSCRIPT RIGHT PARENTHESIS 208E ; Pe # SUBSCRIPT RIGHT PARENTHESIS 2309 ; Pe # RIGHT CEILING 230B ; Pe # RIGHT FLOOR 232A ; Pe # RIGHT-POINTING ANGLE BRACKET 2769 ; Pe # MEDIUM RIGHT PARENTHESIS ORNAMENT 276B ; Pe # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 276D ; Pe # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 276F ; Pe # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 2771 ; Pe # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 2773 ; Pe # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT 2775 ; Pe # MEDIUM RIGHT CURLY BRACKET ORNAMENT 27C6 ; Pe # RIGHT S-SHAPED BAG DELIMITER 27E7 ; Pe # MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E9 ; Pe # MATHEMATICAL RIGHT ANGLE BRACKET 27EB ; Pe # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27ED ; Pe # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EF ; Pe # MATHEMATICAL RIGHT FLATTENED PARENTHESIS 2984 ; Pe # RIGHT WHITE CURLY BRACKET 2986 ; Pe # RIGHT WHITE PARENTHESIS 2988 ; Pe # Z NOTATION RIGHT IMAGE BRACKET 298A ; Pe # Z NOTATION RIGHT BINDING BRACKET 298C ; Pe # RIGHT SQUARE BRACKET WITH UNDERBAR 298E ; Pe # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990 ; Pe # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2992 ; Pe # RIGHT ANGLE BRACKET WITH DOT 2994 ; Pe # RIGHT ARC GREATER-THAN BRACKET 2996 ; Pe # DOUBLE RIGHT ARC LESS-THAN BRACKET 2998 ; Pe # RIGHT BLACK TORTOISE SHELL BRACKET 29D9 ; Pe # RIGHT WIGGLY FENCE 29DB ; Pe # RIGHT DOUBLE WIGGLY FENCE 29FD ; Pe # RIGHT-POINTING CURVED ANGLE BRACKET 2E23 ; Pe # TOP RIGHT HALF BRACKET 2E25 ; Pe # BOTTOM RIGHT HALF BRACKET 2E27 ; Pe # RIGHT SIDEWAYS U BRACKET 2E29 ; Pe # RIGHT DOUBLE PARENTHESIS 2E56 ; Pe # RIGHT SQUARE BRACKET WITH STROKE 2E58 ; Pe # RIGHT SQUARE BRACKET WITH DOUBLE STROKE 2E5A ; Pe # TOP HALF RIGHT PARENTHESIS 2E5C ; Pe # BOTTOM HALF RIGHT PARENTHESIS 3009 ; Pe # RIGHT ANGLE BRACKET 300B ; Pe # RIGHT DOUBLE ANGLE BRACKET 300D ; Pe # RIGHT CORNER BRACKET 300F ; Pe # RIGHT WHITE CORNER BRACKET 3011 ; Pe # RIGHT BLACK LENTICULAR BRACKET 3015 ; Pe # RIGHT TORTOISE SHELL BRACKET 3017 ; Pe # RIGHT WHITE LENTICULAR BRACKET 3019 ; Pe # RIGHT WHITE TORTOISE SHELL BRACKET 301B ; Pe # RIGHT WHITE SQUARE BRACKET 301E..301F ; Pe # [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK FD3E ; Pe # ORNATE LEFT PARENTHESIS FE18 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET FE36 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS FE38 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET FE3A ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET FE3C ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET FE3E ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET FE40 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET FE42 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET FE44 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET FE48 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET FE5A ; Pe # SMALL RIGHT PARENTHESIS FE5C ; Pe # SMALL RIGHT CURLY BRACKET FE5E ; Pe # SMALL RIGHT TORTOISE SHELL BRACKET FF09 ; Pe # FULLWIDTH RIGHT PARENTHESIS FF3D ; Pe # FULLWIDTH RIGHT SQUARE BRACKET FF5D ; Pe # FULLWIDTH RIGHT CURLY BRACKET FF60 ; Pe # FULLWIDTH RIGHT WHITE PARENTHESIS FF63 ; Pe # HALFWIDTH RIGHT CORNER BRACKET # Total code points: 77 # ================================================ # General_Category=Connector_Punctuation 005F ; Pc # LOW LINE 203F..2040 ; Pc # [2] UNDERTIE..CHARACTER TIE 2054 ; Pc # INVERTED UNDERTIE FE33..FE34 ; Pc # [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE FE4D..FE4F ; Pc # [3] DASHED LOW LINE..WAVY LOW LINE FF3F ; Pc # FULLWIDTH LOW LINE # Total code points: 10 # ================================================ # General_Category=Other_Punctuation 0021..0023 ; Po # [3] EXCLAMATION MARK..NUMBER SIGN 0025..0027 ; Po # [3] PERCENT SIGN..APOSTROPHE 002A ; Po # ASTERISK 002C ; Po # COMMA 002E..002F ; Po # [2] FULL STOP..SOLIDUS 003A..003B ; Po # [2] COLON..SEMICOLON 003F..0040 ; Po # [2] QUESTION MARK..COMMERCIAL AT 005C ; Po # REVERSE SOLIDUS 00A1 ; Po # INVERTED EXCLAMATION MARK 00A7 ; Po # SECTION SIGN 00B6..00B7 ; Po # [2] PILCROW SIGN..MIDDLE DOT 00BF ; Po # INVERTED QUESTION MARK 037E ; Po # GREEK QUESTION MARK 0387 ; Po # GREEK ANO TELEIA 055A..055F ; Po # [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK 0589 ; Po # ARMENIAN FULL STOP 05C0 ; Po # HEBREW PUNCTUATION PASEQ 05C3 ; Po # HEBREW PUNCTUATION SOF PASUQ 05C6 ; Po # HEBREW PUNCTUATION NUN HAFUKHA 05F3..05F4 ; Po # [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM 0609..060A ; Po # [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN 060C..060D ; Po # [2] ARABIC COMMA..ARABIC DATE SEPARATOR 061B ; Po # ARABIC SEMICOLON 061D..061F ; Po # [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK 066A..066D ; Po # [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR 06D4 ; Po # ARABIC FULL STOP 0700..070D ; Po # [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS 07F7..07F9 ; Po # [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK 0830..083E ; Po # [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU 085E ; Po # MANDAIC PUNCTUATION 0964..0965 ; Po # [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA 0970 ; Po # DEVANAGARI ABBREVIATION SIGN 09FD ; Po # BENGALI ABBREVIATION SIGN 0A76 ; Po # GURMUKHI ABBREVIATION SIGN 0AF0 ; Po # GUJARATI ABBREVIATION SIGN 0C77 ; Po # TELUGU SIGN SIDDHAM 0C84 ; Po # KANNADA SIGN SIDDHAM 0DF4 ; Po # SINHALA PUNCTUATION KUNDDALIYA 0E4F ; Po # THAI CHARACTER FONGMAN 0E5A..0E5B ; Po # [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT 0F04..0F12 ; Po # [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD 0F14 ; Po # TIBETAN MARK GTER TSHEG 0F85 ; Po # TIBETAN MARK PALUTA 0FD0..0FD4 ; Po # [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA 0FD9..0FDA ; Po # [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS 104A..104F ; Po # [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE 10FB ; Po # GEORGIAN PARAGRAPH SEPARATOR 1360..1368 ; Po # [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR 166E ; Po # CANADIAN SYLLABICS FULL STOP 16EB..16ED ; Po # [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION 1735..1736 ; Po # [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 17D4..17D6 ; Po # [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 17D8..17DA ; Po # [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT 1800..1805 ; Po # [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS 1807..180A ; Po # [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU 1944..1945 ; Po # [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 1A1E..1A1F ; Po # [2] BUGINESE PALLAWA..BUGINESE END OF SECTION 1AA0..1AA6 ; Po # [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA 1AA8..1AAD ; Po # [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG 1B4E..1B4F ; Po # [2] BALINESE INVERTED CARIK SIKI..BALINESE INVERTED CARIK PAREREN 1B5A..1B60 ; Po # [7] BALINESE PANTI..BALINESE PAMENENG 1B7D..1B7F ; Po # [3] BALINESE PANTI LANTANG..BALINESE PANTI BAWAK 1BFC..1BFF ; Po # [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT 1C3B..1C3F ; Po # [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK 1C7E..1C7F ; Po # [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD 1CC0..1CC7 ; Po # [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA 1CD3 ; Po # VEDIC SIGN NIHSHVASA 2016..2017 ; Po # [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE 2020..2027 ; Po # [8] DAGGER..HYPHENATION POINT 2030..2038 ; Po # [9] PER MILLE SIGN..CARET 203B..203E ; Po # [4] REFERENCE MARK..OVERLINE 2041..2043 ; Po # [3] CARET INSERTION POINT..HYPHEN BULLET 2047..2051 ; Po # [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY 2053 ; Po # SWUNG DASH 2055..205E ; Po # [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS 2CF9..2CFC ; Po # [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER 2CFE..2CFF ; Po # [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER 2D70 ; Po # TIFINAGH SEPARATOR MARK 2E00..2E01 ; Po # [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E06..2E08 ; Po # [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER 2E0B ; Po # RAISED SQUARE 2E0E..2E16 ; Po # [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE 2E18..2E19 ; Po # [2] INVERTED INTERROBANG..PALM BRANCH 2E1B ; Po # TILDE WITH RING ABOVE 2E1E..2E1F ; Po # [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW 2E2A..2E2E ; Po # [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK 2E30..2E39 ; Po # [10] RING POINT..TOP HALF SECTION SIGN 2E3C..2E3F ; Po # [4] STENOGRAPHIC FULL STOP..CAPITULUM 2E41 ; Po # REVERSED COMMA 2E43..2E4F ; Po # [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER 2E52..2E54 ; Po # [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK 3001..3003 ; Po # [3] IDEOGRAPHIC COMMA..DITTO MARK 303D ; Po # PART ALTERNATION MARK 30FB ; Po # KATAKANA MIDDLE DOT A4FE..A4FF ; Po # [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP A60D..A60F ; Po # [3] VAI COMMA..VAI QUESTION MARK A673 ; Po # SLAVONIC ASTERISK A67E ; Po # CYRILLIC KAVYKA A6F2..A6F7 ; Po # [6] BAMUM NJAEMLI..BAMUM QUESTION MARK A874..A877 ; Po # [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD A8CE..A8CF ; Po # [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA A8F8..A8FA ; Po # [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET A8FC ; Po # DEVANAGARI SIGN SIDDHAM A92E..A92F ; Po # [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA A95F ; Po # REJANG SECTION MARK A9C1..A9CD ; Po # [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH A9DE..A9DF ; Po # [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN AA5C..AA5F ; Po # [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA AADE..AADF ; Po # [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI AAF0..AAF1 ; Po # [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM ABEB ; Po # MEETEI MAYEK CHEIKHEI FE10..FE16 ; Po # [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE19 ; Po # PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS FE30 ; Po # PRESENTATION FORM FOR VERTICAL TWO DOT LEADER FE45..FE46 ; Po # [2] SESAME DOT..WHITE SESAME DOT FE49..FE4C ; Po # [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE FE50..FE52 ; Po # [3] SMALL COMMA..SMALL FULL STOP FE54..FE57 ; Po # [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK FE5F..FE61 ; Po # [3] SMALL NUMBER SIGN..SMALL ASTERISK FE68 ; Po # SMALL REVERSE SOLIDUS FE6A..FE6B ; Po # [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT FF01..FF03 ; Po # [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN FF05..FF07 ; Po # [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE FF0A ; Po # FULLWIDTH ASTERISK FF0C ; Po # FULLWIDTH COMMA FF0E..FF0F ; Po # [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS FF1A..FF1B ; Po # [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON FF1F..FF20 ; Po # [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT FF3C ; Po # FULLWIDTH REVERSE SOLIDUS FF61 ; Po # HALFWIDTH IDEOGRAPHIC FULL STOP FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT 10100..10102 ; Po # [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK 1039F ; Po # UGARITIC WORD DIVIDER 103D0 ; Po # OLD PERSIAN WORD DIVIDER 1056F ; Po # CAUCASIAN ALBANIAN CITATION MARK 10857 ; Po # IMPERIAL ARAMAIC SECTION SIGN 1091F ; Po # PHOENICIAN WORD SEPARATOR 1093F ; Po # LYDIAN TRIANGULAR MARK 10A50..10A58 ; Po # [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES 10A7F ; Po # OLD SOUTH ARABIAN NUMERIC INDICATOR 10AF0..10AF6 ; Po # [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER 10B39..10B3F ; Po # [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION 10B99..10B9C ; Po # [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT 10ED0 ; Po # ARABIC BIBLICAL END OF VERSE 10F55..10F59 ; Po # [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 10F86..10F89 ; Po # [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS 11047..1104D ; Po # [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 110BB..110BC ; Po # [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN 110BE..110C1 ; Po # [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11140..11143 ; Po # [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK 11174..11175 ; Po # [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK 111C5..111C8 ; Po # [4] SHARADA DANDA..SHARADA SEPARATOR 111CD ; Po # SHARADA SUTRA MARK 111DB ; Po # SHARADA SIGN SIDDHAM 111DD..111DF ; Po # [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 11238..1123D ; Po # [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN 112A9 ; Po # MULTANI SECTION MARK 113D4..113D5 ; Po # [2] TULU-TIGALARI DANDA..TULU-TIGALARI DOUBLE DANDA 113D7..113D8 ; Po # [2] TULU-TIGALARI SIGN OM PUSHPIKA..TULU-TIGALARI SIGN SHRII PUSHPIKA 1144B..1144F ; Po # [5] NEWA DANDA..NEWA ABBREVIATION SIGN 1145A..1145B ; Po # [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK 1145D ; Po # NEWA INSERTION SIGN 114C6 ; Po # TIRHUTA ABBREVIATION SIGN 115C1..115D7 ; Po # [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 11641..11643 ; Po # [3] MODI DANDA..MODI ABBREVIATION SIGN 11660..1166C ; Po # [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT 116B9 ; Po # TAKRI ABBREVIATION SIGN 1173C..1173E ; Po # [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI 1183B ; Po # DOGRA ABBREVIATION SIGN 11944..11946 ; Po # [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK 119E2 ; Po # NANDINAGARI SIGN SIDDHAM 11A3F..11A46 ; Po # [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK 11A9A..11A9C ; Po # [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD 11A9E..11AA2 ; Po # [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11B00..11B09 ; Po # [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU 11BE1 ; Po # SUNUWAR SIGN PVO 11C41..11C45 ; Po # [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 11C70..11C71 ; Po # [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD 11EF7..11EF8 ; Po # [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION 11F43..11F4F ; Po # [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL 11FFF ; Po # TAMIL PUNCTUATION END OF TEXT 12470..12474 ; Po # [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 12FF1..12FF2 ; Po # [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302 16A6E..16A6F ; Po # [2] MRO DANDA..MRO DOUBLE DANDA 16AF5 ; Po # BASSA VAH FULL STOP 16B37..16B3B ; Po # [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM 16B44 ; Po # PAHAWH HMONG SIGN XAUS 16D6D..16D6F ; Po # [3] KIRAT RAI SIGN YUPI..KIRAT RAI DOUBLE DANDA 16E97..16E9A ; Po # [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH 16FE2 ; Po # OLD CHINESE HOOK MARK 1BC9F ; Po # DUPLOYAN PUNCTUATION CHINOOK FULL STOP 1DA87..1DA8B ; Po # [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS 1E5FF ; Po # OL ONAL ABBREVIATION SIGN 1E95E..1E95F ; Po # [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK # Total code points: 641 # ================================================ # General_Category=Math_Symbol 002B ; Sm # PLUS SIGN 003C..003E ; Sm # [3] LESS-THAN SIGN..GREATER-THAN SIGN 007C ; Sm # VERTICAL LINE 007E ; Sm # TILDE 00AC ; Sm # NOT SIGN 00B1 ; Sm # PLUS-MINUS SIGN 00D7 ; Sm # MULTIPLICATION SIGN 00F7 ; Sm # DIVISION SIGN 03F6 ; Sm # GREEK REVERSED LUNATE EPSILON SYMBOL 0606..0608 ; Sm # [3] ARABIC-INDIC CUBE ROOT..ARABIC RAY 2044 ; Sm # FRACTION SLASH 2052 ; Sm # COMMERCIAL MINUS SIGN 207A..207C ; Sm # [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN 208A..208C ; Sm # [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 2118 ; Sm # SCRIPT CAPITAL P 2140..2144 ; Sm # [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y 214B ; Sm # TURNED AMPERSAND 2190..2194 ; Sm # [5] LEFTWARDS ARROW..LEFT RIGHT ARROW 219A..219B ; Sm # [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE 21A0 ; Sm # RIGHTWARDS TWO HEADED ARROW 21A3 ; Sm # RIGHTWARDS ARROW WITH TAIL 21A6 ; Sm # RIGHTWARDS ARROW FROM BAR 21AE ; Sm # LEFT RIGHT ARROW WITH STROKE 21CE..21CF ; Sm # [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE 21D2 ; Sm # RIGHTWARDS DOUBLE ARROW 21D4 ; Sm # LEFT RIGHT DOUBLE ARROW 21F4..22FF ; Sm # [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2320..2321 ; Sm # [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 237C ; Sm # RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW 239B..23B3 ; Sm # [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23DC..23E1 ; Sm # [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET 25B7 ; Sm # WHITE RIGHT-POINTING TRIANGLE 25C1 ; Sm # WHITE LEFT-POINTING TRIANGLE 25F8..25FF ; Sm # [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 266F ; Sm # MUSIC SHARP SIGN 27C0..27C4 ; Sm # [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET 27C7..27E5 ; Sm # [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK 27F0..27FF ; Sm # [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW 2900..2982 ; Sm # [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON 2999..29D7 ; Sm # [63] DOTTED FENCE..BLACK HOURGLASS 29DC..29FB ; Sm # [32] INCOMPLETE INFINITY..TRIPLE PLUS 29FE..2AFF ; Sm # [258] TINY..N-ARY WHITE VERTICAL BAR 2B30..2B44 ; Sm # [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B47..2B4C ; Sm # [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR FB29 ; Sm # HEBREW LETTER ALTERNATIVE PLUS SIGN FE62 ; Sm # SMALL PLUS SIGN FE64..FE66 ; Sm # [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN FF0B ; Sm # FULLWIDTH PLUS SIGN FF1C..FF1E ; Sm # [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN FF5C ; Sm # FULLWIDTH VERTICAL LINE FF5E ; Sm # FULLWIDTH TILDE FFE2 ; Sm # FULLWIDTH NOT SIGN FFE9..FFEC ; Sm # [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW 10D8E..10D8F ; Sm # [2] GARAY PLUS SIGN..GARAY MINUS SIGN 1CEF0 ; Sm # MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR 1D6C1 ; Sm # MATHEMATICAL BOLD NABLA 1D6DB ; Sm # MATHEMATICAL BOLD PARTIAL DIFFERENTIAL 1D6FB ; Sm # MATHEMATICAL ITALIC NABLA 1D715 ; Sm # MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL 1D735 ; Sm # MATHEMATICAL BOLD ITALIC NABLA 1D74F ; Sm # MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL 1D76F ; Sm # MATHEMATICAL SANS-SERIF BOLD NABLA 1D789 ; Sm # MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL 1D7A9 ; Sm # MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA 1D7C3 ; Sm # MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL 1EEF0..1EEF1 ; Sm # [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL 1F8D0..1F8D8 ; Sm # [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE # Total code points: 960 # ================================================ # General_Category=Currency_Symbol 0024 ; Sc # DOLLAR SIGN 00A2..00A5 ; Sc # [4] CENT SIGN..YEN SIGN 058F ; Sc # ARMENIAN DRAM SIGN 060B ; Sc # AFGHANI SIGN 07FE..07FF ; Sc # [2] NKO DOROME SIGN..NKO TAMAN SIGN 09F2..09F3 ; Sc # [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN 09FB ; Sc # BENGALI GANDA MARK 0AF1 ; Sc # GUJARATI RUPEE SIGN 0BF9 ; Sc # TAMIL RUPEE SIGN 0E3F ; Sc # THAI CURRENCY SYMBOL BAHT 17DB ; Sc # KHMER CURRENCY SYMBOL RIEL 20A0..20C1 ; Sc # [34] EURO-CURRENCY SIGN..SAUDI RIYAL SIGN A838 ; Sc # NORTH INDIC RUPEE MARK FDFC ; Sc # RIAL SIGN FE69 ; Sc # SMALL DOLLAR SIGN FF04 ; Sc # FULLWIDTH DOLLAR SIGN FFE0..FFE1 ; Sc # [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN FFE5..FFE6 ; Sc # [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN 11FDD..11FE0 ; Sc # [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN 1E2FF ; Sc # WANCHO NGUN SIGN 1ECB0 ; Sc # INDIC SIYAQ RUPEE MARK # Total code points: 64 # ================================================ # General_Category=Modifier_Symbol 005E ; Sk # CIRCUMFLEX ACCENT 0060 ; Sk # GRAVE ACCENT 00A8 ; Sk # DIAERESIS 00AF ; Sk # MACRON 00B4 ; Sk # ACUTE ACCENT 00B8 ; Sk # CEDILLA 02C2..02C5 ; Sk # [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD 02D2..02DF ; Sk # [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT 02E5..02EB ; Sk # [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK 02ED ; Sk # MODIFIER LETTER UNASPIRATED 02EF..02FF ; Sk # [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW 0375 ; Sk # GREEK LOWER NUMERAL SIGN 0384..0385 ; Sk # [2] GREEK TONOS..GREEK DIALYTIKA TONOS 0888 ; Sk # ARABIC RAISED ROUND DOT 1FBD ; Sk # GREEK KORONIS 1FBF..1FC1 ; Sk # [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI 1FCD..1FCF ; Sk # [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI 1FDD..1FDF ; Sk # [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI 1FED..1FEF ; Sk # [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA 1FFD..1FFE ; Sk # [2] GREEK OXIA..GREEK DASIA 309B..309C ; Sk # [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK A700..A716 ; Sk # [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR A720..A721 ; Sk # [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE A789..A78A ; Sk # [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN AB5B ; Sk # MODIFIER BREVE WITH INVERTED BREVE AB6A..AB6B ; Sk # [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK FBB2..FBC2 ; Sk # [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE FF3E ; Sk # FULLWIDTH CIRCUMFLEX ACCENT FF40 ; Sk # FULLWIDTH GRAVE ACCENT FFE3 ; Sk # FULLWIDTH MACRON 1F3FB..1F3FF ; Sk # [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 # Total code points: 125 # ================================================ # General_Category=Other_Symbol 00A6 ; So # BROKEN BAR 00A9 ; So # COPYRIGHT SIGN 00AE ; So # REGISTERED SIGN 00B0 ; So # DEGREE SIGN 0482 ; So # CYRILLIC THOUSANDS SIGN 058D..058E ; So # [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN 060E..060F ; So # [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA 06DE ; So # ARABIC START OF RUB EL HIZB 06E9 ; So # ARABIC PLACE OF SAJDAH 06FD..06FE ; So # [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN 07F6 ; So # NKO SYMBOL OO DENNEN 09FA ; So # BENGALI ISSHAR 0B70 ; So # ORIYA ISSHAR 0BF3..0BF8 ; So # [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN 0BFA ; So # TAMIL NUMBER SIGN 0C7F ; So # TELUGU SIGN TUUMU 0D4F ; So # MALAYALAM SIGN PARA 0D79 ; So # MALAYALAM DATE MARK 0F01..0F03 ; So # [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA 0F13 ; So # TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN 0F15..0F17 ; So # [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS 0F1A..0F1F ; So # [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG 0F34 ; So # TIBETAN MARK BSDUS RTAGS 0F36 ; So # TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN 0F38 ; So # TIBETAN MARK CHE MGO 0FBE..0FC5 ; So # [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE 0FC7..0FCC ; So # [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL 0FCE..0FCF ; So # [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM 0FD5..0FD8 ; So # [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS 109E..109F ; So # [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION 1390..1399 ; So # [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT 166D ; So # CANADIAN SYLLABICS CHI SIGN 1940 ; So # LIMBU SIGN LOO 19DE..19FF ; So # [34] NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM ROC 1B61..1B6A ; So # [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE 1B74..1B7C ; So # [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING 2100..2101 ; So # [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2103..2106 ; So # [4] DEGREE CELSIUS..CADA UNA 2108..2109 ; So # [2] SCRUPLE..DEGREE FAHRENHEIT 2114 ; So # L B BAR SYMBOL 2116..2117 ; So # [2] NUMERO SIGN..SOUND RECORDING COPYRIGHT 211E..2123 ; So # [6] PRESCRIPTION TAKE..VERSICLE 2125 ; So # OUNCE SIGN 2127 ; So # INVERTED OHM SIGN 2129 ; So # TURNED GREEK SMALL LETTER IOTA 212E ; So # ESTIMATED SYMBOL 213A..213B ; So # [2] ROTATED CAPITAL Q..FACSIMILE SIGN 214A ; So # PROPERTY LINE 214C..214D ; So # [2] PER SIGN..AKTIESELSKAB 214F ; So # SYMBOL FOR SAMARITAN SOURCE 218A..218B ; So # [2] TURNED DIGIT TWO..TURNED DIGIT THREE 2195..2199 ; So # [5] UP DOWN ARROW..SOUTH WEST ARROW 219C..219F ; So # [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW 21A1..21A2 ; So # [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL 21A4..21A5 ; So # [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR 21A7..21AD ; So # [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW 21AF..21CD ; So # [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE 21D0..21D1 ; So # [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW 21D3 ; So # DOWNWARDS DOUBLE ARROW 21D5..21F3 ; So # [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 2300..2307 ; So # [8] DIAMETER SIGN..WAVY LINE 230C..231F ; So # [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2322..2328 ; So # [7] FROWN..KEYBOARD 232B..237B ; So # [81] ERASE TO THE LEFT..NOT CHECK MARK 237D..239A ; So # [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL 23B4..23DB ; So # [40] TOP SQUARE BRACKET..FUSE 23E2..2429 ; So # [72] WHITE TRAPEZIUM..SYMBOL FOR DELETE MEDIUM SHADE FORM 2440..244A ; So # [11] OCR HOOK..OCR DOUBLE BACKSLASH 249C..24E9 ; So # [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2500..25B6 ; So # [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE 25B8..25C0 ; So # [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE 25C2..25F7 ; So # [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT 2600..266E ; So # [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 2670..2767 ; So # [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2794..27BF ; So # [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP 2800..28FF ; So # [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 2B00..2B2F ; So # [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE 2B45..2B46 ; So # [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B4D..2B73 ; So # [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR 2B76..2BFF ; So # [138] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..HELLSCHREIBER PAUSE SYMBOL 2CE5..2CEA ; So # [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA 2E50..2E51 ; So # [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR 2E80..2E99 ; So # [26] CJK RADICAL REPEAT..CJK RADICAL RAP 2E9B..2EF3 ; So # [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE 2F00..2FD5 ; So # [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE 2FF0..2FFF ; So # [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION 3004 ; So # JAPANESE INDUSTRIAL STANDARD SYMBOL 3012..3013 ; So # [2] POSTAL MARK..GETA MARK 3020 ; So # POSTAL MARK FACE 3036..3037 ; So # [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL 303E..303F ; So # [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE 3190..3191 ; So # [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK 3196..319F ; So # [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK 31C0..31E5 ; So # [38] CJK STROKE T..CJK STROKE SZP 31EF ; So # IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION 3200..321E ; So # [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU 322A..3247 ; So # [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO 3250 ; So # PARTNERSHIP SIGN 3260..327F ; So # [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL 328A..32B0 ; So # [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT 32C0..33FF ; So # [320] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE GAL 4DC0..4DFF ; So # [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION A490..A4C6 ; So # [55] YI RADICAL QOT..YI RADICAL KE A828..A82B ; So # [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4 A836..A837 ; So # [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK A839 ; So # NORTH INDIC QUANTITY MARK AA77..AA79 ; So # [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO FBC3..FBD2 ; So # [16] ARABIC LIGATURE JALLA WA-ALAA..ARABIC LIGATURE ALAYHI AR-RAHMAH FD40..FD4F ; So # [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH FD90..FD91 ; So # [2] ARABIC LIGATURE RAHMATU ALLAAHI ALAYH..ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA FDC8..FDCF ; So # [8] ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA..ARABIC LIGATURE SALAAMUHU ALAYNAA FDFD..FDFF ; So # [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL FFE4 ; So # FULLWIDTH BROKEN BAR FFE8 ; So # HALFWIDTH FORMS LIGHT VERTICAL FFED..FFEE ; So # [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE FFFC..FFFD ; So # [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 10137..1013F ; So # [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT 10179..10189 ; So # [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN 1018C..1018E ; So # [3] GREEK SINUSOID SIGN..NOMISMA SIGN 10190..1019C ; So # [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL 101A0 ; So # GREEK SYMBOL TAU RHO 101D0..101FC ; So # [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND 10877..10878 ; So # [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON 10AC8 ; So # MANICHAEAN SIGN UD 10ED1..10ED8 ; So # [8] ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM..ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH 1173F ; So # AHOM SYMBOL VI 11FD5..11FDC ; So # [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI 11FE1..11FF1 ; So # [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA 16B3C..16B3F ; So # [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB 16B45 ; So # PAHAWH HMONG SIGN CIM TSOV ROG 1BC9C ; So # DUPLOYAN SIGN O WITH CROSS 1CC00..1CCEF ; So # [240] UP-POINTING GO-KART..OUTLINED LATIN CAPITAL LETTER Z 1CCFA..1CCFC ; So # [3] SNAKE SYMBOL..NOSE SYMBOL 1CD00..1CEB3 ; So # [436] BLOCK OCTANT-3..BLACK RIGHT TRIANGLE CARET 1CEBA..1CED0 ; So # [23] FRAGILE SYMBOL..LEUKOTHEA 1CEE0..1CEEF ; So # [16] GEOMANTIC FIGURE POPULUS..GEOMANTIC FIGURE VIA 1CF50..1CFC3 ; So # [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK 1D000..1D0F5 ; So # [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; So # [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 1D129..1D164 ; So # [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE 1D16A..1D16C ; So # [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3 1D183..1D184 ; So # [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN 1D18C..1D1A9 ; So # [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH 1D1AE..1D1EA ; So # [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON 1D200..1D241 ; So # [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 1D245 ; So # GREEK MUSICAL LEIMMA 1D300..1D356 ; So # [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING 1D800..1D9FF ; So # [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD 1DA37..1DA3A ; So # [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE 1DA6D..1DA74 ; So # [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING 1DA76..1DA83 ; So # [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH 1DA85..1DA86 ; So # [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS 1E14F ; So # NYIAKENG PUACHUE HMONG CIRCLED CA 1ECAC ; So # INDIC SIYAQ PLACEHOLDER 1ED2E ; So # OTTOMAN SIYAQ MARRATAN 1F000..1F02B ; So # [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK 1F030..1F093 ; So # [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 1F0A0..1F0AE ; So # [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES 1F0B1..1F0BF ; So # [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER 1F0C1..1F0CF ; So # [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER 1F0D1..1F0F5 ; So # [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 1F10D..1F1AD ; So # [161] CIRCLED ZERO WITH SLASH..MASK WORK SYMBOL 1F1E6..1F202 ; So # [29] REGIONAL INDICATOR SYMBOL LETTER A..SQUARED KATAKANA SA 1F210..1F23B ; So # [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D 1F240..1F248 ; So # [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 1F250..1F251 ; So # [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT 1F260..1F265 ; So # [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI 1F300..1F3FA ; So # [251] CYCLONE..AMPHORA 1F400..1F6D8 ; So # [729] RAT..LANDSLIDE 1F6DC..1F6EC ; So # [17] WIRELESS..AIRPLANE ARRIVING 1F6F0..1F6FC ; So # [13] SATELLITE..ROLLER SKATE 1F700..1F7D9 ; So # [218] ALCHEMICAL SYMBOL FOR QUINTESSENCE..NINE POINTED WHITE STAR 1F7E0..1F7EB ; So # [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE 1F7F0 ; So # HEAVY EQUALS SIGN 1F800..1F80B ; So # [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD 1F810..1F847 ; So # [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW 1F850..1F859 ; So # [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW 1F860..1F887 ; So # [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW 1F890..1F8AD ; So # [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS 1F8B0..1F8BB ; So # [12] ARROW POINTING UPWARDS THEN NORTH WEST..SOUTH WEST ARROW FROM BAR 1F8C0..1F8C1 ; So # [2] LEFTWARDS ARROW FROM DOWNWARDS ARROW..RIGHTWARDS ARROW FROM DOWNWARDS ARROW 1F900..1FA57 ; So # [344] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS ALFIL 1FA60..1FA6D ; So # [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER 1FA70..1FA7C ; So # [13] BALLET SHOES..CRUTCH 1FA80..1FA8A ; So # [11] YO-YO..TROMBONE 1FA8E..1FAC6 ; So # [57] TREASURE CHEST..FINGERPRINT 1FAC8 ; So # HAIRY CREATURE 1FACD..1FADC ; So # [16] ORCA..ROOT VEGETABLE 1FADF..1FAEA ; So # [12] SPLATTER..DISTORTED FACE 1FAEF..1FAF8 ; So # [10] FIGHT CLOUD..RIGHTWARDS PUSHING HAND 1FB00..1FB92 ; So # [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK 1FB94..1FBEF ; So # [92] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..TOP LEFT JUSTIFIED LOWER RIGHT QUARTER BLACK CIRCLE 1FBFA ; So # ALARM BELL SYMBOL # Total code points: 7468 # ================================================ # General_Category=Initial_Punctuation 00AB ; Pi # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 2018 ; Pi # LEFT SINGLE QUOTATION MARK 201B..201C ; Pi # [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK 201F ; Pi # DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2039 ; Pi # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 2E02 ; Pi # LEFT SUBSTITUTION BRACKET 2E04 ; Pi # LEFT DOTTED SUBSTITUTION BRACKET 2E09 ; Pi # LEFT TRANSPOSITION BRACKET 2E0C ; Pi # LEFT RAISED OMISSION BRACKET 2E1C ; Pi # LEFT LOW PARAPHRASE BRACKET 2E20 ; Pi # LEFT VERTICAL BAR WITH QUILL # Total code points: 12 # ================================================ # General_Category=Final_Punctuation 00BB ; Pf # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 2019 ; Pf # RIGHT SINGLE QUOTATION MARK 201D ; Pf # RIGHT DOUBLE QUOTATION MARK 203A ; Pf # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 2E03 ; Pf # RIGHT SUBSTITUTION BRACKET 2E05 ; Pf # RIGHT DOTTED SUBSTITUTION BRACKET 2E0A ; Pf # RIGHT TRANSPOSITION BRACKET 2E0D ; Pf # RIGHT RAISED OMISSION BRACKET 2E1D ; Pf # RIGHT LOW PARAPHRASE BRACKET 2E21 ; Pf # RIGHT VERTICAL BAR WITH QUILL # Total code points: 10 # EOF ================================================ FILE: maint/Unicode.tables/GraphemeBreakProperty.txt ================================================ # GraphemeBreakProperty-17.0.0.txt # Date: 2025-06-30, 06:20:23 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # ================================================ # Property: Grapheme_Cluster_Break # All code points not explicitly listed for Grapheme_Cluster_Break # have the value Other (XX). # @missing: 0000..10FFFF; Other # ================================================ 0600..0605 ; Prepend # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE 06DD ; Prepend # Cf ARABIC END OF AYAH 070F ; Prepend # Cf SYRIAC ABBREVIATION MARK 0890..0891 ; Prepend # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 08E2 ; Prepend # Cf ARABIC DISPUTED END OF AYAH 0D4E ; Prepend # Lo MALAYALAM LETTER DOT REPH 110BD ; Prepend # Cf KAITHI NUMBER SIGN 110CD ; Prepend # Cf KAITHI NUMBER SIGN ABOVE 111C2..111C3 ; Prepend # Lo [2] SHARADA SIGN JIHVAMULIYA..SHARADA SIGN UPADHMANIYA 113D1 ; Prepend # Lo TULU-TIGALARI REPHA 1193F ; Prepend # Lo DIVES AKURU PREFIXED NASAL SIGN 11941 ; Prepend # Lo DIVES AKURU INITIAL RA 11A84..11A89 ; Prepend # Lo [6] SOYOMBO SIGN JIHVAMULIYA..SOYOMBO CLUSTER-INITIAL LETTER SA 11D46 ; Prepend # Lo MASARAM GONDI REPHA 11F02 ; Prepend # Lo KAWI SIGN REPHA # Total code points: 27 # ================================================ 000D ; CR # Cc # Total code points: 1 # ================================================ 000A ; LF # Cc # Total code points: 1 # ================================================ 0000..0009 ; Control # Cc [10] .. 000B..000C ; Control # Cc [2] .. 000E..001F ; Control # Cc [18] .. 007F..009F ; Control # Cc [33] .. 00AD ; Control # Cf SOFT HYPHEN 061C ; Control # Cf ARABIC LETTER MARK 180E ; Control # Cf MONGOLIAN VOWEL SEPARATOR 200B ; Control # Cf ZERO WIDTH SPACE 200E..200F ; Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK 2028 ; Control # Zl LINE SEPARATOR 2029 ; Control # Zp PARAGRAPH SEPARATOR 202A..202E ; Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE 2060..2064 ; Control # Cf [5] WORD JOINER..INVISIBLE PLUS 2065 ; Control # Cn 2066..206F ; Control # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES FEFF ; Control # Cf ZERO WIDTH NO-BREAK SPACE FFF0..FFF8 ; Control # Cn [9] .. FFF9..FFFB ; Control # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR 13430..1343F ; Control # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE 1BCA0..1BCA3 ; Control # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1D173..1D17A ; Control # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE E0000 ; Control # Cn E0001 ; Control # Cf LANGUAGE TAG E0002..E001F ; Control # Cn [30] .. E0080..E00FF ; Control # Cn [128] .. E01F0..E0FFF ; Control # Cn [3600] .. # Total code points: 3893 # ================================================ 0300..036F ; Extend # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X 0483..0487 ; Extend # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0488..0489 ; Extend # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 0591..05BD ; Extend # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; Extend # Mn HEBREW POINT RAFE 05C1..05C2 ; Extend # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Extend # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Extend # Mn HEBREW POINT QAMATS QATAN 0610..061A ; Extend # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 064B..065F ; Extend # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW 0670 ; Extend # Mn ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; Extend # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06DF..06E4 ; Extend # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA 06E7..06E8 ; Extend # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06EA..06ED ; Extend # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM 0711 ; Extend # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0730..074A ; Extend # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; Extend # Mn [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; Extend # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07FD ; Extend # Mn NKO DANTAYALAN 0816..0819 ; Extend # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH 081B..0823 ; Extend # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; Extend # Mn DEVANAGARI VOWEL SIGN OE 093C ; Extend # Mn DEVANAGARI SIGN NUKTA 0941..0948 ; Extend # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 094D ; Extend # Mn DEVANAGARI SIGN VIRAMA 0951..0957 ; Extend # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; Extend # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0981 ; Extend # Mn BENGALI SIGN CANDRABINDU 09BC ; Extend # Mn BENGALI SIGN NUKTA 09BE ; Extend # Mc BENGALI VOWEL SIGN AA 09C1..09C4 ; Extend # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09CD ; Extend # Mn BENGALI SIGN VIRAMA 09D7 ; Extend # Mc BENGALI AU LENGTH MARK 09E2..09E3 ; Extend # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 09FE ; Extend # Mn BENGALI SANDHI MARK 0A01..0A02 ; Extend # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A3C ; Extend # Mn GURMUKHI SIGN NUKTA 0A41..0A42 ; Extend # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; Extend # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4D ; Extend # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA 0A51 ; Extend # Mn GURMUKHI SIGN UDAAT 0A70..0A71 ; Extend # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; Extend # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; Extend # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0ABC ; Extend # Mn GUJARATI SIGN NUKTA 0AC1..0AC5 ; Extend # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; Extend # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0ACD ; Extend # Mn GUJARATI SIGN VIRAMA 0AE2..0AE3 ; Extend # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFF ; Extend # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B01 ; Extend # Mn ORIYA SIGN CANDRABINDU 0B3C ; Extend # Mn ORIYA SIGN NUKTA 0B3E ; Extend # Mc ORIYA VOWEL SIGN AA 0B3F ; Extend # Mn ORIYA VOWEL SIGN I 0B41..0B44 ; Extend # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B4D ; Extend # Mn ORIYA SIGN VIRAMA 0B55..0B56 ; Extend # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK 0B57 ; Extend # Mc ORIYA AU LENGTH MARK 0B62..0B63 ; Extend # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; Extend # Mn TAMIL SIGN ANUSVARA 0BBE ; Extend # Mc TAMIL VOWEL SIGN AA 0BC0 ; Extend # Mn TAMIL VOWEL SIGN II 0BCD ; Extend # Mn TAMIL SIGN VIRAMA 0BD7 ; Extend # Mc TAMIL AU LENGTH MARK 0C00 ; Extend # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C04 ; Extend # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3C ; Extend # Mn TELUGU SIGN NUKTA 0C3E..0C40 ; Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C46..0C48 ; Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4D ; Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; Extend # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Extend # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; Extend # Mn KANNADA SIGN CANDRABINDU 0CBC ; Extend # Mn KANNADA SIGN NUKTA 0CBF ; Extend # Mn KANNADA VOWEL SIGN I 0CC0 ; Extend # Mc KANNADA VOWEL SIGN II 0CC2 ; Extend # Mc KANNADA VOWEL SIGN UU 0CC6 ; Extend # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; Extend # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; Extend # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CE2..0CE3 ; Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0D00..0D01 ; Extend # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D3B..0D3C ; Extend # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D3E ; Extend # Mc MALAYALAM VOWEL SIGN AA 0D41..0D44 ; Extend # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D4D ; Extend # Mn MALAYALAM SIGN VIRAMA 0D57 ; Extend # Mc MALAYALAM AU LENGTH MARK 0D62..0D63 ; Extend # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; Extend # Mn SINHALA SIGN CANDRABINDU 0DCA ; Extend # Mn SINHALA SIGN AL-LAKUNA 0DCF ; Extend # Mc SINHALA VOWEL SIGN AELA-PILLA 0DD2..0DD4 ; Extend # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Extend # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DDF ; Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA 0E31 ; Extend # Mn THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; Extend # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E47..0E4E ; Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN 0EB1 ; Extend # Mn LAO VOWEL SIGN MAI KAN 0EB4..0EBC ; Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO 0EC8..0ECE ; Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN 0F18..0F19 ; Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; Extend # Mn TIBETAN MARK TSA -PHRU 0F71..0F7E ; Extend # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F80..0F84 ; Extend # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA 0F86..0F87 ; Extend # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0F8D..0F97 ; Extend # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; Extend # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 0FC6 ; Extend # Mn TIBETAN SYMBOL PADMA GDAN 102D..1030 ; Extend # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1032..1037 ; Extend # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW 1039..103A ; Extend # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 103D..103E ; Extend # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1058..1059 ; Extend # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; Extend # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1071..1074 ; Extend # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; Extend # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1085..1086 ; Extend # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 108D ; Extend # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 109D ; Extend # Mn MYANMAR VOWEL SIGN AITON AI 135D..135F ; Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA 1715 ; Extend # Mc TAGALOG SIGN PAMUDPOD 1732..1733 ; Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1734 ; Extend # Mc HANUNOO SIGN PAMUDPOD 1752..1753 ; Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 17B7..17BD ; Extend # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17C6 ; Extend # Mn KHMER SIGN NIKAHIT 17C9..17D3 ; Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17DD ; Extend # Mn KHMER SIGN ATTHACAN 180B..180D ; Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; Extend # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR 1885..1886 ; Extend # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1927..1928 ; Extend # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1932 ; Extend # Mn LIMBU SMALL LETTER ANUSVARA 1939..193B ; Extend # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A17..1A18 ; Extend # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A1B ; Extend # Mn BUGINESE VOWEL SIGN AE 1A56 ; Extend # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A58..1A5E ; Extend # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A60 ; Extend # Mn TAI THAM SIGN SAKOT 1A62 ; Extend # Mn TAI THAM VOWEL SIGN MAI SAT 1A65..1A6C ; Extend # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A73..1A7C ; Extend # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Extend # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AB0..1ABD ; Extend # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; Extend # Me COMBINING PARENTHESES OVERLAY 1ABF..1ADD ; Extend # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; Extend # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B00..1B03 ; Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B34 ; Extend # Mn BALINESE SIGN REREKAN 1B35 ; Extend # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; Extend # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; Extend # Mn BALINESE VOWEL SIGN LA LENGA 1B3D ; Extend # Mc BALINESE VOWEL SIGN LA LENGA TEDUNG 1B42 ; Extend # Mn BALINESE VOWEL SIGN PEPET 1B43..1B44 ; Extend # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1B6B..1B73 ; Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; Extend # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; Extend # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; Extend # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; Extend # Mc SUNDANESE SIGN PAMAAEH 1BAB..1BAD ; Extend # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; Extend # Mn BATAK SIGN TOMPI 1BE8..1BE9 ; Extend # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; Extend # Mn BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; Extend # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1BF2..1BF3 ; Extend # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C2C..1C33 ; Extend # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; Extend # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1CD0..1CD2 ; Extend # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Extend # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Extend # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Extend # Mn VEDIC SIGN TIRYAK 1CF4 ; Extend # Mn VEDIC TONE CANDRA ABOVE 1CF8..1CF9 ; Extend # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1DC0..1DFF ; Extend # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 200C ; Extend # Cf ZERO WIDTH NON-JOINER 20D0..20DC ; Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20DD..20E0 ; Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH 20E1 ; Extend # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E2..20E4 ; Extend # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE 20E5..20F0 ; Extend # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE 2CEF..2CF1 ; Extend # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2D7F ; Extend # Mn TIFINAGH CONSONANT JOINER 2DE0..2DFF ; Extend # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS 302A..302D ; Extend # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 302E..302F ; Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3099..309A ; Extend # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK A66F ; Extend # Mn COMBINING CYRILLIC VZMET A670..A672 ; Extend # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN A674..A67D ; Extend # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A69E..A69F ; Extend # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A802 ; Extend # Mn SYLOTI NAGRI SIGN DVISVARA A806 ; Extend # Mn SYLOTI NAGRI SIGN HASANTA A80B ; Extend # Mn SYLOTI NAGRI SIGN ANUSVARA A825..A826 ; Extend # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A82C ; Extend # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4..A8C5 ; Extend # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU A8E0..A8F1 ; Extend # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8FF ; Extend # Mn DEVANAGARI VOWEL SIGN AY A926..A92D ; Extend # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; Extend # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A953 ; Extend # Mc REJANG VIRAMA A980..A982 ; Extend # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; Extend # Mn JAVANESE SIGN CECAK TELU A9B6..A9B9 ; Extend # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; Extend # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9C0 ; Extend # Mc JAVANESE PANGKON A9E5 ; Extend # Mn MYANMAR SIGN SHAN SAW AA29..AA2E ; Extend # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; Extend # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA35..AA36 ; Extend # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; Extend # Mn CHAM CONSONANT SIGN FINAL NG AA4C ; Extend # Mn CHAM CONSONANT SIGN FINAL M AA7C ; Extend # Mn MYANMAR SIGN TAI LAING TONE-2 AAB0 ; Extend # Mn TAI VIET MAI KANG AAB2..AAB4 ; Extend # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; Extend # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE..AABF ; Extend # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK AAC1 ; Extend # Mn TAI VIET TONE MAI THO AAEC..AAED ; Extend # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAF6 ; Extend # Mn MEETEI MAYEK VIRAMA ABE5 ; Extend # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE8 ; Extend # Mn MEETEI MAYEK VOWEL SIGN UNAP ABED ; Extend # Mn MEETEI MAYEK APUN IYEK FB1E ; Extend # Mn HEBREW POINT JUDEO-SPANISH VARIKA FE00..FE0F ; Extend # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FE20..FE2F ; Extend # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK 101FD ; Extend # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE 102E0 ; Extend # Mn COPTIC EPACT THOUSANDS MARK 10376..1037A ; Extend # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; Extend # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Extend # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Extend # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10A38..10A3A ; Extend # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; Extend # Mn KHAROSHTHI VIRAMA 10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D69..10D6D ; Extend # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EFA..10EFF ; Extend # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Extend # Mn BRAHMI SIGN ANUSVARA 11038..11046 ; Extend # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA 11070 ; Extend # Mn BRAHMI SIGN OLD TAMIL VIRAMA 11073..11074 ; Extend # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 1107F..11081 ; Extend # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA 110B3..110B6 ; Extend # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B9..110BA ; Extend # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 110C2 ; Extend # Mn KAITHI VOWEL SIGN VOCALIC R 11100..11102 ; Extend # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; Extend # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112D..11134 ; Extend # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA 11173 ; Extend # Mn MAHAJANI SIGN NUKTA 11180..11181 ; Extend # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; Extend # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111C0 ; Extend # Mc SHARADA SIGN VIRAMA 111C9..111CC ; Extend # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; Extend # Mn SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; Extend # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; Extend # Mn KHOJKI SIGN ANUSVARA 11235 ; Extend # Mc KHOJKI SIGN VIRAMA 11236..11237 ; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; Extend # Mn KHOJKI SIGN SUKUN 11241 ; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R 112DF ; Extend # Mn KHUDAWADI SIGN ANUSVARA 112E3..112EA ; Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA 11300..11301 ; Extend # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 1133B..1133C ; Extend # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1133E ; Extend # Mc GRANTHA VOWEL SIGN AA 11340 ; Extend # Mn GRANTHA VOWEL SIGN II 1134D ; Extend # Mc GRANTHA SIGN VIRAMA 11357 ; Extend # Mc GRANTHA AU LENGTH MARK 11366..1136C ; Extend # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; Extend # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113B8 ; Extend # Mc TULU-TIGALARI VOWEL SIGN AA 113BB..113C0 ; Extend # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; Extend # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; Extend # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113C9 ; Extend # Mc [3] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI AU LENGTH MARK 113CE ; Extend # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; Extend # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; Extend # Mn TULU-TIGALARI CONJOINER 113D2 ; Extend # Mn TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; Extend # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11438..1143F ; Extend # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11442..11444 ; Extend # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA 11446 ; Extend # Mn NEWA SIGN NUKTA 1145E ; Extend # Mn NEWA SANDHI MARK 114B0 ; Extend # Mc TIRHUTA VOWEL SIGN AA 114B3..114B8 ; Extend # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114BA ; Extend # Mn TIRHUTA VOWEL SIGN SHORT E 114BD ; Extend # Mc TIRHUTA VOWEL SIGN SHORT O 114BF..114C0 ; Extend # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C2..114C3 ; Extend # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115AF ; Extend # Mc SIDDHAM VOWEL SIGN AA 115B2..115B5 ; Extend # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115BC..115BD ; Extend # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BF..115C0 ; Extend # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 115DC..115DD ; Extend # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11633..1163A ; Extend # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163D ; Extend # Mn MODI SIGN ANUSVARA 1163F..11640 ; Extend # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA 116AB ; Extend # Mn TAKRI SIGN ANUSVARA 116AD ; Extend # Mn TAKRI VOWEL SIGN AA 116B0..116B5 ; Extend # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 116B6 ; Extend # Mc TAKRI SIGN VIRAMA 116B7 ; Extend # Mn TAKRI SIGN NUKTA 1171D ; Extend # Mn AHOM CONSONANT SIGN MEDIAL LA 1171F ; Extend # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11722..11725 ; Extend # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11727..1172B ; Extend # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER 1182F..11837 ; Extend # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11839..1183A ; Extend # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 11930 ; Extend # Mc DIVES AKURU VOWEL SIGN AA 1193B..1193C ; Extend # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 1193D ; Extend # Mc DIVES AKURU SIGN HALANTA 1193E ; Extend # Mn DIVES AKURU VIRAMA 11943 ; Extend # Mn DIVES AKURU SIGN NUKTA 119D4..119D7 ; Extend # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; Extend # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119E0 ; Extend # Mn NANDINAGARI SIGN VIRAMA 11A01..11A0A ; Extend # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A33..11A38 ; Extend # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA 11A3B..11A3E ; Extend # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A47 ; Extend # Mn ZANABAZAR SQUARE SUBJOINER 11A51..11A56 ; Extend # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A59..11A5B ; Extend # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; Extend # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A98..11A99 ; Extend # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11B60 ; Extend # Mn SHARADA VOWEL SIGN OE 11B62..11B64 ; Extend # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B66 ; Extend # Mn SHARADA VOWEL SIGN CANDRA E 11C30..11C36 ; Extend # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; Extend # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3F ; Extend # Mn BHAIKSUKI SIGN VIRAMA 11C92..11CA7 ; Extend # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CAA..11CB0 ; Extend # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB2..11CB3 ; Extend # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB5..11CB6 ; Extend # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; Extend # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; Extend # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; Extend # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D45 ; Extend # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA 11D47 ; Extend # Mn MASARAM GONDI RA-KARA 11D90..11D91 ; Extend # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D95 ; Extend # Mn GUNJALA GONDI SIGN ANUSVARA 11D97 ; Extend # Mn GUNJALA GONDI VIRAMA 11EF3..11EF4 ; Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11F00..11F01 ; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; Extend # Mn KAWI VOWEL SIGN EU 11F41 ; Extend # Mc KAWI SIGN KILLER 11F42 ; Extend # Mn KAWI CONJOINER 11F5A ; Extend # Mn KAWI SIGN NUKTA 13440 ; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY 13447..13455 ; Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1611E..16129 ; Extend # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612D..1612F ; Extend # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F4F ; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FE4 ; Extend # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; Extend # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1BC9D..1BC9E ; Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CF00..1CF2D ; Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D165..1D166 ; Extend # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D167..1D169 ; Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D16D..1D172 ; Extend # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1D242..1D244 ; Extend # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1DA00..1DA36 ; Extend # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN 1DA3B..1DA6C ; Extend # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT 1DA75 ; Extend # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS 1DA84 ; Extend # Mn SIGNWRITING LOCATION HEAD NECK 1DA9B..1DA9F ; Extend # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 1DAA1..1DAAF ; Extend # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 1E000..1E006 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; Extend # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E08F ; Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E2AE ; Extend # Mn TOTO SIGN RISING TONE 1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E4EC..1E4EF ; Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH 1E5EE..1E5EF ; Extend # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E6E3 ; Extend # Mn TAI YO SIGN UE 1E6E6 ; Extend # Mn TAI YO SIGN AU 1E6EE..1E6EF ; Extend # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; Extend # Mn TAI YO SIGN OM 1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA 1F3FB..1F3FF ; Extend # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 2237 # ================================================ 1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z # Total code points: 26 # ================================================ 0903 ; SpacingMark # Mc DEVANAGARI SIGN VISARGA 093B ; SpacingMark # Mc DEVANAGARI VOWEL SIGN OOE 093E..0940 ; SpacingMark # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0949..094C ; SpacingMark # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094E..094F ; SpacingMark # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0982..0983 ; SpacingMark # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 09BF..09C0 ; SpacingMark # Mc [2] BENGALI VOWEL SIGN I..BENGALI VOWEL SIGN II 09C7..09C8 ; SpacingMark # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; SpacingMark # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 0A03 ; SpacingMark # Mc GURMUKHI SIGN VISARGA 0A3E..0A40 ; SpacingMark # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A83 ; SpacingMark # Mc GUJARATI SIGN VISARGA 0ABE..0AC0 ; SpacingMark # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC9 ; SpacingMark # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; SpacingMark # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0B02..0B03 ; SpacingMark # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B40 ; SpacingMark # Mc ORIYA VOWEL SIGN II 0B47..0B48 ; SpacingMark # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; SpacingMark # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0BBF ; SpacingMark # Mc TAMIL VOWEL SIGN I 0BC1..0BC2 ; SpacingMark # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; SpacingMark # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; SpacingMark # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0C01..0C03 ; SpacingMark # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C41..0C44 ; SpacingMark # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C82..0C83 ; SpacingMark # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0CBE ; SpacingMark # Mc KANNADA VOWEL SIGN AA 0CC1 ; SpacingMark # Mc KANNADA VOWEL SIGN U 0CC3..0CC4 ; SpacingMark # Mc [2] KANNADA VOWEL SIGN VOCALIC R..KANNADA VOWEL SIGN VOCALIC RR 0CF3 ; SpacingMark # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D02..0D03 ; SpacingMark # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D3F..0D40 ; SpacingMark # Mc [2] MALAYALAM VOWEL SIGN I..MALAYALAM VOWEL SIGN II 0D46..0D48 ; SpacingMark # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; SpacingMark # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D82..0D83 ; SpacingMark # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0DD0..0DD1 ; SpacingMark # Mc [2] SINHALA VOWEL SIGN KETTI AEDA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD8..0DDE ; SpacingMark # Mc [7] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA 0DF2..0DF3 ; SpacingMark # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0E33 ; SpacingMark # Lo THAI CHARACTER SARA AM 0EB3 ; SpacingMark # Lo LAO VOWEL SIGN AM 0F3E..0F3F ; SpacingMark # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F7F ; SpacingMark # Mc TIBETAN SIGN RNAM BCAD 1031 ; SpacingMark # Mc MYANMAR VOWEL SIGN E 103B..103C ; SpacingMark # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 1056..1057 ; SpacingMark # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1084 ; SpacingMark # Mc MYANMAR VOWEL SIGN SHAN E 17B6 ; SpacingMark # Mc KHMER VOWEL SIGN AA 17BE..17C5 ; SpacingMark # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C7..17C8 ; SpacingMark # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 1923..1926 ; SpacingMark # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1929..192B ; SpacingMark # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; SpacingMark # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1933..1938 ; SpacingMark # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1A19..1A1A ; SpacingMark # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A55 ; SpacingMark # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A57 ; SpacingMark # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A6D..1A72 ; SpacingMark # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1B04 ; SpacingMark # Mc BALINESE SIGN BISAH 1B3E..1B41 ; SpacingMark # Mc [4] BALINESE VOWEL SIGN TALING..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B82 ; SpacingMark # Mc SUNDANESE SIGN PANGWISAD 1BA1 ; SpacingMark # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA6..1BA7 ; SpacingMark # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BE7 ; SpacingMark # Mc BATAK VOWEL SIGN E 1BEA..1BEC ; SpacingMark # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BEE ; SpacingMark # Mc BATAK VOWEL SIGN U 1C24..1C2B ; SpacingMark # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C34..1C35 ; SpacingMark # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1CE1 ; SpacingMark # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CF7 ; SpacingMark # Mc VEDIC SIGN ATIKRAMA A823..A824 ; SpacingMark # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A827 ; SpacingMark # Mc SYLOTI NAGRI VOWEL SIGN OO A880..A881 ; SpacingMark # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A8B4..A8C3 ; SpacingMark # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A952 ; SpacingMark # Mc REJANG CONSONANT SIGN H A983 ; SpacingMark # Mc JAVANESE SIGN WIGNYAN A9B4..A9B5 ; SpacingMark # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9BA..A9BB ; SpacingMark # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BE..A9BF ; SpacingMark # Mc [2] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE CONSONANT SIGN CAKRA AA2F..AA30 ; SpacingMark # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA33..AA34 ; SpacingMark # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA4D ; SpacingMark # Mc CHAM CONSONANT SIGN FINAL H AAEB ; SpacingMark # Mc MEETEI MAYEK VOWEL SIGN II AAEE..AAEF ; SpacingMark # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF5 ; SpacingMark # Mc MEETEI MAYEK VOWEL SIGN VISARGA ABE3..ABE4 ; SpacingMark # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE6..ABE7 ; SpacingMark # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE9..ABEA ; SpacingMark # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK 11000 ; SpacingMark # Mc BRAHMI SIGN CANDRABINDU 11002 ; SpacingMark # Mc BRAHMI SIGN VISARGA 11082 ; SpacingMark # Mc KAITHI SIGN VISARGA 110B0..110B2 ; SpacingMark # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B7..110B8 ; SpacingMark # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 1112C ; SpacingMark # Mc CHAKMA VOWEL SIGN E 11145..11146 ; SpacingMark # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11182 ; SpacingMark # Mc SHARADA SIGN VISARGA 111B3..111B5 ; SpacingMark # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111BF ; SpacingMark # Mc SHARADA VOWEL SIGN AU 111CE ; SpacingMark # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 1122C..1122E ; SpacingMark # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 11232..11233 ; SpacingMark # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 112E0..112E2 ; SpacingMark # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 11302..11303 ; SpacingMark # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 1133F ; SpacingMark # Mc GRANTHA VOWEL SIGN I 11341..11344 ; SpacingMark # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; SpacingMark # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134C ; SpacingMark # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU 11362..11363 ; SpacingMark # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 113B9..113BA ; SpacingMark # Mc [2] TULU-TIGALARI VOWEL SIGN I..TULU-TIGALARI VOWEL SIGN II 113CA ; SpacingMark # Mc TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; SpacingMark # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 11435..11437 ; SpacingMark # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11440..11441 ; SpacingMark # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11445 ; SpacingMark # Mc NEWA SIGN VISARGA 114B1..114B2 ; SpacingMark # Mc [2] TIRHUTA VOWEL SIGN I..TIRHUTA VOWEL SIGN II 114B9 ; SpacingMark # Mc TIRHUTA VOWEL SIGN E 114BB..114BC ; SpacingMark # Mc [2] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN O 114BE ; SpacingMark # Mc TIRHUTA VOWEL SIGN AU 114C1 ; SpacingMark # Mc TIRHUTA SIGN VISARGA 115B0..115B1 ; SpacingMark # Mc [2] SIDDHAM VOWEL SIGN I..SIDDHAM VOWEL SIGN II 115B8..115BB ; SpacingMark # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BE ; SpacingMark # Mc SIDDHAM SIGN VISARGA 11630..11632 ; SpacingMark # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 1163B..1163C ; SpacingMark # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163E ; SpacingMark # Mc MODI SIGN VISARGA 116AC ; SpacingMark # Mc TAKRI SIGN VISARGA 116AE..116AF ; SpacingMark # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 1171E ; SpacingMark # Mc AHOM CONSONANT SIGN MEDIAL RA 11726 ; SpacingMark # Mc AHOM VOWEL SIGN E 1182C..1182E ; SpacingMark # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 11838 ; SpacingMark # Mc DOGRA SIGN VISARGA 11931..11935 ; SpacingMark # Mc [5] DIVES AKURU VOWEL SIGN I..DIVES AKURU VOWEL SIGN E 11937..11938 ; SpacingMark # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 11940 ; SpacingMark # Mc DIVES AKURU MEDIAL YA 11942 ; SpacingMark # Mc DIVES AKURU MEDIAL RA 119D1..119D3 ; SpacingMark # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119DC..119DF ; SpacingMark # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E4 ; SpacingMark # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A39 ; SpacingMark # Mc ZANABAZAR SQUARE SIGN VISARGA 11A57..11A58 ; SpacingMark # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A97 ; SpacingMark # Mc SOYOMBO SIGN VISARGA 11B61 ; SpacingMark # Mc SHARADA VOWEL SIGN OOE 11B65 ; SpacingMark # Mc SHARADA VOWEL SIGN SHORT O 11B67 ; SpacingMark # Mc SHARADA VOWEL SIGN CANDRA O 11C2F ; SpacingMark # Mc BHAIKSUKI VOWEL SIGN AA 11C3E ; SpacingMark # Mc BHAIKSUKI SIGN VISARGA 11CA9 ; SpacingMark # Mc MARCHEN SUBJOINED LETTER YA 11CB1 ; SpacingMark # Mc MARCHEN VOWEL SIGN I 11CB4 ; SpacingMark # Mc MARCHEN VOWEL SIGN O 11D8A..11D8E ; SpacingMark # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D93..11D94 ; SpacingMark # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D96 ; SpacingMark # Mc GUNJALA GONDI SIGN VISARGA 11EF5..11EF6 ; SpacingMark # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11F03 ; SpacingMark # Mc KAWI SIGN VISARGA 11F34..11F35 ; SpacingMark # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F3E..11F3F ; SpacingMark # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 1612A..1612C ; SpacingMark # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 16F51..16F87 ; SpacingMark # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI # Total code points: 381 # ================================================ 1100..115F ; L # Lo [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER A960..A97C ; L # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH # Total code points: 125 # ================================================ 1160..11A7 ; V # Lo [72] HANGUL JUNGSEONG FILLER..HANGUL JUNGSEONG O-YAE D7B0..D7C6 ; V # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E 16D63 ; V # Lo KIRAT RAI VOWEL SIGN AA 16D67..16D6A ; V # Lo [4] KIRAT RAI VOWEL SIGN E..KIRAT RAI VOWEL SIGN AU # Total code points: 100 # ================================================ 11A8..11FF ; T # Lo [88] HANGUL JONGSEONG KIYEOK..HANGUL JONGSEONG SSANGNIEUN D7CB..D7FB ; T # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH # Total code points: 137 # ================================================ AC00 ; LV # Lo HANGUL SYLLABLE GA AC1C ; LV # Lo HANGUL SYLLABLE GAE AC38 ; LV # Lo HANGUL SYLLABLE GYA AC54 ; LV # Lo HANGUL SYLLABLE GYAE AC70 ; LV # Lo HANGUL SYLLABLE GEO AC8C ; LV # Lo HANGUL SYLLABLE GE ACA8 ; LV # Lo HANGUL SYLLABLE GYEO ACC4 ; LV # Lo HANGUL SYLLABLE GYE ACE0 ; LV # Lo HANGUL SYLLABLE GO ACFC ; LV # Lo HANGUL SYLLABLE GWA AD18 ; LV # Lo HANGUL SYLLABLE GWAE AD34 ; LV # Lo HANGUL SYLLABLE GOE AD50 ; LV # Lo HANGUL SYLLABLE GYO AD6C ; LV # Lo HANGUL SYLLABLE GU AD88 ; LV # Lo HANGUL SYLLABLE GWEO ADA4 ; LV # Lo HANGUL SYLLABLE GWE ADC0 ; LV # Lo HANGUL SYLLABLE GWI ADDC ; LV # Lo HANGUL SYLLABLE GYU ADF8 ; LV # Lo HANGUL SYLLABLE GEU AE14 ; LV # Lo HANGUL SYLLABLE GYI AE30 ; LV # Lo HANGUL SYLLABLE GI AE4C ; LV # Lo HANGUL SYLLABLE GGA AE68 ; LV # Lo HANGUL SYLLABLE GGAE AE84 ; LV # Lo HANGUL SYLLABLE GGYA AEA0 ; LV # Lo HANGUL SYLLABLE GGYAE AEBC ; LV # Lo HANGUL SYLLABLE GGEO AED8 ; LV # Lo HANGUL SYLLABLE GGE AEF4 ; LV # Lo HANGUL SYLLABLE GGYEO AF10 ; LV # Lo HANGUL SYLLABLE GGYE AF2C ; LV # Lo HANGUL SYLLABLE GGO AF48 ; LV # Lo HANGUL SYLLABLE GGWA AF64 ; LV # Lo HANGUL SYLLABLE GGWAE AF80 ; LV # Lo HANGUL SYLLABLE GGOE AF9C ; LV # Lo HANGUL SYLLABLE GGYO AFB8 ; LV # Lo HANGUL SYLLABLE GGU AFD4 ; LV # Lo HANGUL SYLLABLE GGWEO AFF0 ; LV # Lo HANGUL SYLLABLE GGWE B00C ; LV # Lo HANGUL SYLLABLE GGWI B028 ; LV # Lo HANGUL SYLLABLE GGYU B044 ; LV # Lo HANGUL SYLLABLE GGEU B060 ; LV # Lo HANGUL SYLLABLE GGYI B07C ; LV # Lo HANGUL SYLLABLE GGI B098 ; LV # Lo HANGUL SYLLABLE NA B0B4 ; LV # Lo HANGUL SYLLABLE NAE B0D0 ; LV # Lo HANGUL SYLLABLE NYA B0EC ; LV # Lo HANGUL SYLLABLE NYAE B108 ; LV # Lo HANGUL SYLLABLE NEO B124 ; LV # Lo HANGUL SYLLABLE NE B140 ; LV # Lo HANGUL SYLLABLE NYEO B15C ; LV # Lo HANGUL SYLLABLE NYE B178 ; LV # Lo HANGUL SYLLABLE NO B194 ; LV # Lo HANGUL SYLLABLE NWA B1B0 ; LV # Lo HANGUL SYLLABLE NWAE B1CC ; LV # Lo HANGUL SYLLABLE NOE B1E8 ; LV # Lo HANGUL SYLLABLE NYO B204 ; LV # Lo HANGUL SYLLABLE NU B220 ; LV # Lo HANGUL SYLLABLE NWEO B23C ; LV # Lo HANGUL SYLLABLE NWE B258 ; LV # Lo HANGUL SYLLABLE NWI B274 ; LV # Lo HANGUL SYLLABLE NYU B290 ; LV # Lo HANGUL SYLLABLE NEU B2AC ; LV # Lo HANGUL SYLLABLE NYI B2C8 ; LV # Lo HANGUL SYLLABLE NI B2E4 ; LV # Lo HANGUL SYLLABLE DA B300 ; LV # Lo HANGUL SYLLABLE DAE B31C ; LV # Lo HANGUL SYLLABLE DYA B338 ; LV # Lo HANGUL SYLLABLE DYAE B354 ; LV # Lo HANGUL SYLLABLE DEO B370 ; LV # Lo HANGUL SYLLABLE DE B38C ; LV # Lo HANGUL SYLLABLE DYEO B3A8 ; LV # Lo HANGUL SYLLABLE DYE B3C4 ; LV # Lo HANGUL SYLLABLE DO B3E0 ; LV # Lo HANGUL SYLLABLE DWA B3FC ; LV # Lo HANGUL SYLLABLE DWAE B418 ; LV # Lo HANGUL SYLLABLE DOE B434 ; LV # Lo HANGUL SYLLABLE DYO B450 ; LV # Lo HANGUL SYLLABLE DU B46C ; LV # Lo HANGUL SYLLABLE DWEO B488 ; LV # Lo HANGUL SYLLABLE DWE B4A4 ; LV # Lo HANGUL SYLLABLE DWI B4C0 ; LV # Lo HANGUL SYLLABLE DYU B4DC ; LV # Lo HANGUL SYLLABLE DEU B4F8 ; LV # Lo HANGUL SYLLABLE DYI B514 ; LV # Lo HANGUL SYLLABLE DI B530 ; LV # Lo HANGUL SYLLABLE DDA B54C ; LV # Lo HANGUL SYLLABLE DDAE B568 ; LV # Lo HANGUL SYLLABLE DDYA B584 ; LV # Lo HANGUL SYLLABLE DDYAE B5A0 ; LV # Lo HANGUL SYLLABLE DDEO B5BC ; LV # Lo HANGUL SYLLABLE DDE B5D8 ; LV # Lo HANGUL SYLLABLE DDYEO B5F4 ; LV # Lo HANGUL SYLLABLE DDYE B610 ; LV # Lo HANGUL SYLLABLE DDO B62C ; LV # Lo HANGUL SYLLABLE DDWA B648 ; LV # Lo HANGUL SYLLABLE DDWAE B664 ; LV # Lo HANGUL SYLLABLE DDOE B680 ; LV # Lo HANGUL SYLLABLE DDYO B69C ; LV # Lo HANGUL SYLLABLE DDU B6B8 ; LV # Lo HANGUL SYLLABLE DDWEO B6D4 ; LV # Lo HANGUL SYLLABLE DDWE B6F0 ; LV # Lo HANGUL SYLLABLE DDWI B70C ; LV # Lo HANGUL SYLLABLE DDYU B728 ; LV # Lo HANGUL SYLLABLE DDEU B744 ; LV # Lo HANGUL SYLLABLE DDYI B760 ; LV # Lo HANGUL SYLLABLE DDI B77C ; LV # Lo HANGUL SYLLABLE RA B798 ; LV # Lo HANGUL SYLLABLE RAE B7B4 ; LV # Lo HANGUL SYLLABLE RYA B7D0 ; LV # Lo HANGUL SYLLABLE RYAE B7EC ; LV # Lo HANGUL SYLLABLE REO B808 ; LV # Lo HANGUL SYLLABLE RE B824 ; LV # Lo HANGUL SYLLABLE RYEO B840 ; LV # Lo HANGUL SYLLABLE RYE B85C ; LV # Lo HANGUL SYLLABLE RO B878 ; LV # Lo HANGUL SYLLABLE RWA B894 ; LV # Lo HANGUL SYLLABLE RWAE B8B0 ; LV # Lo HANGUL SYLLABLE ROE B8CC ; LV # Lo HANGUL SYLLABLE RYO B8E8 ; LV # Lo HANGUL SYLLABLE RU B904 ; LV # Lo HANGUL SYLLABLE RWEO B920 ; LV # Lo HANGUL SYLLABLE RWE B93C ; LV # Lo HANGUL SYLLABLE RWI B958 ; LV # Lo HANGUL SYLLABLE RYU B974 ; LV # Lo HANGUL SYLLABLE REU B990 ; LV # Lo HANGUL SYLLABLE RYI B9AC ; LV # Lo HANGUL SYLLABLE RI B9C8 ; LV # Lo HANGUL SYLLABLE MA B9E4 ; LV # Lo HANGUL SYLLABLE MAE BA00 ; LV # Lo HANGUL SYLLABLE MYA BA1C ; LV # Lo HANGUL SYLLABLE MYAE BA38 ; LV # Lo HANGUL SYLLABLE MEO BA54 ; LV # Lo HANGUL SYLLABLE ME BA70 ; LV # Lo HANGUL SYLLABLE MYEO BA8C ; LV # Lo HANGUL SYLLABLE MYE BAA8 ; LV # Lo HANGUL SYLLABLE MO BAC4 ; LV # Lo HANGUL SYLLABLE MWA BAE0 ; LV # Lo HANGUL SYLLABLE MWAE BAFC ; LV # Lo HANGUL SYLLABLE MOE BB18 ; LV # Lo HANGUL SYLLABLE MYO BB34 ; LV # Lo HANGUL SYLLABLE MU BB50 ; LV # Lo HANGUL SYLLABLE MWEO BB6C ; LV # Lo HANGUL SYLLABLE MWE BB88 ; LV # Lo HANGUL SYLLABLE MWI BBA4 ; LV # Lo HANGUL SYLLABLE MYU BBC0 ; LV # Lo HANGUL SYLLABLE MEU BBDC ; LV # Lo HANGUL SYLLABLE MYI BBF8 ; LV # Lo HANGUL SYLLABLE MI BC14 ; LV # Lo HANGUL SYLLABLE BA BC30 ; LV # Lo HANGUL SYLLABLE BAE BC4C ; LV # Lo HANGUL SYLLABLE BYA BC68 ; LV # Lo HANGUL SYLLABLE BYAE BC84 ; LV # Lo HANGUL SYLLABLE BEO BCA0 ; LV # Lo HANGUL SYLLABLE BE BCBC ; LV # Lo HANGUL SYLLABLE BYEO BCD8 ; LV # Lo HANGUL SYLLABLE BYE BCF4 ; LV # Lo HANGUL SYLLABLE BO BD10 ; LV # Lo HANGUL SYLLABLE BWA BD2C ; LV # Lo HANGUL SYLLABLE BWAE BD48 ; LV # Lo HANGUL SYLLABLE BOE BD64 ; LV # Lo HANGUL SYLLABLE BYO BD80 ; LV # Lo HANGUL SYLLABLE BU BD9C ; LV # Lo HANGUL SYLLABLE BWEO BDB8 ; LV # Lo HANGUL SYLLABLE BWE BDD4 ; LV # Lo HANGUL SYLLABLE BWI BDF0 ; LV # Lo HANGUL SYLLABLE BYU BE0C ; LV # Lo HANGUL SYLLABLE BEU BE28 ; LV # Lo HANGUL SYLLABLE BYI BE44 ; LV # Lo HANGUL SYLLABLE BI BE60 ; LV # Lo HANGUL SYLLABLE BBA BE7C ; LV # Lo HANGUL SYLLABLE BBAE BE98 ; LV # Lo HANGUL SYLLABLE BBYA BEB4 ; LV # Lo HANGUL SYLLABLE BBYAE BED0 ; LV # Lo HANGUL SYLLABLE BBEO BEEC ; LV # Lo HANGUL SYLLABLE BBE BF08 ; LV # Lo HANGUL SYLLABLE BBYEO BF24 ; LV # Lo HANGUL SYLLABLE BBYE BF40 ; LV # Lo HANGUL SYLLABLE BBO BF5C ; LV # Lo HANGUL SYLLABLE BBWA BF78 ; LV # Lo HANGUL SYLLABLE BBWAE BF94 ; LV # Lo HANGUL SYLLABLE BBOE BFB0 ; LV # Lo HANGUL SYLLABLE BBYO BFCC ; LV # Lo HANGUL SYLLABLE BBU BFE8 ; LV # Lo HANGUL SYLLABLE BBWEO C004 ; LV # Lo HANGUL SYLLABLE BBWE C020 ; LV # Lo HANGUL SYLLABLE BBWI C03C ; LV # Lo HANGUL SYLLABLE BBYU C058 ; LV # Lo HANGUL SYLLABLE BBEU C074 ; LV # Lo HANGUL SYLLABLE BBYI C090 ; LV # Lo HANGUL SYLLABLE BBI C0AC ; LV # Lo HANGUL SYLLABLE SA C0C8 ; LV # Lo HANGUL SYLLABLE SAE C0E4 ; LV # Lo HANGUL SYLLABLE SYA C100 ; LV # Lo HANGUL SYLLABLE SYAE C11C ; LV # Lo HANGUL SYLLABLE SEO C138 ; LV # Lo HANGUL SYLLABLE SE C154 ; LV # Lo HANGUL SYLLABLE SYEO C170 ; LV # Lo HANGUL SYLLABLE SYE C18C ; LV # Lo HANGUL SYLLABLE SO C1A8 ; LV # Lo HANGUL SYLLABLE SWA C1C4 ; LV # Lo HANGUL SYLLABLE SWAE C1E0 ; LV # Lo HANGUL SYLLABLE SOE C1FC ; LV # Lo HANGUL SYLLABLE SYO C218 ; LV # Lo HANGUL SYLLABLE SU C234 ; LV # Lo HANGUL SYLLABLE SWEO C250 ; LV # Lo HANGUL SYLLABLE SWE C26C ; LV # Lo HANGUL SYLLABLE SWI C288 ; LV # Lo HANGUL SYLLABLE SYU C2A4 ; LV # Lo HANGUL SYLLABLE SEU C2C0 ; LV # Lo HANGUL SYLLABLE SYI C2DC ; LV # Lo HANGUL SYLLABLE SI C2F8 ; LV # Lo HANGUL SYLLABLE SSA C314 ; LV # Lo HANGUL SYLLABLE SSAE C330 ; LV # Lo HANGUL SYLLABLE SSYA C34C ; LV # Lo HANGUL SYLLABLE SSYAE C368 ; LV # Lo HANGUL SYLLABLE SSEO C384 ; LV # Lo HANGUL SYLLABLE SSE C3A0 ; LV # Lo HANGUL SYLLABLE SSYEO C3BC ; LV # Lo HANGUL SYLLABLE SSYE C3D8 ; LV # Lo HANGUL SYLLABLE SSO C3F4 ; LV # Lo HANGUL SYLLABLE SSWA C410 ; LV # Lo HANGUL SYLLABLE SSWAE C42C ; LV # Lo HANGUL SYLLABLE SSOE C448 ; LV # Lo HANGUL SYLLABLE SSYO C464 ; LV # Lo HANGUL SYLLABLE SSU C480 ; LV # Lo HANGUL SYLLABLE SSWEO C49C ; LV # Lo HANGUL SYLLABLE SSWE C4B8 ; LV # Lo HANGUL SYLLABLE SSWI C4D4 ; LV # Lo HANGUL SYLLABLE SSYU C4F0 ; LV # Lo HANGUL SYLLABLE SSEU C50C ; LV # Lo HANGUL SYLLABLE SSYI C528 ; LV # Lo HANGUL SYLLABLE SSI C544 ; LV # Lo HANGUL SYLLABLE A C560 ; LV # Lo HANGUL SYLLABLE AE C57C ; LV # Lo HANGUL SYLLABLE YA C598 ; LV # Lo HANGUL SYLLABLE YAE C5B4 ; LV # Lo HANGUL SYLLABLE EO C5D0 ; LV # Lo HANGUL SYLLABLE E C5EC ; LV # Lo HANGUL SYLLABLE YEO C608 ; LV # Lo HANGUL SYLLABLE YE C624 ; LV # Lo HANGUL SYLLABLE O C640 ; LV # Lo HANGUL SYLLABLE WA C65C ; LV # Lo HANGUL SYLLABLE WAE C678 ; LV # Lo HANGUL SYLLABLE OE C694 ; LV # Lo HANGUL SYLLABLE YO C6B0 ; LV # Lo HANGUL SYLLABLE U C6CC ; LV # Lo HANGUL SYLLABLE WEO C6E8 ; LV # Lo HANGUL SYLLABLE WE C704 ; LV # Lo HANGUL SYLLABLE WI C720 ; LV # Lo HANGUL SYLLABLE YU C73C ; LV # Lo HANGUL SYLLABLE EU C758 ; LV # Lo HANGUL SYLLABLE YI C774 ; LV # Lo HANGUL SYLLABLE I C790 ; LV # Lo HANGUL SYLLABLE JA C7AC ; LV # Lo HANGUL SYLLABLE JAE C7C8 ; LV # Lo HANGUL SYLLABLE JYA C7E4 ; LV # Lo HANGUL SYLLABLE JYAE C800 ; LV # Lo HANGUL SYLLABLE JEO C81C ; LV # Lo HANGUL SYLLABLE JE C838 ; LV # Lo HANGUL SYLLABLE JYEO C854 ; LV # Lo HANGUL SYLLABLE JYE C870 ; LV # Lo HANGUL SYLLABLE JO C88C ; LV # Lo HANGUL SYLLABLE JWA C8A8 ; LV # Lo HANGUL SYLLABLE JWAE C8C4 ; LV # Lo HANGUL SYLLABLE JOE C8E0 ; LV # Lo HANGUL SYLLABLE JYO C8FC ; LV # Lo HANGUL SYLLABLE JU C918 ; LV # Lo HANGUL SYLLABLE JWEO C934 ; LV # Lo HANGUL SYLLABLE JWE C950 ; LV # Lo HANGUL SYLLABLE JWI C96C ; LV # Lo HANGUL SYLLABLE JYU C988 ; LV # Lo HANGUL SYLLABLE JEU C9A4 ; LV # Lo HANGUL SYLLABLE JYI C9C0 ; LV # Lo HANGUL SYLLABLE JI C9DC ; LV # Lo HANGUL SYLLABLE JJA C9F8 ; LV # Lo HANGUL SYLLABLE JJAE CA14 ; LV # Lo HANGUL SYLLABLE JJYA CA30 ; LV # Lo HANGUL SYLLABLE JJYAE CA4C ; LV # Lo HANGUL SYLLABLE JJEO CA68 ; LV # Lo HANGUL SYLLABLE JJE CA84 ; LV # Lo HANGUL SYLLABLE JJYEO CAA0 ; LV # Lo HANGUL SYLLABLE JJYE CABC ; LV # Lo HANGUL SYLLABLE JJO CAD8 ; LV # Lo HANGUL SYLLABLE JJWA CAF4 ; LV # Lo HANGUL SYLLABLE JJWAE CB10 ; LV # Lo HANGUL SYLLABLE JJOE CB2C ; LV # Lo HANGUL SYLLABLE JJYO CB48 ; LV # Lo HANGUL SYLLABLE JJU CB64 ; LV # Lo HANGUL SYLLABLE JJWEO CB80 ; LV # Lo HANGUL SYLLABLE JJWE CB9C ; LV # Lo HANGUL SYLLABLE JJWI CBB8 ; LV # Lo HANGUL SYLLABLE JJYU CBD4 ; LV # Lo HANGUL SYLLABLE JJEU CBF0 ; LV # Lo HANGUL SYLLABLE JJYI CC0C ; LV # Lo HANGUL SYLLABLE JJI CC28 ; LV # Lo HANGUL SYLLABLE CA CC44 ; LV # Lo HANGUL SYLLABLE CAE CC60 ; LV # Lo HANGUL SYLLABLE CYA CC7C ; LV # Lo HANGUL SYLLABLE CYAE CC98 ; LV # Lo HANGUL SYLLABLE CEO CCB4 ; LV # Lo HANGUL SYLLABLE CE CCD0 ; LV # Lo HANGUL SYLLABLE CYEO CCEC ; LV # Lo HANGUL SYLLABLE CYE CD08 ; LV # Lo HANGUL SYLLABLE CO CD24 ; LV # Lo HANGUL SYLLABLE CWA CD40 ; LV # Lo HANGUL SYLLABLE CWAE CD5C ; LV # Lo HANGUL SYLLABLE COE CD78 ; LV # Lo HANGUL SYLLABLE CYO CD94 ; LV # Lo HANGUL SYLLABLE CU CDB0 ; LV # Lo HANGUL SYLLABLE CWEO CDCC ; LV # Lo HANGUL SYLLABLE CWE CDE8 ; LV # Lo HANGUL SYLLABLE CWI CE04 ; LV # Lo HANGUL SYLLABLE CYU CE20 ; LV # Lo HANGUL SYLLABLE CEU CE3C ; LV # Lo HANGUL SYLLABLE CYI CE58 ; LV # Lo HANGUL SYLLABLE CI CE74 ; LV # Lo HANGUL SYLLABLE KA CE90 ; LV # Lo HANGUL SYLLABLE KAE CEAC ; LV # Lo HANGUL SYLLABLE KYA CEC8 ; LV # Lo HANGUL SYLLABLE KYAE CEE4 ; LV # Lo HANGUL SYLLABLE KEO CF00 ; LV # Lo HANGUL SYLLABLE KE CF1C ; LV # Lo HANGUL SYLLABLE KYEO CF38 ; LV # Lo HANGUL SYLLABLE KYE CF54 ; LV # Lo HANGUL SYLLABLE KO CF70 ; LV # Lo HANGUL SYLLABLE KWA CF8C ; LV # Lo HANGUL SYLLABLE KWAE CFA8 ; LV # Lo HANGUL SYLLABLE KOE CFC4 ; LV # Lo HANGUL SYLLABLE KYO CFE0 ; LV # Lo HANGUL SYLLABLE KU CFFC ; LV # Lo HANGUL SYLLABLE KWEO D018 ; LV # Lo HANGUL SYLLABLE KWE D034 ; LV # Lo HANGUL SYLLABLE KWI D050 ; LV # Lo HANGUL SYLLABLE KYU D06C ; LV # Lo HANGUL SYLLABLE KEU D088 ; LV # Lo HANGUL SYLLABLE KYI D0A4 ; LV # Lo HANGUL SYLLABLE KI D0C0 ; LV # Lo HANGUL SYLLABLE TA D0DC ; LV # Lo HANGUL SYLLABLE TAE D0F8 ; LV # Lo HANGUL SYLLABLE TYA D114 ; LV # Lo HANGUL SYLLABLE TYAE D130 ; LV # Lo HANGUL SYLLABLE TEO D14C ; LV # Lo HANGUL SYLLABLE TE D168 ; LV # Lo HANGUL SYLLABLE TYEO D184 ; LV # Lo HANGUL SYLLABLE TYE D1A0 ; LV # Lo HANGUL SYLLABLE TO D1BC ; LV # Lo HANGUL SYLLABLE TWA D1D8 ; LV # Lo HANGUL SYLLABLE TWAE D1F4 ; LV # Lo HANGUL SYLLABLE TOE D210 ; LV # Lo HANGUL SYLLABLE TYO D22C ; LV # Lo HANGUL SYLLABLE TU D248 ; LV # Lo HANGUL SYLLABLE TWEO D264 ; LV # Lo HANGUL SYLLABLE TWE D280 ; LV # Lo HANGUL SYLLABLE TWI D29C ; LV # Lo HANGUL SYLLABLE TYU D2B8 ; LV # Lo HANGUL SYLLABLE TEU D2D4 ; LV # Lo HANGUL SYLLABLE TYI D2F0 ; LV # Lo HANGUL SYLLABLE TI D30C ; LV # Lo HANGUL SYLLABLE PA D328 ; LV # Lo HANGUL SYLLABLE PAE D344 ; LV # Lo HANGUL SYLLABLE PYA D360 ; LV # Lo HANGUL SYLLABLE PYAE D37C ; LV # Lo HANGUL SYLLABLE PEO D398 ; LV # Lo HANGUL SYLLABLE PE D3B4 ; LV # Lo HANGUL SYLLABLE PYEO D3D0 ; LV # Lo HANGUL SYLLABLE PYE D3EC ; LV # Lo HANGUL SYLLABLE PO D408 ; LV # Lo HANGUL SYLLABLE PWA D424 ; LV # Lo HANGUL SYLLABLE PWAE D440 ; LV # Lo HANGUL SYLLABLE POE D45C ; LV # Lo HANGUL SYLLABLE PYO D478 ; LV # Lo HANGUL SYLLABLE PU D494 ; LV # Lo HANGUL SYLLABLE PWEO D4B0 ; LV # Lo HANGUL SYLLABLE PWE D4CC ; LV # Lo HANGUL SYLLABLE PWI D4E8 ; LV # Lo HANGUL SYLLABLE PYU D504 ; LV # Lo HANGUL SYLLABLE PEU D520 ; LV # Lo HANGUL SYLLABLE PYI D53C ; LV # Lo HANGUL SYLLABLE PI D558 ; LV # Lo HANGUL SYLLABLE HA D574 ; LV # Lo HANGUL SYLLABLE HAE D590 ; LV # Lo HANGUL SYLLABLE HYA D5AC ; LV # Lo HANGUL SYLLABLE HYAE D5C8 ; LV # Lo HANGUL SYLLABLE HEO D5E4 ; LV # Lo HANGUL SYLLABLE HE D600 ; LV # Lo HANGUL SYLLABLE HYEO D61C ; LV # Lo HANGUL SYLLABLE HYE D638 ; LV # Lo HANGUL SYLLABLE HO D654 ; LV # Lo HANGUL SYLLABLE HWA D670 ; LV # Lo HANGUL SYLLABLE HWAE D68C ; LV # Lo HANGUL SYLLABLE HOE D6A8 ; LV # Lo HANGUL SYLLABLE HYO D6C4 ; LV # Lo HANGUL SYLLABLE HU D6E0 ; LV # Lo HANGUL SYLLABLE HWEO D6FC ; LV # Lo HANGUL SYLLABLE HWE D718 ; LV # Lo HANGUL SYLLABLE HWI D734 ; LV # Lo HANGUL SYLLABLE HYU D750 ; LV # Lo HANGUL SYLLABLE HEU D76C ; LV # Lo HANGUL SYLLABLE HYI D788 ; LV # Lo HANGUL SYLLABLE HI # Total code points: 399 # ================================================ AC01..AC1B ; LVT # Lo [27] HANGUL SYLLABLE GAG..HANGUL SYLLABLE GAH AC1D..AC37 ; LVT # Lo [27] HANGUL SYLLABLE GAEG..HANGUL SYLLABLE GAEH AC39..AC53 ; LVT # Lo [27] HANGUL SYLLABLE GYAG..HANGUL SYLLABLE GYAH AC55..AC6F ; LVT # Lo [27] HANGUL SYLLABLE GYAEG..HANGUL SYLLABLE GYAEH AC71..AC8B ; LVT # Lo [27] HANGUL SYLLABLE GEOG..HANGUL SYLLABLE GEOH AC8D..ACA7 ; LVT # Lo [27] HANGUL SYLLABLE GEG..HANGUL SYLLABLE GEH ACA9..ACC3 ; LVT # Lo [27] HANGUL SYLLABLE GYEOG..HANGUL SYLLABLE GYEOH ACC5..ACDF ; LVT # Lo [27] HANGUL SYLLABLE GYEG..HANGUL SYLLABLE GYEH ACE1..ACFB ; LVT # Lo [27] HANGUL SYLLABLE GOG..HANGUL SYLLABLE GOH ACFD..AD17 ; LVT # Lo [27] HANGUL SYLLABLE GWAG..HANGUL SYLLABLE GWAH AD19..AD33 ; LVT # Lo [27] HANGUL SYLLABLE GWAEG..HANGUL SYLLABLE GWAEH AD35..AD4F ; LVT # Lo [27] HANGUL SYLLABLE GOEG..HANGUL SYLLABLE GOEH AD51..AD6B ; LVT # Lo [27] HANGUL SYLLABLE GYOG..HANGUL SYLLABLE GYOH AD6D..AD87 ; LVT # Lo [27] HANGUL SYLLABLE GUG..HANGUL SYLLABLE GUH AD89..ADA3 ; LVT # Lo [27] HANGUL SYLLABLE GWEOG..HANGUL SYLLABLE GWEOH ADA5..ADBF ; LVT # Lo [27] HANGUL SYLLABLE GWEG..HANGUL SYLLABLE GWEH ADC1..ADDB ; LVT # Lo [27] HANGUL SYLLABLE GWIG..HANGUL SYLLABLE GWIH ADDD..ADF7 ; LVT # Lo [27] HANGUL SYLLABLE GYUG..HANGUL SYLLABLE GYUH ADF9..AE13 ; LVT # Lo [27] HANGUL SYLLABLE GEUG..HANGUL SYLLABLE GEUH AE15..AE2F ; LVT # Lo [27] HANGUL SYLLABLE GYIG..HANGUL SYLLABLE GYIH AE31..AE4B ; LVT # Lo [27] HANGUL SYLLABLE GIG..HANGUL SYLLABLE GIH AE4D..AE67 ; LVT # Lo [27] HANGUL SYLLABLE GGAG..HANGUL SYLLABLE GGAH AE69..AE83 ; LVT # Lo [27] HANGUL SYLLABLE GGAEG..HANGUL SYLLABLE GGAEH AE85..AE9F ; LVT # Lo [27] HANGUL SYLLABLE GGYAG..HANGUL SYLLABLE GGYAH AEA1..AEBB ; LVT # Lo [27] HANGUL SYLLABLE GGYAEG..HANGUL SYLLABLE GGYAEH AEBD..AED7 ; LVT # Lo [27] HANGUL SYLLABLE GGEOG..HANGUL SYLLABLE GGEOH AED9..AEF3 ; LVT # Lo [27] HANGUL SYLLABLE GGEG..HANGUL SYLLABLE GGEH AEF5..AF0F ; LVT # Lo [27] HANGUL SYLLABLE GGYEOG..HANGUL SYLLABLE GGYEOH AF11..AF2B ; LVT # Lo [27] HANGUL SYLLABLE GGYEG..HANGUL SYLLABLE GGYEH AF2D..AF47 ; LVT # Lo [27] HANGUL SYLLABLE GGOG..HANGUL SYLLABLE GGOH AF49..AF63 ; LVT # Lo [27] HANGUL SYLLABLE GGWAG..HANGUL SYLLABLE GGWAH AF65..AF7F ; LVT # Lo [27] HANGUL SYLLABLE GGWAEG..HANGUL SYLLABLE GGWAEH AF81..AF9B ; LVT # Lo [27] HANGUL SYLLABLE GGOEG..HANGUL SYLLABLE GGOEH AF9D..AFB7 ; LVT # Lo [27] HANGUL SYLLABLE GGYOG..HANGUL SYLLABLE GGYOH AFB9..AFD3 ; LVT # Lo [27] HANGUL SYLLABLE GGUG..HANGUL SYLLABLE GGUH AFD5..AFEF ; LVT # Lo [27] HANGUL SYLLABLE GGWEOG..HANGUL SYLLABLE GGWEOH AFF1..B00B ; LVT # Lo [27] HANGUL SYLLABLE GGWEG..HANGUL SYLLABLE GGWEH B00D..B027 ; LVT # Lo [27] HANGUL SYLLABLE GGWIG..HANGUL SYLLABLE GGWIH B029..B043 ; LVT # Lo [27] HANGUL SYLLABLE GGYUG..HANGUL SYLLABLE GGYUH B045..B05F ; LVT # Lo [27] HANGUL SYLLABLE GGEUG..HANGUL SYLLABLE GGEUH B061..B07B ; LVT # Lo [27] HANGUL SYLLABLE GGYIG..HANGUL SYLLABLE GGYIH B07D..B097 ; LVT # Lo [27] HANGUL SYLLABLE GGIG..HANGUL SYLLABLE GGIH B099..B0B3 ; LVT # Lo [27] HANGUL SYLLABLE NAG..HANGUL SYLLABLE NAH B0B5..B0CF ; LVT # Lo [27] HANGUL SYLLABLE NAEG..HANGUL SYLLABLE NAEH B0D1..B0EB ; LVT # Lo [27] HANGUL SYLLABLE NYAG..HANGUL SYLLABLE NYAH B0ED..B107 ; LVT # Lo [27] HANGUL SYLLABLE NYAEG..HANGUL SYLLABLE NYAEH B109..B123 ; LVT # Lo [27] HANGUL SYLLABLE NEOG..HANGUL SYLLABLE NEOH B125..B13F ; LVT # Lo [27] HANGUL SYLLABLE NEG..HANGUL SYLLABLE NEH B141..B15B ; LVT # Lo [27] HANGUL SYLLABLE NYEOG..HANGUL SYLLABLE NYEOH B15D..B177 ; LVT # Lo [27] HANGUL SYLLABLE NYEG..HANGUL SYLLABLE NYEH B179..B193 ; LVT # Lo [27] HANGUL SYLLABLE NOG..HANGUL SYLLABLE NOH B195..B1AF ; LVT # Lo [27] HANGUL SYLLABLE NWAG..HANGUL SYLLABLE NWAH B1B1..B1CB ; LVT # Lo [27] HANGUL SYLLABLE NWAEG..HANGUL SYLLABLE NWAEH B1CD..B1E7 ; LVT # Lo [27] HANGUL SYLLABLE NOEG..HANGUL SYLLABLE NOEH B1E9..B203 ; LVT # Lo [27] HANGUL SYLLABLE NYOG..HANGUL SYLLABLE NYOH B205..B21F ; LVT # Lo [27] HANGUL SYLLABLE NUG..HANGUL SYLLABLE NUH B221..B23B ; LVT # Lo [27] HANGUL SYLLABLE NWEOG..HANGUL SYLLABLE NWEOH B23D..B257 ; LVT # Lo [27] HANGUL SYLLABLE NWEG..HANGUL SYLLABLE NWEH B259..B273 ; LVT # Lo [27] HANGUL SYLLABLE NWIG..HANGUL SYLLABLE NWIH B275..B28F ; LVT # Lo [27] HANGUL SYLLABLE NYUG..HANGUL SYLLABLE NYUH B291..B2AB ; LVT # Lo [27] HANGUL SYLLABLE NEUG..HANGUL SYLLABLE NEUH B2AD..B2C7 ; LVT # Lo [27] HANGUL SYLLABLE NYIG..HANGUL SYLLABLE NYIH B2C9..B2E3 ; LVT # Lo [27] HANGUL SYLLABLE NIG..HANGUL SYLLABLE NIH B2E5..B2FF ; LVT # Lo [27] HANGUL SYLLABLE DAG..HANGUL SYLLABLE DAH B301..B31B ; LVT # Lo [27] HANGUL SYLLABLE DAEG..HANGUL SYLLABLE DAEH B31D..B337 ; LVT # Lo [27] HANGUL SYLLABLE DYAG..HANGUL SYLLABLE DYAH B339..B353 ; LVT # Lo [27] HANGUL SYLLABLE DYAEG..HANGUL SYLLABLE DYAEH B355..B36F ; LVT # Lo [27] HANGUL SYLLABLE DEOG..HANGUL SYLLABLE DEOH B371..B38B ; LVT # Lo [27] HANGUL SYLLABLE DEG..HANGUL SYLLABLE DEH B38D..B3A7 ; LVT # Lo [27] HANGUL SYLLABLE DYEOG..HANGUL SYLLABLE DYEOH B3A9..B3C3 ; LVT # Lo [27] HANGUL SYLLABLE DYEG..HANGUL SYLLABLE DYEH B3C5..B3DF ; LVT # Lo [27] HANGUL SYLLABLE DOG..HANGUL SYLLABLE DOH B3E1..B3FB ; LVT # Lo [27] HANGUL SYLLABLE DWAG..HANGUL SYLLABLE DWAH B3FD..B417 ; LVT # Lo [27] HANGUL SYLLABLE DWAEG..HANGUL SYLLABLE DWAEH B419..B433 ; LVT # Lo [27] HANGUL SYLLABLE DOEG..HANGUL SYLLABLE DOEH B435..B44F ; LVT # Lo [27] HANGUL SYLLABLE DYOG..HANGUL SYLLABLE DYOH B451..B46B ; LVT # Lo [27] HANGUL SYLLABLE DUG..HANGUL SYLLABLE DUH B46D..B487 ; LVT # Lo [27] HANGUL SYLLABLE DWEOG..HANGUL SYLLABLE DWEOH B489..B4A3 ; LVT # Lo [27] HANGUL SYLLABLE DWEG..HANGUL SYLLABLE DWEH B4A5..B4BF ; LVT # Lo [27] HANGUL SYLLABLE DWIG..HANGUL SYLLABLE DWIH B4C1..B4DB ; LVT # Lo [27] HANGUL SYLLABLE DYUG..HANGUL SYLLABLE DYUH B4DD..B4F7 ; LVT # Lo [27] HANGUL SYLLABLE DEUG..HANGUL SYLLABLE DEUH B4F9..B513 ; LVT # Lo [27] HANGUL SYLLABLE DYIG..HANGUL SYLLABLE DYIH B515..B52F ; LVT # Lo [27] HANGUL SYLLABLE DIG..HANGUL SYLLABLE DIH B531..B54B ; LVT # Lo [27] HANGUL SYLLABLE DDAG..HANGUL SYLLABLE DDAH B54D..B567 ; LVT # Lo [27] HANGUL SYLLABLE DDAEG..HANGUL SYLLABLE DDAEH B569..B583 ; LVT # Lo [27] HANGUL SYLLABLE DDYAG..HANGUL SYLLABLE DDYAH B585..B59F ; LVT # Lo [27] HANGUL SYLLABLE DDYAEG..HANGUL SYLLABLE DDYAEH B5A1..B5BB ; LVT # Lo [27] HANGUL SYLLABLE DDEOG..HANGUL SYLLABLE DDEOH B5BD..B5D7 ; LVT # Lo [27] HANGUL SYLLABLE DDEG..HANGUL SYLLABLE DDEH B5D9..B5F3 ; LVT # Lo [27] HANGUL SYLLABLE DDYEOG..HANGUL SYLLABLE DDYEOH B5F5..B60F ; LVT # Lo [27] HANGUL SYLLABLE DDYEG..HANGUL SYLLABLE DDYEH B611..B62B ; LVT # Lo [27] HANGUL SYLLABLE DDOG..HANGUL SYLLABLE DDOH B62D..B647 ; LVT # Lo [27] HANGUL SYLLABLE DDWAG..HANGUL SYLLABLE DDWAH B649..B663 ; LVT # Lo [27] HANGUL SYLLABLE DDWAEG..HANGUL SYLLABLE DDWAEH B665..B67F ; LVT # Lo [27] HANGUL SYLLABLE DDOEG..HANGUL SYLLABLE DDOEH B681..B69B ; LVT # Lo [27] HANGUL SYLLABLE DDYOG..HANGUL SYLLABLE DDYOH B69D..B6B7 ; LVT # Lo [27] HANGUL SYLLABLE DDUG..HANGUL SYLLABLE DDUH B6B9..B6D3 ; LVT # Lo [27] HANGUL SYLLABLE DDWEOG..HANGUL SYLLABLE DDWEOH B6D5..B6EF ; LVT # Lo [27] HANGUL SYLLABLE DDWEG..HANGUL SYLLABLE DDWEH B6F1..B70B ; LVT # Lo [27] HANGUL SYLLABLE DDWIG..HANGUL SYLLABLE DDWIH B70D..B727 ; LVT # Lo [27] HANGUL SYLLABLE DDYUG..HANGUL SYLLABLE DDYUH B729..B743 ; LVT # Lo [27] HANGUL SYLLABLE DDEUG..HANGUL SYLLABLE DDEUH B745..B75F ; LVT # Lo [27] HANGUL SYLLABLE DDYIG..HANGUL SYLLABLE DDYIH B761..B77B ; LVT # Lo [27] HANGUL SYLLABLE DDIG..HANGUL SYLLABLE DDIH B77D..B797 ; LVT # Lo [27] HANGUL SYLLABLE RAG..HANGUL SYLLABLE RAH B799..B7B3 ; LVT # Lo [27] HANGUL SYLLABLE RAEG..HANGUL SYLLABLE RAEH B7B5..B7CF ; LVT # Lo [27] HANGUL SYLLABLE RYAG..HANGUL SYLLABLE RYAH B7D1..B7EB ; LVT # Lo [27] HANGUL SYLLABLE RYAEG..HANGUL SYLLABLE RYAEH B7ED..B807 ; LVT # Lo [27] HANGUL SYLLABLE REOG..HANGUL SYLLABLE REOH B809..B823 ; LVT # Lo [27] HANGUL SYLLABLE REG..HANGUL SYLLABLE REH B825..B83F ; LVT # Lo [27] HANGUL SYLLABLE RYEOG..HANGUL SYLLABLE RYEOH B841..B85B ; LVT # Lo [27] HANGUL SYLLABLE RYEG..HANGUL SYLLABLE RYEH B85D..B877 ; LVT # Lo [27] HANGUL SYLLABLE ROG..HANGUL SYLLABLE ROH B879..B893 ; LVT # Lo [27] HANGUL SYLLABLE RWAG..HANGUL SYLLABLE RWAH B895..B8AF ; LVT # Lo [27] HANGUL SYLLABLE RWAEG..HANGUL SYLLABLE RWAEH B8B1..B8CB ; LVT # Lo [27] HANGUL SYLLABLE ROEG..HANGUL SYLLABLE ROEH B8CD..B8E7 ; LVT # Lo [27] HANGUL SYLLABLE RYOG..HANGUL SYLLABLE RYOH B8E9..B903 ; LVT # Lo [27] HANGUL SYLLABLE RUG..HANGUL SYLLABLE RUH B905..B91F ; LVT # Lo [27] HANGUL SYLLABLE RWEOG..HANGUL SYLLABLE RWEOH B921..B93B ; LVT # Lo [27] HANGUL SYLLABLE RWEG..HANGUL SYLLABLE RWEH B93D..B957 ; LVT # Lo [27] HANGUL SYLLABLE RWIG..HANGUL SYLLABLE RWIH B959..B973 ; LVT # Lo [27] HANGUL SYLLABLE RYUG..HANGUL SYLLABLE RYUH B975..B98F ; LVT # Lo [27] HANGUL SYLLABLE REUG..HANGUL SYLLABLE REUH B991..B9AB ; LVT # Lo [27] HANGUL SYLLABLE RYIG..HANGUL SYLLABLE RYIH B9AD..B9C7 ; LVT # Lo [27] HANGUL SYLLABLE RIG..HANGUL SYLLABLE RIH B9C9..B9E3 ; LVT # Lo [27] HANGUL SYLLABLE MAG..HANGUL SYLLABLE MAH B9E5..B9FF ; LVT # Lo [27] HANGUL SYLLABLE MAEG..HANGUL SYLLABLE MAEH BA01..BA1B ; LVT # Lo [27] HANGUL SYLLABLE MYAG..HANGUL SYLLABLE MYAH BA1D..BA37 ; LVT # Lo [27] HANGUL SYLLABLE MYAEG..HANGUL SYLLABLE MYAEH BA39..BA53 ; LVT # Lo [27] HANGUL SYLLABLE MEOG..HANGUL SYLLABLE MEOH BA55..BA6F ; LVT # Lo [27] HANGUL SYLLABLE MEG..HANGUL SYLLABLE MEH BA71..BA8B ; LVT # Lo [27] HANGUL SYLLABLE MYEOG..HANGUL SYLLABLE MYEOH BA8D..BAA7 ; LVT # Lo [27] HANGUL SYLLABLE MYEG..HANGUL SYLLABLE MYEH BAA9..BAC3 ; LVT # Lo [27] HANGUL SYLLABLE MOG..HANGUL SYLLABLE MOH BAC5..BADF ; LVT # Lo [27] HANGUL SYLLABLE MWAG..HANGUL SYLLABLE MWAH BAE1..BAFB ; LVT # Lo [27] HANGUL SYLLABLE MWAEG..HANGUL SYLLABLE MWAEH BAFD..BB17 ; LVT # Lo [27] HANGUL SYLLABLE MOEG..HANGUL SYLLABLE MOEH BB19..BB33 ; LVT # Lo [27] HANGUL SYLLABLE MYOG..HANGUL SYLLABLE MYOH BB35..BB4F ; LVT # Lo [27] HANGUL SYLLABLE MUG..HANGUL SYLLABLE MUH BB51..BB6B ; LVT # Lo [27] HANGUL SYLLABLE MWEOG..HANGUL SYLLABLE MWEOH BB6D..BB87 ; LVT # Lo [27] HANGUL SYLLABLE MWEG..HANGUL SYLLABLE MWEH BB89..BBA3 ; LVT # Lo [27] HANGUL SYLLABLE MWIG..HANGUL SYLLABLE MWIH BBA5..BBBF ; LVT # Lo [27] HANGUL SYLLABLE MYUG..HANGUL SYLLABLE MYUH BBC1..BBDB ; LVT # Lo [27] HANGUL SYLLABLE MEUG..HANGUL SYLLABLE MEUH BBDD..BBF7 ; LVT # Lo [27] HANGUL SYLLABLE MYIG..HANGUL SYLLABLE MYIH BBF9..BC13 ; LVT # Lo [27] HANGUL SYLLABLE MIG..HANGUL SYLLABLE MIH BC15..BC2F ; LVT # Lo [27] HANGUL SYLLABLE BAG..HANGUL SYLLABLE BAH BC31..BC4B ; LVT # Lo [27] HANGUL SYLLABLE BAEG..HANGUL SYLLABLE BAEH BC4D..BC67 ; LVT # Lo [27] HANGUL SYLLABLE BYAG..HANGUL SYLLABLE BYAH BC69..BC83 ; LVT # Lo [27] HANGUL SYLLABLE BYAEG..HANGUL SYLLABLE BYAEH BC85..BC9F ; LVT # Lo [27] HANGUL SYLLABLE BEOG..HANGUL SYLLABLE BEOH BCA1..BCBB ; LVT # Lo [27] HANGUL SYLLABLE BEG..HANGUL SYLLABLE BEH BCBD..BCD7 ; LVT # Lo [27] HANGUL SYLLABLE BYEOG..HANGUL SYLLABLE BYEOH BCD9..BCF3 ; LVT # Lo [27] HANGUL SYLLABLE BYEG..HANGUL SYLLABLE BYEH BCF5..BD0F ; LVT # Lo [27] HANGUL SYLLABLE BOG..HANGUL SYLLABLE BOH BD11..BD2B ; LVT # Lo [27] HANGUL SYLLABLE BWAG..HANGUL SYLLABLE BWAH BD2D..BD47 ; LVT # Lo [27] HANGUL SYLLABLE BWAEG..HANGUL SYLLABLE BWAEH BD49..BD63 ; LVT # Lo [27] HANGUL SYLLABLE BOEG..HANGUL SYLLABLE BOEH BD65..BD7F ; LVT # Lo [27] HANGUL SYLLABLE BYOG..HANGUL SYLLABLE BYOH BD81..BD9B ; LVT # Lo [27] HANGUL SYLLABLE BUG..HANGUL SYLLABLE BUH BD9D..BDB7 ; LVT # Lo [27] HANGUL SYLLABLE BWEOG..HANGUL SYLLABLE BWEOH BDB9..BDD3 ; LVT # Lo [27] HANGUL SYLLABLE BWEG..HANGUL SYLLABLE BWEH BDD5..BDEF ; LVT # Lo [27] HANGUL SYLLABLE BWIG..HANGUL SYLLABLE BWIH BDF1..BE0B ; LVT # Lo [27] HANGUL SYLLABLE BYUG..HANGUL SYLLABLE BYUH BE0D..BE27 ; LVT # Lo [27] HANGUL SYLLABLE BEUG..HANGUL SYLLABLE BEUH BE29..BE43 ; LVT # Lo [27] HANGUL SYLLABLE BYIG..HANGUL SYLLABLE BYIH BE45..BE5F ; LVT # Lo [27] HANGUL SYLLABLE BIG..HANGUL SYLLABLE BIH BE61..BE7B ; LVT # Lo [27] HANGUL SYLLABLE BBAG..HANGUL SYLLABLE BBAH BE7D..BE97 ; LVT # Lo [27] HANGUL SYLLABLE BBAEG..HANGUL SYLLABLE BBAEH BE99..BEB3 ; LVT # Lo [27] HANGUL SYLLABLE BBYAG..HANGUL SYLLABLE BBYAH BEB5..BECF ; LVT # Lo [27] HANGUL SYLLABLE BBYAEG..HANGUL SYLLABLE BBYAEH BED1..BEEB ; LVT # Lo [27] HANGUL SYLLABLE BBEOG..HANGUL SYLLABLE BBEOH BEED..BF07 ; LVT # Lo [27] HANGUL SYLLABLE BBEG..HANGUL SYLLABLE BBEH BF09..BF23 ; LVT # Lo [27] HANGUL SYLLABLE BBYEOG..HANGUL SYLLABLE BBYEOH BF25..BF3F ; LVT # Lo [27] HANGUL SYLLABLE BBYEG..HANGUL SYLLABLE BBYEH BF41..BF5B ; LVT # Lo [27] HANGUL SYLLABLE BBOG..HANGUL SYLLABLE BBOH BF5D..BF77 ; LVT # Lo [27] HANGUL SYLLABLE BBWAG..HANGUL SYLLABLE BBWAH BF79..BF93 ; LVT # Lo [27] HANGUL SYLLABLE BBWAEG..HANGUL SYLLABLE BBWAEH BF95..BFAF ; LVT # Lo [27] HANGUL SYLLABLE BBOEG..HANGUL SYLLABLE BBOEH BFB1..BFCB ; LVT # Lo [27] HANGUL SYLLABLE BBYOG..HANGUL SYLLABLE BBYOH BFCD..BFE7 ; LVT # Lo [27] HANGUL SYLLABLE BBUG..HANGUL SYLLABLE BBUH BFE9..C003 ; LVT # Lo [27] HANGUL SYLLABLE BBWEOG..HANGUL SYLLABLE BBWEOH C005..C01F ; LVT # Lo [27] HANGUL SYLLABLE BBWEG..HANGUL SYLLABLE BBWEH C021..C03B ; LVT # Lo [27] HANGUL SYLLABLE BBWIG..HANGUL SYLLABLE BBWIH C03D..C057 ; LVT # Lo [27] HANGUL SYLLABLE BBYUG..HANGUL SYLLABLE BBYUH C059..C073 ; LVT # Lo [27] HANGUL SYLLABLE BBEUG..HANGUL SYLLABLE BBEUH C075..C08F ; LVT # Lo [27] HANGUL SYLLABLE BBYIG..HANGUL SYLLABLE BBYIH C091..C0AB ; LVT # Lo [27] HANGUL SYLLABLE BBIG..HANGUL SYLLABLE BBIH C0AD..C0C7 ; LVT # Lo [27] HANGUL SYLLABLE SAG..HANGUL SYLLABLE SAH C0C9..C0E3 ; LVT # Lo [27] HANGUL SYLLABLE SAEG..HANGUL SYLLABLE SAEH C0E5..C0FF ; LVT # Lo [27] HANGUL SYLLABLE SYAG..HANGUL SYLLABLE SYAH C101..C11B ; LVT # Lo [27] HANGUL SYLLABLE SYAEG..HANGUL SYLLABLE SYAEH C11D..C137 ; LVT # Lo [27] HANGUL SYLLABLE SEOG..HANGUL SYLLABLE SEOH C139..C153 ; LVT # Lo [27] HANGUL SYLLABLE SEG..HANGUL SYLLABLE SEH C155..C16F ; LVT # Lo [27] HANGUL SYLLABLE SYEOG..HANGUL SYLLABLE SYEOH C171..C18B ; LVT # Lo [27] HANGUL SYLLABLE SYEG..HANGUL SYLLABLE SYEH C18D..C1A7 ; LVT # Lo [27] HANGUL SYLLABLE SOG..HANGUL SYLLABLE SOH C1A9..C1C3 ; LVT # Lo [27] HANGUL SYLLABLE SWAG..HANGUL SYLLABLE SWAH C1C5..C1DF ; LVT # Lo [27] HANGUL SYLLABLE SWAEG..HANGUL SYLLABLE SWAEH C1E1..C1FB ; LVT # Lo [27] HANGUL SYLLABLE SOEG..HANGUL SYLLABLE SOEH C1FD..C217 ; LVT # Lo [27] HANGUL SYLLABLE SYOG..HANGUL SYLLABLE SYOH C219..C233 ; LVT # Lo [27] HANGUL SYLLABLE SUG..HANGUL SYLLABLE SUH C235..C24F ; LVT # Lo [27] HANGUL SYLLABLE SWEOG..HANGUL SYLLABLE SWEOH C251..C26B ; LVT # Lo [27] HANGUL SYLLABLE SWEG..HANGUL SYLLABLE SWEH C26D..C287 ; LVT # Lo [27] HANGUL SYLLABLE SWIG..HANGUL SYLLABLE SWIH C289..C2A3 ; LVT # Lo [27] HANGUL SYLLABLE SYUG..HANGUL SYLLABLE SYUH C2A5..C2BF ; LVT # Lo [27] HANGUL SYLLABLE SEUG..HANGUL SYLLABLE SEUH C2C1..C2DB ; LVT # Lo [27] HANGUL SYLLABLE SYIG..HANGUL SYLLABLE SYIH C2DD..C2F7 ; LVT # Lo [27] HANGUL SYLLABLE SIG..HANGUL SYLLABLE SIH C2F9..C313 ; LVT # Lo [27] HANGUL SYLLABLE SSAG..HANGUL SYLLABLE SSAH C315..C32F ; LVT # Lo [27] HANGUL SYLLABLE SSAEG..HANGUL SYLLABLE SSAEH C331..C34B ; LVT # Lo [27] HANGUL SYLLABLE SSYAG..HANGUL SYLLABLE SSYAH C34D..C367 ; LVT # Lo [27] HANGUL SYLLABLE SSYAEG..HANGUL SYLLABLE SSYAEH C369..C383 ; LVT # Lo [27] HANGUL SYLLABLE SSEOG..HANGUL SYLLABLE SSEOH C385..C39F ; LVT # Lo [27] HANGUL SYLLABLE SSEG..HANGUL SYLLABLE SSEH C3A1..C3BB ; LVT # Lo [27] HANGUL SYLLABLE SSYEOG..HANGUL SYLLABLE SSYEOH C3BD..C3D7 ; LVT # Lo [27] HANGUL SYLLABLE SSYEG..HANGUL SYLLABLE SSYEH C3D9..C3F3 ; LVT # Lo [27] HANGUL SYLLABLE SSOG..HANGUL SYLLABLE SSOH C3F5..C40F ; LVT # Lo [27] HANGUL SYLLABLE SSWAG..HANGUL SYLLABLE SSWAH C411..C42B ; LVT # Lo [27] HANGUL SYLLABLE SSWAEG..HANGUL SYLLABLE SSWAEH C42D..C447 ; LVT # Lo [27] HANGUL SYLLABLE SSOEG..HANGUL SYLLABLE SSOEH C449..C463 ; LVT # Lo [27] HANGUL SYLLABLE SSYOG..HANGUL SYLLABLE SSYOH C465..C47F ; LVT # Lo [27] HANGUL SYLLABLE SSUG..HANGUL SYLLABLE SSUH C481..C49B ; LVT # Lo [27] HANGUL SYLLABLE SSWEOG..HANGUL SYLLABLE SSWEOH C49D..C4B7 ; LVT # Lo [27] HANGUL SYLLABLE SSWEG..HANGUL SYLLABLE SSWEH C4B9..C4D3 ; LVT # Lo [27] HANGUL SYLLABLE SSWIG..HANGUL SYLLABLE SSWIH C4D5..C4EF ; LVT # Lo [27] HANGUL SYLLABLE SSYUG..HANGUL SYLLABLE SSYUH C4F1..C50B ; LVT # Lo [27] HANGUL SYLLABLE SSEUG..HANGUL SYLLABLE SSEUH C50D..C527 ; LVT # Lo [27] HANGUL SYLLABLE SSYIG..HANGUL SYLLABLE SSYIH C529..C543 ; LVT # Lo [27] HANGUL SYLLABLE SSIG..HANGUL SYLLABLE SSIH C545..C55F ; LVT # Lo [27] HANGUL SYLLABLE AG..HANGUL SYLLABLE AH C561..C57B ; LVT # Lo [27] HANGUL SYLLABLE AEG..HANGUL SYLLABLE AEH C57D..C597 ; LVT # Lo [27] HANGUL SYLLABLE YAG..HANGUL SYLLABLE YAH C599..C5B3 ; LVT # Lo [27] HANGUL SYLLABLE YAEG..HANGUL SYLLABLE YAEH C5B5..C5CF ; LVT # Lo [27] HANGUL SYLLABLE EOG..HANGUL SYLLABLE EOH C5D1..C5EB ; LVT # Lo [27] HANGUL SYLLABLE EG..HANGUL SYLLABLE EH C5ED..C607 ; LVT # Lo [27] HANGUL SYLLABLE YEOG..HANGUL SYLLABLE YEOH C609..C623 ; LVT # Lo [27] HANGUL SYLLABLE YEG..HANGUL SYLLABLE YEH C625..C63F ; LVT # Lo [27] HANGUL SYLLABLE OG..HANGUL SYLLABLE OH C641..C65B ; LVT # Lo [27] HANGUL SYLLABLE WAG..HANGUL SYLLABLE WAH C65D..C677 ; LVT # Lo [27] HANGUL SYLLABLE WAEG..HANGUL SYLLABLE WAEH C679..C693 ; LVT # Lo [27] HANGUL SYLLABLE OEG..HANGUL SYLLABLE OEH C695..C6AF ; LVT # Lo [27] HANGUL SYLLABLE YOG..HANGUL SYLLABLE YOH C6B1..C6CB ; LVT # Lo [27] HANGUL SYLLABLE UG..HANGUL SYLLABLE UH C6CD..C6E7 ; LVT # Lo [27] HANGUL SYLLABLE WEOG..HANGUL SYLLABLE WEOH C6E9..C703 ; LVT # Lo [27] HANGUL SYLLABLE WEG..HANGUL SYLLABLE WEH C705..C71F ; LVT # Lo [27] HANGUL SYLLABLE WIG..HANGUL SYLLABLE WIH C721..C73B ; LVT # Lo [27] HANGUL SYLLABLE YUG..HANGUL SYLLABLE YUH C73D..C757 ; LVT # Lo [27] HANGUL SYLLABLE EUG..HANGUL SYLLABLE EUH C759..C773 ; LVT # Lo [27] HANGUL SYLLABLE YIG..HANGUL SYLLABLE YIH C775..C78F ; LVT # Lo [27] HANGUL SYLLABLE IG..HANGUL SYLLABLE IH C791..C7AB ; LVT # Lo [27] HANGUL SYLLABLE JAG..HANGUL SYLLABLE JAH C7AD..C7C7 ; LVT # Lo [27] HANGUL SYLLABLE JAEG..HANGUL SYLLABLE JAEH C7C9..C7E3 ; LVT # Lo [27] HANGUL SYLLABLE JYAG..HANGUL SYLLABLE JYAH C7E5..C7FF ; LVT # Lo [27] HANGUL SYLLABLE JYAEG..HANGUL SYLLABLE JYAEH C801..C81B ; LVT # Lo [27] HANGUL SYLLABLE JEOG..HANGUL SYLLABLE JEOH C81D..C837 ; LVT # Lo [27] HANGUL SYLLABLE JEG..HANGUL SYLLABLE JEH C839..C853 ; LVT # Lo [27] HANGUL SYLLABLE JYEOG..HANGUL SYLLABLE JYEOH C855..C86F ; LVT # Lo [27] HANGUL SYLLABLE JYEG..HANGUL SYLLABLE JYEH C871..C88B ; LVT # Lo [27] HANGUL SYLLABLE JOG..HANGUL SYLLABLE JOH C88D..C8A7 ; LVT # Lo [27] HANGUL SYLLABLE JWAG..HANGUL SYLLABLE JWAH C8A9..C8C3 ; LVT # Lo [27] HANGUL SYLLABLE JWAEG..HANGUL SYLLABLE JWAEH C8C5..C8DF ; LVT # Lo [27] HANGUL SYLLABLE JOEG..HANGUL SYLLABLE JOEH C8E1..C8FB ; LVT # Lo [27] HANGUL SYLLABLE JYOG..HANGUL SYLLABLE JYOH C8FD..C917 ; LVT # Lo [27] HANGUL SYLLABLE JUG..HANGUL SYLLABLE JUH C919..C933 ; LVT # Lo [27] HANGUL SYLLABLE JWEOG..HANGUL SYLLABLE JWEOH C935..C94F ; LVT # Lo [27] HANGUL SYLLABLE JWEG..HANGUL SYLLABLE JWEH C951..C96B ; LVT # Lo [27] HANGUL SYLLABLE JWIG..HANGUL SYLLABLE JWIH C96D..C987 ; LVT # Lo [27] HANGUL SYLLABLE JYUG..HANGUL SYLLABLE JYUH C989..C9A3 ; LVT # Lo [27] HANGUL SYLLABLE JEUG..HANGUL SYLLABLE JEUH C9A5..C9BF ; LVT # Lo [27] HANGUL SYLLABLE JYIG..HANGUL SYLLABLE JYIH C9C1..C9DB ; LVT # Lo [27] HANGUL SYLLABLE JIG..HANGUL SYLLABLE JIH C9DD..C9F7 ; LVT # Lo [27] HANGUL SYLLABLE JJAG..HANGUL SYLLABLE JJAH C9F9..CA13 ; LVT # Lo [27] HANGUL SYLLABLE JJAEG..HANGUL SYLLABLE JJAEH CA15..CA2F ; LVT # Lo [27] HANGUL SYLLABLE JJYAG..HANGUL SYLLABLE JJYAH CA31..CA4B ; LVT # Lo [27] HANGUL SYLLABLE JJYAEG..HANGUL SYLLABLE JJYAEH CA4D..CA67 ; LVT # Lo [27] HANGUL SYLLABLE JJEOG..HANGUL SYLLABLE JJEOH CA69..CA83 ; LVT # Lo [27] HANGUL SYLLABLE JJEG..HANGUL SYLLABLE JJEH CA85..CA9F ; LVT # Lo [27] HANGUL SYLLABLE JJYEOG..HANGUL SYLLABLE JJYEOH CAA1..CABB ; LVT # Lo [27] HANGUL SYLLABLE JJYEG..HANGUL SYLLABLE JJYEH CABD..CAD7 ; LVT # Lo [27] HANGUL SYLLABLE JJOG..HANGUL SYLLABLE JJOH CAD9..CAF3 ; LVT # Lo [27] HANGUL SYLLABLE JJWAG..HANGUL SYLLABLE JJWAH CAF5..CB0F ; LVT # Lo [27] HANGUL SYLLABLE JJWAEG..HANGUL SYLLABLE JJWAEH CB11..CB2B ; LVT # Lo [27] HANGUL SYLLABLE JJOEG..HANGUL SYLLABLE JJOEH CB2D..CB47 ; LVT # Lo [27] HANGUL SYLLABLE JJYOG..HANGUL SYLLABLE JJYOH CB49..CB63 ; LVT # Lo [27] HANGUL SYLLABLE JJUG..HANGUL SYLLABLE JJUH CB65..CB7F ; LVT # Lo [27] HANGUL SYLLABLE JJWEOG..HANGUL SYLLABLE JJWEOH CB81..CB9B ; LVT # Lo [27] HANGUL SYLLABLE JJWEG..HANGUL SYLLABLE JJWEH CB9D..CBB7 ; LVT # Lo [27] HANGUL SYLLABLE JJWIG..HANGUL SYLLABLE JJWIH CBB9..CBD3 ; LVT # Lo [27] HANGUL SYLLABLE JJYUG..HANGUL SYLLABLE JJYUH CBD5..CBEF ; LVT # Lo [27] HANGUL SYLLABLE JJEUG..HANGUL SYLLABLE JJEUH CBF1..CC0B ; LVT # Lo [27] HANGUL SYLLABLE JJYIG..HANGUL SYLLABLE JJYIH CC0D..CC27 ; LVT # Lo [27] HANGUL SYLLABLE JJIG..HANGUL SYLLABLE JJIH CC29..CC43 ; LVT # Lo [27] HANGUL SYLLABLE CAG..HANGUL SYLLABLE CAH CC45..CC5F ; LVT # Lo [27] HANGUL SYLLABLE CAEG..HANGUL SYLLABLE CAEH CC61..CC7B ; LVT # Lo [27] HANGUL SYLLABLE CYAG..HANGUL SYLLABLE CYAH CC7D..CC97 ; LVT # Lo [27] HANGUL SYLLABLE CYAEG..HANGUL SYLLABLE CYAEH CC99..CCB3 ; LVT # Lo [27] HANGUL SYLLABLE CEOG..HANGUL SYLLABLE CEOH CCB5..CCCF ; LVT # Lo [27] HANGUL SYLLABLE CEG..HANGUL SYLLABLE CEH CCD1..CCEB ; LVT # Lo [27] HANGUL SYLLABLE CYEOG..HANGUL SYLLABLE CYEOH CCED..CD07 ; LVT # Lo [27] HANGUL SYLLABLE CYEG..HANGUL SYLLABLE CYEH CD09..CD23 ; LVT # Lo [27] HANGUL SYLLABLE COG..HANGUL SYLLABLE COH CD25..CD3F ; LVT # Lo [27] HANGUL SYLLABLE CWAG..HANGUL SYLLABLE CWAH CD41..CD5B ; LVT # Lo [27] HANGUL SYLLABLE CWAEG..HANGUL SYLLABLE CWAEH CD5D..CD77 ; LVT # Lo [27] HANGUL SYLLABLE COEG..HANGUL SYLLABLE COEH CD79..CD93 ; LVT # Lo [27] HANGUL SYLLABLE CYOG..HANGUL SYLLABLE CYOH CD95..CDAF ; LVT # Lo [27] HANGUL SYLLABLE CUG..HANGUL SYLLABLE CUH CDB1..CDCB ; LVT # Lo [27] HANGUL SYLLABLE CWEOG..HANGUL SYLLABLE CWEOH CDCD..CDE7 ; LVT # Lo [27] HANGUL SYLLABLE CWEG..HANGUL SYLLABLE CWEH CDE9..CE03 ; LVT # Lo [27] HANGUL SYLLABLE CWIG..HANGUL SYLLABLE CWIH CE05..CE1F ; LVT # Lo [27] HANGUL SYLLABLE CYUG..HANGUL SYLLABLE CYUH CE21..CE3B ; LVT # Lo [27] HANGUL SYLLABLE CEUG..HANGUL SYLLABLE CEUH CE3D..CE57 ; LVT # Lo [27] HANGUL SYLLABLE CYIG..HANGUL SYLLABLE CYIH CE59..CE73 ; LVT # Lo [27] HANGUL SYLLABLE CIG..HANGUL SYLLABLE CIH CE75..CE8F ; LVT # Lo [27] HANGUL SYLLABLE KAG..HANGUL SYLLABLE KAH CE91..CEAB ; LVT # Lo [27] HANGUL SYLLABLE KAEG..HANGUL SYLLABLE KAEH CEAD..CEC7 ; LVT # Lo [27] HANGUL SYLLABLE KYAG..HANGUL SYLLABLE KYAH CEC9..CEE3 ; LVT # Lo [27] HANGUL SYLLABLE KYAEG..HANGUL SYLLABLE KYAEH CEE5..CEFF ; LVT # Lo [27] HANGUL SYLLABLE KEOG..HANGUL SYLLABLE KEOH CF01..CF1B ; LVT # Lo [27] HANGUL SYLLABLE KEG..HANGUL SYLLABLE KEH CF1D..CF37 ; LVT # Lo [27] HANGUL SYLLABLE KYEOG..HANGUL SYLLABLE KYEOH CF39..CF53 ; LVT # Lo [27] HANGUL SYLLABLE KYEG..HANGUL SYLLABLE KYEH CF55..CF6F ; LVT # Lo [27] HANGUL SYLLABLE KOG..HANGUL SYLLABLE KOH CF71..CF8B ; LVT # Lo [27] HANGUL SYLLABLE KWAG..HANGUL SYLLABLE KWAH CF8D..CFA7 ; LVT # Lo [27] HANGUL SYLLABLE KWAEG..HANGUL SYLLABLE KWAEH CFA9..CFC3 ; LVT # Lo [27] HANGUL SYLLABLE KOEG..HANGUL SYLLABLE KOEH CFC5..CFDF ; LVT # Lo [27] HANGUL SYLLABLE KYOG..HANGUL SYLLABLE KYOH CFE1..CFFB ; LVT # Lo [27] HANGUL SYLLABLE KUG..HANGUL SYLLABLE KUH CFFD..D017 ; LVT # Lo [27] HANGUL SYLLABLE KWEOG..HANGUL SYLLABLE KWEOH D019..D033 ; LVT # Lo [27] HANGUL SYLLABLE KWEG..HANGUL SYLLABLE KWEH D035..D04F ; LVT # Lo [27] HANGUL SYLLABLE KWIG..HANGUL SYLLABLE KWIH D051..D06B ; LVT # Lo [27] HANGUL SYLLABLE KYUG..HANGUL SYLLABLE KYUH D06D..D087 ; LVT # Lo [27] HANGUL SYLLABLE KEUG..HANGUL SYLLABLE KEUH D089..D0A3 ; LVT # Lo [27] HANGUL SYLLABLE KYIG..HANGUL SYLLABLE KYIH D0A5..D0BF ; LVT # Lo [27] HANGUL SYLLABLE KIG..HANGUL SYLLABLE KIH D0C1..D0DB ; LVT # Lo [27] HANGUL SYLLABLE TAG..HANGUL SYLLABLE TAH D0DD..D0F7 ; LVT # Lo [27] HANGUL SYLLABLE TAEG..HANGUL SYLLABLE TAEH D0F9..D113 ; LVT # Lo [27] HANGUL SYLLABLE TYAG..HANGUL SYLLABLE TYAH D115..D12F ; LVT # Lo [27] HANGUL SYLLABLE TYAEG..HANGUL SYLLABLE TYAEH D131..D14B ; LVT # Lo [27] HANGUL SYLLABLE TEOG..HANGUL SYLLABLE TEOH D14D..D167 ; LVT # Lo [27] HANGUL SYLLABLE TEG..HANGUL SYLLABLE TEH D169..D183 ; LVT # Lo [27] HANGUL SYLLABLE TYEOG..HANGUL SYLLABLE TYEOH D185..D19F ; LVT # Lo [27] HANGUL SYLLABLE TYEG..HANGUL SYLLABLE TYEH D1A1..D1BB ; LVT # Lo [27] HANGUL SYLLABLE TOG..HANGUL SYLLABLE TOH D1BD..D1D7 ; LVT # Lo [27] HANGUL SYLLABLE TWAG..HANGUL SYLLABLE TWAH D1D9..D1F3 ; LVT # Lo [27] HANGUL SYLLABLE TWAEG..HANGUL SYLLABLE TWAEH D1F5..D20F ; LVT # Lo [27] HANGUL SYLLABLE TOEG..HANGUL SYLLABLE TOEH D211..D22B ; LVT # Lo [27] HANGUL SYLLABLE TYOG..HANGUL SYLLABLE TYOH D22D..D247 ; LVT # Lo [27] HANGUL SYLLABLE TUG..HANGUL SYLLABLE TUH D249..D263 ; LVT # Lo [27] HANGUL SYLLABLE TWEOG..HANGUL SYLLABLE TWEOH D265..D27F ; LVT # Lo [27] HANGUL SYLLABLE TWEG..HANGUL SYLLABLE TWEH D281..D29B ; LVT # Lo [27] HANGUL SYLLABLE TWIG..HANGUL SYLLABLE TWIH D29D..D2B7 ; LVT # Lo [27] HANGUL SYLLABLE TYUG..HANGUL SYLLABLE TYUH D2B9..D2D3 ; LVT # Lo [27] HANGUL SYLLABLE TEUG..HANGUL SYLLABLE TEUH D2D5..D2EF ; LVT # Lo [27] HANGUL SYLLABLE TYIG..HANGUL SYLLABLE TYIH D2F1..D30B ; LVT # Lo [27] HANGUL SYLLABLE TIG..HANGUL SYLLABLE TIH D30D..D327 ; LVT # Lo [27] HANGUL SYLLABLE PAG..HANGUL SYLLABLE PAH D329..D343 ; LVT # Lo [27] HANGUL SYLLABLE PAEG..HANGUL SYLLABLE PAEH D345..D35F ; LVT # Lo [27] HANGUL SYLLABLE PYAG..HANGUL SYLLABLE PYAH D361..D37B ; LVT # Lo [27] HANGUL SYLLABLE PYAEG..HANGUL SYLLABLE PYAEH D37D..D397 ; LVT # Lo [27] HANGUL SYLLABLE PEOG..HANGUL SYLLABLE PEOH D399..D3B3 ; LVT # Lo [27] HANGUL SYLLABLE PEG..HANGUL SYLLABLE PEH D3B5..D3CF ; LVT # Lo [27] HANGUL SYLLABLE PYEOG..HANGUL SYLLABLE PYEOH D3D1..D3EB ; LVT # Lo [27] HANGUL SYLLABLE PYEG..HANGUL SYLLABLE PYEH D3ED..D407 ; LVT # Lo [27] HANGUL SYLLABLE POG..HANGUL SYLLABLE POH D409..D423 ; LVT # Lo [27] HANGUL SYLLABLE PWAG..HANGUL SYLLABLE PWAH D425..D43F ; LVT # Lo [27] HANGUL SYLLABLE PWAEG..HANGUL SYLLABLE PWAEH D441..D45B ; LVT # Lo [27] HANGUL SYLLABLE POEG..HANGUL SYLLABLE POEH D45D..D477 ; LVT # Lo [27] HANGUL SYLLABLE PYOG..HANGUL SYLLABLE PYOH D479..D493 ; LVT # Lo [27] HANGUL SYLLABLE PUG..HANGUL SYLLABLE PUH D495..D4AF ; LVT # Lo [27] HANGUL SYLLABLE PWEOG..HANGUL SYLLABLE PWEOH D4B1..D4CB ; LVT # Lo [27] HANGUL SYLLABLE PWEG..HANGUL SYLLABLE PWEH D4CD..D4E7 ; LVT # Lo [27] HANGUL SYLLABLE PWIG..HANGUL SYLLABLE PWIH D4E9..D503 ; LVT # Lo [27] HANGUL SYLLABLE PYUG..HANGUL SYLLABLE PYUH D505..D51F ; LVT # Lo [27] HANGUL SYLLABLE PEUG..HANGUL SYLLABLE PEUH D521..D53B ; LVT # Lo [27] HANGUL SYLLABLE PYIG..HANGUL SYLLABLE PYIH D53D..D557 ; LVT # Lo [27] HANGUL SYLLABLE PIG..HANGUL SYLLABLE PIH D559..D573 ; LVT # Lo [27] HANGUL SYLLABLE HAG..HANGUL SYLLABLE HAH D575..D58F ; LVT # Lo [27] HANGUL SYLLABLE HAEG..HANGUL SYLLABLE HAEH D591..D5AB ; LVT # Lo [27] HANGUL SYLLABLE HYAG..HANGUL SYLLABLE HYAH D5AD..D5C7 ; LVT # Lo [27] HANGUL SYLLABLE HYAEG..HANGUL SYLLABLE HYAEH D5C9..D5E3 ; LVT # Lo [27] HANGUL SYLLABLE HEOG..HANGUL SYLLABLE HEOH D5E5..D5FF ; LVT # Lo [27] HANGUL SYLLABLE HEG..HANGUL SYLLABLE HEH D601..D61B ; LVT # Lo [27] HANGUL SYLLABLE HYEOG..HANGUL SYLLABLE HYEOH D61D..D637 ; LVT # Lo [27] HANGUL SYLLABLE HYEG..HANGUL SYLLABLE HYEH D639..D653 ; LVT # Lo [27] HANGUL SYLLABLE HOG..HANGUL SYLLABLE HOH D655..D66F ; LVT # Lo [27] HANGUL SYLLABLE HWAG..HANGUL SYLLABLE HWAH D671..D68B ; LVT # Lo [27] HANGUL SYLLABLE HWAEG..HANGUL SYLLABLE HWAEH D68D..D6A7 ; LVT # Lo [27] HANGUL SYLLABLE HOEG..HANGUL SYLLABLE HOEH D6A9..D6C3 ; LVT # Lo [27] HANGUL SYLLABLE HYOG..HANGUL SYLLABLE HYOH D6C5..D6DF ; LVT # Lo [27] HANGUL SYLLABLE HUG..HANGUL SYLLABLE HUH D6E1..D6FB ; LVT # Lo [27] HANGUL SYLLABLE HWEOG..HANGUL SYLLABLE HWEOH D6FD..D717 ; LVT # Lo [27] HANGUL SYLLABLE HWEG..HANGUL SYLLABLE HWEH D719..D733 ; LVT # Lo [27] HANGUL SYLLABLE HWIG..HANGUL SYLLABLE HWIH D735..D74F ; LVT # Lo [27] HANGUL SYLLABLE HYUG..HANGUL SYLLABLE HYUH D751..D76B ; LVT # Lo [27] HANGUL SYLLABLE HEUG..HANGUL SYLLABLE HEUH D76D..D787 ; LVT # Lo [27] HANGUL SYLLABLE HYIG..HANGUL SYLLABLE HYIH D789..D7A3 ; LVT # Lo [27] HANGUL SYLLABLE HIG..HANGUL SYLLABLE HIH # Total code points: 10773 # ================================================ 200D ; ZWJ # Cf ZERO WIDTH JOINER # Total code points: 1 # EOF ================================================ FILE: maint/Unicode.tables/PropList.txt ================================================ # PropList-17.0.0.txt # Date: 2025-06-30, 06:19:01 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # ================================================ 0009..000D ; White_Space # Cc [5] .. 0020 ; White_Space # Zs SPACE 0085 ; White_Space # Cc 00A0 ; White_Space # Zs NO-BREAK SPACE 1680 ; White_Space # Zs OGHAM SPACE MARK 2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE 2028 ; White_Space # Zl LINE SEPARATOR 2029 ; White_Space # Zp PARAGRAPH SEPARATOR 202F ; White_Space # Zs NARROW NO-BREAK SPACE 205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE 3000 ; White_Space # Zs IDEOGRAPHIC SPACE # Total code points: 25 # ================================================ 061C ; Bidi_Control # Cf ARABIC LETTER MARK 200E..200F ; Bidi_Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK 202A..202E ; Bidi_Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE 2066..2069 ; Bidi_Control # Cf [4] LEFT-TO-RIGHT ISOLATE..POP DIRECTIONAL ISOLATE # Total code points: 12 # ================================================ 200C..200D ; Join_Control # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER # Total code points: 2 # ================================================ 002D ; Dash # Pd HYPHEN-MINUS 058A ; Dash # Pd ARMENIAN HYPHEN 05BE ; Dash # Pd HEBREW PUNCTUATION MAQAF 1400 ; Dash # Pd CANADIAN SYLLABICS HYPHEN 1806 ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN 2010..2015 ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR 2053 ; Dash # Po SWUNG DASH 207B ; Dash # Sm SUPERSCRIPT MINUS 208B ; Dash # Sm SUBSCRIPT MINUS 2212 ; Dash # Sm MINUS SIGN 2E17 ; Dash # Pd DOUBLE OBLIQUE HYPHEN 2E1A ; Dash # Pd HYPHEN WITH DIAERESIS 2E3A..2E3B ; Dash # Pd [2] TWO-EM DASH..THREE-EM DASH 2E40 ; Dash # Pd DOUBLE HYPHEN 2E5D ; Dash # Pd OBLIQUE HYPHEN 301C ; Dash # Pd WAVE DASH 3030 ; Dash # Pd WAVY DASH 30A0 ; Dash # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN FE31..FE32 ; Dash # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH FE58 ; Dash # Pd SMALL EM DASH FE63 ; Dash # Pd SMALL HYPHEN-MINUS FF0D ; Dash # Pd FULLWIDTH HYPHEN-MINUS 10D6E ; Dash # Pd GARAY HYPHEN 10EAD ; Dash # Pd YEZIDI HYPHENATION MARK # Total code points: 31 # ================================================ 002D ; Hyphen # Pd HYPHEN-MINUS 00AD ; Hyphen # Cf SOFT HYPHEN 058A ; Hyphen # Pd ARMENIAN HYPHEN 1806 ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN 2010..2011 ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN 2E17 ; Hyphen # Pd DOUBLE OBLIQUE HYPHEN 30FB ; Hyphen # Po KATAKANA MIDDLE DOT FE63 ; Hyphen # Pd SMALL HYPHEN-MINUS FF0D ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS FF65 ; Hyphen # Po HALFWIDTH KATAKANA MIDDLE DOT # Total code points: 11 # ================================================ 0022 ; Quotation_Mark # Po QUOTATION MARK 0027 ; Quotation_Mark # Po APOSTROPHE 00AB ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00BB ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 2018 ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK 2019 ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK 201A ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK 201B..201C ; Quotation_Mark # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK 201D ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK 201E ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK 201F ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2039 ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 2E42 ; Quotation_Mark # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 300C ; Quotation_Mark # Ps LEFT CORNER BRACKET 300D ; Quotation_Mark # Pe RIGHT CORNER BRACKET 300E ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET 300F ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET 301D ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK 301E..301F ; Quotation_Mark # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK FE41 ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET FE42 ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET FE43 ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET FE44 ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET FF02 ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK FF07 ; Quotation_Mark # Po FULLWIDTH APOSTROPHE FF62 ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET # Total code points: 30 # ================================================ 0021 ; Terminal_Punctuation # Po EXCLAMATION MARK 002C ; Terminal_Punctuation # Po COMMA 002E ; Terminal_Punctuation # Po FULL STOP 003A..003B ; Terminal_Punctuation # Po [2] COLON..SEMICOLON 003F ; Terminal_Punctuation # Po QUESTION MARK 037E ; Terminal_Punctuation # Po GREEK QUESTION MARK 0387 ; Terminal_Punctuation # Po GREEK ANO TELEIA 0589 ; Terminal_Punctuation # Po ARMENIAN FULL STOP 05C3 ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ 060C ; Terminal_Punctuation # Po ARABIC COMMA 061B ; Terminal_Punctuation # Po ARABIC SEMICOLON 061D..061F ; Terminal_Punctuation # Po [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK 06D4 ; Terminal_Punctuation # Po ARABIC FULL STOP 0700..070A ; Terminal_Punctuation # Po [11] SYRIAC END OF PARAGRAPH..SYRIAC CONTRACTION 070C ; Terminal_Punctuation # Po SYRIAC HARKLEAN METOBELUS 07F8..07F9 ; Terminal_Punctuation # Po [2] NKO COMMA..NKO EXCLAMATION MARK 0830..0835 ; Terminal_Punctuation # Po [6] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION SHIYYAALAA 0837..083E ; Terminal_Punctuation # Po [8] SAMARITAN PUNCTUATION MELODIC QITSA..SAMARITAN PUNCTUATION ANNAAU 085E ; Terminal_Punctuation # Po MANDAIC PUNCTUATION 0964..0965 ; Terminal_Punctuation # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA 0E5A..0E5B ; Terminal_Punctuation # Po [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT 0F08 ; Terminal_Punctuation # Po TIBETAN MARK SBRUL SHAD 0F0D..0F12 ; Terminal_Punctuation # Po [6] TIBETAN MARK SHAD..TIBETAN MARK RGYA GRAM SHAD 104A..104B ; Terminal_Punctuation # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION 1361..1368 ; Terminal_Punctuation # Po [8] ETHIOPIC WORDSPACE..ETHIOPIC PARAGRAPH SEPARATOR 166E ; Terminal_Punctuation # Po CANADIAN SYLLABICS FULL STOP 16EB..16ED ; Terminal_Punctuation # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION 1735..1736 ; Terminal_Punctuation # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 17D4..17D6 ; Terminal_Punctuation # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 17DA ; Terminal_Punctuation # Po KHMER SIGN KOOMUUT 1802..1805 ; Terminal_Punctuation # Po [4] MONGOLIAN COMMA..MONGOLIAN FOUR DOTS 1808..1809 ; Terminal_Punctuation # Po [2] MONGOLIAN MANCHU COMMA..MONGOLIAN MANCHU FULL STOP 1944..1945 ; Terminal_Punctuation # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 1AA8..1AAB ; Terminal_Punctuation # Po [4] TAI THAM SIGN KAAN..TAI THAM SIGN SATKAANKUU 1B4E..1B4F ; Terminal_Punctuation # Po [2] BALINESE INVERTED CARIK SIKI..BALINESE INVERTED CARIK PAREREN 1B5A..1B5B ; Terminal_Punctuation # Po [2] BALINESE PANTI..BALINESE PAMADA 1B5D..1B5F ; Terminal_Punctuation # Po [3] BALINESE CARIK PAMUNGKAH..BALINESE CARIK PAREREN 1B7D..1B7F ; Terminal_Punctuation # Po [3] BALINESE PANTI LANTANG..BALINESE PANTI BAWAK 1C3B..1C3F ; Terminal_Punctuation # Po [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK 1C7E..1C7F ; Terminal_Punctuation # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD 2024 ; Terminal_Punctuation # Po ONE DOT LEADER 203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG 2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK 2CF9..2CFB ; Terminal_Punctuation # Po [3] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN INDIRECT QUESTION MARK 2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK 2E3C ; Terminal_Punctuation # Po STENOGRAPHIC FULL STOP 2E41 ; Terminal_Punctuation # Po REVERSED COMMA 2E4C ; Terminal_Punctuation # Po MEDIEVAL COMMA 2E4E..2E4F ; Terminal_Punctuation # Po [2] PUNCTUS ELEVATUS MARK..CORNISH VERSE DIVIDER 2E53..2E54 ; Terminal_Punctuation # Po [2] MEDIEVAL EXCLAMATION MARK..MEDIEVAL QUESTION MARK 3001..3002 ; Terminal_Punctuation # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP A4FE..A4FF ; Terminal_Punctuation # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP A60D..A60F ; Terminal_Punctuation # Po [3] VAI COMMA..VAI QUESTION MARK A6F3..A6F7 ; Terminal_Punctuation # Po [5] BAMUM FULL STOP..BAMUM QUESTION MARK A876..A877 ; Terminal_Punctuation # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD A8CE..A8CF ; Terminal_Punctuation # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA A92F ; Terminal_Punctuation # Po KAYAH LI SIGN SHYA A9C7..A9C9 ; Terminal_Punctuation # Po [3] JAVANESE PADA PANGKAT..JAVANESE PADA LUNGSI AA5D..AA5F ; Terminal_Punctuation # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA AADF ; Terminal_Punctuation # Po TAI VIET SYMBOL KOI KOI AAF0..AAF1 ; Terminal_Punctuation # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM ABEB ; Terminal_Punctuation # Po MEETEI MAYEK CHEIKHEI FE12 ; Terminal_Punctuation # Po PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP FE15..FE16 ; Terminal_Punctuation # Po [2] PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE50..FE52 ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP FE54..FE57 ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK FF01 ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK FF0C ; Terminal_Punctuation # Po FULLWIDTH COMMA FF0E ; Terminal_Punctuation # Po FULLWIDTH FULL STOP FF1A..FF1B ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON FF1F ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK FF61 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA 1039F ; Terminal_Punctuation # Po UGARITIC WORD DIVIDER 103D0 ; Terminal_Punctuation # Po OLD PERSIAN WORD DIVIDER 10857 ; Terminal_Punctuation # Po IMPERIAL ARAMAIC SECTION SIGN 1091F ; Terminal_Punctuation # Po PHOENICIAN WORD SEPARATOR 10A56..10A57 ; Terminal_Punctuation # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA 10AF0..10AF5 ; Terminal_Punctuation # Po [6] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS 10B3A..10B3F ; Terminal_Punctuation # Po [6] TINY TWO DOTS OVER ONE DOT PUNCTUATION..LARGE ONE RING OVER TWO RINGS PUNCTUATION 10B99..10B9C ; Terminal_Punctuation # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT 10F55..10F59 ; Terminal_Punctuation # Po [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 10F86..10F89 ; Terminal_Punctuation # Po [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS 11047..1104D ; Terminal_Punctuation # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 110BE..110C1 ; Terminal_Punctuation # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11141..11143 ; Terminal_Punctuation # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK 111C5..111C6 ; Terminal_Punctuation # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA 111CD ; Terminal_Punctuation # Po SHARADA SUTRA MARK 111DE..111DF ; Terminal_Punctuation # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2 11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK 112A9 ; Terminal_Punctuation # Po MULTANI SECTION MARK 113D4..113D5 ; Terminal_Punctuation # Po [2] TULU-TIGALARI DANDA..TULU-TIGALARI DOUBLE DANDA 1144B..1144D ; Terminal_Punctuation # Po [3] NEWA DANDA..NEWA COMMA 1145A..1145B ; Terminal_Punctuation # Po [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK 115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR 115C9..115D7 ; Terminal_Punctuation # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA 1173C..1173E ; Terminal_Punctuation # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI 11944 ; Terminal_Punctuation # Po DIVES AKURU DOUBLE DANDA 11946 ; Terminal_Punctuation # Po DIVES AKURU END OF TEXT MARK 11A42..11A43 ; Terminal_Punctuation # Po [2] ZANABAZAR SQUARE MARK SHAD..ZANABAZAR SQUARE MARK DOUBLE SHAD 11A9B..11A9C ; Terminal_Punctuation # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD 11AA1..11AA2 ; Terminal_Punctuation # Po [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2 11C41..11C43 ; Terminal_Punctuation # Po [3] BHAIKSUKI DANDA..BHAIKSUKI WORD SEPARATOR 11C71 ; Terminal_Punctuation # Po MARCHEN MARK SHAD 11EF7..11EF8 ; Terminal_Punctuation # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION 11F43..11F44 ; Terminal_Punctuation # Po [2] KAWI DANDA..KAWI DOUBLE DANDA 12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA 16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP 16B37..16B39 ; Terminal_Punctuation # Po [3] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN CIM CHEEM 16B44 ; Terminal_Punctuation # Po PAHAWH HMONG SIGN XAUS 16D6E..16D6F ; Terminal_Punctuation # Po [2] KIRAT RAI DANDA..KIRAT RAI DOUBLE DANDA 16E97..16E98 ; Terminal_Punctuation # Po [2] MEDEFAIDRIN COMMA..MEDEFAIDRIN FULL STOP 1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP 1DA87..1DA8A ; Terminal_Punctuation # Po [4] SIGNWRITING COMMA..SIGNWRITING COLON # Total code points: 291 # ================================================ 005E ; Other_Math # Sk CIRCUMFLEX ACCENT 03D0..03D2 ; Other_Math # L& [3] GREEK BETA SYMBOL..GREEK UPSILON WITH HOOK SYMBOL 03D5 ; Other_Math # L& GREEK PHI SYMBOL 03F0..03F1 ; Other_Math # L& [2] GREEK KAPPA SYMBOL..GREEK RHO SYMBOL 03F4..03F5 ; Other_Math # L& [2] GREEK CAPITAL THETA SYMBOL..GREEK LUNATE EPSILON SYMBOL 2016 ; Other_Math # Po DOUBLE VERTICAL LINE 2032..2034 ; Other_Math # Po [3] PRIME..TRIPLE PRIME 2040 ; Other_Math # Pc CHARACTER TIE 2061..2064 ; Other_Math # Cf [4] FUNCTION APPLICATION..INVISIBLE PLUS 207D ; Other_Math # Ps SUPERSCRIPT LEFT PARENTHESIS 207E ; Other_Math # Pe SUPERSCRIPT RIGHT PARENTHESIS 208D ; Other_Math # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Other_Math # Pe SUBSCRIPT RIGHT PARENTHESIS 20D0..20DC ; Other_Math # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE 20E1 ; Other_Math # Mn COMBINING LEFT RIGHT ARROW ABOVE 20E5..20E6 ; Other_Math # Mn [2] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING DOUBLE VERTICAL STROKE OVERLAY 20EB..20EF ; Other_Math # Mn [5] COMBINING LONG DOUBLE SOLIDUS OVERLAY..COMBINING RIGHT ARROW BELOW 2102 ; Other_Math # L& DOUBLE-STRUCK CAPITAL C 2107 ; Other_Math # L& EULER CONSTANT 210A..2113 ; Other_Math # L& [10] SCRIPT SMALL G..SCRIPT SMALL L 2115 ; Other_Math # L& DOUBLE-STRUCK CAPITAL N 2119..211D ; Other_Math # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R 2124 ; Other_Math # L& DOUBLE-STRUCK CAPITAL Z 2128 ; Other_Math # L& BLACK-LETTER CAPITAL Z 2129 ; Other_Math # So TURNED GREEK SMALL LETTER IOTA 212C..212D ; Other_Math # L& [2] SCRIPT CAPITAL B..BLACK-LETTER CAPITAL C 212F..2131 ; Other_Math # L& [3] SCRIPT SMALL E..SCRIPT CAPITAL F 2133..2134 ; Other_Math # L& [2] SCRIPT CAPITAL M..SCRIPT SMALL O 2135..2138 ; Other_Math # Lo [4] ALEF SYMBOL..DALET SYMBOL 213C..213F ; Other_Math # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI 2145..2149 ; Other_Math # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J 2195..2199 ; Other_Math # So [5] UP DOWN ARROW..SOUTH WEST ARROW 219C..219F ; Other_Math # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW 21A1..21A2 ; Other_Math # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL 21A4..21A5 ; Other_Math # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR 21A7 ; Other_Math # So DOWNWARDS ARROW FROM BAR 21A9..21AD ; Other_Math # So [5] LEFTWARDS ARROW WITH HOOK..LEFT RIGHT WAVE ARROW 21B0..21B1 ; Other_Math # So [2] UPWARDS ARROW WITH TIP LEFTWARDS..UPWARDS ARROW WITH TIP RIGHTWARDS 21B6..21B7 ; Other_Math # So [2] ANTICLOCKWISE TOP SEMICIRCLE ARROW..CLOCKWISE TOP SEMICIRCLE ARROW 21BC..21CD ; Other_Math # So [18] LEFTWARDS HARPOON WITH BARB UPWARDS..LEFTWARDS DOUBLE ARROW WITH STROKE 21D0..21D1 ; Other_Math # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW 21D3 ; Other_Math # So DOWNWARDS DOUBLE ARROW 21D5..21DB ; Other_Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW 21DD ; Other_Math # So RIGHTWARDS SQUIGGLE ARROW 21E4..21E5 ; Other_Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR 2308 ; Other_Math # Ps LEFT CEILING 2309 ; Other_Math # Pe RIGHT CEILING 230A ; Other_Math # Ps LEFT FLOOR 230B ; Other_Math # Pe RIGHT FLOOR 23B4..23B5 ; Other_Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET 23B7 ; Other_Math # So RADICAL SYMBOL BOTTOM 23D0 ; Other_Math # So VERTICAL LINE EXTENSION 23E2 ; Other_Math # So WHITE TRAPEZIUM 25A0..25A1 ; Other_Math # So [2] BLACK SQUARE..WHITE SQUARE 25AE..25B6 ; Other_Math # So [9] BLACK VERTICAL RECTANGLE..BLACK RIGHT-POINTING TRIANGLE 25BC..25C0 ; Other_Math # So [5] BLACK DOWN-POINTING TRIANGLE..BLACK LEFT-POINTING TRIANGLE 25C6..25C7 ; Other_Math # So [2] BLACK DIAMOND..WHITE DIAMOND 25CA..25CB ; Other_Math # So [2] LOZENGE..WHITE CIRCLE 25CF..25D3 ; Other_Math # So [5] BLACK CIRCLE..CIRCLE WITH UPPER HALF BLACK 25E2 ; Other_Math # So BLACK LOWER RIGHT TRIANGLE 25E4 ; Other_Math # So BLACK UPPER LEFT TRIANGLE 25E7..25EC ; Other_Math # So [6] SQUARE WITH LEFT HALF BLACK..WHITE UP-POINTING TRIANGLE WITH DOT 2605..2606 ; Other_Math # So [2] BLACK STAR..WHITE STAR 2640 ; Other_Math # So FEMALE SIGN 2642 ; Other_Math # So MALE SIGN 2660..2663 ; Other_Math # So [4] BLACK SPADE SUIT..BLACK CLUB SUIT 266D..266E ; Other_Math # So [2] MUSIC FLAT SIGN..MUSIC NATURAL SIGN 27C5 ; Other_Math # Ps LEFT S-SHAPED BAG DELIMITER 27C6 ; Other_Math # Pe RIGHT S-SHAPED BAG DELIMITER 27E6 ; Other_Math # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E7 ; Other_Math # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E8 ; Other_Math # Ps MATHEMATICAL LEFT ANGLE BRACKET 27E9 ; Other_Math # Pe MATHEMATICAL RIGHT ANGLE BRACKET 27EA ; Other_Math # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EB ; Other_Math # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27EC ; Other_Math # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27ED ; Other_Math # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EE ; Other_Math # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS 27EF ; Other_Math # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS 2983 ; Other_Math # Ps LEFT WHITE CURLY BRACKET 2984 ; Other_Math # Pe RIGHT WHITE CURLY BRACKET 2985 ; Other_Math # Ps LEFT WHITE PARENTHESIS 2986 ; Other_Math # Pe RIGHT WHITE PARENTHESIS 2987 ; Other_Math # Ps Z NOTATION LEFT IMAGE BRACKET 2988 ; Other_Math # Pe Z NOTATION RIGHT IMAGE BRACKET 2989 ; Other_Math # Ps Z NOTATION LEFT BINDING BRACKET 298A ; Other_Math # Pe Z NOTATION RIGHT BINDING BRACKET 298B ; Other_Math # Ps LEFT SQUARE BRACKET WITH UNDERBAR 298C ; Other_Math # Pe RIGHT SQUARE BRACKET WITH UNDERBAR 298D ; Other_Math # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298E ; Other_Math # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 298F ; Other_Math # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990 ; Other_Math # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2991 ; Other_Math # Ps LEFT ANGLE BRACKET WITH DOT 2992 ; Other_Math # Pe RIGHT ANGLE BRACKET WITH DOT 2993 ; Other_Math # Ps LEFT ARC LESS-THAN BRACKET 2994 ; Other_Math # Pe RIGHT ARC GREATER-THAN BRACKET 2995 ; Other_Math # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET 2996 ; Other_Math # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET 2997 ; Other_Math # Ps LEFT BLACK TORTOISE SHELL BRACKET 2998 ; Other_Math # Pe RIGHT BLACK TORTOISE SHELL BRACKET 29D8 ; Other_Math # Ps LEFT WIGGLY FENCE 29D9 ; Other_Math # Pe RIGHT WIGGLY FENCE 29DA ; Other_Math # Ps LEFT DOUBLE WIGGLY FENCE 29DB ; Other_Math # Pe RIGHT DOUBLE WIGGLY FENCE 29FC ; Other_Math # Ps LEFT-POINTING CURVED ANGLE BRACKET 29FD ; Other_Math # Pe RIGHT-POINTING CURVED ANGLE BRACKET FE61 ; Other_Math # Po SMALL ASTERISK FE63 ; Other_Math # Pd SMALL HYPHEN-MINUS FE68 ; Other_Math # Po SMALL REVERSE SOLIDUS FF3C ; Other_Math # Po FULLWIDTH REVERSE SOLIDUS FF3E ; Other_Math # Sk FULLWIDTH CIRCUMFLEX ACCENT 1D400..1D454 ; Other_Math # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G 1D456..1D49C ; Other_Math # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A 1D49E..1D49F ; Other_Math # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D 1D4A2 ; Other_Math # L& MATHEMATICAL SCRIPT CAPITAL G 1D4A5..1D4A6 ; Other_Math # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K 1D4A9..1D4AC ; Other_Math # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q 1D4AE..1D4B9 ; Other_Math # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D 1D4BB ; Other_Math # L& MATHEMATICAL SCRIPT SMALL F 1D4BD..1D4C3 ; Other_Math # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N 1D4C5..1D505 ; Other_Math # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B 1D507..1D50A ; Other_Math # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G 1D50D..1D514 ; Other_Math # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q 1D516..1D51C ; Other_Math # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y 1D51E..1D539 ; Other_Math # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B 1D53B..1D53E ; Other_Math # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G 1D540..1D544 ; Other_Math # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M 1D546 ; Other_Math # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O 1D54A..1D550 ; Other_Math # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y 1D552..1D6A5 ; Other_Math # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J 1D6A8..1D6C0 ; Other_Math # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA 1D6C2..1D6DA ; Other_Math # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA 1D6DC..1D6FA ; Other_Math # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA 1D6FC..1D714 ; Other_Math # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA 1D716..1D734 ; Other_Math # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA 1D736..1D74E ; Other_Math # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA 1D750..1D76E ; Other_Math # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA 1D770..1D788 ; Other_Math # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA 1D78A..1D7A8 ; Other_Math # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA 1D7AA..1D7C2 ; Other_Math # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA 1D7C4..1D7CB ; Other_Math # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA 1D7CE..1D7FF ; Other_Math # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE 1EE00..1EE03 ; Other_Math # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Other_Math # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Other_Math # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM 1EE24 ; Other_Math # Lo ARABIC MATHEMATICAL INITIAL HEH 1EE27 ; Other_Math # Lo ARABIC MATHEMATICAL INITIAL HAH 1EE29..1EE32 ; Other_Math # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF 1EE34..1EE37 ; Other_Math # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH 1EE39 ; Other_Math # Lo ARABIC MATHEMATICAL INITIAL DAD 1EE3B ; Other_Math # Lo ARABIC MATHEMATICAL INITIAL GHAIN 1EE42 ; Other_Math # Lo ARABIC MATHEMATICAL TAILED JEEM 1EE47 ; Other_Math # Lo ARABIC MATHEMATICAL TAILED HAH 1EE49 ; Other_Math # Lo ARABIC MATHEMATICAL TAILED YEH 1EE4B ; Other_Math # Lo ARABIC MATHEMATICAL TAILED LAM 1EE4D..1EE4F ; Other_Math # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN 1EE51..1EE52 ; Other_Math # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF 1EE54 ; Other_Math # Lo ARABIC MATHEMATICAL TAILED SHEEN 1EE57 ; Other_Math # Lo ARABIC MATHEMATICAL TAILED KHAH 1EE59 ; Other_Math # Lo ARABIC MATHEMATICAL TAILED DAD 1EE5B ; Other_Math # Lo ARABIC MATHEMATICAL TAILED GHAIN 1EE5D ; Other_Math # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON 1EE5F ; Other_Math # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF 1EE61..1EE62 ; Other_Math # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM 1EE64 ; Other_Math # Lo ARABIC MATHEMATICAL STRETCHED HEH 1EE67..1EE6A ; Other_Math # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF 1EE6C..1EE72 ; Other_Math # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF 1EE74..1EE77 ; Other_Math # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH 1EE79..1EE7C ; Other_Math # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH 1EE7E ; Other_Math # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH 1EE80..1EE89 ; Other_Math # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH 1EE8B..1EE9B ; Other_Math # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN 1EEA1..1EEA3 ; Other_Math # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL 1EEA5..1EEA9 ; Other_Math # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Other_Math # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN # Total code points: 1362 # ================================================ 0030..0039 ; Hex_Digit # Nd [10] DIGIT ZERO..DIGIT NINE 0041..0046 ; Hex_Digit # L& [6] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F 0061..0066 ; Hex_Digit # L& [6] LATIN SMALL LETTER A..LATIN SMALL LETTER F FF10..FF19 ; Hex_Digit # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE FF21..FF26 ; Hex_Digit # L& [6] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER F FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER F # Total code points: 44 # ================================================ 0030..0039 ; ASCII_Hex_Digit # Nd [10] DIGIT ZERO..DIGIT NINE 0041..0046 ; ASCII_Hex_Digit # L& [6] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F 0061..0066 ; ASCII_Hex_Digit # L& [6] LATIN SMALL LETTER A..LATIN SMALL LETTER F # Total code points: 22 # ================================================ 0345 ; Other_Alphabetic # Mn COMBINING GREEK YPOGEGRAMMENI 0363..036F ; Other_Alphabetic # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X 05B0..05BD ; Other_Alphabetic # Mn [14] HEBREW POINT SHEVA..HEBREW POINT METEG 05BF ; Other_Alphabetic # Mn HEBREW POINT RAFE 05C1..05C2 ; Other_Alphabetic # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Other_Alphabetic # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Other_Alphabetic # Mn HEBREW POINT QAMATS QATAN 0610..061A ; Other_Alphabetic # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA 064B..0657 ; Other_Alphabetic # Mn [13] ARABIC FATHATAN..ARABIC INVERTED DAMMA 0659..065F ; Other_Alphabetic # Mn [7] ARABIC ZWARAKAY..ARABIC WAVY HAMZA BELOW 0670 ; Other_Alphabetic # Mn ARABIC LETTER SUPERSCRIPT ALEF 06D6..06DC ; Other_Alphabetic # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN 06E1..06E4 ; Other_Alphabetic # Mn [4] ARABIC SMALL HIGH DOTLESS HEAD OF KHAH..ARABIC SMALL HIGH MADDA 06E7..06E8 ; Other_Alphabetic # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 06ED ; Other_Alphabetic # Mn ARABIC SMALL LOW MEEM 0711 ; Other_Alphabetic # Mn SYRIAC LETTER SUPERSCRIPT ALAPH 0730..073F ; Other_Alphabetic # Mn [16] SYRIAC PTHAHA ABOVE..SYRIAC RWAHA 07A6..07B0 ; Other_Alphabetic # Mn [11] THAANA ABAFILI..THAANA SUKUN 0816..0817 ; Other_Alphabetic # Mn [2] SAMARITAN MARK IN..SAMARITAN MARK IN-ALAF 081B..0823 ; Other_Alphabetic # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN 0897 ; Other_Alphabetic # Mn ARABIC PEPET 08D4..08DF ; Other_Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA 08E3..08E9 ; Other_Alphabetic # Mn [7] ARABIC TURNED DAMMA BELOW..ARABIC CURLY KASRATAN 08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA 0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA 093A ; Other_Alphabetic # Mn DEVANAGARI VOWEL SIGN OE 093B ; Other_Alphabetic # Mc DEVANAGARI VOWEL SIGN OOE 093E..0940 ; Other_Alphabetic # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II 0941..0948 ; Other_Alphabetic # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI 0949..094C ; Other_Alphabetic # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU 094E..094F ; Other_Alphabetic # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW 0955..0957 ; Other_Alphabetic # Mn [3] DEVANAGARI VOWEL SIGN CANDRA LONG E..DEVANAGARI VOWEL SIGN UUE 0962..0963 ; Other_Alphabetic # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL 0981 ; Other_Alphabetic # Mn BENGALI SIGN CANDRABINDU 0982..0983 ; Other_Alphabetic # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 09BE..09C0 ; Other_Alphabetic # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II 09C1..09C4 ; Other_Alphabetic # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR 09C7..09C8 ; Other_Alphabetic # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI 09CB..09CC ; Other_Alphabetic # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU 09D7 ; Other_Alphabetic # Mc BENGALI AU LENGTH MARK 09E2..09E3 ; Other_Alphabetic # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL 0A01..0A02 ; Other_Alphabetic # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI 0A03 ; Other_Alphabetic # Mc GURMUKHI SIGN VISARGA 0A3E..0A40 ; Other_Alphabetic # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II 0A41..0A42 ; Other_Alphabetic # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU 0A47..0A48 ; Other_Alphabetic # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI 0A4B..0A4C ; Other_Alphabetic # Mn [2] GURMUKHI VOWEL SIGN OO..GURMUKHI VOWEL SIGN AU 0A51 ; Other_Alphabetic # Mn GURMUKHI SIGN UDAAT 0A70..0A71 ; Other_Alphabetic # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK 0A75 ; Other_Alphabetic # Mn GURMUKHI SIGN YAKASH 0A81..0A82 ; Other_Alphabetic # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA 0A83 ; Other_Alphabetic # Mc GUJARATI SIGN VISARGA 0ABE..0AC0 ; Other_Alphabetic # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II 0AC1..0AC5 ; Other_Alphabetic # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E 0AC7..0AC8 ; Other_Alphabetic # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI 0AC9 ; Other_Alphabetic # Mc GUJARATI VOWEL SIGN CANDRA O 0ACB..0ACC ; Other_Alphabetic # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU 0AE2..0AE3 ; Other_Alphabetic # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL 0AFA..0AFC ; Other_Alphabetic # Mn [3] GUJARATI SIGN SUKUN..GUJARATI SIGN MADDAH 0B01 ; Other_Alphabetic # Mn ORIYA SIGN CANDRABINDU 0B02..0B03 ; Other_Alphabetic # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA 0B3E ; Other_Alphabetic # Mc ORIYA VOWEL SIGN AA 0B3F ; Other_Alphabetic # Mn ORIYA VOWEL SIGN I 0B40 ; Other_Alphabetic # Mc ORIYA VOWEL SIGN II 0B41..0B44 ; Other_Alphabetic # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR 0B47..0B48 ; Other_Alphabetic # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI 0B4B..0B4C ; Other_Alphabetic # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU 0B56 ; Other_Alphabetic # Mn ORIYA AI LENGTH MARK 0B57 ; Other_Alphabetic # Mc ORIYA AU LENGTH MARK 0B62..0B63 ; Other_Alphabetic # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL 0B82 ; Other_Alphabetic # Mn TAMIL SIGN ANUSVARA 0BBE..0BBF ; Other_Alphabetic # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I 0BC0 ; Other_Alphabetic # Mn TAMIL VOWEL SIGN II 0BC1..0BC2 ; Other_Alphabetic # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU 0BC6..0BC8 ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD7 ; Other_Alphabetic # Mc TAMIL AU LENGTH MARK 0C00 ; Other_Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; Other_Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C04 ; Other_Alphabetic # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE 0C3E..0C40 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; Other_Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR 0C46..0C48 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI 0C4A..0C4C ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU 0C55..0C56 ; Other_Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Other_Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C81 ; Other_Alphabetic # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; Other_Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0CBE ; Other_Alphabetic # Mc KANNADA VOWEL SIGN AA 0CBF ; Other_Alphabetic # Mn KANNADA VOWEL SIGN I 0CC0..0CC4 ; Other_Alphabetic # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR 0CC6 ; Other_Alphabetic # Mn KANNADA VOWEL SIGN E 0CC7..0CC8 ; Other_Alphabetic # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; Other_Alphabetic # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU 0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0CF3 ; Other_Alphabetic # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT 0D00..0D01 ; Other_Alphabetic # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR 0D46..0D48 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI 0D4A..0D4C ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU 0D57 ; Other_Alphabetic # Mc MALAYALAM AU LENGTH MARK 0D62..0D63 ; Other_Alphabetic # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL 0D81 ; Other_Alphabetic # Mn SINHALA SIGN CANDRABINDU 0D82..0D83 ; Other_Alphabetic # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA 0DCF..0DD1 ; Other_Alphabetic # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA 0DD2..0DD4 ; Other_Alphabetic # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Other_Alphabetic # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DD8..0DDF ; Other_Alphabetic # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA 0DF2..0DF3 ; Other_Alphabetic # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0E31 ; Other_Alphabetic # Mn THAI CHARACTER MAI HAN-AKAT 0E34..0E3A ; Other_Alphabetic # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU 0E4D ; Other_Alphabetic # Mn THAI CHARACTER NIKHAHIT 0EB1 ; Other_Alphabetic # Mn LAO VOWEL SIGN MAI KAN 0EB4..0EB9 ; Other_Alphabetic # Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU 0EBB..0EBC ; Other_Alphabetic # Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO 0ECD ; Other_Alphabetic # Mn LAO NIGGAHITA 0F71..0F7E ; Other_Alphabetic # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO 0F7F ; Other_Alphabetic # Mc TIBETAN SIGN RNAM BCAD 0F80..0F83 ; Other_Alphabetic # Mn [4] TIBETAN VOWEL SIGN REVERSED I..TIBETAN SIGN SNA LDAN 0F8D..0F97 ; Other_Alphabetic # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA 0F99..0FBC ; Other_Alphabetic # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA 102B..102C ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA 102D..1030 ; Other_Alphabetic # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU 1031 ; Other_Alphabetic # Mc MYANMAR VOWEL SIGN E 1032..1036 ; Other_Alphabetic # Mn [5] MYANMAR VOWEL SIGN AI..MYANMAR SIGN ANUSVARA 1038 ; Other_Alphabetic # Mc MYANMAR SIGN VISARGA 103B..103C ; Other_Alphabetic # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 103D..103E ; Other_Alphabetic # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA 1056..1057 ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1058..1059 ; Other_Alphabetic # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL 105E..1060 ; Other_Alphabetic # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA 1062..1064 ; Other_Alphabetic # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO 1067..106D ; Other_Alphabetic # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 1071..1074 ; Other_Alphabetic # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE 1082 ; Other_Alphabetic # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA 1083..1084 ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E 1085..1086 ; Other_Alphabetic # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y 1087..108C ; Other_Alphabetic # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108D ; Other_Alphabetic # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 108F ; Other_Alphabetic # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 109A..109C ; Other_Alphabetic # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109D ; Other_Alphabetic # Mn MYANMAR VOWEL SIGN AITON AI 1712..1713 ; Other_Alphabetic # Mn [2] TAGALOG VOWEL SIGN I..TAGALOG VOWEL SIGN U 1732..1733 ; Other_Alphabetic # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U 1752..1753 ; Other_Alphabetic # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; Other_Alphabetic # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B6 ; Other_Alphabetic # Mc KHMER VOWEL SIGN AA 17B7..17BD ; Other_Alphabetic # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA 17BE..17C5 ; Other_Alphabetic # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C6 ; Other_Alphabetic # Mn KHMER SIGN NIKAHIT 17C7..17C8 ; Other_Alphabetic # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU 1885..1886 ; Other_Alphabetic # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; Other_Alphabetic # Mn MONGOLIAN LETTER ALI GALI DAGALGA 1920..1922 ; Other_Alphabetic # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1923..1926 ; Other_Alphabetic # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1927..1928 ; Other_Alphabetic # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O 1929..192B ; Other_Alphabetic # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA 1930..1931 ; Other_Alphabetic # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA 1932 ; Other_Alphabetic # Mn LIMBU SMALL LETTER ANUSVARA 1933..1938 ; Other_Alphabetic # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA 1A17..1A18 ; Other_Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U 1A19..1A1A ; Other_Alphabetic # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O 1A1B ; Other_Alphabetic # Mn BUGINESE VOWEL SIGN AE 1A55 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A56 ; Other_Alphabetic # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A57 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN LA TANG LAI 1A58..1A5E ; Other_Alphabetic # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA 1A61 ; Other_Alphabetic # Mc TAI THAM VOWEL SIGN A 1A62 ; Other_Alphabetic # Mn TAI THAM VOWEL SIGN MAI SAT 1A63..1A64 ; Other_Alphabetic # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA 1A65..1A6C ; Other_Alphabetic # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW 1A6D..1A72 ; Other_Alphabetic # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI 1A73..1A74 ; Other_Alphabetic # Mn [2] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN MAI KANG 1ABF..1AC0 ; Other_Alphabetic # Mn [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW 1ACC..1ACE ; Other_Alphabetic # Mn [3] COMBINING LATIN SMALL LETTER INSULAR G..COMBINING LATIN SMALL LETTER INSULAR T 1B00..1B03 ; Other_Alphabetic # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG 1B04 ; Other_Alphabetic # Mc BALINESE SIGN BISAH 1B35 ; Other_Alphabetic # Mc BALINESE VOWEL SIGN TEDUNG 1B36..1B3A ; Other_Alphabetic # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3B ; Other_Alphabetic # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3C ; Other_Alphabetic # Mn BALINESE VOWEL SIGN LA LENGA 1B3D..1B41 ; Other_Alphabetic # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG 1B42 ; Other_Alphabetic # Mn BALINESE VOWEL SIGN PEPET 1B43 ; Other_Alphabetic # Mc BALINESE VOWEL SIGN PEPET TEDUNG 1B80..1B81 ; Other_Alphabetic # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1B82 ; Other_Alphabetic # Mc SUNDANESE SIGN PANGWISAD 1BA1 ; Other_Alphabetic # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA2..1BA5 ; Other_Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA6..1BA7 ; Other_Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; Other_Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAC..1BAD ; Other_Alphabetic # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE7 ; Other_Alphabetic # Mc BATAK VOWEL SIGN E 1BE8..1BE9 ; Other_Alphabetic # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BEA..1BEC ; Other_Alphabetic # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BED ; Other_Alphabetic # Mn BATAK VOWEL SIGN KARO O 1BEE ; Other_Alphabetic # Mc BATAK VOWEL SIGN U 1BEF..1BF1 ; Other_Alphabetic # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H 1C24..1C2B ; Other_Alphabetic # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C36 ; Other_Alphabetic # Mn LEPCHA SIGN RAN 1DD3..1DF4 ; Other_Alphabetic # Mn [34] COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA A69E..A69F ; Other_Alphabetic # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E A802 ; Other_Alphabetic # Mn SYLOTI NAGRI SIGN DVISVARA A80B ; Other_Alphabetic # Mn SYLOTI NAGRI SIGN ANUSVARA A823..A824 ; Other_Alphabetic # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I A825..A826 ; Other_Alphabetic # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E A827 ; Other_Alphabetic # Mc SYLOTI NAGRI VOWEL SIGN OO A880..A881 ; Other_Alphabetic # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A8B4..A8C3 ; Other_Alphabetic # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU A8C5 ; Other_Alphabetic # Mn SAURASHTRA SIGN CANDRABINDU A8FF ; Other_Alphabetic # Mn DEVANAGARI VOWEL SIGN AY A926..A92A ; Other_Alphabetic # Mn [5] KAYAH LI VOWEL UE..KAYAH LI VOWEL O A947..A951 ; Other_Alphabetic # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R A952 ; Other_Alphabetic # Mc REJANG CONSONANT SIGN H A980..A982 ; Other_Alphabetic # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A983 ; Other_Alphabetic # Mc JAVANESE SIGN WIGNYAN A9B4..A9B5 ; Other_Alphabetic # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9B6..A9B9 ; Other_Alphabetic # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BA..A9BB ; Other_Alphabetic # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE A9BC..A9BD ; Other_Alphabetic # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET A9BE..A9BF ; Other_Alphabetic # Mc [2] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE CONSONANT SIGN CAKRA A9E5 ; Other_Alphabetic # Mn MYANMAR SIGN SHAN SAW AA29..AA2E ; Other_Alphabetic # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA2F..AA30 ; Other_Alphabetic # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA31..AA32 ; Other_Alphabetic # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE AA33..AA34 ; Other_Alphabetic # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA35..AA36 ; Other_Alphabetic # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA AA43 ; Other_Alphabetic # Mn CHAM CONSONANT SIGN FINAL NG AA4C ; Other_Alphabetic # Mn CHAM CONSONANT SIGN FINAL M AA4D ; Other_Alphabetic # Mc CHAM CONSONANT SIGN FINAL H AA7B ; Other_Alphabetic # Mc MYANMAR SIGN PAO KAREN TONE AA7C ; Other_Alphabetic # Mn MYANMAR SIGN TAI LAING TONE-2 AA7D ; Other_Alphabetic # Mc MYANMAR SIGN TAI LAING TONE-5 AAB0 ; Other_Alphabetic # Mn TAI VIET MAI KANG AAB2..AAB4 ; Other_Alphabetic # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U AAB7..AAB8 ; Other_Alphabetic # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA AABE ; Other_Alphabetic # Mn TAI VIET VOWEL AM AAEB ; Other_Alphabetic # Mc MEETEI MAYEK VOWEL SIGN II AAEC..AAED ; Other_Alphabetic # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI AAEE..AAEF ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU AAF5 ; Other_Alphabetic # Mc MEETEI MAYEK VOWEL SIGN VISARGA ABE3..ABE4 ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP ABE5 ; Other_Alphabetic # Mn MEETEI MAYEK VOWEL SIGN ANAP ABE6..ABE7 ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP ABE8 ; Other_Alphabetic # Mn MEETEI MAYEK VOWEL SIGN UNAP ABE9..ABEA ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA 10376..1037A ; Other_Alphabetic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; Other_Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Other_Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10D24..10D27 ; Other_Alphabetic # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D69 ; Other_Alphabetic # Mn GARAY VOWEL SIGN E 10EAB..10EAC ; Other_Alphabetic # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EFA..10EFC ; Other_Alphabetic # Mn [3] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC COMBINING ALEF OVERLAY 11000 ; Other_Alphabetic # Mc BRAHMI SIGN CANDRABINDU 11001 ; Other_Alphabetic # Mn BRAHMI SIGN ANUSVARA 11002 ; Other_Alphabetic # Mc BRAHMI SIGN VISARGA 11038..11045 ; Other_Alphabetic # Mn [14] BRAHMI VOWEL SIGN AA..BRAHMI VOWEL SIGN AU 11073..11074 ; Other_Alphabetic # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O 11080..11081 ; Other_Alphabetic # Mn [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA 11082 ; Other_Alphabetic # Mc KAITHI SIGN VISARGA 110B0..110B2 ; Other_Alphabetic # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II 110B3..110B6 ; Other_Alphabetic # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI 110B7..110B8 ; Other_Alphabetic # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU 110C2 ; Other_Alphabetic # Mn KAITHI VOWEL SIGN VOCALIC R 11100..11102 ; Other_Alphabetic # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA 11127..1112B ; Other_Alphabetic # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU 1112C ; Other_Alphabetic # Mc CHAKMA VOWEL SIGN E 1112D..11132 ; Other_Alphabetic # Mn [6] CHAKMA VOWEL SIGN AI..CHAKMA AU MARK 11145..11146 ; Other_Alphabetic # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11180..11181 ; Other_Alphabetic # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 11182 ; Other_Alphabetic # Mc SHARADA SIGN VISARGA 111B3..111B5 ; Other_Alphabetic # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111B6..111BE ; Other_Alphabetic # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111BF ; Other_Alphabetic # Mc SHARADA VOWEL SIGN AU 111CE ; Other_Alphabetic # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 111CF ; Other_Alphabetic # Mn SHARADA SIGN INVERTED CANDRABINDU 1122C..1122E ; Other_Alphabetic # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 1122F..11231 ; Other_Alphabetic # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU 11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA 11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA 1123E ; Other_Alphabetic # Mn KHOJKI SIGN SUKUN 11241 ; Other_Alphabetic # Mn KHOJKI VOWEL SIGN VOCALIC R 112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA 112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU 11300..11301 ; Other_Alphabetic # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU 11302..11303 ; Other_Alphabetic # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 1133E..1133F ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I 11340 ; Other_Alphabetic # Mn GRANTHA VOWEL SIGN II 11341..11344 ; Other_Alphabetic # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI 1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU 11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK 11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL 113B8..113BA ; Other_Alphabetic # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II 113BB..113C0 ; Other_Alphabetic # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL 113C2 ; Other_Alphabetic # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; Other_Alphabetic # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113CA ; Other_Alphabetic # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; Other_Alphabetic # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA 11435..11437 ; Other_Alphabetic # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11438..1143F ; Other_Alphabetic # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI 11440..11441 ; Other_Alphabetic # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11443..11444 ; Other_Alphabetic # Mn [2] NEWA SIGN CANDRABINDU..NEWA SIGN ANUSVARA 11445 ; Other_Alphabetic # Mc NEWA SIGN VISARGA 114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II 114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL 114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E 114BA ; Other_Alphabetic # Mn TIRHUTA VOWEL SIGN SHORT E 114BB..114BE ; Other_Alphabetic # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU 114BF..114C0 ; Other_Alphabetic # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA 114C1 ; Other_Alphabetic # Mc TIRHUTA SIGN VISARGA 115AF..115B1 ; Other_Alphabetic # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II 115B2..115B5 ; Other_Alphabetic # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR 115B8..115BB ; Other_Alphabetic # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU 115BC..115BD ; Other_Alphabetic # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA 115BE ; Other_Alphabetic # Mc SIDDHAM SIGN VISARGA 115DC..115DD ; Other_Alphabetic # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU 11630..11632 ; Other_Alphabetic # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II 11633..1163A ; Other_Alphabetic # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI 1163B..1163C ; Other_Alphabetic # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU 1163D ; Other_Alphabetic # Mn MODI SIGN ANUSVARA 1163E ; Other_Alphabetic # Mc MODI SIGN VISARGA 11640 ; Other_Alphabetic # Mn MODI SIGN ARDHACANDRA 116AB ; Other_Alphabetic # Mn TAKRI SIGN ANUSVARA 116AC ; Other_Alphabetic # Mc TAKRI SIGN VISARGA 116AD ; Other_Alphabetic # Mn TAKRI VOWEL SIGN AA 116AE..116AF ; Other_Alphabetic # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B0..116B5 ; Other_Alphabetic # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU 1171D ; Other_Alphabetic # Mn AHOM CONSONANT SIGN MEDIAL LA 1171E ; Other_Alphabetic # Mc AHOM CONSONANT SIGN MEDIAL RA 1171F ; Other_Alphabetic # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA 11720..11721 ; Other_Alphabetic # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA 11722..11725 ; Other_Alphabetic # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU 11726 ; Other_Alphabetic # Mc AHOM VOWEL SIGN E 11727..1172A ; Other_Alphabetic # Mn [4] AHOM VOWEL SIGN AW..AHOM VOWEL SIGN AM 1182C..1182E ; Other_Alphabetic # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 1182F..11837 ; Other_Alphabetic # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA 11838 ; Other_Alphabetic # Mc DOGRA SIGN VISARGA 11930..11935 ; Other_Alphabetic # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E 11937..11938 ; Other_Alphabetic # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O 1193B..1193C ; Other_Alphabetic # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU 11940 ; Other_Alphabetic # Mc DIVES AKURU MEDIAL YA 11942 ; Other_Alphabetic # Mc DIVES AKURU MEDIAL RA 119D1..119D3 ; Other_Alphabetic # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II 119D4..119D7 ; Other_Alphabetic # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR 119DA..119DB ; Other_Alphabetic # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI 119DC..119DF ; Other_Alphabetic # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA 119E4 ; Other_Alphabetic # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E 11A01..11A0A ; Other_Alphabetic # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK 11A35..11A38 ; Other_Alphabetic # Mn [4] ZANABAZAR SQUARE SIGN CANDRABINDU..ZANABAZAR SQUARE SIGN ANUSVARA 11A39 ; Other_Alphabetic # Mc ZANABAZAR SQUARE SIGN VISARGA 11A3B..11A3E ; Other_Alphabetic # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA 11A51..11A56 ; Other_Alphabetic # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE 11A57..11A58 ; Other_Alphabetic # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU 11A59..11A5B ; Other_Alphabetic # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK 11A8A..11A96 ; Other_Alphabetic # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA 11A97 ; Other_Alphabetic # Mc SOYOMBO SIGN VISARGA 11B60 ; Other_Alphabetic # Mn SHARADA VOWEL SIGN OE 11B61 ; Other_Alphabetic # Mc SHARADA VOWEL SIGN OOE 11B62..11B64 ; Other_Alphabetic # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E 11B65 ; Other_Alphabetic # Mc SHARADA VOWEL SIGN SHORT O 11B66 ; Other_Alphabetic # Mn SHARADA VOWEL SIGN CANDRA E 11B67 ; Other_Alphabetic # Mc SHARADA VOWEL SIGN CANDRA O 11C2F ; Other_Alphabetic # Mc BHAIKSUKI VOWEL SIGN AA 11C30..11C36 ; Other_Alphabetic # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L 11C38..11C3D ; Other_Alphabetic # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA 11C3E ; Other_Alphabetic # Mc BHAIKSUKI SIGN VISARGA 11C92..11CA7 ; Other_Alphabetic # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA 11CA9 ; Other_Alphabetic # Mc MARCHEN SUBJOINED LETTER YA 11CAA..11CB0 ; Other_Alphabetic # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA 11CB1 ; Other_Alphabetic # Mc MARCHEN VOWEL SIGN I 11CB2..11CB3 ; Other_Alphabetic # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E 11CB4 ; Other_Alphabetic # Mc MARCHEN VOWEL SIGN O 11CB5..11CB6 ; Other_Alphabetic # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU 11D31..11D36 ; Other_Alphabetic # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R 11D3A ; Other_Alphabetic # Mn MASARAM GONDI VOWEL SIGN E 11D3C..11D3D ; Other_Alphabetic # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O 11D3F..11D41 ; Other_Alphabetic # Mn [3] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI SIGN VISARGA 11D43 ; Other_Alphabetic # Mn MASARAM GONDI SIGN CANDRA 11D47 ; Other_Alphabetic # Mn MASARAM GONDI RA-KARA 11D8A..11D8E ; Other_Alphabetic # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU 11D90..11D91 ; Other_Alphabetic # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI 11D93..11D94 ; Other_Alphabetic # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU 11D95 ; Other_Alphabetic # Mn GUNJALA GONDI SIGN ANUSVARA 11D96 ; Other_Alphabetic # Mc GUNJALA GONDI SIGN VISARGA 11EF3..11EF4 ; Other_Alphabetic # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U 11EF5..11EF6 ; Other_Alphabetic # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O 11F00..11F01 ; Other_Alphabetic # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F03 ; Other_Alphabetic # Mc KAWI SIGN VISARGA 11F34..11F35 ; Other_Alphabetic # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F36..11F3A ; Other_Alphabetic # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F3E..11F3F ; Other_Alphabetic # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI 11F40 ; Other_Alphabetic # Mn KAWI VOWEL SIGN EU 1611E..16129 ; Other_Alphabetic # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK 1612A..1612C ; Other_Alphabetic # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 1612D..1612E ; Other_Alphabetic # Mn [2] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA CONSONANT SIGN MEDIAL RA 16F4F ; Other_Alphabetic # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F51..16F87 ; Other_Alphabetic # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; Other_Alphabetic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FF0..16FF1 ; Other_Alphabetic # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK 1E000..1E006 ; Other_Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE 1E008..1E018 ; Other_Alphabetic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU 1E01B..1E021 ; Other_Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI 1E023..1E024 ; Other_Alphabetic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS 1E026..1E02A ; Other_Alphabetic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA 1E08F ; Other_Alphabetic # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 1E6E3 ; Other_Alphabetic # Mn TAI YO SIGN UE 1E6E6 ; Other_Alphabetic # Mn TAI YO SIGN AU 1E6EE..1E6EF ; Other_Alphabetic # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG 1E6F5 ; Other_Alphabetic # Mn TAI YO SIGN OM 1E947 ; Other_Alphabetic # Mn ADLAM HAMZA 1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z 1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z # Total code points: 1510 # ================================================ 3006 ; Ideographic # Lo IDEOGRAPHIC CLOSING MARK 3007 ; Ideographic # Nl IDEOGRAPHIC NUMBER ZERO 3021..3029 ; Ideographic # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE 3038..303A ; Ideographic # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 3400..4DBF ; Ideographic # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..9FFF ; Ideographic # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF F900..FA6D ; Ideographic # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 16FE4 ; Ideographic # Mn KHITAN SMALL SCRIPT FILLER 16FF2..16FF3 ; Ideographic # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FF6 ; Ideographic # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 17000..18CD5 ; Ideographic # Lo [7382] TANGUT IDEOGRAPH-17000..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D1E ; Ideographic # Lo [32] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D1E 18D80..18DF2 ; Ideographic # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 1B170..1B2FB ; Ideographic # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB 20000..2A6DF ; Ideographic # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; Ideographic # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; Ideographic # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Ideographic # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Ideographic # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; Ideographic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; Ideographic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; Ideographic # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 110943 # ================================================ 005E ; Diacritic # Sk CIRCUMFLEX ACCENT 0060 ; Diacritic # Sk GRAVE ACCENT 00A8 ; Diacritic # Sk DIAERESIS 00AF ; Diacritic # Sk MACRON 00B4 ; Diacritic # Sk ACUTE ACCENT 00B7 ; Diacritic # Po MIDDLE DOT 00B8 ; Diacritic # Sk CEDILLA 02B0..02C1 ; Diacritic # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP 02C2..02C5 ; Diacritic # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD 02C6..02D1 ; Diacritic # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON 02D2..02DF ; Diacritic # Sk [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT 02E0..02E4 ; Diacritic # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 02E5..02EB ; Diacritic # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK 02EC ; Diacritic # Lm MODIFIER LETTER VOICING 02ED ; Diacritic # Sk MODIFIER LETTER UNASPIRATED 02EE ; Diacritic # Lm MODIFIER LETTER DOUBLE APOSTROPHE 02EF..02FF ; Diacritic # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW 0300..034E ; Diacritic # Mn [79] COMBINING GRAVE ACCENT..COMBINING UPWARDS ARROW BELOW 0350..0357 ; Diacritic # Mn [8] COMBINING RIGHT ARROWHEAD ABOVE..COMBINING RIGHT HALF RING ABOVE 035D..0362 ; Diacritic # Mn [6] COMBINING DOUBLE BREVE..COMBINING DOUBLE RIGHTWARDS ARROW BELOW 0374 ; Diacritic # Lm GREEK NUMERAL SIGN 0375 ; Diacritic # Sk GREEK LOWER NUMERAL SIGN 037A ; Diacritic # Lm GREEK YPOGEGRAMMENI 0384..0385 ; Diacritic # Sk [2] GREEK TONOS..GREEK DIALYTIKA TONOS 0483..0487 ; Diacritic # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE 0559 ; Diacritic # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING 0591..05BD ; Diacritic # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG 05BF ; Diacritic # Mn HEBREW POINT RAFE 05C1..05C2 ; Diacritic # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT 05C4..05C5 ; Diacritic # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT 05C7 ; Diacritic # Mn HEBREW POINT QAMATS QATAN 064B..0652 ; Diacritic # Mn [8] ARABIC FATHATAN..ARABIC SUKUN 0657..0658 ; Diacritic # Mn [2] ARABIC INVERTED DAMMA..ARABIC MARK NOON GHUNNA 06DF..06E0 ; Diacritic # Mn [2] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO 06E5..06E6 ; Diacritic # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH 06EA..06EC ; Diacritic # Mn [3] ARABIC EMPTY CENTRE LOW STOP..ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE 0730..074A ; Diacritic # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH 07A6..07B0 ; Diacritic # Mn [11] THAANA ABAFILI..THAANA SUKUN 07EB..07F3 ; Diacritic # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07F4..07F5 ; Diacritic # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE 0818..0819 ; Diacritic # Mn [2] SAMARITAN MARK OCCLUSION..SAMARITAN MARK DAGESH 0898..089F ; Diacritic # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA 08C9 ; Diacritic # Lm ARABIC SMALL FARSI YEH 08CA..08D2 ; Diacritic # Mn [9] ARABIC SMALL HIGH FARSI YEH..ARABIC LARGE ROUND DOT INSIDE CIRCLE BELOW 08E3..08FE ; Diacritic # Mn [28] ARABIC TURNED DAMMA BELOW..ARABIC DAMMA WITH DOT 093C ; Diacritic # Mn DEVANAGARI SIGN NUKTA 094D ; Diacritic # Mn DEVANAGARI SIGN VIRAMA 0951..0954 ; Diacritic # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT 0971 ; Diacritic # Lm DEVANAGARI SIGN HIGH SPACING DOT 09BC ; Diacritic # Mn BENGALI SIGN NUKTA 09CD ; Diacritic # Mn BENGALI SIGN VIRAMA 0A3C ; Diacritic # Mn GURMUKHI SIGN NUKTA 0A4D ; Diacritic # Mn GURMUKHI SIGN VIRAMA 0ABC ; Diacritic # Mn GUJARATI SIGN NUKTA 0ACD ; Diacritic # Mn GUJARATI SIGN VIRAMA 0AFD..0AFF ; Diacritic # Mn [3] GUJARATI SIGN THREE-DOT NUKTA ABOVE..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE 0B3C ; Diacritic # Mn ORIYA SIGN NUKTA 0B4D ; Diacritic # Mn ORIYA SIGN VIRAMA 0B55 ; Diacritic # Mn ORIYA SIGN OVERLINE 0BCD ; Diacritic # Mn TAMIL SIGN VIRAMA 0C3C ; Diacritic # Mn TELUGU SIGN NUKTA 0C4D ; Diacritic # Mn TELUGU SIGN VIRAMA 0CBC ; Diacritic # Mn KANNADA SIGN NUKTA 0CCD ; Diacritic # Mn KANNADA SIGN VIRAMA 0D3B..0D3C ; Diacritic # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA 0D4D ; Diacritic # Mn MALAYALAM SIGN VIRAMA 0DCA ; Diacritic # Mn SINHALA SIGN AL-LAKUNA 0E3A ; Diacritic # Mn THAI CHARACTER PHINTHU 0E47..0E4C ; Diacritic # Mn [6] THAI CHARACTER MAITAIKHU..THAI CHARACTER THANTHAKHAT 0E4E ; Diacritic # Mn THAI CHARACTER YAMAKKAN 0EBA ; Diacritic # Mn LAO SIGN PALI VIRAMA 0EC8..0ECC ; Diacritic # Mn [5] LAO TONE MAI EK..LAO CANCELLATION MARK 0F18..0F19 ; Diacritic # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS 0F35 ; Diacritic # Mn TIBETAN MARK NGAS BZUNG NYI ZLA 0F37 ; Diacritic # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS 0F39 ; Diacritic # Mn TIBETAN MARK TSA -PHRU 0F3E..0F3F ; Diacritic # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES 0F82..0F84 ; Diacritic # Mn [3] TIBETAN SIGN NYI ZLA NAA DA..TIBETAN MARK HALANTA 0F86..0F87 ; Diacritic # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS 0FC6 ; Diacritic # Mn TIBETAN SYMBOL PADMA GDAN 1037 ; Diacritic # Mn MYANMAR SIGN DOT BELOW 1039..103A ; Diacritic # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT 1063..1064 ; Diacritic # Mc [2] MYANMAR TONE MARK SGAW KAREN HATHI..MYANMAR TONE MARK SGAW KAREN KE PHO 1069..106D ; Diacritic # Mc [5] MYANMAR SIGN WESTERN PWO KAREN TONE-1..MYANMAR SIGN WESTERN PWO KAREN TONE-5 1087..108C ; Diacritic # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 108D ; Diacritic # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE 108F ; Diacritic # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 109A..109B ; Diacritic # Mc [2] MYANMAR SIGN KHAMTI TONE-1..MYANMAR SIGN KHAMTI TONE-3 135D..135F ; Diacritic # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1714 ; Diacritic # Mn TAGALOG SIGN VIRAMA 1715 ; Diacritic # Mc TAGALOG SIGN PAMUDPOD 1734 ; Diacritic # Mc HANUNOO SIGN PAMUDPOD 17C9..17D3 ; Diacritic # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT 17DD ; Diacritic # Mn KHMER SIGN ATTHACAN 1939..193B ; Diacritic # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A60 ; Diacritic # Mn TAI THAM SIGN SAKOT 1A75..1A7C ; Diacritic # Mn [8] TAI THAM SIGN TONE-1..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Diacritic # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT 1AB0..1ABD ; Diacritic # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; Diacritic # Me COMBINING PARENTHESES OVERLAY 1AC1..1ACB ; Diacritic # Mn [11] COMBINING LEFT PARENTHESIS ABOVE LEFT..COMBINING TRIPLE ACUTE ACCENT 1ACF..1ADD ; Diacritic # Mn [15] COMBINING DOUBLE CARON..COMBINING DOT-AND-RING BELOW 1AE0..1AEB ; Diacritic # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1B34 ; Diacritic # Mn BALINESE SIGN REREKAN 1B44 ; Diacritic # Mc BALINESE ADEG ADEG 1B6B..1B73 ; Diacritic # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1BAA ; Diacritic # Mc SUNDANESE SIGN PAMAAEH 1BAB ; Diacritic # Mn SUNDANESE SIGN VIRAMA 1BE6 ; Diacritic # Mn BATAK SIGN TOMPI 1BF2..1BF3 ; Diacritic # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C36..1C37 ; Diacritic # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1C78..1C7D ; Diacritic # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1CD0..1CD2 ; Diacritic # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD3 ; Diacritic # Po VEDIC SIGN NIHSHVASA 1CD4..1CE0 ; Diacritic # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE1 ; Diacritic # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA 1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Diacritic # Mn VEDIC SIGN TIRYAK 1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE 1CF7 ; Diacritic # Mc VEDIC SIGN ATIKRAMA 1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D9B..1DBE ; Diacritic # Lm [36] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL EZH 1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW 1DF5..1DFF ; Diacritic # Mn [11] COMBINING UP TACK ABOVE..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 1FBD ; Diacritic # Sk GREEK KORONIS 1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI 1FCD..1FCF ; Diacritic # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI 1FDD..1FDF ; Diacritic # Sk [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI 1FED..1FEF ; Diacritic # Sk [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA 1FFD..1FFE ; Diacritic # Sk [2] GREEK OXIA..GREEK DASIA 2CEF..2CF1 ; Diacritic # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS 2E2F ; Diacritic # Lm VERTICAL TILDE 302A..302D ; Diacritic # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 302E..302F ; Diacritic # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK 3099..309A ; Diacritic # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 309B..309C ; Diacritic # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 30FC ; Diacritic # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK A66F ; Diacritic # Mn COMBINING CYRILLIC VZMET A67C..A67D ; Diacritic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK A67F ; Diacritic # Lm CYRILLIC PAYEROK A69C..A69D ; Diacritic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6F0..A6F1 ; Diacritic # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A700..A716 ; Diacritic # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR A717..A71F ; Diacritic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A720..A721 ; Diacritic # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE A788 ; Diacritic # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT A789..A78A ; Diacritic # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN A7F1 ; Diacritic # Lm MODIFIER LETTER CAPITAL S A7F8..A7F9 ; Diacritic # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A806 ; Diacritic # Mn SYLOTI NAGRI SIGN HASANTA A82C ; Diacritic # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA A8C4 ; Diacritic # Mn SAURASHTRA SIGN VIRAMA A8E0..A8F1 ; Diacritic # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A92B..A92D ; Diacritic # Mn [3] KAYAH LI TONE PLOPHU..KAYAH LI TONE CALYA PLOPHU A92E ; Diacritic # Po KAYAH LI SIGN CWI A953 ; Diacritic # Mc REJANG VIRAMA A9B3 ; Diacritic # Mn JAVANESE SIGN CECAK TELU A9C0 ; Diacritic # Mc JAVANESE PANGKON A9E5 ; Diacritic # Mn MYANMAR SIGN SHAN SAW AA7B ; Diacritic # Mc MYANMAR SIGN PAO KAREN TONE AA7C ; Diacritic # Mn MYANMAR SIGN TAI LAING TONE-2 AA7D ; Diacritic # Mc MYANMAR SIGN TAI LAING TONE-5 AABF ; Diacritic # Mn TAI VIET TONE MAI EK AAC0 ; Diacritic # Lo TAI VIET TONE MAI NUENG AAC1 ; Diacritic # Mn TAI VIET TONE MAI THO AAC2 ; Diacritic # Lo TAI VIET TONE MAI SONG AAF6 ; Diacritic # Mn MEETEI MAYEK VIRAMA AB5B ; Diacritic # Sk MODIFIER BREVE WITH INVERTED BREVE AB5C..AB5F ; Diacritic # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB69 ; Diacritic # Lm MODIFIER LETTER SMALL TURNED W AB6A..AB6B ; Diacritic # Sk [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK ABEC ; Diacritic # Mc MEETEI MAYEK LUM IYEK ABED ; Diacritic # Mn MEETEI MAYEK APUN IYEK FB1E ; Diacritic # Mn HEBREW POINT JUDEO-SPANISH VARIKA FE20..FE2F ; Diacritic # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF FF3E ; Diacritic # Sk FULLWIDTH CIRCUMFLEX ACCENT FF40 ; Diacritic # Sk FULLWIDTH GRAVE ACCENT FF70 ; Diacritic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF9E..FF9F ; Diacritic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFE3 ; Diacritic # Sk FULLWIDTH MACRON 102E0 ; Diacritic # Mn COPTIC EPACT THOUSANDS MARK 10780..10785 ; Diacritic # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Diacritic # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Diacritic # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 10A38..10A3A ; Diacritic # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW 10A3F ; Diacritic # Mn KHAROSHTHI VIRAMA 10AE5..10AE6 ; Diacritic # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D22..10D23 ; Diacritic # Lo [2] HANIFI ROHINGYA MARK SAKIN..HANIFI ROHINGYA MARK NA KHONNA 10D24..10D27 ; Diacritic # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10D4E ; Diacritic # Lm GARAY VOWEL LENGTH MARK 10D69..10D6D ; Diacritic # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK 10EFA ; Diacritic # Mn ARABIC DOUBLE VERTICAL BAR BELOW 10EFD..10EFF ; Diacritic # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Diacritic # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Diacritic # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11046 ; Diacritic # Mn BRAHMI VIRAMA 11070 ; Diacritic # Mn BRAHMI SIGN OLD TAMIL VIRAMA 110B9..110BA ; Diacritic # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 11133..11134 ; Diacritic # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA 11173 ; Diacritic # Mn MAHAJANI SIGN NUKTA 111C0 ; Diacritic # Mc SHARADA SIGN VIRAMA 111CA..111CC ; Diacritic # Mn [3] SHARADA SIGN NUKTA..SHARADA EXTRA SHORT VOWEL MARK 11235 ; Diacritic # Mc KHOJKI SIGN VIRAMA 11236 ; Diacritic # Mn KHOJKI SIGN NUKTA 112E9..112EA ; Diacritic # Mn [2] KHUDAWADI SIGN NUKTA..KHUDAWADI SIGN VIRAMA 1133B..1133C ; Diacritic # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA 11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA 113CE ; Diacritic # Mn TULU-TIGALARI SIGN VIRAMA 113CF ; Diacritic # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; Diacritic # Mn TULU-TIGALARI CONJOINER 113D2 ; Diacritic # Mn TULU-TIGALARI GEMINATION MARK 113D3 ; Diacritic # Lo TULU-TIGALARI SIGN PLUTA 113E1..113E2 ; Diacritic # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA 11442 ; Diacritic # Mn NEWA SIGN VIRAMA 11446 ; Diacritic # Mn NEWA SIGN NUKTA 114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA 115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA 1163F ; Diacritic # Mn MODI SIGN VIRAMA 116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA 116B7 ; Diacritic # Mn TAKRI SIGN NUKTA 1172B ; Diacritic # Mn AHOM SIGN KILLER 11839..1183A ; Diacritic # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 1193D ; Diacritic # Mc DIVES AKURU SIGN HALANTA 1193E ; Diacritic # Mn DIVES AKURU VIRAMA 11943 ; Diacritic # Mn DIVES AKURU SIGN NUKTA 119E0 ; Diacritic # Mn NANDINAGARI SIGN VIRAMA 11A34 ; Diacritic # Mn ZANABAZAR SQUARE SIGN VIRAMA 11A47 ; Diacritic # Mn ZANABAZAR SQUARE SUBJOINER 11A99 ; Diacritic # Mn SOYOMBO SUBJOINER 11C3F ; Diacritic # Mn BHAIKSUKI SIGN VIRAMA 11D42 ; Diacritic # Mn MASARAM GONDI SIGN NUKTA 11D44..11D45 ; Diacritic # Mn [2] MASARAM GONDI SIGN HALANTA..MASARAM GONDI VIRAMA 11D97 ; Diacritic # Mn GUNJALA GONDI VIRAMA 11DD9 ; Diacritic # Lm TOLONG SIKI SIGN SELA 11F41 ; Diacritic # Mc KAWI SIGN KILLER 11F42 ; Diacritic # Mn KAWI CONJOINER 11F5A ; Diacritic # Mn KAWI SIGN NUKTA 13447..13455 ; Diacritic # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED 1612F ; Diacritic # Mn GURUNG KHEMA SIGN THOLHOMA 16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16B30..16B36 ; Diacritic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16D6B..16D6C ; Diacritic # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT 16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FF0..16FF1 ; Diacritic # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1AFF0..1AFF3 ; Diacritic # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Diacritic # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Diacritic # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 1CF00..1CF2D ; Diacritic # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; Diacritic # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG 1D167..1D169 ; Diacritic # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D16D..1D172 ; Diacritic # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; Diacritic # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO 1E030..1E06D ; Diacritic # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E130..1E136 ; Diacritic # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D 1E2AE ; Diacritic # Mn TOTO SIGN RISING TONE 1E2EC..1E2EF ; Diacritic # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI 1E5EE..1E5EF ; Diacritic # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR 1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS 1E944..1E946 ; Diacritic # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK 1E948..1E94A ; Diacritic # Mn [3] ADLAM CONSONANT MODIFIER..ADLAM NUKTA # Total code points: 1247 # ================================================ 00B7 ; Extender # Po MIDDLE DOT 02D0..02D1 ; Extender # Lm [2] MODIFIER LETTER TRIANGULAR COLON..MODIFIER LETTER HALF TRIANGULAR COLON 0640 ; Extender # Lm ARABIC TATWEEL 07FA ; Extender # Lm NKO LAJANYALAN 0A71 ; Extender # Mn GURMUKHI ADDAK 0AFB ; Extender # Mn GUJARATI SIGN SHADDA 0B55 ; Extender # Mn ORIYA SIGN OVERLINE 0E46 ; Extender # Lm THAI CHARACTER MAIYAMOK 0EC6 ; Extender # Lm LAO KO LA 180A ; Extender # Po MONGOLIAN NIRUGU 1843 ; Extender # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN 1AA7 ; Extender # Lm TAI THAM SIGN MAI YAMOK 1C36 ; Extender # Mn LEPCHA SIGN RAN 1C7B ; Extender # Lm OL CHIKI RELAA 3005 ; Extender # Lm IDEOGRAPHIC ITERATION MARK 3031..3035 ; Extender # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF 309D..309E ; Extender # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK 30FC..30FE ; Extender # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK A015 ; Extender # Lm YI SYLLABLE WU A60C ; Extender # Lm VAI SYLLABLE LENGTHENER A9CF ; Extender # Lm JAVANESE PANGRANGKEP A9E6 ; Extender # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION AA70 ; Extender # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AADD ; Extender # Lm TAI VIET SYMBOL SAM AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 10781..10782 ; Extender # Lm [2] MODIFIER LETTER SUPERSCRIPT TRIANGULAR COLON..MODIFIER LETTER SUPERSCRIPT HALF TRIANGULAR COLON 10D4E ; Extender # Lm GARAY VOWEL LENGTH MARK 10D6A ; Extender # Mn GARAY CONSONANT GEMINATION MARK 10D6F ; Extender # Lm GARAY REDUPLICATION MARK 11237 ; Extender # Mn KHOJKI SIGN SHADDA 1135D ; Extender # Lo GRANTHA SIGN PLUTA 113D2 ; Extender # Mn TULU-TIGALARI GEMINATION MARK 113D3 ; Extender # Lo TULU-TIGALARI SIGN PLUTA 115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3 11A98 ; Extender # Mn SOYOMBO GEMINATION MARK 11DD9 ; Extender # Lm TOLONG SIKI SIGN SELA 16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM 16FE0..16FE1 ; Extender # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Extender # Lm OLD CHINESE ITERATION MARK 16FF2..16FF3 ; Extender # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1E13C..1E13D ; Extender # Lm [2] NYIAKENG PUACHUE HMONG SIGN XW XW..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E5EF ; Extender # Mn OL ONAL SIGN IKIR 1E944..1E946 ; Extender # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK # Total code points: 62 # ================================================ 00AA ; Other_Lowercase # Lo FEMININE ORDINAL INDICATOR 00BA ; Other_Lowercase # Lo MASCULINE ORDINAL INDICATOR 02B0..02B8 ; Other_Lowercase # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 02C0..02C1 ; Other_Lowercase # Lm [2] MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP 02E0..02E4 ; Other_Lowercase # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 0345 ; Other_Lowercase # Mn COMBINING GREEK YPOGEGRAMMENI 037A ; Other_Lowercase # Lm GREEK YPOGEGRAMMENI 10FC ; Other_Lowercase # Lm MODIFIER LETTER GEORGIAN NAR 1D2C..1D6A ; Other_Lowercase # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D78 ; Other_Lowercase # Lm MODIFIER LETTER CYRILLIC EN 1D9B..1DBF ; Other_Lowercase # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA 2071 ; Other_Lowercase # Lm SUPERSCRIPT LATIN SMALL LETTER I 207F ; Other_Lowercase # Lm SUPERSCRIPT LATIN SMALL LETTER N 2090..209C ; Other_Lowercase # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T 2170..217F ; Other_Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 24D0..24E9 ; Other_Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C7C..2C7D ; Other_Lowercase # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V A69C..A69D ; Other_Lowercase # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A770 ; Other_Lowercase # Lm MODIFIER LETTER US A7F1..A7F4 ; Other_Lowercase # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F8..A7F9 ; Other_Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE AB5C..AB5F ; Other_Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK AB69 ; Other_Lowercase # Lm MODIFIER LETTER SMALL TURNED W 10780 ; Other_Lowercase # Lm MODIFIER LETTER SMALL CAPITAL AA 10783..10785 ; Other_Lowercase # Lm [3] MODIFIER LETTER SMALL AE..MODIFIER LETTER SMALL B WITH HOOK 10787..107B0 ; Other_Lowercase # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK 107B2..107BA ; Other_Lowercase # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL 1E030..1E06D ; Other_Lowercase # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE # Total code points: 312 # ================================================ 2160..216F ; Other_Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 24B6..24CF ; Other_Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z 1F130..1F149 ; Other_Uppercase # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z 1F150..1F169 ; Other_Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Other_Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z # Total code points: 120 # ================================================ FDD0..FDEF ; Noncharacter_Code_Point # Cn [32] .. FFFE..FFFF ; Noncharacter_Code_Point # Cn [2] .. 1FFFE..1FFFF ; Noncharacter_Code_Point # Cn [2] .. 2FFFE..2FFFF ; Noncharacter_Code_Point # Cn [2] .. 3FFFE..3FFFF ; Noncharacter_Code_Point # Cn [2] .. 4FFFE..4FFFF ; Noncharacter_Code_Point # Cn [2] .. 5FFFE..5FFFF ; Noncharacter_Code_Point # Cn [2] .. 6FFFE..6FFFF ; Noncharacter_Code_Point # Cn [2] .. 7FFFE..7FFFF ; Noncharacter_Code_Point # Cn [2] .. 8FFFE..8FFFF ; Noncharacter_Code_Point # Cn [2] .. 9FFFE..9FFFF ; Noncharacter_Code_Point # Cn [2] .. AFFFE..AFFFF ; Noncharacter_Code_Point # Cn [2] .. BFFFE..BFFFF ; Noncharacter_Code_Point # Cn [2] .. CFFFE..CFFFF ; Noncharacter_Code_Point # Cn [2] .. DFFFE..DFFFF ; Noncharacter_Code_Point # Cn [2] .. EFFFE..EFFFF ; Noncharacter_Code_Point # Cn [2] .. FFFFE..FFFFF ; Noncharacter_Code_Point # Cn [2] .. 10FFFE..10FFFF; Noncharacter_Code_Point # Cn [2] .. # Total code points: 66 # ================================================ 09BE ; Other_Grapheme_Extend # Mc BENGALI VOWEL SIGN AA 09D7 ; Other_Grapheme_Extend # Mc BENGALI AU LENGTH MARK 0B3E ; Other_Grapheme_Extend # Mc ORIYA VOWEL SIGN AA 0B57 ; Other_Grapheme_Extend # Mc ORIYA AU LENGTH MARK 0BBE ; Other_Grapheme_Extend # Mc TAMIL VOWEL SIGN AA 0BD7 ; Other_Grapheme_Extend # Mc TAMIL AU LENGTH MARK 0CC0 ; Other_Grapheme_Extend # Mc KANNADA VOWEL SIGN II 0CC2 ; Other_Grapheme_Extend # Mc KANNADA VOWEL SIGN UU 0CC7..0CC8 ; Other_Grapheme_Extend # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI 0CCA..0CCB ; Other_Grapheme_Extend # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CD5..0CD6 ; Other_Grapheme_Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0D3E ; Other_Grapheme_Extend # Mc MALAYALAM VOWEL SIGN AA 0D57 ; Other_Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK 0DCF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA 0DDF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA 1715 ; Other_Grapheme_Extend # Mc TAGALOG SIGN PAMUDPOD 1734 ; Other_Grapheme_Extend # Mc HANUNOO SIGN PAMUDPOD 1B35 ; Other_Grapheme_Extend # Mc BALINESE VOWEL SIGN TEDUNG 1B3B ; Other_Grapheme_Extend # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3D ; Other_Grapheme_Extend # Mc BALINESE VOWEL SIGN LA LENGA TEDUNG 1B43..1B44 ; Other_Grapheme_Extend # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG 1BAA ; Other_Grapheme_Extend # Mc SUNDANESE SIGN PAMAAEH 1BF2..1BF3 ; Other_Grapheme_Extend # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 200C ; Other_Grapheme_Extend # Cf ZERO WIDTH NON-JOINER 302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK A953 ; Other_Grapheme_Extend # Mc REJANG VIRAMA A9C0 ; Other_Grapheme_Extend # Mc JAVANESE PANGKON FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK 111C0 ; Other_Grapheme_Extend # Mc SHARADA SIGN VIRAMA 11235 ; Other_Grapheme_Extend # Mc KHOJKI SIGN VIRAMA 1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA 1134D ; Other_Grapheme_Extend # Mc GRANTHA SIGN VIRAMA 11357 ; Other_Grapheme_Extend # Mc GRANTHA AU LENGTH MARK 113B8 ; Other_Grapheme_Extend # Mc TULU-TIGALARI VOWEL SIGN AA 113C2 ; Other_Grapheme_Extend # Mc TULU-TIGALARI VOWEL SIGN EE 113C5 ; Other_Grapheme_Extend # Mc TULU-TIGALARI VOWEL SIGN AI 113C7..113C9 ; Other_Grapheme_Extend # Mc [3] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI AU LENGTH MARK 113CF ; Other_Grapheme_Extend # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 114B0 ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN AA 114BD ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN SHORT O 115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA 116B6 ; Other_Grapheme_Extend # Mc TAKRI SIGN VIRAMA 11930 ; Other_Grapheme_Extend # Mc DIVES AKURU VOWEL SIGN AA 1193D ; Other_Grapheme_Extend # Mc DIVES AKURU SIGN HALANTA 11F41 ; Other_Grapheme_Extend # Mc KAWI SIGN KILLER 16FF0..16FF1 ; Other_Grapheme_Extend # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1D165..1D166 ; Other_Grapheme_Extend # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D16D..1D172 ; Other_Grapheme_Extend # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 E0020..E007F ; Other_Grapheme_Extend # Cf [96] TAG SPACE..CANCEL TAG # Total code points: 160 # ================================================ 2FF0..2FF1 ; IDS_Binary_Operator # So [2] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW 2FF4..2FFD ; IDS_Binary_Operator # So [10] IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND..IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER RIGHT 31EF ; IDS_Binary_Operator # So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION # Total code points: 13 # ================================================ 2FF2..2FF3 ; IDS_Trinary_Operator # So [2] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW # Total code points: 2 # ================================================ 2FFE..2FFF ; IDS_Unary_Operator # So [2] IDEOGRAPHIC DESCRIPTION CHARACTER HORIZONTAL REFLECTION..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION # Total code points: 2 # ================================================ 2E80..2E99 ; Radical # So [26] CJK RADICAL REPEAT..CJK RADICAL RAP 2E9B..2EF3 ; Radical # So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE 2F00..2FD5 ; Radical # So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE # Total code points: 329 # ================================================ 3400..4DBF ; Unified_Ideograph # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF 4E00..9FFF ; Unified_Ideograph # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11 FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14 FA1F ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA1F FA21 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA21 FA23..FA24 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA23..CJK COMPATIBILITY IDEOGRAPH-FA24 FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..CJK COMPATIBILITY IDEOGRAPH-FA29 20000..2A6DF ; Unified_Ideograph # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B81D ; Unified_Ideograph # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D 2B820..2CEAD ; Unified_Ideograph # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Unified_Ideograph # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Unified_Ideograph # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 30000..3134A ; Unified_Ideograph # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..33479 ; Unified_Ideograph # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 # Total code points: 101996 # ================================================ 034F ; Other_Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER 115F..1160 ; Other_Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER 17B4..17B5 ; Other_Default_Ignorable_Code_Point # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 2065 ; Other_Default_Ignorable_Code_Point # Cn 3164 ; Other_Default_Ignorable_Code_Point # Lo HANGUL FILLER FFA0 ; Other_Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER FFF0..FFF8 ; Other_Default_Ignorable_Code_Point # Cn [9] .. E0000 ; Other_Default_Ignorable_Code_Point # Cn E0002..E001F ; Other_Default_Ignorable_Code_Point # Cn [30] .. E0080..E00FF ; Other_Default_Ignorable_Code_Point # Cn [128] .. E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] .. # Total code points: 3776 # ================================================ 0149 ; Deprecated # L& LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 0673 ; Deprecated # Lo ARABIC LETTER ALEF WITH WAVY HAMZA BELOW 0F77 ; Deprecated # Mn TIBETAN VOWEL SIGN VOCALIC RR 0F79 ; Deprecated # Mn TIBETAN VOWEL SIGN VOCALIC LL 17A3..17A4 ; Deprecated # Lo [2] KHMER INDEPENDENT VOWEL QAQ..KHMER INDEPENDENT VOWEL QAA 206A..206F ; Deprecated # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES 2329 ; Deprecated # Ps LEFT-POINTING ANGLE BRACKET 232A ; Deprecated # Pe RIGHT-POINTING ANGLE BRACKET E0001 ; Deprecated # Cf LANGUAGE TAG # Total code points: 15 # ================================================ 0069..006A ; Soft_Dotted # L& [2] LATIN SMALL LETTER I..LATIN SMALL LETTER J 012F ; Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK 0249 ; Soft_Dotted # L& LATIN SMALL LETTER J WITH STROKE 0268 ; Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE 029D ; Soft_Dotted # L& LATIN SMALL LETTER J WITH CROSSED-TAIL 02B2 ; Soft_Dotted # Lm MODIFIER LETTER SMALL J 03F3 ; Soft_Dotted # L& GREEK LETTER YOT 0456 ; Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 0458 ; Soft_Dotted # L& CYRILLIC SMALL LETTER JE 1D62 ; Soft_Dotted # Lm LATIN SUBSCRIPT SMALL LETTER I 1D96 ; Soft_Dotted # L& LATIN SMALL LETTER I WITH RETROFLEX HOOK 1DA4 ; Soft_Dotted # Lm MODIFIER LETTER SMALL I WITH STROKE 1DA8 ; Soft_Dotted # Lm MODIFIER LETTER SMALL J WITH CROSSED-TAIL 1E2D ; Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW 1ECB ; Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW 2071 ; Soft_Dotted # Lm SUPERSCRIPT LATIN SMALL LETTER I 2148..2149 ; Soft_Dotted # L& [2] DOUBLE-STRUCK ITALIC SMALL I..DOUBLE-STRUCK ITALIC SMALL J 2C7C ; Soft_Dotted # Lm LATIN SUBSCRIPT SMALL LETTER J 1D422..1D423 ; Soft_Dotted # L& [2] MATHEMATICAL BOLD SMALL I..MATHEMATICAL BOLD SMALL J 1D456..1D457 ; Soft_Dotted # L& [2] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL ITALIC SMALL J 1D48A..1D48B ; Soft_Dotted # L& [2] MATHEMATICAL BOLD ITALIC SMALL I..MATHEMATICAL BOLD ITALIC SMALL J 1D4BE..1D4BF ; Soft_Dotted # L& [2] MATHEMATICAL SCRIPT SMALL I..MATHEMATICAL SCRIPT SMALL J 1D4F2..1D4F3 ; Soft_Dotted # L& [2] MATHEMATICAL BOLD SCRIPT SMALL I..MATHEMATICAL BOLD SCRIPT SMALL J 1D526..1D527 ; Soft_Dotted # L& [2] MATHEMATICAL FRAKTUR SMALL I..MATHEMATICAL FRAKTUR SMALL J 1D55A..1D55B ; Soft_Dotted # L& [2] MATHEMATICAL DOUBLE-STRUCK SMALL I..MATHEMATICAL DOUBLE-STRUCK SMALL J 1D58E..1D58F ; Soft_Dotted # L& [2] MATHEMATICAL BOLD FRAKTUR SMALL I..MATHEMATICAL BOLD FRAKTUR SMALL J 1D5C2..1D5C3 ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF SMALL I..MATHEMATICAL SANS-SERIF SMALL J 1D5F6..1D5F7 ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF BOLD SMALL I..MATHEMATICAL SANS-SERIF BOLD SMALL J 1D62A..1D62B ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF ITALIC SMALL I..MATHEMATICAL SANS-SERIF ITALIC SMALL J 1D65E..1D65F ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL I..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL J 1D692..1D693 ; Soft_Dotted # L& [2] MATHEMATICAL MONOSPACE SMALL I..MATHEMATICAL MONOSPACE SMALL J 1DF1A ; Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE AND RETROFLEX HOOK 1E04C..1E04D ; Soft_Dotted # Lm [2] MODIFIER LETTER CYRILLIC SMALL BYELORUSSIAN-UKRAINIAN I..MODIFIER LETTER CYRILLIC SMALL JE 1E068 ; Soft_Dotted # Lm CYRILLIC SUBSCRIPT SMALL LETTER BYELORUSSIAN-UKRAINIAN I # Total code points: 50 # ================================================ 0E40..0E44 ; Logical_Order_Exception # Lo [5] THAI CHARACTER SARA E..THAI CHARACTER SARA AI MAIMALAI 0EC0..0EC4 ; Logical_Order_Exception # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI 19B5..19B7 ; Logical_Order_Exception # Lo [3] NEW TAI LUE VOWEL SIGN E..NEW TAI LUE VOWEL SIGN O 19BA ; Logical_Order_Exception # Lo NEW TAI LUE VOWEL SIGN AY AAB5..AAB6 ; Logical_Order_Exception # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O AAB9 ; Logical_Order_Exception # Lo TAI VIET VOWEL UEA AABB..AABC ; Logical_Order_Exception # Lo [2] TAI VIET VOWEL AUE..TAI VIET VOWEL AY # Total code points: 19 # ================================================ 1885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 2118 ; Other_ID_Start # Sm SCRIPT CAPITAL P 212E ; Other_ID_Start # So ESTIMATED SYMBOL 309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK # Total code points: 6 # ================================================ 00B7 ; Other_ID_Continue # Po MIDDLE DOT 0387 ; Other_ID_Continue # Po GREEK ANO TELEIA 1369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE 19DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE 200C..200D ; Other_ID_Continue # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 30FB ; Other_ID_Continue # Po KATAKANA MIDDLE DOT FF65 ; Other_ID_Continue # Po HALFWIDTH KATAKANA MIDDLE DOT # Total code points: 16 # ================================================ 00B2..00B3 ; ID_Compat_Math_Continue # No [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE 00B9 ; ID_Compat_Math_Continue # No SUPERSCRIPT ONE 2070 ; ID_Compat_Math_Continue # No SUPERSCRIPT ZERO 2074..2079 ; ID_Compat_Math_Continue # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE 207A..207C ; ID_Compat_Math_Continue # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN 207D ; ID_Compat_Math_Continue # Ps SUPERSCRIPT LEFT PARENTHESIS 207E ; ID_Compat_Math_Continue # Pe SUPERSCRIPT RIGHT PARENTHESIS 2080..2089 ; ID_Compat_Math_Continue # No [10] SUBSCRIPT ZERO..SUBSCRIPT NINE 208A..208C ; ID_Compat_Math_Continue # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; ID_Compat_Math_Continue # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; ID_Compat_Math_Continue # Pe SUBSCRIPT RIGHT PARENTHESIS 2202 ; ID_Compat_Math_Continue # Sm PARTIAL DIFFERENTIAL 2207 ; ID_Compat_Math_Continue # Sm NABLA 221E ; ID_Compat_Math_Continue # Sm INFINITY 1D6C1 ; ID_Compat_Math_Continue # Sm MATHEMATICAL BOLD NABLA 1D6DB ; ID_Compat_Math_Continue # Sm MATHEMATICAL BOLD PARTIAL DIFFERENTIAL 1D6FB ; ID_Compat_Math_Continue # Sm MATHEMATICAL ITALIC NABLA 1D715 ; ID_Compat_Math_Continue # Sm MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL 1D735 ; ID_Compat_Math_Continue # Sm MATHEMATICAL BOLD ITALIC NABLA 1D74F ; ID_Compat_Math_Continue # Sm MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL 1D76F ; ID_Compat_Math_Continue # Sm MATHEMATICAL SANS-SERIF BOLD NABLA 1D789 ; ID_Compat_Math_Continue # Sm MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL 1D7A9 ; ID_Compat_Math_Continue # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA 1D7C3 ; ID_Compat_Math_Continue # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL # Total code points: 43 # ================================================ 2202 ; ID_Compat_Math_Start # Sm PARTIAL DIFFERENTIAL 2207 ; ID_Compat_Math_Start # Sm NABLA 221E ; ID_Compat_Math_Start # Sm INFINITY 1D6C1 ; ID_Compat_Math_Start # Sm MATHEMATICAL BOLD NABLA 1D6DB ; ID_Compat_Math_Start # Sm MATHEMATICAL BOLD PARTIAL DIFFERENTIAL 1D6FB ; ID_Compat_Math_Start # Sm MATHEMATICAL ITALIC NABLA 1D715 ; ID_Compat_Math_Start # Sm MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL 1D735 ; ID_Compat_Math_Start # Sm MATHEMATICAL BOLD ITALIC NABLA 1D74F ; ID_Compat_Math_Start # Sm MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL 1D76F ; ID_Compat_Math_Start # Sm MATHEMATICAL SANS-SERIF BOLD NABLA 1D789 ; ID_Compat_Math_Start # Sm MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL 1D7A9 ; ID_Compat_Math_Start # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA 1D7C3 ; ID_Compat_Math_Start # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL # Total code points: 13 # ================================================ 0021 ; Sentence_Terminal # Po EXCLAMATION MARK 002E ; Sentence_Terminal # Po FULL STOP 003F ; Sentence_Terminal # Po QUESTION MARK 0589 ; Sentence_Terminal # Po ARMENIAN FULL STOP 061D..061F ; Sentence_Terminal # Po [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK 06D4 ; Sentence_Terminal # Po ARABIC FULL STOP 0700..0702 ; Sentence_Terminal # Po [3] SYRIAC END OF PARAGRAPH..SYRIAC SUBLINEAR FULL STOP 07F9 ; Sentence_Terminal # Po NKO EXCLAMATION MARK 0837 ; Sentence_Terminal # Po SAMARITAN PUNCTUATION MELODIC QITSA 0839 ; Sentence_Terminal # Po SAMARITAN PUNCTUATION QITSA 083D..083E ; Sentence_Terminal # Po [2] SAMARITAN PUNCTUATION SOF MASHFAAT..SAMARITAN PUNCTUATION ANNAAU 0964..0965 ; Sentence_Terminal # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA 104A..104B ; Sentence_Terminal # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION 1362 ; Sentence_Terminal # Po ETHIOPIC FULL STOP 1367..1368 ; Sentence_Terminal # Po [2] ETHIOPIC QUESTION MARK..ETHIOPIC PARAGRAPH SEPARATOR 166E ; Sentence_Terminal # Po CANADIAN SYLLABICS FULL STOP 1735..1736 ; Sentence_Terminal # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 17D4..17D5 ; Sentence_Terminal # Po [2] KHMER SIGN KHAN..KHMER SIGN BARIYOOSAN 1803 ; Sentence_Terminal # Po MONGOLIAN FULL STOP 1809 ; Sentence_Terminal # Po MONGOLIAN MANCHU FULL STOP 1944..1945 ; Sentence_Terminal # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 1AA8..1AAB ; Sentence_Terminal # Po [4] TAI THAM SIGN KAAN..TAI THAM SIGN SATKAANKUU 1B4E..1B4F ; Sentence_Terminal # Po [2] BALINESE INVERTED CARIK SIKI..BALINESE INVERTED CARIK PAREREN 1B5A..1B5B ; Sentence_Terminal # Po [2] BALINESE PANTI..BALINESE PAMADA 1B5E..1B5F ; Sentence_Terminal # Po [2] BALINESE CARIK SIKI..BALINESE CARIK PAREREN 1B7D..1B7F ; Sentence_Terminal # Po [3] BALINESE PANTI LANTANG..BALINESE PANTI BAWAK 1C3B..1C3C ; Sentence_Terminal # Po [2] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION NYET THYOOM TA-ROL 1C7E..1C7F ; Sentence_Terminal # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD 2024 ; Sentence_Terminal # Po ONE DOT LEADER 203C..203D ; Sentence_Terminal # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG 2047..2049 ; Sentence_Terminal # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK 2CF9..2CFB ; Sentence_Terminal # Po [3] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN INDIRECT QUESTION MARK 2E2E ; Sentence_Terminal # Po REVERSED QUESTION MARK 2E3C ; Sentence_Terminal # Po STENOGRAPHIC FULL STOP 2E53..2E54 ; Sentence_Terminal # Po [2] MEDIEVAL EXCLAMATION MARK..MEDIEVAL QUESTION MARK 3002 ; Sentence_Terminal # Po IDEOGRAPHIC FULL STOP A4FF ; Sentence_Terminal # Po LISU PUNCTUATION FULL STOP A60E..A60F ; Sentence_Terminal # Po [2] VAI FULL STOP..VAI QUESTION MARK A6F3 ; Sentence_Terminal # Po BAMUM FULL STOP A6F7 ; Sentence_Terminal # Po BAMUM QUESTION MARK A876..A877 ; Sentence_Terminal # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD A8CE..A8CF ; Sentence_Terminal # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA A92F ; Sentence_Terminal # Po KAYAH LI SIGN SHYA A9C8..A9C9 ; Sentence_Terminal # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA LUNGSI AA5D..AA5F ; Sentence_Terminal # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA AAF0..AAF1 ; Sentence_Terminal # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM ABEB ; Sentence_Terminal # Po MEETEI MAYEK CHEIKHEI FE12 ; Sentence_Terminal # Po PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP FE15..FE16 ; Sentence_Terminal # Po [2] PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE52 ; Sentence_Terminal # Po SMALL FULL STOP FE56..FE57 ; Sentence_Terminal # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK FF01 ; Sentence_Terminal # Po FULLWIDTH EXCLAMATION MARK FF0E ; Sentence_Terminal # Po FULLWIDTH FULL STOP FF1F ; Sentence_Terminal # Po FULLWIDTH QUESTION MARK FF61 ; Sentence_Terminal # Po HALFWIDTH IDEOGRAPHIC FULL STOP 10A56..10A57 ; Sentence_Terminal # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA 10F55..10F59 ; Sentence_Terminal # Po [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 10F86..10F89 ; Sentence_Terminal # Po [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS 11047..11048 ; Sentence_Terminal # Po [2] BRAHMI DANDA..BRAHMI DOUBLE DANDA 110BE..110C1 ; Sentence_Terminal # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11141..11143 ; Sentence_Terminal # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK 111C5..111C6 ; Sentence_Terminal # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA 111CD ; Sentence_Terminal # Po SHARADA SUTRA MARK 111DE..111DF ; Sentence_Terminal # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2 11238..11239 ; Sentence_Terminal # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA 1123B..1123C ; Sentence_Terminal # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK 112A9 ; Sentence_Terminal # Po MULTANI SECTION MARK 113D4..113D5 ; Sentence_Terminal # Po [2] TULU-TIGALARI DANDA..TULU-TIGALARI DOUBLE DANDA 1144B..1144C ; Sentence_Terminal # Po [2] NEWA DANDA..NEWA DOUBLE DANDA 115C2..115C3 ; Sentence_Terminal # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA 115C9..115D7 ; Sentence_Terminal # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 11641..11642 ; Sentence_Terminal # Po [2] MODI DANDA..MODI DOUBLE DANDA 1173C..1173E ; Sentence_Terminal # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI 11944 ; Sentence_Terminal # Po DIVES AKURU DOUBLE DANDA 11946 ; Sentence_Terminal # Po DIVES AKURU END OF TEXT MARK 11A42..11A43 ; Sentence_Terminal # Po [2] ZANABAZAR SQUARE MARK SHAD..ZANABAZAR SQUARE MARK DOUBLE SHAD 11A9B..11A9C ; Sentence_Terminal # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD 11C41..11C42 ; Sentence_Terminal # Po [2] BHAIKSUKI DANDA..BHAIKSUKI DOUBLE DANDA 11EF7..11EF8 ; Sentence_Terminal # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION 11F43..11F44 ; Sentence_Terminal # Po [2] KAWI DANDA..KAWI DOUBLE DANDA 16A6E..16A6F ; Sentence_Terminal # Po [2] MRO DANDA..MRO DOUBLE DANDA 16AF5 ; Sentence_Terminal # Po BASSA VAH FULL STOP 16B37..16B38 ; Sentence_Terminal # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB 16B44 ; Sentence_Terminal # Po PAHAWH HMONG SIGN XAUS 16D6E..16D6F ; Sentence_Terminal # Po [2] KIRAT RAI DANDA..KIRAT RAI DOUBLE DANDA 16E98 ; Sentence_Terminal # Po MEDEFAIDRIN FULL STOP 1BC9F ; Sentence_Terminal # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP 1DA88 ; Sentence_Terminal # Po SIGNWRITING FULL STOP # Total code points: 170 # ================================================ 180B..180D ; Variation_Selector # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 180F ; Variation_Selector # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR FE00..FE0F ; Variation_Selector # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 # Total code points: 260 # ================================================ 0009..000D ; Pattern_White_Space # Cc [5] .. 0020 ; Pattern_White_Space # Zs SPACE 0085 ; Pattern_White_Space # Cc 200E..200F ; Pattern_White_Space # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK 2028 ; Pattern_White_Space # Zl LINE SEPARATOR 2029 ; Pattern_White_Space # Zp PARAGRAPH SEPARATOR # Total code points: 11 # ================================================ 0021..0023 ; Pattern_Syntax # Po [3] EXCLAMATION MARK..NUMBER SIGN 0024 ; Pattern_Syntax # Sc DOLLAR SIGN 0025..0027 ; Pattern_Syntax # Po [3] PERCENT SIGN..APOSTROPHE 0028 ; Pattern_Syntax # Ps LEFT PARENTHESIS 0029 ; Pattern_Syntax # Pe RIGHT PARENTHESIS 002A ; Pattern_Syntax # Po ASTERISK 002B ; Pattern_Syntax # Sm PLUS SIGN 002C ; Pattern_Syntax # Po COMMA 002D ; Pattern_Syntax # Pd HYPHEN-MINUS 002E..002F ; Pattern_Syntax # Po [2] FULL STOP..SOLIDUS 003A..003B ; Pattern_Syntax # Po [2] COLON..SEMICOLON 003C..003E ; Pattern_Syntax # Sm [3] LESS-THAN SIGN..GREATER-THAN SIGN 003F..0040 ; Pattern_Syntax # Po [2] QUESTION MARK..COMMERCIAL AT 005B ; Pattern_Syntax # Ps LEFT SQUARE BRACKET 005C ; Pattern_Syntax # Po REVERSE SOLIDUS 005D ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET 005E ; Pattern_Syntax # Sk CIRCUMFLEX ACCENT 0060 ; Pattern_Syntax # Sk GRAVE ACCENT 007B ; Pattern_Syntax # Ps LEFT CURLY BRACKET 007C ; Pattern_Syntax # Sm VERTICAL LINE 007D ; Pattern_Syntax # Pe RIGHT CURLY BRACKET 007E ; Pattern_Syntax # Sm TILDE 00A1 ; Pattern_Syntax # Po INVERTED EXCLAMATION MARK 00A2..00A5 ; Pattern_Syntax # Sc [4] CENT SIGN..YEN SIGN 00A6 ; Pattern_Syntax # So BROKEN BAR 00A7 ; Pattern_Syntax # Po SECTION SIGN 00A9 ; Pattern_Syntax # So COPYRIGHT SIGN 00AB ; Pattern_Syntax # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00AC ; Pattern_Syntax # Sm NOT SIGN 00AE ; Pattern_Syntax # So REGISTERED SIGN 00B0 ; Pattern_Syntax # So DEGREE SIGN 00B1 ; Pattern_Syntax # Sm PLUS-MINUS SIGN 00B6 ; Pattern_Syntax # Po PILCROW SIGN 00BB ; Pattern_Syntax # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00BF ; Pattern_Syntax # Po INVERTED QUESTION MARK 00D7 ; Pattern_Syntax # Sm MULTIPLICATION SIGN 00F7 ; Pattern_Syntax # Sm DIVISION SIGN 2010..2015 ; Pattern_Syntax # Pd [6] HYPHEN..HORIZONTAL BAR 2016..2017 ; Pattern_Syntax # Po [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE 2018 ; Pattern_Syntax # Pi LEFT SINGLE QUOTATION MARK 2019 ; Pattern_Syntax # Pf RIGHT SINGLE QUOTATION MARK 201A ; Pattern_Syntax # Ps SINGLE LOW-9 QUOTATION MARK 201B..201C ; Pattern_Syntax # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK 201D ; Pattern_Syntax # Pf RIGHT DOUBLE QUOTATION MARK 201E ; Pattern_Syntax # Ps DOUBLE LOW-9 QUOTATION MARK 201F ; Pattern_Syntax # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2020..2027 ; Pattern_Syntax # Po [8] DAGGER..HYPHENATION POINT 2030..2038 ; Pattern_Syntax # Po [9] PER MILLE SIGN..CARET 2039 ; Pattern_Syntax # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A ; Pattern_Syntax # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 203B..203E ; Pattern_Syntax # Po [4] REFERENCE MARK..OVERLINE 2041..2043 ; Pattern_Syntax # Po [3] CARET INSERTION POINT..HYPHEN BULLET 2044 ; Pattern_Syntax # Sm FRACTION SLASH 2045 ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH QUILL 2046 ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH QUILL 2047..2051 ; Pattern_Syntax # Po [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY 2052 ; Pattern_Syntax # Sm COMMERCIAL MINUS SIGN 2053 ; Pattern_Syntax # Po SWUNG DASH 2055..205E ; Pattern_Syntax # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS 2190..2194 ; Pattern_Syntax # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW 2195..2199 ; Pattern_Syntax # So [5] UP DOWN ARROW..SOUTH WEST ARROW 219A..219B ; Pattern_Syntax # Sm [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE 219C..219F ; Pattern_Syntax # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW 21A0 ; Pattern_Syntax # Sm RIGHTWARDS TWO HEADED ARROW 21A1..21A2 ; Pattern_Syntax # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL 21A3 ; Pattern_Syntax # Sm RIGHTWARDS ARROW WITH TAIL 21A4..21A5 ; Pattern_Syntax # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR 21A6 ; Pattern_Syntax # Sm RIGHTWARDS ARROW FROM BAR 21A7..21AD ; Pattern_Syntax # So [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW 21AE ; Pattern_Syntax # Sm LEFT RIGHT ARROW WITH STROKE 21AF..21CD ; Pattern_Syntax # So [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE 21CE..21CF ; Pattern_Syntax # Sm [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE 21D0..21D1 ; Pattern_Syntax # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW 21D2 ; Pattern_Syntax # Sm RIGHTWARDS DOUBLE ARROW 21D3 ; Pattern_Syntax # So DOWNWARDS DOUBLE ARROW 21D4 ; Pattern_Syntax # Sm LEFT RIGHT DOUBLE ARROW 21D5..21F3 ; Pattern_Syntax # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..22FF ; Pattern_Syntax # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2300..2307 ; Pattern_Syntax # So [8] DIAMETER SIGN..WAVY LINE 2308 ; Pattern_Syntax # Ps LEFT CEILING 2309 ; Pattern_Syntax # Pe RIGHT CEILING 230A ; Pattern_Syntax # Ps LEFT FLOOR 230B ; Pattern_Syntax # Pe RIGHT FLOOR 230C..231F ; Pattern_Syntax # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; Pattern_Syntax # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; Pattern_Syntax # So [7] FROWN..KEYBOARD 2329 ; Pattern_Syntax # Ps LEFT-POINTING ANGLE BRACKET 232A ; Pattern_Syntax # Pe RIGHT-POINTING ANGLE BRACKET 232B..237B ; Pattern_Syntax # So [81] ERASE TO THE LEFT..NOT CHECK MARK 237C ; Pattern_Syntax # Sm RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW 237D..239A ; Pattern_Syntax # So [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL 239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET 23E2..2429 ; Pattern_Syntax # So [72] WHITE TRAPEZIUM..SYMBOL FOR DELETE MEDIUM SHADE FORM 242A..243F ; Pattern_Syntax # Cn [22] .. 2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH 244B..245F ; Pattern_Syntax # Cn [21] .. 2500..25B6 ; Pattern_Syntax # So [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE 25B7 ; Pattern_Syntax # Sm WHITE RIGHT-POINTING TRIANGLE 25B8..25C0 ; Pattern_Syntax # So [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE 25C1 ; Pattern_Syntax # Sm WHITE LEFT-POINTING TRIANGLE 25C2..25F7 ; Pattern_Syntax # So [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT 25F8..25FF ; Pattern_Syntax # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; Pattern_Syntax # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; Pattern_Syntax # Sm MUSIC SHARP SIGN 2670..2767 ; Pattern_Syntax # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2768 ; Pattern_Syntax # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; Pattern_Syntax # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; Pattern_Syntax # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 276B ; Pattern_Syntax # Pe MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 276C ; Pattern_Syntax # Ps MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 276D ; Pattern_Syntax # Pe MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 276E ; Pattern_Syntax # Ps HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 276F ; Pattern_Syntax # Pe HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 2770 ; Pattern_Syntax # Ps HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 2771 ; Pattern_Syntax # Pe HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 2772 ; Pattern_Syntax # Ps LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT 2773 ; Pattern_Syntax # Pe LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT 2774 ; Pattern_Syntax # Ps MEDIUM LEFT CURLY BRACKET ORNAMENT 2775 ; Pattern_Syntax # Pe MEDIUM RIGHT CURLY BRACKET ORNAMENT 2794..27BF ; Pattern_Syntax # So [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP 27C0..27C4 ; Pattern_Syntax # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET 27C5 ; Pattern_Syntax # Ps LEFT S-SHAPED BAG DELIMITER 27C6 ; Pattern_Syntax # Pe RIGHT S-SHAPED BAG DELIMITER 27C7..27E5 ; Pattern_Syntax # Sm [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK 27E6 ; Pattern_Syntax # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET 27E7 ; Pattern_Syntax # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET 27E8 ; Pattern_Syntax # Ps MATHEMATICAL LEFT ANGLE BRACKET 27E9 ; Pattern_Syntax # Pe MATHEMATICAL RIGHT ANGLE BRACKET 27EA ; Pattern_Syntax # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 27EB ; Pattern_Syntax # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 27EC ; Pattern_Syntax # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 27ED ; Pattern_Syntax # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 27EE ; Pattern_Syntax # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS 27EF ; Pattern_Syntax # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS 27F0..27FF ; Pattern_Syntax # Sm [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW 2800..28FF ; Pattern_Syntax # So [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 2900..2982 ; Pattern_Syntax # Sm [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON 2983 ; Pattern_Syntax # Ps LEFT WHITE CURLY BRACKET 2984 ; Pattern_Syntax # Pe RIGHT WHITE CURLY BRACKET 2985 ; Pattern_Syntax # Ps LEFT WHITE PARENTHESIS 2986 ; Pattern_Syntax # Pe RIGHT WHITE PARENTHESIS 2987 ; Pattern_Syntax # Ps Z NOTATION LEFT IMAGE BRACKET 2988 ; Pattern_Syntax # Pe Z NOTATION RIGHT IMAGE BRACKET 2989 ; Pattern_Syntax # Ps Z NOTATION LEFT BINDING BRACKET 298A ; Pattern_Syntax # Pe Z NOTATION RIGHT BINDING BRACKET 298B ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH UNDERBAR 298C ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH UNDERBAR 298D ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 298E ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 298F ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 2990 ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 2991 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET WITH DOT 2992 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET WITH DOT 2993 ; Pattern_Syntax # Ps LEFT ARC LESS-THAN BRACKET 2994 ; Pattern_Syntax # Pe RIGHT ARC GREATER-THAN BRACKET 2995 ; Pattern_Syntax # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET 2996 ; Pattern_Syntax # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET 2997 ; Pattern_Syntax # Ps LEFT BLACK TORTOISE SHELL BRACKET 2998 ; Pattern_Syntax # Pe RIGHT BLACK TORTOISE SHELL BRACKET 2999..29D7 ; Pattern_Syntax # Sm [63] DOTTED FENCE..BLACK HOURGLASS 29D8 ; Pattern_Syntax # Ps LEFT WIGGLY FENCE 29D9 ; Pattern_Syntax # Pe RIGHT WIGGLY FENCE 29DA ; Pattern_Syntax # Ps LEFT DOUBLE WIGGLY FENCE 29DB ; Pattern_Syntax # Pe RIGHT DOUBLE WIGGLY FENCE 29DC..29FB ; Pattern_Syntax # Sm [32] INCOMPLETE INFINITY..TRIPLE PLUS 29FC ; Pattern_Syntax # Ps LEFT-POINTING CURVED ANGLE BRACKET 29FD ; Pattern_Syntax # Pe RIGHT-POINTING CURVED ANGLE BRACKET 29FE..2AFF ; Pattern_Syntax # Sm [258] TINY..N-ARY WHITE VERTICAL BAR 2B00..2B2F ; Pattern_Syntax # So [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE 2B30..2B44 ; Pattern_Syntax # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; Pattern_Syntax # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Pattern_Syntax # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR 2B4D..2B73 ; Pattern_Syntax # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR 2B74..2B75 ; Pattern_Syntax # Cn [2] .. 2B76..2BFF ; Pattern_Syntax # So [138] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..HELLSCHREIBER PAUSE SYMBOL 2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Pattern_Syntax # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Pattern_Syntax # Pf RIGHT SUBSTITUTION BRACKET 2E04 ; Pattern_Syntax # Pi LEFT DOTTED SUBSTITUTION BRACKET 2E05 ; Pattern_Syntax # Pf RIGHT DOTTED SUBSTITUTION BRACKET 2E06..2E08 ; Pattern_Syntax # Po [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER 2E09 ; Pattern_Syntax # Pi LEFT TRANSPOSITION BRACKET 2E0A ; Pattern_Syntax # Pf RIGHT TRANSPOSITION BRACKET 2E0B ; Pattern_Syntax # Po RAISED SQUARE 2E0C ; Pattern_Syntax # Pi LEFT RAISED OMISSION BRACKET 2E0D ; Pattern_Syntax # Pf RIGHT RAISED OMISSION BRACKET 2E0E..2E16 ; Pattern_Syntax # Po [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE 2E17 ; Pattern_Syntax # Pd DOUBLE OBLIQUE HYPHEN 2E18..2E19 ; Pattern_Syntax # Po [2] INVERTED INTERROBANG..PALM BRANCH 2E1A ; Pattern_Syntax # Pd HYPHEN WITH DIAERESIS 2E1B ; Pattern_Syntax # Po TILDE WITH RING ABOVE 2E1C ; Pattern_Syntax # Pi LEFT LOW PARAPHRASE BRACKET 2E1D ; Pattern_Syntax # Pf RIGHT LOW PARAPHRASE BRACKET 2E1E..2E1F ; Pattern_Syntax # Po [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW 2E20 ; Pattern_Syntax # Pi LEFT VERTICAL BAR WITH QUILL 2E21 ; Pattern_Syntax # Pf RIGHT VERTICAL BAR WITH QUILL 2E22 ; Pattern_Syntax # Ps TOP LEFT HALF BRACKET 2E23 ; Pattern_Syntax # Pe TOP RIGHT HALF BRACKET 2E24 ; Pattern_Syntax # Ps BOTTOM LEFT HALF BRACKET 2E25 ; Pattern_Syntax # Pe BOTTOM RIGHT HALF BRACKET 2E26 ; Pattern_Syntax # Ps LEFT SIDEWAYS U BRACKET 2E27 ; Pattern_Syntax # Pe RIGHT SIDEWAYS U BRACKET 2E28 ; Pattern_Syntax # Ps LEFT DOUBLE PARENTHESIS 2E29 ; Pattern_Syntax # Pe RIGHT DOUBLE PARENTHESIS 2E2A..2E2E ; Pattern_Syntax # Po [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK 2E2F ; Pattern_Syntax # Lm VERTICAL TILDE 2E30..2E39 ; Pattern_Syntax # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; Pattern_Syntax # Pd [2] TWO-EM DASH..THREE-EM DASH 2E3C..2E3F ; Pattern_Syntax # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM 2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN 2E41 ; Pattern_Syntax # Po REVERSED COMMA 2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 2E43..2E4F ; Pattern_Syntax # Po [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER 2E50..2E51 ; Pattern_Syntax # So [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR 2E52..2E54 ; Pattern_Syntax # Po [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK 2E55 ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH STROKE 2E56 ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH STROKE 2E57 ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH DOUBLE STROKE 2E58 ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH DOUBLE STROKE 2E59 ; Pattern_Syntax # Ps TOP HALF LEFT PARENTHESIS 2E5A ; Pattern_Syntax # Pe TOP HALF RIGHT PARENTHESIS 2E5B ; Pattern_Syntax # Ps BOTTOM HALF LEFT PARENTHESIS 2E5C ; Pattern_Syntax # Pe BOTTOM HALF RIGHT PARENTHESIS 2E5D ; Pattern_Syntax # Pd OBLIQUE HYPHEN 2E5E..2E7F ; Pattern_Syntax # Cn [34] .. 3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK 3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET 3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET 300A ; Pattern_Syntax # Ps LEFT DOUBLE ANGLE BRACKET 300B ; Pattern_Syntax # Pe RIGHT DOUBLE ANGLE BRACKET 300C ; Pattern_Syntax # Ps LEFT CORNER BRACKET 300D ; Pattern_Syntax # Pe RIGHT CORNER BRACKET 300E ; Pattern_Syntax # Ps LEFT WHITE CORNER BRACKET 300F ; Pattern_Syntax # Pe RIGHT WHITE CORNER BRACKET 3010 ; Pattern_Syntax # Ps LEFT BLACK LENTICULAR BRACKET 3011 ; Pattern_Syntax # Pe RIGHT BLACK LENTICULAR BRACKET 3012..3013 ; Pattern_Syntax # So [2] POSTAL MARK..GETA MARK 3014 ; Pattern_Syntax # Ps LEFT TORTOISE SHELL BRACKET 3015 ; Pattern_Syntax # Pe RIGHT TORTOISE SHELL BRACKET 3016 ; Pattern_Syntax # Ps LEFT WHITE LENTICULAR BRACKET 3017 ; Pattern_Syntax # Pe RIGHT WHITE LENTICULAR BRACKET 3018 ; Pattern_Syntax # Ps LEFT WHITE TORTOISE SHELL BRACKET 3019 ; Pattern_Syntax # Pe RIGHT WHITE TORTOISE SHELL BRACKET 301A ; Pattern_Syntax # Ps LEFT WHITE SQUARE BRACKET 301B ; Pattern_Syntax # Pe RIGHT WHITE SQUARE BRACKET 301C ; Pattern_Syntax # Pd WAVE DASH 301D ; Pattern_Syntax # Ps REVERSED DOUBLE PRIME QUOTATION MARK 301E..301F ; Pattern_Syntax # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK 3020 ; Pattern_Syntax # So POSTAL MARK FACE 3030 ; Pattern_Syntax # Pd WAVY DASH FD3E ; Pattern_Syntax # Pe ORNATE LEFT PARENTHESIS FD3F ; Pattern_Syntax # Ps ORNATE RIGHT PARENTHESIS FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT # Total code points: 2760 # ================================================ 0600..0605 ; Prepended_Concatenation_Mark # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE 06DD ; Prepended_Concatenation_Mark # Cf ARABIC END OF AYAH 070F ; Prepended_Concatenation_Mark # Cf SYRIAC ABBREVIATION MARK 0890..0891 ; Prepended_Concatenation_Mark # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 08E2 ; Prepended_Concatenation_Mark # Cf ARABIC DISPUTED END OF AYAH 110BD ; Prepended_Concatenation_Mark # Cf KAITHI NUMBER SIGN 110CD ; Prepended_Concatenation_Mark # Cf KAITHI NUMBER SIGN ABOVE # Total code points: 13 # ================================================ 1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z # Total code points: 26 # ================================================ 0654..0655 ; Modifier_Combining_Mark # Mn [2] ARABIC HAMZA ABOVE..ARABIC HAMZA BELOW 0658 ; Modifier_Combining_Mark # Mn ARABIC MARK NOON GHUNNA 06DC ; Modifier_Combining_Mark # Mn ARABIC SMALL HIGH SEEN 06E3 ; Modifier_Combining_Mark # Mn ARABIC SMALL LOW SEEN 06E7..06E8 ; Modifier_Combining_Mark # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON 08CA..08CB ; Modifier_Combining_Mark # Mn [2] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH YEH BARREE WITH TWO DOTS BELOW 08CD..08CF ; Modifier_Combining_Mark # Mn [3] ARABIC SMALL HIGH ZAH..ARABIC LARGE ROUND DOT BELOW 08D3 ; Modifier_Combining_Mark # Mn ARABIC SMALL LOW WAW 08F3 ; Modifier_Combining_Mark # Mn ARABIC SMALL HIGH WAW # Total code points: 14 # EOF ================================================ FILE: maint/Unicode.tables/PropertyAliases.txt ================================================ # PropertyAliases-17.0.0.txt # Date: 2025-04-25, 14:00:52 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # # This file contains aliases for properties used in the UCD. # These names can be used for XML formats of UCD data, for regular-expression # property tests, and other programmatic textual descriptions of Unicode data. # # The names may be translated in appropriate environments, and additional # aliases may be useful. # # FORMAT # # Each line has two or more fields, separated by semicolons. # # First Field: The first field is the short name for the property. # It is typically an abbreviation, but in a number of cases it is simply # a duplicate of the "long name" in the second field. # For Unihan database tags, the short name is actually a longer string than # the tag specified in the second field. # # Second Field: The second field is the long name for the property, # typically the formal name used in documentation about the property. # # The above are the preferred aliases. Other aliases may be listed in additional fields. # # Loose matching should be applied to all property names and property values, with # the exception of String Property values. With loose matching of property names and # values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property # values, numeric equivalencies are applied: thus "01.00" is equivalent to "1". # # NOTE: Property value names are NOT unique across properties. For example: # # AL means Arabic Letter for the Bidi_Class property, and # AL means Above_Left for the Combining_Class property, and # AL means Alphabetic for the Line_Break property. # # In addition, some property names may be the same as some property value names. # For example: # # sc means the Script property, and # Sc means the General_Category property value Currency_Symbol (Sc) # # The combination of property value and property name is, however, unique. # # For more information, see: # - UAX #44, Unicode Character Database; # - UAX #38, Unicode Han Database (Unihan); # - UAX #57, Unicode Egyptian Hieroglyph Database (Unikemet); # - UTS #18, Unicode Regular Expressions. # ================================================ # ================================================ # Numeric Properties # ================================================ cjkAccountingNumeric ; kAccountingNumeric cjkOtherNumeric ; kOtherNumeric cjkPrimaryNumeric ; kPrimaryNumeric nv ; Numeric_Value # ================================================ # String Properties # ================================================ bmg ; Bidi_Mirroring_Glyph bpb ; Bidi_Paired_Bracket cf ; Case_Folding cjkCompatibilityVariant ; kCompatibilityVariant dm ; Decomposition_Mapping EqUIdeo ; Equivalent_Unified_Ideograph FC_NFKC ; FC_NFKC_Closure lc ; Lowercase_Mapping NFKC_CF ; NFKC_Casefold NFKC_SCF ; NFKC_Simple_Casefold scf ; Simple_Case_Folding ; sfc slc ; Simple_Lowercase_Mapping stc ; Simple_Titlecase_Mapping suc ; Simple_Uppercase_Mapping tc ; Titlecase_Mapping uc ; Uppercase_Mapping # ================================================ # Miscellaneous Properties # ================================================ cjkIICore ; kIICore cjkIRG_GSource ; kIRG_GSource cjkIRG_HSource ; kIRG_HSource cjkIRG_JSource ; kIRG_JSource cjkIRG_KPSource ; kIRG_KPSource cjkIRG_KSource ; kIRG_KSource cjkIRG_MSource ; kIRG_MSource cjkIRG_SSource ; kIRG_SSource cjkIRG_TSource ; kIRG_TSource cjkIRG_UKSource ; kIRG_UKSource cjkIRG_USource ; kIRG_USource cjkIRG_VSource ; kIRG_VSource cjkMandarin ; kMandarin cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS cjkTotalStrokes ; kTotalStrokes cjkUnihanCore2020 ; kUnihanCore2020 isc ; ISO_Comment JSN ; Jamo_Short_Name kEH_Cat ; kEH_Cat kEH_Desc ; kEH_Desc kEH_HG ; kEH_HG kEH_IFAO ; kEH_IFAO kEH_JSesh ; kEH_JSesh na ; Name na1 ; Unicode_1_Name Name_Alias ; Name_Alias scx ; Script_Extensions # ================================================ # Catalog Properties # ================================================ age ; Age blk ; Block sc ; Script # ================================================ # Enumerated Properties # ================================================ bc ; Bidi_Class bpt ; Bidi_Paired_Bracket_Type ccc ; Canonical_Combining_Class dt ; Decomposition_Type ea ; East_Asian_Width gc ; General_Category GCB ; Grapheme_Cluster_Break hst ; Hangul_Syllable_Type InCB ; Indic_Conjunct_Break InPC ; Indic_Positional_Category InSC ; Indic_Syllabic_Category jg ; Joining_Group jt ; Joining_Type lb ; Line_Break NFC_QC ; NFC_Quick_Check NFD_QC ; NFD_Quick_Check NFKC_QC ; NFKC_Quick_Check NFKD_QC ; NFKD_Quick_Check nt ; Numeric_Type SB ; Sentence_Break vo ; Vertical_Orientation WB ; Word_Break # ================================================ # Binary Properties # ================================================ AHex ; ASCII_Hex_Digit Alpha ; Alphabetic Bidi_C ; Bidi_Control Bidi_M ; Bidi_Mirrored Cased ; Cased CE ; Composition_Exclusion CI ; Case_Ignorable Comp_Ex ; Full_Composition_Exclusion CWCF ; Changes_When_Casefolded CWCM ; Changes_When_Casemapped CWKCF ; Changes_When_NFKC_Casefolded CWL ; Changes_When_Lowercased CWT ; Changes_When_Titlecased CWU ; Changes_When_Uppercased Dash ; Dash Dep ; Deprecated DI ; Default_Ignorable_Code_Point Dia ; Diacritic EBase ; Emoji_Modifier_Base EComp ; Emoji_Component EMod ; Emoji_Modifier Emoji ; Emoji EPres ; Emoji_Presentation Ext ; Extender ExtPict ; Extended_Pictographic Gr_Base ; Grapheme_Base Gr_Ext ; Grapheme_Extend Gr_Link ; Grapheme_Link Hex ; Hex_Digit Hyphen ; Hyphen ID_Compat_Math_Continue ; ID_Compat_Math_Continue ID_Compat_Math_Start ; ID_Compat_Math_Start IDC ; ID_Continue Ideo ; Ideographic IDS ; ID_Start IDSB ; IDS_Binary_Operator IDST ; IDS_Trinary_Operator IDSU ; IDS_Unary_Operator Join_C ; Join_Control kEH_NoMirror ; kEH_NoMirror kEH_NoRotate ; kEH_NoRotate LOE ; Logical_Order_Exception Lower ; Lowercase Math ; Math MCM ; Modifier_Combining_Mark NChar ; Noncharacter_Code_Point OAlpha ; Other_Alphabetic ODI ; Other_Default_Ignorable_Code_Point OGr_Ext ; Other_Grapheme_Extend OIDC ; Other_ID_Continue OIDS ; Other_ID_Start OLower ; Other_Lowercase OMath ; Other_Math OUpper ; Other_Uppercase Pat_Syn ; Pattern_Syntax Pat_WS ; Pattern_White_Space PCM ; Prepended_Concatenation_Mark QMark ; Quotation_Mark Radical ; Radical RI ; Regional_Indicator SD ; Soft_Dotted STerm ; Sentence_Terminal Term ; Terminal_Punctuation UIdeo ; Unified_Ideograph Upper ; Uppercase VS ; Variation_Selector WSpace ; White_Space ; space XIDC ; XID_Continue XIDS ; XID_Start XO_NFC ; Expands_On_NFC XO_NFD ; Expands_On_NFD XO_NFKC ; Expands_On_NFKC XO_NFKD ; Expands_On_NFKD # ================================================ # Total: 145 # EOF ================================================ FILE: maint/Unicode.tables/PropertyValueAliases.txt ================================================ # PropertyValueAliases-17.0.0.txt # Date: 2025-06-30, 06:16:21 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # # This file contains aliases for property values used in the UCD. # These names can be used for XML formats of UCD data, for regular-expression # property tests, and other programmatic textual descriptions of Unicode data. # # The names may be translated in appropriate environments, and additional # aliases may be useful. # # FORMAT # # Each line describes a property value name. # This consists of three or more fields, separated by semicolons. # # First Field: The first field describes the property for which that # property value name is used. # # Second Field: The second field is the short name for the property value. # It is typically an abbreviation, but in a number of cases it is simply # a duplicate of the "long name" in the third field. # # Third Field: The third field is the long name for the property value, # typically the formal name used in documentation about the property value. # # In the case of Canonical_Combining_Class (ccc), there are 4 fields: # The second field is numeric, the third is the short name, and the fourth is the long name. # # The above are the preferred aliases. Other aliases may be listed in additional fields. # # Loose matching should be applied to all property names and property values, with # the exception of String Property values. With loose matching of property names and # values, the case distinctions, whitespace, hyphens, and '_' are ignored. # For Numeric Property values, numeric equivalence is applied: thus "01.00" # is equivalent to "1". # # NOTE: Property value names are NOT unique across properties. For example: # # AL means Arabic Letter for the Bidi_Class property, and # AL means Above_Left for the Canonical_Combining_Class property, and # AL means Alphabetic for the Line_Break property. # # In addition, some property names may be the same as some property value names. # For example: # # sc means the Script property, and # Sc means the General_Category property value Currency_Symbol (Sc) # # The combination of property value and property name is, however, unique. # # For more information, see UAX #44, Unicode Character Database, and # UTS #18, Unicode Regular Expressions. # ================================================ # ASCII_Hex_Digit (AHex) AHex; N ; No ; F ; False AHex; Y ; Yes ; T ; True # Age (age) age; 1.1 ; V1_1 age; 2.0 ; V2_0 age; 2.1 ; V2_1 age; 3.0 ; V3_0 age; 3.1 ; V3_1 age; 3.2 ; V3_2 age; 4.0 ; V4_0 age; 4.1 ; V4_1 age; 5.0 ; V5_0 age; 5.1 ; V5_1 age; 5.2 ; V5_2 age; 6.0 ; V6_0 age; 6.1 ; V6_1 age; 6.2 ; V6_2 age; 6.3 ; V6_3 age; 7.0 ; V7_0 age; 8.0 ; V8_0 age; 9.0 ; V9_0 age; 10.0 ; V10_0 age; 11.0 ; V11_0 age; 12.0 ; V12_0 age; 12.1 ; V12_1 age; 13.0 ; V13_0 age; 14.0 ; V14_0 age; 15.0 ; V15_0 age; 15.1 ; V15_1 age; 16.0 ; V16_0 age; 17.0 ; V17_0 age; NA ; Unassigned # Alphabetic (Alpha) Alpha; N ; No ; F ; False Alpha; Y ; Yes ; T ; True # Bidi_Class (bc) bc ; AL ; Arabic_Letter bc ; AN ; Arabic_Number bc ; B ; Paragraph_Separator bc ; BN ; Boundary_Neutral bc ; CS ; Common_Separator bc ; EN ; European_Number bc ; ES ; European_Separator bc ; ET ; European_Terminator bc ; FSI ; First_Strong_Isolate bc ; L ; Left_To_Right bc ; LRE ; Left_To_Right_Embedding bc ; LRI ; Left_To_Right_Isolate bc ; LRO ; Left_To_Right_Override bc ; NSM ; Nonspacing_Mark bc ; ON ; Other_Neutral bc ; PDF ; Pop_Directional_Format bc ; PDI ; Pop_Directional_Isolate bc ; R ; Right_To_Left bc ; RLE ; Right_To_Left_Embedding bc ; RLI ; Right_To_Left_Isolate bc ; RLO ; Right_To_Left_Override bc ; S ; Segment_Separator bc ; WS ; White_Space # Bidi_Control (Bidi_C) Bidi_C; N ; No ; F ; False Bidi_C; Y ; Yes ; T ; True # Bidi_Mirrored (Bidi_M) Bidi_M; N ; No ; F ; False Bidi_M; Y ; Yes ; T ; True # Bidi_Mirroring_Glyph (bmg) # Bidi_Paired_Bracket (bpb) # @missing: 0000..10FFFF; Bidi_Paired_Bracket; # Bidi_Paired_Bracket_Type (bpt) bpt; c ; Close bpt; n ; None bpt; o ; Open # @missing: 0000..10FFFF; Bidi_Paired_Bracket_Type; n # Block (blk) blk; Adlam ; Adlam blk; Aegean_Numbers ; Aegean_Numbers blk; Ahom ; Ahom blk; Alchemical ; Alchemical_Symbols blk; Alphabetic_PF ; Alphabetic_Presentation_Forms blk; Anatolian_Hieroglyphs ; Anatolian_Hieroglyphs blk; Ancient_Greek_Music ; Ancient_Greek_Musical_Notation blk; Ancient_Greek_Numbers ; Ancient_Greek_Numbers blk; Ancient_Symbols ; Ancient_Symbols blk; Arabic ; Arabic blk; Arabic_Ext_A ; Arabic_Extended_A blk; Arabic_Ext_B ; Arabic_Extended_B blk; Arabic_Ext_C ; Arabic_Extended_C blk; Arabic_Math ; Arabic_Mathematical_Alphabetic_Symbols blk; Arabic_PF_A ; Arabic_Presentation_Forms_A ; Arabic_Presentation_Forms-A blk; Arabic_PF_B ; Arabic_Presentation_Forms_B blk; Arabic_Sup ; Arabic_Supplement blk; Armenian ; Armenian blk; Arrows ; Arrows blk; ASCII ; Basic_Latin blk; Avestan ; Avestan blk; Balinese ; Balinese blk; Bamum ; Bamum blk; Bamum_Sup ; Bamum_Supplement blk; Bassa_Vah ; Bassa_Vah blk; Batak ; Batak blk; Bengali ; Bengali blk; Beria_Erfe ; Beria_Erfe blk; Bhaiksuki ; Bhaiksuki blk; Block_Elements ; Block_Elements blk; Bopomofo ; Bopomofo blk; Bopomofo_Ext ; Bopomofo_Extended blk; Box_Drawing ; Box_Drawing blk; Brahmi ; Brahmi blk; Braille ; Braille_Patterns blk; Buginese ; Buginese blk; Buhid ; Buhid blk; Byzantine_Music ; Byzantine_Musical_Symbols blk; Carian ; Carian blk; Caucasian_Albanian ; Caucasian_Albanian blk; Chakma ; Chakma blk; Cham ; Cham blk; Cherokee ; Cherokee blk; Cherokee_Sup ; Cherokee_Supplement blk; Chess_Symbols ; Chess_Symbols blk; Chorasmian ; Chorasmian blk; CJK ; CJK_Unified_Ideographs blk; CJK_Compat ; CJK_Compatibility blk; CJK_Compat_Forms ; CJK_Compatibility_Forms blk; CJK_Compat_Ideographs ; CJK_Compatibility_Ideographs blk; CJK_Compat_Ideographs_Sup ; CJK_Compatibility_Ideographs_Supplement blk; CJK_Ext_A ; CJK_Unified_Ideographs_Extension_A blk; CJK_Ext_B ; CJK_Unified_Ideographs_Extension_B blk; CJK_Ext_C ; CJK_Unified_Ideographs_Extension_C blk; CJK_Ext_D ; CJK_Unified_Ideographs_Extension_D blk; CJK_Ext_E ; CJK_Unified_Ideographs_Extension_E blk; CJK_Ext_F ; CJK_Unified_Ideographs_Extension_F blk; CJK_Ext_G ; CJK_Unified_Ideographs_Extension_G blk; CJK_Ext_H ; CJK_Unified_Ideographs_Extension_H blk; CJK_Ext_I ; CJK_Unified_Ideographs_Extension_I blk; CJK_Ext_J ; CJK_Unified_Ideographs_Extension_J blk; CJK_Radicals_Sup ; CJK_Radicals_Supplement blk; CJK_Strokes ; CJK_Strokes blk; CJK_Symbols ; CJK_Symbols_And_Punctuation blk; Compat_Jamo ; Hangul_Compatibility_Jamo blk; Control_Pictures ; Control_Pictures blk; Coptic ; Coptic blk; Coptic_Epact_Numbers ; Coptic_Epact_Numbers blk; Counting_Rod ; Counting_Rod_Numerals blk; Cuneiform ; Cuneiform blk; Cuneiform_Numbers ; Cuneiform_Numbers_And_Punctuation blk; Currency_Symbols ; Currency_Symbols blk; Cypriot_Syllabary ; Cypriot_Syllabary blk; Cypro_Minoan ; Cypro_Minoan blk; Cyrillic ; Cyrillic blk; Cyrillic_Ext_A ; Cyrillic_Extended_A blk; Cyrillic_Ext_B ; Cyrillic_Extended_B blk; Cyrillic_Ext_C ; Cyrillic_Extended_C blk; Cyrillic_Ext_D ; Cyrillic_Extended_D blk; Cyrillic_Sup ; Cyrillic_Supplement ; Cyrillic_Supplementary blk; Deseret ; Deseret blk; Devanagari ; Devanagari blk; Devanagari_Ext ; Devanagari_Extended blk; Devanagari_Ext_A ; Devanagari_Extended_A blk; Diacriticals ; Combining_Diacritical_Marks blk; Diacriticals_Ext ; Combining_Diacritical_Marks_Extended blk; Diacriticals_For_Symbols ; Combining_Diacritical_Marks_For_Symbols; Combining_Marks_For_Symbols blk; Diacriticals_Sup ; Combining_Diacritical_Marks_Supplement blk; Dingbats ; Dingbats blk; Dives_Akuru ; Dives_Akuru blk; Dogra ; Dogra blk; Domino ; Domino_Tiles blk; Duployan ; Duployan blk; Early_Dynastic_Cuneiform ; Early_Dynastic_Cuneiform blk; Egyptian_Hieroglyph_Format_Controls; Egyptian_Hieroglyph_Format_Controls blk; Egyptian_Hieroglyphs ; Egyptian_Hieroglyphs blk; Egyptian_Hieroglyphs_Ext_A ; Egyptian_Hieroglyphs_Extended_A blk; Elbasan ; Elbasan blk; Elymaic ; Elymaic blk; Emoticons ; Emoticons blk; Enclosed_Alphanum ; Enclosed_Alphanumerics blk; Enclosed_Alphanum_Sup ; Enclosed_Alphanumeric_Supplement blk; Enclosed_CJK ; Enclosed_CJK_Letters_And_Months blk; Enclosed_Ideographic_Sup ; Enclosed_Ideographic_Supplement blk; Ethiopic ; Ethiopic blk; Ethiopic_Ext ; Ethiopic_Extended blk; Ethiopic_Ext_A ; Ethiopic_Extended_A blk; Ethiopic_Ext_B ; Ethiopic_Extended_B blk; Ethiopic_Sup ; Ethiopic_Supplement blk; Garay ; Garay blk; Geometric_Shapes ; Geometric_Shapes blk; Geometric_Shapes_Ext ; Geometric_Shapes_Extended blk; Georgian ; Georgian blk; Georgian_Ext ; Georgian_Extended blk; Georgian_Sup ; Georgian_Supplement blk; Glagolitic ; Glagolitic blk; Glagolitic_Sup ; Glagolitic_Supplement blk; Gothic ; Gothic blk; Grantha ; Grantha blk; Greek ; Greek_And_Coptic blk; Greek_Ext ; Greek_Extended blk; Gujarati ; Gujarati blk; Gunjala_Gondi ; Gunjala_Gondi blk; Gurmukhi ; Gurmukhi blk; Gurung_Khema ; Gurung_Khema blk; Half_And_Full_Forms ; Halfwidth_And_Fullwidth_Forms blk; Half_Marks ; Combining_Half_Marks blk; Hangul ; Hangul_Syllables blk; Hanifi_Rohingya ; Hanifi_Rohingya blk; Hanunoo ; Hanunoo blk; Hatran ; Hatran blk; Hebrew ; Hebrew blk; High_PU_Surrogates ; High_Private_Use_Surrogates blk; High_Surrogates ; High_Surrogates blk; Hiragana ; Hiragana blk; IDC ; Ideographic_Description_Characters blk; Ideographic_Symbols ; Ideographic_Symbols_And_Punctuation blk; Imperial_Aramaic ; Imperial_Aramaic blk; Indic_Number_Forms ; Common_Indic_Number_Forms blk; Indic_Siyaq_Numbers ; Indic_Siyaq_Numbers blk; Inscriptional_Pahlavi ; Inscriptional_Pahlavi blk; Inscriptional_Parthian ; Inscriptional_Parthian blk; IPA_Ext ; IPA_Extensions blk; Jamo ; Hangul_Jamo blk; Jamo_Ext_A ; Hangul_Jamo_Extended_A blk; Jamo_Ext_B ; Hangul_Jamo_Extended_B blk; Javanese ; Javanese blk; Kaithi ; Kaithi blk; Kaktovik_Numerals ; Kaktovik_Numerals blk; Kana_Ext_A ; Kana_Extended_A blk; Kana_Ext_B ; Kana_Extended_B blk; Kana_Sup ; Kana_Supplement blk; Kanbun ; Kanbun blk; Kangxi ; Kangxi_Radicals blk; Kannada ; Kannada blk; Katakana ; Katakana blk; Katakana_Ext ; Katakana_Phonetic_Extensions blk; Kawi ; Kawi blk; Kayah_Li ; Kayah_Li blk; Kharoshthi ; Kharoshthi blk; Khitan_Small_Script ; Khitan_Small_Script blk; Khmer ; Khmer blk; Khmer_Symbols ; Khmer_Symbols blk; Khojki ; Khojki blk; Khudawadi ; Khudawadi blk; Kirat_Rai ; Kirat_Rai blk; Lao ; Lao blk; Latin_1_Sup ; Latin_1_Supplement ; Latin_1 blk; Latin_Ext_A ; Latin_Extended_A blk; Latin_Ext_Additional ; Latin_Extended_Additional blk; Latin_Ext_B ; Latin_Extended_B blk; Latin_Ext_C ; Latin_Extended_C blk; Latin_Ext_D ; Latin_Extended_D blk; Latin_Ext_E ; Latin_Extended_E blk; Latin_Ext_F ; Latin_Extended_F blk; Latin_Ext_G ; Latin_Extended_G blk; Lepcha ; Lepcha blk; Letterlike_Symbols ; Letterlike_Symbols blk; Limbu ; Limbu blk; Linear_A ; Linear_A blk; Linear_B_Ideograms ; Linear_B_Ideograms blk; Linear_B_Syllabary ; Linear_B_Syllabary blk; Lisu ; Lisu blk; Lisu_Sup ; Lisu_Supplement blk; Low_Surrogates ; Low_Surrogates blk; Lycian ; Lycian blk; Lydian ; Lydian blk; Mahajani ; Mahajani blk; Mahjong ; Mahjong_Tiles blk; Makasar ; Makasar blk; Malayalam ; Malayalam blk; Mandaic ; Mandaic blk; Manichaean ; Manichaean blk; Marchen ; Marchen blk; Masaram_Gondi ; Masaram_Gondi blk; Math_Alphanum ; Mathematical_Alphanumeric_Symbols blk; Math_Operators ; Mathematical_Operators blk; Mayan_Numerals ; Mayan_Numerals blk; Medefaidrin ; Medefaidrin blk; Meetei_Mayek ; Meetei_Mayek blk; Meetei_Mayek_Ext ; Meetei_Mayek_Extensions blk; Mende_Kikakui ; Mende_Kikakui blk; Meroitic_Cursive ; Meroitic_Cursive blk; Meroitic_Hieroglyphs ; Meroitic_Hieroglyphs blk; Miao ; Miao blk; Misc_Arrows ; Miscellaneous_Symbols_And_Arrows blk; Misc_Math_Symbols_A ; Miscellaneous_Mathematical_Symbols_A blk; Misc_Math_Symbols_B ; Miscellaneous_Mathematical_Symbols_B blk; Misc_Pictographs ; Miscellaneous_Symbols_And_Pictographs blk; Misc_Symbols ; Miscellaneous_Symbols blk; Misc_Symbols_Sup ; Miscellaneous_Symbols_Supplement blk; Misc_Technical ; Miscellaneous_Technical blk; Modi ; Modi blk; Modifier_Letters ; Spacing_Modifier_Letters blk; Modifier_Tone_Letters ; Modifier_Tone_Letters blk; Mongolian ; Mongolian blk; Mongolian_Sup ; Mongolian_Supplement blk; Mro ; Mro blk; Multani ; Multani blk; Music ; Musical_Symbols blk; Myanmar ; Myanmar blk; Myanmar_Ext_A ; Myanmar_Extended_A blk; Myanmar_Ext_B ; Myanmar_Extended_B blk; Myanmar_Ext_C ; Myanmar_Extended_C blk; Nabataean ; Nabataean blk; Nag_Mundari ; Nag_Mundari blk; Nandinagari ; Nandinagari blk; NB ; No_Block blk; New_Tai_Lue ; New_Tai_Lue blk; Newa ; Newa blk; NKo ; NKo blk; Number_Forms ; Number_Forms blk; Nushu ; Nushu blk; Nyiakeng_Puachue_Hmong ; Nyiakeng_Puachue_Hmong blk; OCR ; Optical_Character_Recognition blk; Ogham ; Ogham blk; Ol_Chiki ; Ol_Chiki blk; Ol_Onal ; Ol_Onal blk; Old_Hungarian ; Old_Hungarian blk; Old_Italic ; Old_Italic blk; Old_North_Arabian ; Old_North_Arabian blk; Old_Permic ; Old_Permic blk; Old_Persian ; Old_Persian blk; Old_Sogdian ; Old_Sogdian blk; Old_South_Arabian ; Old_South_Arabian blk; Old_Turkic ; Old_Turkic blk; Old_Uyghur ; Old_Uyghur blk; Oriya ; Oriya blk; Ornamental_Dingbats ; Ornamental_Dingbats blk; Osage ; Osage blk; Osmanya ; Osmanya blk; Ottoman_Siyaq_Numbers ; Ottoman_Siyaq_Numbers blk; Pahawh_Hmong ; Pahawh_Hmong blk; Palmyrene ; Palmyrene blk; Pau_Cin_Hau ; Pau_Cin_Hau blk; Phags_Pa ; Phags_Pa blk; Phaistos ; Phaistos_Disc blk; Phoenician ; Phoenician blk; Phonetic_Ext ; Phonetic_Extensions blk; Phonetic_Ext_Sup ; Phonetic_Extensions_Supplement blk; Playing_Cards ; Playing_Cards blk; Psalter_Pahlavi ; Psalter_Pahlavi blk; PUA ; Private_Use_Area ; Private_Use blk; Punctuation ; General_Punctuation blk; Rejang ; Rejang blk; Rumi ; Rumi_Numeral_Symbols blk; Runic ; Runic blk; Samaritan ; Samaritan blk; Saurashtra ; Saurashtra blk; Sharada ; Sharada blk; Sharada_Sup ; Sharada_Supplement blk; Shavian ; Shavian blk; Shorthand_Format_Controls ; Shorthand_Format_Controls blk; Siddham ; Siddham blk; Sidetic ; Sidetic blk; Sinhala ; Sinhala blk; Sinhala_Archaic_Numbers ; Sinhala_Archaic_Numbers blk; Small_Forms ; Small_Form_Variants blk; Small_Kana_Ext ; Small_Kana_Extension blk; Sogdian ; Sogdian blk; Sora_Sompeng ; Sora_Sompeng blk; Soyombo ; Soyombo blk; Specials ; Specials blk; Sundanese ; Sundanese blk; Sundanese_Sup ; Sundanese_Supplement blk; Sunuwar ; Sunuwar blk; Sup_Arrows_A ; Supplemental_Arrows_A blk; Sup_Arrows_B ; Supplemental_Arrows_B blk; Sup_Arrows_C ; Supplemental_Arrows_C blk; Sup_Math_Operators ; Supplemental_Mathematical_Operators blk; Sup_PUA_A ; Supplementary_Private_Use_Area_A blk; Sup_PUA_B ; Supplementary_Private_Use_Area_B blk; Sup_Punctuation ; Supplemental_Punctuation blk; Sup_Symbols_And_Pictographs ; Supplemental_Symbols_And_Pictographs blk; Super_And_Sub ; Superscripts_And_Subscripts blk; Sutton_SignWriting ; Sutton_SignWriting blk; Syloti_Nagri ; Syloti_Nagri blk; Symbols_And_Pictographs_Ext_A ; Symbols_And_Pictographs_Extended_A blk; Symbols_For_Legacy_Computing ; Symbols_For_Legacy_Computing blk; Symbols_For_Legacy_Computing_Sup ; Symbols_For_Legacy_Computing_Supplement blk; Syriac ; Syriac blk; Syriac_Sup ; Syriac_Supplement blk; Tagalog ; Tagalog blk; Tagbanwa ; Tagbanwa blk; Tags ; Tags blk; Tai_Le ; Tai_Le blk; Tai_Tham ; Tai_Tham blk; Tai_Viet ; Tai_Viet blk; Tai_Xuan_Jing ; Tai_Xuan_Jing_Symbols blk; Tai_Yo ; Tai_Yo blk; Takri ; Takri blk; Tamil ; Tamil blk; Tamil_Sup ; Tamil_Supplement blk; Tangsa ; Tangsa blk; Tangut ; Tangut blk; Tangut_Components ; Tangut_Components blk; Tangut_Components_Sup ; Tangut_Components_Supplement blk; Tangut_Sup ; Tangut_Supplement blk; Telugu ; Telugu blk; Thaana ; Thaana blk; Thai ; Thai blk; Tibetan ; Tibetan blk; Tifinagh ; Tifinagh blk; Tirhuta ; Tirhuta blk; Todhri ; Todhri blk; Tolong_Siki ; Tolong_Siki blk; Toto ; Toto blk; Transport_And_Map ; Transport_And_Map_Symbols blk; Tulu_Tigalari ; Tulu_Tigalari blk; UCAS ; Unified_Canadian_Aboriginal_Syllabics; Canadian_Syllabics blk; UCAS_Ext ; Unified_Canadian_Aboriginal_Syllabics_Extended blk; UCAS_Ext_A ; Unified_Canadian_Aboriginal_Syllabics_Extended_A blk; Ugaritic ; Ugaritic blk; Vai ; Vai blk; Vedic_Ext ; Vedic_Extensions blk; Vertical_Forms ; Vertical_Forms blk; Vithkuqi ; Vithkuqi blk; VS ; Variation_Selectors blk; VS_Sup ; Variation_Selectors_Supplement blk; Wancho ; Wancho blk; Warang_Citi ; Warang_Citi blk; Yezidi ; Yezidi blk; Yi_Radicals ; Yi_Radicals blk; Yi_Syllables ; Yi_Syllables blk; Yijing ; Yijing_Hexagram_Symbols blk; Zanabazar_Square ; Zanabazar_Square blk; Znamenny_Music ; Znamenny_Musical_Notation # Canonical_Combining_Class (ccc) ccc; 0; NR ; Not_Reordered ccc; 1; OV ; Overlay ccc; 6; HANR ; Han_Reading ccc; 7; NK ; Nukta ccc; 8; KV ; Kana_Voicing ccc; 9; VR ; Virama ccc; 10; CCC10 ; CCC10 ccc; 11; CCC11 ; CCC11 ccc; 12; CCC12 ; CCC12 ccc; 13; CCC13 ; CCC13 ccc; 14; CCC14 ; CCC14 ccc; 15; CCC15 ; CCC15 ccc; 16; CCC16 ; CCC16 ccc; 17; CCC17 ; CCC17 ccc; 18; CCC18 ; CCC18 ccc; 19; CCC19 ; CCC19 ccc; 20; CCC20 ; CCC20 ccc; 21; CCC21 ; CCC21 ccc; 22; CCC22 ; CCC22 ccc; 23; CCC23 ; CCC23 ccc; 24; CCC24 ; CCC24 ccc; 25; CCC25 ; CCC25 ccc; 26; CCC26 ; CCC26 ccc; 27; CCC27 ; CCC27 ccc; 28; CCC28 ; CCC28 ccc; 29; CCC29 ; CCC29 ccc; 30; CCC30 ; CCC30 ccc; 31; CCC31 ; CCC31 ccc; 32; CCC32 ; CCC32 ccc; 33; CCC33 ; CCC33 ccc; 34; CCC34 ; CCC34 ccc; 35; CCC35 ; CCC35 ccc; 36; CCC36 ; CCC36 ccc; 84; CCC84 ; CCC84 ccc; 91; CCC91 ; CCC91 ccc; 103; CCC103 ; CCC103 ccc; 107; CCC107 ; CCC107 ccc; 118; CCC118 ; CCC118 ccc; 122; CCC122 ; CCC122 ccc; 129; CCC129 ; CCC129 ccc; 130; CCC130 ; CCC130 ccc; 132; CCC132 ; CCC132 ccc; 133; CCC133 ; CCC133 # RESERVED ccc; 200; ATBL ; Attached_Below_Left ccc; 202; ATB ; Attached_Below ccc; 214; ATA ; Attached_Above ccc; 216; ATAR ; Attached_Above_Right ccc; 218; BL ; Below_Left ccc; 220; B ; Below ccc; 222; BR ; Below_Right ccc; 224; L ; Left ccc; 226; R ; Right ccc; 228; AL ; Above_Left ccc; 230; A ; Above ccc; 232; AR ; Above_Right ccc; 233; DB ; Double_Below ccc; 234; DA ; Double_Above ccc; 240; IS ; Iota_Subscript # Case_Folding (cf) # @missing: 0000..10FFFF; Case_Folding; # Case_Ignorable (CI) CI ; N ; No ; F ; False CI ; Y ; Yes ; T ; True # Cased (Cased) Cased; N ; No ; F ; False Cased; Y ; Yes ; T ; True # Changes_When_Casefolded (CWCF) CWCF; N ; No ; F ; False CWCF; Y ; Yes ; T ; True # Changes_When_Casemapped (CWCM) CWCM; N ; No ; F ; False CWCM; Y ; Yes ; T ; True # Changes_When_Lowercased (CWL) CWL; N ; No ; F ; False CWL; Y ; Yes ; T ; True # Changes_When_NFKC_Casefolded (CWKCF) CWKCF; N ; No ; F ; False CWKCF; Y ; Yes ; T ; True # Changes_When_Titlecased (CWT) CWT; N ; No ; F ; False CWT; Y ; Yes ; T ; True # Changes_When_Uppercased (CWU) CWU; N ; No ; F ; False CWU; Y ; Yes ; T ; True # Composition_Exclusion (CE) CE ; N ; No ; F ; False CE ; Y ; Yes ; T ; True # Dash (Dash) Dash; N ; No ; F ; False Dash; Y ; Yes ; T ; True # Decomposition_Mapping (dm) # @missing: 0000..10FFFF; Decomposition_Mapping; # Decomposition_Type (dt) dt ; Can ; Canonical ; can dt ; Com ; Compat ; com dt ; Enc ; Circle ; enc dt ; Fin ; Final ; fin dt ; Font ; Font ; font dt ; Fra ; Fraction ; fra dt ; Init ; Initial ; init dt ; Iso ; Isolated ; iso dt ; Med ; Medial ; med dt ; Nar ; Narrow ; nar dt ; Nb ; Nobreak ; nb dt ; None ; None ; none dt ; Sml ; Small ; sml dt ; Sqr ; Square ; sqr dt ; Sub ; Sub ; sub dt ; Sup ; Super ; sup dt ; Vert ; Vertical ; vert dt ; Wide ; Wide ; wide # Default_Ignorable_Code_Point (DI) DI ; N ; No ; F ; False DI ; Y ; Yes ; T ; True # Deprecated (Dep) Dep; N ; No ; F ; False Dep; Y ; Yes ; T ; True # Diacritic (Dia) Dia; N ; No ; F ; False Dia; Y ; Yes ; T ; True # East_Asian_Width (ea) ea ; A ; Ambiguous ea ; F ; Fullwidth ea ; H ; Halfwidth ea ; N ; Neutral ea ; Na ; Narrow ea ; W ; Wide # Emoji (Emoji) Emoji; N ; No ; F ; False Emoji; Y ; Yes ; T ; True # Emoji_Component (EComp) EComp; N ; No ; F ; False EComp; Y ; Yes ; T ; True # Emoji_Modifier (EMod) EMod; N ; No ; F ; False EMod; Y ; Yes ; T ; True # Emoji_Modifier_Base (EBase) EBase; N ; No ; F ; False EBase; Y ; Yes ; T ; True # Emoji_Presentation (EPres) EPres; N ; No ; F ; False EPres; Y ; Yes ; T ; True # Equivalent_Unified_Ideograph (EqUIdeo) # Expands_On_NFC (XO_NFC) XO_NFC; N ; No ; F ; False XO_NFC; Y ; Yes ; T ; True # Expands_On_NFD (XO_NFD) XO_NFD; N ; No ; F ; False XO_NFD; Y ; Yes ; T ; True # Expands_On_NFKC (XO_NFKC) XO_NFKC; N ; No ; F ; False XO_NFKC; Y ; Yes ; T ; True # Expands_On_NFKD (XO_NFKD) XO_NFKD; N ; No ; F ; False XO_NFKD; Y ; Yes ; T ; True # Extended_Pictographic (ExtPict) ExtPict; N ; No ; F ; False ExtPict; Y ; Yes ; T ; True # Extender (Ext) Ext; N ; No ; F ; False Ext; Y ; Yes ; T ; True # FC_NFKC_Closure (FC_NFKC) # @missing: 0000..10FFFF; FC_NFKC_Closure; # Full_Composition_Exclusion (Comp_Ex) Comp_Ex; N ; No ; F ; False Comp_Ex; Y ; Yes ; T ; True # General_Category (gc) gc ; C ; Other # Cc | Cf | Cn | Co | Cs gc ; Cc ; Control ; cntrl gc ; Cf ; Format gc ; Cn ; Unassigned gc ; Co ; Private_Use gc ; Cs ; Surrogate gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu gc ; LC ; Cased_Letter # Ll | Lt | Lu gc ; Ll ; Lowercase_Letter gc ; Lm ; Modifier_Letter gc ; Lo ; Other_Letter gc ; Lt ; Titlecase_Letter gc ; Lu ; Uppercase_Letter gc ; M ; Mark ; Combining_Mark # Mc | Me | Mn gc ; Mc ; Spacing_Mark gc ; Me ; Enclosing_Mark gc ; Mn ; Nonspacing_Mark gc ; N ; Number # Nd | Nl | No gc ; Nd ; Decimal_Number ; digit gc ; Nl ; Letter_Number gc ; No ; Other_Number gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps gc ; Pc ; Connector_Punctuation gc ; Pd ; Dash_Punctuation gc ; Pe ; Close_Punctuation gc ; Pf ; Final_Punctuation gc ; Pi ; Initial_Punctuation gc ; Po ; Other_Punctuation gc ; Ps ; Open_Punctuation gc ; S ; Symbol # Sc | Sk | Sm | So gc ; Sc ; Currency_Symbol gc ; Sk ; Modifier_Symbol gc ; Sm ; Math_Symbol gc ; So ; Other_Symbol gc ; Z ; Separator # Zl | Zp | Zs gc ; Zl ; Line_Separator gc ; Zp ; Paragraph_Separator gc ; Zs ; Space_Separator # @missing: 0000..10FFFF; General_Category; Unassigned # Grapheme_Base (Gr_Base) Gr_Base; N ; No ; F ; False Gr_Base; Y ; Yes ; T ; True # Grapheme_Cluster_Break (GCB) GCB; CN ; Control GCB; CR ; CR GCB; EB ; E_Base GCB; EBG ; E_Base_GAZ GCB; EM ; E_Modifier GCB; EX ; Extend GCB; GAZ ; Glue_After_Zwj GCB; L ; L GCB; LF ; LF GCB; LV ; LV GCB; LVT ; LVT GCB; PP ; Prepend GCB; RI ; Regional_Indicator GCB; SM ; SpacingMark GCB; T ; T GCB; V ; V GCB; XX ; Other GCB; ZWJ ; ZWJ # Grapheme_Extend (Gr_Ext) Gr_Ext; N ; No ; F ; False Gr_Ext; Y ; Yes ; T ; True # Grapheme_Link (Gr_Link) Gr_Link; N ; No ; F ; False Gr_Link; Y ; Yes ; T ; True # Hangul_Syllable_Type (hst) hst; L ; Leading_Jamo hst; LV ; LV_Syllable hst; LVT ; LVT_Syllable hst; NA ; Not_Applicable hst; T ; Trailing_Jamo hst; V ; Vowel_Jamo # Hex_Digit (Hex) Hex; N ; No ; F ; False Hex; Y ; Yes ; T ; True # Hyphen (Hyphen) Hyphen; N ; No ; F ; False Hyphen; Y ; Yes ; T ; True # IDS_Binary_Operator (IDSB) IDSB; N ; No ; F ; False IDSB; Y ; Yes ; T ; True # IDS_Trinary_Operator (IDST) IDST; N ; No ; F ; False IDST; Y ; Yes ; T ; True # IDS_Unary_Operator (IDSU) IDSU; N ; No ; F ; False IDSU; Y ; Yes ; T ; True # ID_Compat_Math_Continue (ID_Compat_Math_Continue) ID_Compat_Math_Continue; N ; No ; F ; False ID_Compat_Math_Continue; Y ; Yes ; T ; True # ID_Compat_Math_Start (ID_Compat_Math_Start) ID_Compat_Math_Start; N ; No ; F ; False ID_Compat_Math_Start; Y ; Yes ; T ; True # ID_Continue (IDC) IDC; N ; No ; F ; False IDC; Y ; Yes ; T ; True # ID_Start (IDS) IDS; N ; No ; F ; False IDS; Y ; Yes ; T ; True # ISO_Comment (isc) # @missing: 0000..10FFFF; ISO_Comment; # Ideographic (Ideo) Ideo; N ; No ; F ; False Ideo; Y ; Yes ; T ; True # Indic_Conjunct_Break (InCB) InCB; Consonant ; Consonant InCB; Extend ; Extend InCB; Linker ; Linker InCB; None ; None # Indic_Positional_Category (InPC) InPC; Bottom ; Bottom InPC; Bottom_And_Left ; Bottom_And_Left InPC; Bottom_And_Right ; Bottom_And_Right InPC; Left ; Left InPC; Left_And_Right ; Left_And_Right InPC; NA ; Not_Applicable InPC; Overstruck ; Overstruck InPC; Right ; Right InPC; Top ; Top InPC; Top_And_Bottom ; Top_And_Bottom InPC; Top_And_Bottom_And_Left ; Top_And_Bottom_And_Left InPC; Top_And_Bottom_And_Right ; Top_And_Bottom_And_Right InPC; Top_And_Left ; Top_And_Left InPC; Top_And_Left_And_Right ; Top_And_Left_And_Right InPC; Top_And_Right ; Top_And_Right InPC; Visual_Order_Left ; Visual_Order_Left # Indic_Syllabic_Category (InSC) InSC; Avagraha ; Avagraha InSC; Bindu ; Bindu InSC; Brahmi_Joining_Number ; Brahmi_Joining_Number InSC; Cantillation_Mark ; Cantillation_Mark InSC; Consonant ; Consonant InSC; Consonant_Dead ; Consonant_Dead InSC; Consonant_Final ; Consonant_Final InSC; Consonant_Head_Letter ; Consonant_Head_Letter InSC; Consonant_Initial_Postfixed ; Consonant_Initial_Postfixed InSC; Consonant_Killer ; Consonant_Killer InSC; Consonant_Medial ; Consonant_Medial InSC; Consonant_Placeholder ; Consonant_Placeholder InSC; Consonant_Preceding_Repha ; Consonant_Preceding_Repha InSC; Consonant_Prefixed ; Consonant_Prefixed InSC; Consonant_Subjoined ; Consonant_Subjoined InSC; Consonant_Succeeding_Repha ; Consonant_Succeeding_Repha InSC; Consonant_With_Stacker ; Consonant_With_Stacker InSC; Gemination_Mark ; Gemination_Mark InSC; Invisible_Stacker ; Invisible_Stacker InSC; Joiner ; Joiner InSC; Modifying_Letter ; Modifying_Letter InSC; Non_Joiner ; Non_Joiner InSC; Nukta ; Nukta InSC; Number ; Number InSC; Number_Joiner ; Number_Joiner InSC; Other ; Other InSC; Pure_Killer ; Pure_Killer InSC; Register_Shifter ; Register_Shifter InSC; Reordering_Killer ; Reordering_Killer InSC; Syllable_Modifier ; Syllable_Modifier InSC; Tone_Letter ; Tone_Letter InSC; Tone_Mark ; Tone_Mark InSC; Virama ; Virama InSC; Visarga ; Visarga InSC; Vowel ; Vowel InSC; Vowel_Dependent ; Vowel_Dependent InSC; Vowel_Independent ; Vowel_Independent # Jamo_Short_Name (JSN) JSN; A ; A JSN; AE ; AE JSN; B ; B JSN; BB ; BB JSN; BS ; BS JSN; C ; C JSN; D ; D JSN; DD ; DD JSN; E ; E JSN; EO ; EO JSN; EU ; EU JSN; G ; G JSN; GG ; GG JSN; GS ; GS JSN; H ; H JSN; I ; I JSN; J ; J JSN; JJ ; JJ JSN; K ; K JSN; L ; L JSN; LB ; LB JSN; LG ; LG JSN; LH ; LH JSN; LM ; LM JSN; LP ; LP JSN; LS ; LS JSN; LT ; LT JSN; M ; M JSN; N ; N JSN; NG ; NG JSN; NH ; NH JSN; NJ ; NJ JSN; O ; O JSN; OE ; OE JSN; P ; P JSN; R ; R JSN; S ; S JSN; SS ; SS JSN; T ; T JSN; U ; U JSN; WA ; WA JSN; WAE ; WAE JSN; WE ; WE JSN; WEO ; WEO JSN; WI ; WI JSN; YA ; YA JSN; YAE ; YAE JSN; YE ; YE JSN; YEO ; YEO JSN; YI ; YI JSN; YO ; YO JSN; YU ; YU # @missing: 0000..10FFFF; Jamo_Short_Name; # Join_Control (Join_C) Join_C; N ; No ; F ; False Join_C; Y ; Yes ; T ; True # Joining_Group (jg) jg ; African_Feh ; African_Feh jg ; African_Noon ; African_Noon jg ; African_Qaf ; African_Qaf jg ; Ain ; Ain jg ; Alaph ; Alaph jg ; Alef ; Alef jg ; Beh ; Beh jg ; Beth ; Beth jg ; Burushaski_Yeh_Barree ; Burushaski_Yeh_Barree jg ; Dal ; Dal jg ; Dalath_Rish ; Dalath_Rish jg ; E ; E jg ; Farsi_Yeh ; Farsi_Yeh jg ; Fe ; Fe jg ; Feh ; Feh jg ; Final_Semkath ; Final_Semkath jg ; Gaf ; Gaf jg ; Gamal ; Gamal jg ; Hah ; Hah jg ; Hanifi_Rohingya_Kinna_Ya ; Hanifi_Rohingya_Kinna_Ya jg ; Hanifi_Rohingya_Pa ; Hanifi_Rohingya_Pa jg ; He ; He jg ; Heh ; Heh jg ; Heh_Goal ; Heh_Goal jg ; Heth ; Heth jg ; Kaf ; Kaf jg ; Kaph ; Kaph jg ; Kashmiri_Yeh ; Kashmiri_Yeh jg ; Khaph ; Khaph jg ; Knotted_Heh ; Knotted_Heh jg ; Lam ; Lam jg ; Lamadh ; Lamadh jg ; Malayalam_Bha ; Malayalam_Bha jg ; Malayalam_Ja ; Malayalam_Ja jg ; Malayalam_Lla ; Malayalam_Lla jg ; Malayalam_Llla ; Malayalam_Llla jg ; Malayalam_Nga ; Malayalam_Nga jg ; Malayalam_Nna ; Malayalam_Nna jg ; Malayalam_Nnna ; Malayalam_Nnna jg ; Malayalam_Nya ; Malayalam_Nya jg ; Malayalam_Ra ; Malayalam_Ra jg ; Malayalam_Ssa ; Malayalam_Ssa jg ; Malayalam_Tta ; Malayalam_Tta jg ; Manichaean_Aleph ; Manichaean_Aleph jg ; Manichaean_Ayin ; Manichaean_Ayin jg ; Manichaean_Beth ; Manichaean_Beth jg ; Manichaean_Daleth ; Manichaean_Daleth jg ; Manichaean_Dhamedh ; Manichaean_Dhamedh jg ; Manichaean_Five ; Manichaean_Five jg ; Manichaean_Gimel ; Manichaean_Gimel jg ; Manichaean_Heth ; Manichaean_Heth jg ; Manichaean_Hundred ; Manichaean_Hundred jg ; Manichaean_Kaph ; Manichaean_Kaph jg ; Manichaean_Lamedh ; Manichaean_Lamedh jg ; Manichaean_Mem ; Manichaean_Mem jg ; Manichaean_Nun ; Manichaean_Nun jg ; Manichaean_One ; Manichaean_One jg ; Manichaean_Pe ; Manichaean_Pe jg ; Manichaean_Qoph ; Manichaean_Qoph jg ; Manichaean_Resh ; Manichaean_Resh jg ; Manichaean_Sadhe ; Manichaean_Sadhe jg ; Manichaean_Samekh ; Manichaean_Samekh jg ; Manichaean_Taw ; Manichaean_Taw jg ; Manichaean_Ten ; Manichaean_Ten jg ; Manichaean_Teth ; Manichaean_Teth jg ; Manichaean_Thamedh ; Manichaean_Thamedh jg ; Manichaean_Twenty ; Manichaean_Twenty jg ; Manichaean_Waw ; Manichaean_Waw jg ; Manichaean_Yodh ; Manichaean_Yodh jg ; Manichaean_Zayin ; Manichaean_Zayin jg ; Meem ; Meem jg ; Mim ; Mim jg ; No_Joining_Group ; No_Joining_Group jg ; Noon ; Noon jg ; Nun ; Nun jg ; Nya ; Nya jg ; Pe ; Pe jg ; Qaf ; Qaf jg ; Qaph ; Qaph jg ; Reh ; Reh jg ; Reversed_Pe ; Reversed_Pe jg ; Rohingya_Yeh ; Rohingya_Yeh jg ; Sad ; Sad jg ; Sadhe ; Sadhe jg ; Seen ; Seen jg ; Semkath ; Semkath jg ; Shin ; Shin jg ; Straight_Waw ; Straight_Waw jg ; Swash_Kaf ; Swash_Kaf jg ; Syriac_Waw ; Syriac_Waw jg ; Tah ; Tah jg ; Taw ; Taw jg ; Teh_Marbuta ; Teh_Marbuta jg ; Teh_Marbuta_Goal ; Teh_Marbuta_Goal ; Hamza_On_Heh_Goal jg ; Teth ; Teth jg ; Thin_Noon ; Thin_Noon jg ; Thin_Yeh ; Thin_Yeh jg ; Vertical_Tail ; Vertical_Tail jg ; Waw ; Waw jg ; Yeh ; Yeh jg ; Yeh_Barree ; Yeh_Barree jg ; Yeh_With_Tail ; Yeh_With_Tail jg ; Yudh ; Yudh jg ; Yudh_He ; Yudh_He jg ; Zain ; Zain jg ; Zhain ; Zhain # Joining_Type (jt) jt ; C ; Join_Causing jt ; D ; Dual_Joining jt ; L ; Left_Joining jt ; R ; Right_Joining jt ; T ; Transparent jt ; U ; Non_Joining # Line_Break (lb) lb ; AI ; Ambiguous lb ; AK ; Aksara lb ; AL ; Alphabetic lb ; AP ; Aksara_Prebase lb ; AS ; Aksara_Start lb ; B2 ; Break_Both lb ; BA ; Break_After lb ; BB ; Break_Before lb ; BK ; Mandatory_Break lb ; CB ; Contingent_Break lb ; CJ ; Conditional_Japanese_Starter lb ; CL ; Close_Punctuation lb ; CM ; Combining_Mark lb ; CP ; Close_Parenthesis lb ; CR ; Carriage_Return lb ; EB ; E_Base lb ; EM ; E_Modifier lb ; EX ; Exclamation lb ; GL ; Glue lb ; H2 ; H2 lb ; H3 ; H3 lb ; HH ; Unambiguous_Hyphen lb ; HL ; Hebrew_Letter lb ; HY ; Hyphen lb ; ID ; Ideographic lb ; IN ; Inseparable ; Inseperable lb ; IS ; Infix_Numeric lb ; JL ; JL lb ; JT ; JT lb ; JV ; JV lb ; LF ; Line_Feed lb ; NL ; Next_Line lb ; NS ; Nonstarter lb ; NU ; Numeric lb ; OP ; Open_Punctuation lb ; PO ; Postfix_Numeric lb ; PR ; Prefix_Numeric lb ; QU ; Quotation lb ; RI ; Regional_Indicator lb ; SA ; Complex_Context lb ; SG ; Surrogate lb ; SP ; Space lb ; SY ; Break_Symbols lb ; VF ; Virama_Final lb ; VI ; Virama lb ; WJ ; Word_Joiner lb ; XX ; Unknown lb ; ZW ; ZWSpace lb ; ZWJ ; ZWJ # Logical_Order_Exception (LOE) LOE; N ; No ; F ; False LOE; Y ; Yes ; T ; True # Lowercase (Lower) Lower; N ; No ; F ; False Lower; Y ; Yes ; T ; True # Lowercase_Mapping (lc) # @missing: 0000..10FFFF; Lowercase_Mapping; # Math (Math) Math; N ; No ; F ; False Math; Y ; Yes ; T ; True # Modifier_Combining_Mark (MCM) MCM; N ; No ; F ; False MCM; Y ; Yes ; T ; True # NFC_Quick_Check (NFC_QC) NFC_QC; M ; Maybe NFC_QC; N ; No NFC_QC; Y ; Yes # NFD_Quick_Check (NFD_QC) NFD_QC; N ; No NFD_QC; Y ; Yes # NFKC_Casefold (NFKC_CF) # NFKC_Quick_Check (NFKC_QC) NFKC_QC; M ; Maybe NFKC_QC; N ; No NFKC_QC; Y ; Yes # NFKC_Simple_Casefold (NFKC_SCF) # NFKD_Quick_Check (NFKD_QC) NFKD_QC; N ; No NFKD_QC; Y ; Yes # Name (na) # @missing: 0000..10FFFF; Name; # Name_Alias (Name_Alias) # @missing: 0000..10FFFF; Name_Alias; # Noncharacter_Code_Point (NChar) NChar; N ; No ; F ; False NChar; Y ; Yes ; T ; True # Numeric_Type (nt) nt ; De ; Decimal nt ; Di ; Digit nt ; None ; None nt ; Nu ; Numeric # Numeric_Value (nv) # @missing: 0000..10FFFF; Numeric_Value; NaN # Other_Alphabetic (OAlpha) OAlpha; N ; No ; F ; False OAlpha; Y ; Yes ; T ; True # Other_Default_Ignorable_Code_Point (ODI) ODI; N ; No ; F ; False ODI; Y ; Yes ; T ; True # Other_Grapheme_Extend (OGr_Ext) OGr_Ext; N ; No ; F ; False OGr_Ext; Y ; Yes ; T ; True # Other_ID_Continue (OIDC) OIDC; N ; No ; F ; False OIDC; Y ; Yes ; T ; True # Other_ID_Start (OIDS) OIDS; N ; No ; F ; False OIDS; Y ; Yes ; T ; True # Other_Lowercase (OLower) OLower; N ; No ; F ; False OLower; Y ; Yes ; T ; True # Other_Math (OMath) OMath; N ; No ; F ; False OMath; Y ; Yes ; T ; True # Other_Uppercase (OUpper) OUpper; N ; No ; F ; False OUpper; Y ; Yes ; T ; True # Pattern_Syntax (Pat_Syn) Pat_Syn; N ; No ; F ; False Pat_Syn; Y ; Yes ; T ; True # Pattern_White_Space (Pat_WS) Pat_WS; N ; No ; F ; False Pat_WS; Y ; Yes ; T ; True # Prepended_Concatenation_Mark (PCM) PCM; N ; No ; F ; False PCM; Y ; Yes ; T ; True # Quotation_Mark (QMark) QMark; N ; No ; F ; False QMark; Y ; Yes ; T ; True # Radical (Radical) Radical; N ; No ; F ; False Radical; Y ; Yes ; T ; True # Regional_Indicator (RI) RI ; N ; No ; F ; False RI ; Y ; Yes ; T ; True # Script (sc) sc ; Adlm ; Adlam sc ; Aghb ; Caucasian_Albanian sc ; Ahom ; Ahom sc ; Arab ; Arabic sc ; Armi ; Imperial_Aramaic sc ; Armn ; Armenian sc ; Avst ; Avestan sc ; Bali ; Balinese sc ; Bamu ; Bamum sc ; Bass ; Bassa_Vah sc ; Batk ; Batak sc ; Beng ; Bengali sc ; Berf ; Beria_Erfe sc ; Bhks ; Bhaiksuki sc ; Bopo ; Bopomofo sc ; Brah ; Brahmi sc ; Brai ; Braille sc ; Bugi ; Buginese sc ; Buhd ; Buhid sc ; Cakm ; Chakma sc ; Cans ; Canadian_Aboriginal sc ; Cari ; Carian sc ; Cham ; Cham sc ; Cher ; Cherokee sc ; Chrs ; Chorasmian sc ; Copt ; Coptic ; Qaac sc ; Cpmn ; Cypro_Minoan sc ; Cprt ; Cypriot sc ; Cyrl ; Cyrillic sc ; Deva ; Devanagari sc ; Diak ; Dives_Akuru sc ; Dogr ; Dogra sc ; Dsrt ; Deseret sc ; Dupl ; Duployan sc ; Egyp ; Egyptian_Hieroglyphs sc ; Elba ; Elbasan sc ; Elym ; Elymaic sc ; Ethi ; Ethiopic sc ; Gara ; Garay sc ; Geor ; Georgian sc ; Glag ; Glagolitic sc ; Gong ; Gunjala_Gondi sc ; Gonm ; Masaram_Gondi sc ; Goth ; Gothic sc ; Gran ; Grantha sc ; Grek ; Greek sc ; Gujr ; Gujarati sc ; Gukh ; Gurung_Khema sc ; Guru ; Gurmukhi sc ; Hang ; Hangul sc ; Hani ; Han sc ; Hano ; Hanunoo sc ; Hatr ; Hatran sc ; Hebr ; Hebrew sc ; Hira ; Hiragana sc ; Hluw ; Anatolian_Hieroglyphs sc ; Hmng ; Pahawh_Hmong sc ; Hmnp ; Nyiakeng_Puachue_Hmong sc ; Hrkt ; Katakana_Or_Hiragana sc ; Hung ; Old_Hungarian sc ; Ital ; Old_Italic sc ; Java ; Javanese sc ; Kali ; Kayah_Li sc ; Kana ; Katakana sc ; Kawi ; Kawi sc ; Khar ; Kharoshthi sc ; Khmr ; Khmer sc ; Khoj ; Khojki sc ; Kits ; Khitan_Small_Script sc ; Knda ; Kannada sc ; Krai ; Kirat_Rai sc ; Kthi ; Kaithi sc ; Lana ; Tai_Tham sc ; Laoo ; Lao sc ; Latn ; Latin sc ; Lepc ; Lepcha sc ; Limb ; Limbu sc ; Lina ; Linear_A sc ; Linb ; Linear_B sc ; Lisu ; Lisu sc ; Lyci ; Lycian sc ; Lydi ; Lydian sc ; Mahj ; Mahajani sc ; Maka ; Makasar sc ; Mand ; Mandaic sc ; Mani ; Manichaean sc ; Marc ; Marchen sc ; Medf ; Medefaidrin sc ; Mend ; Mende_Kikakui sc ; Merc ; Meroitic_Cursive sc ; Mero ; Meroitic_Hieroglyphs sc ; Mlym ; Malayalam sc ; Modi ; Modi sc ; Mong ; Mongolian sc ; Mroo ; Mro sc ; Mtei ; Meetei_Mayek sc ; Mult ; Multani sc ; Mymr ; Myanmar sc ; Nagm ; Nag_Mundari sc ; Nand ; Nandinagari sc ; Narb ; Old_North_Arabian sc ; Nbat ; Nabataean sc ; Newa ; Newa sc ; Nkoo ; Nko sc ; Nshu ; Nushu sc ; Ogam ; Ogham sc ; Olck ; Ol_Chiki sc ; Onao ; Ol_Onal sc ; Orkh ; Old_Turkic sc ; Orya ; Oriya sc ; Osge ; Osage sc ; Osma ; Osmanya sc ; Ougr ; Old_Uyghur sc ; Palm ; Palmyrene sc ; Pauc ; Pau_Cin_Hau sc ; Perm ; Old_Permic sc ; Phag ; Phags_Pa sc ; Phli ; Inscriptional_Pahlavi sc ; Phlp ; Psalter_Pahlavi sc ; Phnx ; Phoenician sc ; Plrd ; Miao sc ; Prti ; Inscriptional_Parthian sc ; Rjng ; Rejang sc ; Rohg ; Hanifi_Rohingya sc ; Runr ; Runic sc ; Samr ; Samaritan sc ; Sarb ; Old_South_Arabian sc ; Saur ; Saurashtra sc ; Sgnw ; SignWriting sc ; Shaw ; Shavian sc ; Shrd ; Sharada sc ; Sidd ; Siddham sc ; Sidt ; Sidetic sc ; Sind ; Khudawadi sc ; Sinh ; Sinhala sc ; Sogd ; Sogdian sc ; Sogo ; Old_Sogdian sc ; Sora ; Sora_Sompeng sc ; Soyo ; Soyombo sc ; Sund ; Sundanese sc ; Sunu ; Sunuwar sc ; Sylo ; Syloti_Nagri sc ; Syrc ; Syriac sc ; Tagb ; Tagbanwa sc ; Takr ; Takri sc ; Tale ; Tai_Le sc ; Talu ; New_Tai_Lue sc ; Taml ; Tamil sc ; Tang ; Tangut sc ; Tavt ; Tai_Viet sc ; Tayo ; Tai_Yo sc ; Telu ; Telugu sc ; Tfng ; Tifinagh sc ; Tglg ; Tagalog sc ; Thaa ; Thaana sc ; Thai ; Thai sc ; Tibt ; Tibetan sc ; Tirh ; Tirhuta sc ; Tnsa ; Tangsa sc ; Todr ; Todhri sc ; Tols ; Tolong_Siki sc ; Toto ; Toto sc ; Tutg ; Tulu_Tigalari sc ; Ugar ; Ugaritic sc ; Vaii ; Vai sc ; Vith ; Vithkuqi sc ; Wara ; Warang_Citi sc ; Wcho ; Wancho sc ; Xpeo ; Old_Persian sc ; Xsux ; Cuneiform sc ; Yezi ; Yezidi sc ; Yiii ; Yi sc ; Zanb ; Zanabazar_Square sc ; Zinh ; Inherited ; Qaai sc ; Zyyy ; Common sc ; Zzzz ; Unknown # Script_Extensions (scx) # Sentence_Break (SB) SB ; AT ; ATerm SB ; CL ; Close SB ; CR ; CR SB ; EX ; Extend SB ; FO ; Format SB ; LE ; OLetter SB ; LF ; LF SB ; LO ; Lower SB ; NU ; Numeric SB ; SC ; SContinue SB ; SE ; Sep SB ; SP ; Sp SB ; ST ; STerm SB ; UP ; Upper SB ; XX ; Other # Sentence_Terminal (STerm) STerm; N ; No ; F ; False STerm; Y ; Yes ; T ; True # Simple_Case_Folding (scf) # @missing: 0000..10FFFF; Simple_Case_Folding; # Simple_Lowercase_Mapping (slc) # @missing: 0000..10FFFF; Simple_Lowercase_Mapping; # Simple_Titlecase_Mapping (stc) # @missing: 0000..10FFFF; Simple_Titlecase_Mapping; # Simple_Uppercase_Mapping (suc) # @missing: 0000..10FFFF; Simple_Uppercase_Mapping; # Soft_Dotted (SD) SD ; N ; No ; F ; False SD ; Y ; Yes ; T ; True # Terminal_Punctuation (Term) Term; N ; No ; F ; False Term; Y ; Yes ; T ; True # Titlecase_Mapping (tc) # @missing: 0000..10FFFF; Titlecase_Mapping; # Unicode_1_Name (na1) # @missing: 0000..10FFFF; Unicode_1_Name; # Unified_Ideograph (UIdeo) UIdeo; N ; No ; F ; False UIdeo; Y ; Yes ; T ; True # Uppercase (Upper) Upper; N ; No ; F ; False Upper; Y ; Yes ; T ; True # Uppercase_Mapping (uc) # @missing: 0000..10FFFF; Uppercase_Mapping; # Variation_Selector (VS) VS ; N ; No ; F ; False VS ; Y ; Yes ; T ; True # Vertical_Orientation (vo) vo ; R ; Rotated vo ; Tr ; Transformed_Rotated vo ; Tu ; Transformed_Upright vo ; U ; Upright # White_Space (WSpace) WSpace; N ; No ; F ; False WSpace; Y ; Yes ; T ; True # Word_Break (WB) WB ; CR ; CR WB ; DQ ; Double_Quote WB ; EB ; E_Base WB ; EBG ; E_Base_GAZ WB ; EM ; E_Modifier WB ; EX ; ExtendNumLet WB ; Extend ; Extend WB ; FO ; Format WB ; GAZ ; Glue_After_Zwj WB ; HL ; Hebrew_Letter WB ; KA ; Katakana WB ; LE ; ALetter WB ; LF ; LF WB ; MB ; MidNumLet WB ; ML ; MidLetter WB ; MN ; MidNum WB ; NL ; Newline WB ; NU ; Numeric WB ; RI ; Regional_Indicator WB ; SQ ; Single_Quote WB ; WSegSpace ; WSegSpace WB ; XX ; Other WB ; ZWJ ; ZWJ # XID_Continue (XIDC) XIDC; N ; No ; F ; False XIDC; Y ; Yes ; T ; True # XID_Start (XIDS) XIDS; N ; No ; F ; False XIDS; Y ; Yes ; T ; True # cjkAccountingNumeric (cjkAccountingNumeric) # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN # cjkCompatibilityVariant (cjkCompatibilityVariant) # @missing: 0000..10FFFF; cjkCompatibilityVariant; # cjkIICore (cjkIICore) # @missing: 0000..10FFFF; cjkIICore; # cjkIRG_GSource (cjkIRG_GSource) # @missing: 0000..10FFFF; cjkIRG_GSource; # cjkIRG_HSource (cjkIRG_HSource) # @missing: 0000..10FFFF; cjkIRG_HSource; # cjkIRG_JSource (cjkIRG_JSource) # @missing: 0000..10FFFF; cjkIRG_JSource; # cjkIRG_KPSource (cjkIRG_KPSource) # @missing: 0000..10FFFF; cjkIRG_KPSource; # cjkIRG_KSource (cjkIRG_KSource) # @missing: 0000..10FFFF; cjkIRG_KSource; # cjkIRG_MSource (cjkIRG_MSource) # @missing: 0000..10FFFF; cjkIRG_MSource; # cjkIRG_SSource (cjkIRG_SSource) # @missing: 0000..10FFFF; cjkIRG_SSource; # cjkIRG_TSource (cjkIRG_TSource) # @missing: 0000..10FFFF; cjkIRG_TSource; # cjkIRG_UKSource (cjkIRG_UKSource) # @missing: 0000..10FFFF; cjkIRG_UKSource; # cjkIRG_USource (cjkIRG_USource) # @missing: 0000..10FFFF; cjkIRG_USource; # cjkIRG_VSource (cjkIRG_VSource) # @missing: 0000..10FFFF; cjkIRG_VSource; # cjkOtherNumeric (cjkOtherNumeric) # @missing: 0000..10FFFF; cjkOtherNumeric; NaN # cjkPrimaryNumeric (cjkPrimaryNumeric) # @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN # cjkRSUnicode (cjkRSUnicode) # @missing: 0000..10FFFF; cjkRSUnicode; # kEH_Cat (kEH_Cat) # @missing: 0000..10FFFF; kEH_Cat; # kEH_Desc (kEH_Desc) # @missing: 0000..10FFFF; kEH_Desc; # kEH_HG (kEH_HG) # @missing: 0000..10FFFF; kEH_HG; # kEH_IFAO (kEH_IFAO) # @missing: 0000..10FFFF; kEH_IFAO; # kEH_JSesh (kEH_JSesh) # @missing: 0000..10FFFF; kEH_JSesh; # kEH_NoMirror (kEH_NoMirror) kEH_NoMirror; N ; No ; F ; False kEH_NoMirror; Y ; Yes ; T ; True # kEH_NoRotate (kEH_NoRotate) kEH_NoRotate; N ; No ; F ; False kEH_NoRotate; Y ; Yes ; T ; True # kMandarin (cjkMandarin) # @missing: 0000..10FFFF; kMandarin; # kTotalStrokes (cjkTotalStrokes) # @missing: 0000..10FFFF; kTotalStrokes; # kUnihanCore2020 (cjkUnihanCore2020) # @missing: 0000..10FFFF; kUnihanCore2020; # EOF ================================================ FILE: maint/Unicode.tables/ScriptExtensions.txt ================================================ # ScriptExtensions-17.0.0.txt # Date: 2025-08-01, 21:42:00 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database # For documentation, see https://www.unicode.org/reports/tr44/ # # The Script_Extensions property indicates which characters are commonly used # with more than one script, but with a limited number of scripts. # For each code point, there is one or more property values. Each such value is a Script property value. # For more information, see: # UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/ # Especially the sections: # https://www.unicode.org/reports/tr24/#Assignment_Script_Values # https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values # # Each Script_Extensions value in this file consists of a set # of one or more abbreviated Script property values. The ordering of the # values in that set is not material, but for stability in presentation # it is given here as alphabetical. # # All code points not explicitly listed for Script_Extensions # have as their value the corresponding Script property value. # # @missing: 0000..10FFFF;