Repository: Cysharp/Utf8StreamReader Branch: main Commit: 71d4683aef6e Files: 36 Total size: 130.3 KB Directory structure: gitextract_yfdau7jl/ ├── .editorconfig ├── .github/ │ ├── dependabot.yaml │ └── workflows/ │ ├── build-debug.yaml │ ├── build-release.yaml │ └── stale.yaml ├── .gitignore ├── Directory.Build.props ├── LICENSE ├── README.md ├── Utf8StreamReader.sln ├── opensource.snk ├── sandbox/ │ ├── Benchmark/ │ │ ├── Benchmark.csproj │ │ ├── BytesReadToEnd.cs │ │ ├── FromFile.cs │ │ ├── FromMemory.cs │ │ ├── Program.cs │ │ └── ReadToEndString.cs │ └── ConsoleApp1/ │ ├── ConsoleApp1.csproj │ ├── Program.cs │ ├── ReadMeSample.cs │ ├── RespReader.cs │ └── file1.txt ├── src/ │ └── Utf8StreamReader/ │ ├── SegmentedArrayBufferWriter.cs │ ├── Utf8StreamReader.cs │ ├── Utf8StreamReader.csproj │ └── Utf8TextReader.cs └── tests/ └── Utf8StreamReader.Tests/ ├── FakeMemoryStream.cs ├── FileReadTest.cs ├── ReadBlockTest.cs ├── ReadTest.cs ├── ReadToEndTest.cs ├── SegmentedArrayBufferWriterTest.cs ├── Tests.cs ├── TextReaderTest.cs ├── Utf8StreamReader.Tests.csproj └── file1.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .editorconfig ================================================ # top-most EditorConfig file root = true [*] charset = utf-8 end_of_line = lf indent_style = space indent_size = 2 insert_final_newline = true trim_trailing_whitespace = true # Visual Studio Spell checker configs (https://learn.microsoft.com/en-us/visualstudio/ide/text-spell-checker?view=vs-2022#how-to-customize-the-spell-checker) spelling_exclusion_path = ./exclusion.dic [*.cs] indent_size = 4 charset = utf-8-bom end_of_line = unset # Solution files [*.{sln,slnx}] end_of_line = unset # MSBuild project files [*.{csproj,props,targets}] end_of_line = unset # Xml config files [*.{ruleset,config,nuspec,resx,runsettings,DotSettings}] end_of_line = unset [*{_AssemblyInfo.cs,.notsupported.cs}] generated_code = true # C# code style settings [*.{cs}] dotnet_style_coalesce_expression = true:suggestion dotnet_style_null_propagation = true:suggestion dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion dotnet_style_prefer_auto_properties = true:suggestion dotnet_style_object_initializer = true:suggestion dotnet_style_prefer_collection_expression = true:suggestion dotnet_style_collection_initializer = true:suggestion dotnet_style_prefer_simplified_boolean_expressions = true:suggestion dotnet_style_prefer_conditional_expression_over_assignment = true:silent dotnet_style_prefer_conditional_expression_over_return = true:silent dotnet_style_explicit_tuple_names = true:suggestion dotnet_style_prefer_inferred_tuple_names = true:suggestion dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion dotnet_style_prefer_compound_assignment = true:suggestion dotnet_style_prefer_simplified_interpolation = true:suggestion dotnet_style_namespace_match_folder = true:suggestion dotnet_style_readonly_field = true:suggestion dotnet_style_predefined_type_for_member_access = true:suggestion dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion dotnet_style_require_accessibility_modifiers = for_non_interface_members:silent dotnet_style_allow_statement_immediately_after_block_experimental = true:silent dotnet_style_allow_multiple_blank_lines_experimental = true:silent dotnet_code_quality_unused_parameters = non_public:suggestion dotnet_style_parentheses_in_relational_binary_operators = always_for_clarity:silent dotnet_style_parentheses_in_other_binary_operators = always_for_clarity:silent dotnet_style_parentheses_in_arithmetic_binary_operators = always_for_clarity:silent dotnet_style_parentheses_in_other_operators = never_if_unnecessary:silent dotnet_style_qualification_for_method = false:none dotnet_style_qualification_for_property = false:none dotnet_style_qualification_for_field = false:none dotnet_style_qualification_for_event = false:none # New line preferences csharp_new_line_before_open_brace = all csharp_new_line_before_else = true csharp_new_line_before_catch = true csharp_new_line_before_finally = true csharp_new_line_before_members_in_object_initializers = true csharp_new_line_before_members_in_anonymous_types = true csharp_new_line_between_query_expression_clauses = true # Indentation preferences csharp_indent_block_contents = true csharp_indent_braces = false csharp_indent_case_contents = true csharp_indent_case_contents_when_block = true csharp_indent_switch_labels = true csharp_indent_labels = one_less_than_current # Modifier preferences csharp_preferred_modifier_order = public,private,protected,internal,static,extern,new,virtual,abstract,sealed,override,readonly,unsafe,volatile,async:suggestion # avoid this. unless absolutely necessary dotnet_style_qualification_for_field = false:none dotnet_style_qualification_for_property = false:none dotnet_style_qualification_for_method = false:none dotnet_style_qualification_for_event = false:none # Types: use keywords instead of BCL types, and permit var only when the type is clear csharp_style_var_for_built_in_types = false:none csharp_style_var_when_type_is_apparent = false:none csharp_style_var_elsewhere = false:none dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion dotnet_style_predefined_type_for_member_access = true:suggestion # name all constant fields using PascalCase dotnet_naming_rule.constant_fields_should_be_pascal_case.severity = suggestion dotnet_naming_rule.constant_fields_should_be_pascal_case.symbols = constant_fields dotnet_naming_rule.constant_fields_should_be_pascal_case.style = pascal_case_style dotnet_naming_symbols.constant_fields.applicable_kinds = field dotnet_naming_symbols.constant_fields.required_modifiers = const dotnet_naming_style.pascal_case_style.capitalization = pascal_case # static fields dotnet_naming_rule.static_fields_should_have_prefix.severity = none dotnet_naming_rule.static_fields_should_have_prefix.symbols = static_fields dotnet_naming_rule.static_fields_should_have_prefix.style = static_prefix_style dotnet_naming_symbols.static_fields.applicable_kinds = field dotnet_naming_symbols.static_fields.required_modifiers = static dotnet_naming_symbols.static_fields.applicable_accessibilities = private, internal, private_protected dotnet_naming_style.static_prefix_style.required_prefix = s_ dotnet_naming_style.static_prefix_style.capitalization = camel_case # internal and private fields dotnet_naming_rule.camel_case_for_private_internal_fields.severity = none dotnet_naming_rule.camel_case_for_private_internal_fields.symbols = private_internal_fields dotnet_naming_rule.camel_case_for_private_internal_fields.style = camel_case_underscore_style dotnet_naming_symbols.private_internal_fields.applicable_kinds = field dotnet_naming_symbols.private_internal_fields.applicable_accessibilities = private, internal dotnet_naming_style.camel_case_underscore_style.required_prefix = _ dotnet_naming_style.camel_case_underscore_style.capitalization = camel_case # Code style defaults csharp_using_directive_placement = outside_namespace:suggestion csharp_prefer_braces = true:silent csharp_preserve_single_line_blocks = true:none csharp_preserve_single_line_statements = false:none csharp_prefer_static_local_function = true:suggestion csharp_prefer_simple_using_statement = false:none csharp_style_prefer_switch_expression = true:suggestion # Code quality dotnet_style_readonly_field = true:suggestion dotnet_code_quality_unused_parameters = non_public:suggestion # Expression-level preferences dotnet_style_object_initializer = true:suggestion dotnet_style_collection_initializer = true:suggestion dotnet_style_explicit_tuple_names = true:suggestion dotnet_style_coalesce_expression = true:suggestion dotnet_style_null_propagation = true:suggestion dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion dotnet_style_prefer_inferred_tuple_names = true:suggestion dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion dotnet_style_prefer_auto_properties = true:suggestion dotnet_style_prefer_conditional_expression_over_assignment = true:silent dotnet_style_prefer_conditional_expression_over_return = true:silent csharp_prefer_simple_default_expression = true:suggestion # Expression-bodied members csharp_style_expression_bodied_methods = true:silent csharp_style_expression_bodied_constructors = true:silent csharp_style_expression_bodied_operators = true:silent csharp_style_expression_bodied_properties = true:silent csharp_style_expression_bodied_indexers = true:silent csharp_style_expression_bodied_accessors = true:silent csharp_style_expression_bodied_lambdas = true:silent csharp_style_expression_bodied_local_functions = true:silent # Pattern matching csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion csharp_style_pattern_matching_over_as_with_null_check = true:suggestion csharp_style_inlined_variable_declaration = true:suggestion # Null checking preferences csharp_style_throw_expression = true:suggestion csharp_style_conditional_delegate_call = true:suggestion # Other features csharp_style_prefer_index_operator = false:none csharp_style_prefer_range_operator = false:none csharp_style_pattern_local_over_anonymous_function = false:none # Space preferences csharp_space_after_cast = false csharp_space_after_colon_in_inheritance_clause = true csharp_space_after_comma = true csharp_space_after_dot = false csharp_space_after_keywords_in_control_flow_statements = true csharp_space_after_semicolon_in_for_statement = true csharp_space_around_binary_operators = before_and_after csharp_space_around_declaration_statements = do_not_ignore csharp_space_before_colon_in_inheritance_clause = true csharp_space_before_comma = false csharp_space_before_dot = false csharp_space_before_open_square_brackets = false csharp_space_before_semicolon_in_for_statement = false csharp_space_between_empty_square_brackets = false csharp_space_between_method_call_empty_parameter_list_parentheses = false csharp_space_between_method_call_name_and_opening_parenthesis = false csharp_space_between_method_call_parameter_list_parentheses = false csharp_space_between_method_declaration_empty_parameter_list_parentheses = false csharp_space_between_method_declaration_name_and_open_parenthesis = false csharp_space_between_method_declaration_parameter_list_parentheses = false csharp_space_between_parentheses = false csharp_space_between_square_brackets = false # Analyzers dotnet_code_quality.CA1052.api_surface = private, internal dotnet_code_quality.CA1802.api_surface = private, internal dotnet_code_quality.CA1822.api_surface = private, internal dotnet_code_quality.CA2208.api_surface = public # IDE0008: Use explicit type dotnet_diagnostic.IDE0008.severity = none # IDE0090: Use 'new(...)' dotnet_diagnostic.IDE0090.severity = none # IDE0040: Add accessibility modifiers dotnet_diagnostic.IDE0040.severity = none # Nullability in reference types of interface implemented by the base type doesn't match dotnet_diagnostic.CS8644.severity = none dotnet_diagnostic.CA1816.severity = none dotnet_diagnostic.IDE1006.severity = none #Remove unnecessary suppression dotnet_diagnostic.IDE0079.severity = none dotnet_diagnostic.IDE0130.severity = none dotnet_diagnostic.CA1822.severity = none csharp_style_prefer_switch_expression = false:suggestion csharp_style_pattern_matching_over_as_with_null_check = false:suggestion dotnet_naming_symbols.functional_symbols.applicable_kinds = property,method,event,delegate dotnet_naming_style.pascal_case_style.capitalization = pascal_case dotnet_naming_rule.functional_symbols_must_be_capitalized.symbols = functional_symbols dotnet_naming_rule.functional_symbols_must_be_capitalized.style = pascal_case_style dotnet_naming_rule.functional_symbols_must_be_capitalized.severity = warning dotnet_naming_symbols.public_symbols.applicable_kinds = property,method,field,event,delegate dotnet_naming_symbols.public_symbols.applicable_accessibilities = public dotnet_naming_symbols.public_symbols.required_modifiers = readonly dotnet_naming_style.first_word_upper_case_style.capitalization = first_word_upper dotnet_naming_rule.public_members_must_be_capitalized.symbols = public_symbols dotnet_naming_rule.public_members_must_be_capitalized.style = first_word_upper_case_style dotnet_naming_rule.public_members_must_be_capitalized.severity = warning csharp_style_expression_bodied_methods = false:silent csharp_style_expression_bodied_constructors = false:silent csharp_style_expression_bodied_operators = false:silent csharp_style_namespace_declarations = file_scoped:suggestion csharp_style_prefer_method_group_conversion = true:silent csharp_style_prefer_top_level_statements = true:silent csharp_style_prefer_primary_constructors = true:suggestion csharp_style_prefer_null_check_over_type_check = true:suggestion csharp_style_prefer_local_over_anonymous_function = true:suggestion csharp_style_implicit_object_creation_when_type_is_apparent = true:suggestion csharp_style_prefer_tuple_swap = true:suggestion csharp_style_prefer_utf8_string_literals = true:suggestion csharp_style_deconstructed_variable_declaration = true:suggestion csharp_style_unused_value_assignment_preference = discard_variable:suggestion csharp_style_unused_value_expression_statement_preference = discard_variable:silent csharp_style_prefer_readonly_struct_member = true:suggestion csharp_style_prefer_readonly_struct = true:suggestion csharp_style_allow_embedded_statements_on_same_line_experimental = true:silent csharp_style_allow_blank_line_after_token_in_arrow_expression_clause_experimental = true:silent csharp_style_allow_blank_line_after_token_in_conditional_expression_experimental = true:silent csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental = true:silent csharp_style_allow_blank_lines_between_consecutive_braces_experimental = true:silent csharp_style_prefer_pattern_matching = true:silent csharp_style_prefer_extended_property_pattern = true:suggestion csharp_style_prefer_not_pattern = true:suggestion ================================================ FILE: .github/dependabot.yaml ================================================ # ref: https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" # Check for updates to GitHub Actions every week groups: dependencies: patterns: - "*" cooldown: default-days: 14 # Wait 14 days before creating another PR for the same dependency. This will prevent vulnerability on the package impact. ignore: # I just want update action when major/minor version is updated. patch updates are too noisy. - dependency-name: "*" update-types: - version-update:semver-patch ================================================ FILE: .github/workflows/build-debug.yaml ================================================ name: Build-Debug on: push: branches: - "main" pull_request: branches: - "main" jobs: build-dotnet: permissions: contents: read runs-on: ubuntu-24.04 timeout-minutes: 10 steps: - uses: Cysharp/Actions/.github/actions/checkout@main - uses: Cysharp/Actions/.github/actions/setup-dotnet@main - run: dotnet build -c Release - run: dotnet test -c Release --no-build - run: dotnet pack -c Release --no-build -p:IncludeSymbols=true -p:SymbolPackageFormat=snupkg -o $GITHUB_WORKSPACE/artifacts ================================================ FILE: .github/workflows/build-release.yaml ================================================ name: Build-Release on: workflow_dispatch: inputs: tag: description: "tag: git tag you want create. (sample 1.0.0)" required: true dry-run: description: "dry-run: true will never create relase/nuget." required: true default: false type: boolean jobs: build-dotnet: permissions: contents: read runs-on: ubuntu-24.04 timeout-minutes: 10 steps: - uses: Cysharp/Actions/.github/actions/checkout@main - uses: Cysharp/Actions/.github/actions/setup-dotnet@main - run: dotnet build -c Release -p:Version=${{ inputs.tag }} - run: dotnet test -c Release --no-build - run: dotnet pack -c Release --no-build -p:Version=${{ inputs.tag }} -o ./publish # Store artifacts. - uses: Cysharp/Actions/.github/actions/upload-artifact@main with: name: nuget path: ./publish/ # release create-release: needs: [build-dotnet] permissions: contents: write id-token: write # required for NuGet Trusted Publish uses: Cysharp/Actions/.github/workflows/create-release.yaml@main with: commit-id: "" dry-run: ${{ inputs.dry-run }} tag: ${{ inputs.tag }} nuget-push: true release-upload: false secrets: inherit ================================================ FILE: .github/workflows/stale.yaml ================================================ name: "Close stale issues" on: workflow_dispatch: schedule: - cron: "0 0 * * *" jobs: stale: permissions: contents: read pull-requests: write issues: write uses: Cysharp/Actions/.github/workflows/stale-issue.yaml@main ================================================ FILE: .gitignore ================================================ # Build Folders (you can keep bin if you'd like, to store dlls and pdbs) [Bb]in/ [Oo]bj/ # mstest test results TestResults ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. # User-specific files *.suo *.user *.sln.docstates # Build results [Dd]ebug/ [Rr]elease/ x64/ *_i.c *_p.c *.ilk *.obj *.pch *.pdb *.pgc *.pgd *.rsp *.sbr *.tlb *.tli *.tlh *.tmp *.log *.vspscc *.vssscc .builds # Visual C++ cache files ipch/ *.aps *.ncb *.opensdf *.sdf # Visual Studio profiler *.psess *.vsp *.vspx # Guidance Automation Toolkit *.gpState # ReSharper is a .NET coding add-in _ReSharper* # NCrunch *.ncrunch* .*crunch*.local.xml # Installshield output folder [Ee]xpress # DocProject is a documentation generator add-in DocProject/buildhelp/ DocProject/Help/*.HxT DocProject/Help/*.HxC DocProject/Help/*.hhc DocProject/Help/*.hhk DocProject/Help/*.hhp DocProject/Help/Html2 DocProject/Help/html # Click-Once directory publish # Publish Web Output *.Publish.xml # NuGet Packages Directory packages # Windows Azure Build Output csx *.build.csdef # Windows Store app package directory AppPackages/ # Others [Bb]in [Oo]bj sql TestResults [Tt]est[Rr]esult* *.Cache ClientBin [Ss]tyle[Cc]op.* ~$* *.dbmdl Generated_Code #added for RIA/Silverlight projects # Backup & report files from converting an old project file to a newer # Visual Studio version. Backup files are not needed, because we have git ;-) _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML .vs/config/applicationhost.config .vs/restore.dg # OTHER nuget/tools/* *.nupkg .vs **/.DS_Store .idea # publish directory out/ *.tsbuildinfo # BenchmarkDotNet Artifacts BenchmarkDotNet.Artifacts/ ================================================ FILE: Directory.Build.props ================================================ true $(MSBuildThisFileDirectory)opensource.snk false $(Version) Cysharp Cysharp © Cysharp, Inc. https://github.com/Cysharp/Utf8StreamReader README.md $(PackageProjectUrl) git MIT Icon.png ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Cysharp, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Utf8StreamReader [![GitHub Actions](https://github.com/Cysharp/Utf8StreamReader/workflows/Build-Debug/badge.svg)](https://github.com/Cysharp/Utf8StreamReader/actions) [![Releases](https://img.shields.io/github/release/Cysharp/Utf8StreamReader.svg)](https://github.com/Cysharp/Utf8StreamReader/releases) [![NuGet package](https://img.shields.io/nuget/v/Utf8StreamReader.svg)](https://nuget.org/packages/Utf8StreamReader) Utf8 based StreamReader for high performance text processing. In addition to UTF-8 based binary processing, it can also be used as a a high-performance replacement for StreamReader and as a helper for fast binary reading. Avoiding unnecessary string allocation is a fundamental aspect of recent .NET performance improvements. Given that most file and network data is in UTF8, features like [JsonSerializer](https://learn.microsoft.com/en-us/dotnet/api/system.text.json.jsonserializer?view=net-8.0) and [IUtf8SpanParsable](https://learn.microsoft.com/en-us/dotnet/api/system.iutf8spanparsable-1?view=net-8.0), which operate on UTF8-based data, have been added. More recently, methods like [.NET8 MemoryExtensions.Split](https://learn.microsoft.com/en-us/dotnet/api/system.memoryextensions.split?view=net-8.0), which avoids allocations, have also been introduced. However, for the most common use case of parsing strings delimited by newlines, only the traditional [StreamReader](https://learn.microsoft.com/en-us/dotnet/api/system.io.streamreader) is provided, which generates a new String for each line, resulting in a large amount of allocations. ![image](https://github.com/Cysharp/Utf8StringInterpolation/assets/46207/ac8d2c7f-65fb-4dc1-b9f5-73219f036e58) > Read simple 1000000 lines text Incredibly, there is a **240,000 times** difference! While it is possible to process data in UTF8 format using standard classes like [PipeReader](https://learn.microsoft.com/en-us/dotnet/api/system.io.pipelines.pipereader?view=dotnet-plat-ext-8.0) and [SequenceReader](https://learn.microsoft.com/en-us/dotnet/api/system.buffers.sequencereader-1?view=net-8.0), they are generic librardies, so properly handling newline processing requires considerable effort(Handling BOM and Multiple Types of Newline Characters). `Utf8StreamReader` provides a familiar API similar to StreamReader, making it easy to use, while its ReadLine-specific implementation maximizes performance. By using optimized internal processing, higher performance can be achieved when reading Strings from Files compared to using the standard `StreamReader.ReadToEnd` or `File.ReadAllText` methods. ![image](https://github.com/Cysharp/Utf8StreamReader/assets/46207/f2dc965a-768a-4069-a3e3-387f5279421a) > Read from file(1000000 lines text) ```csharp [Benchmark] public async Task StreamReaderReadToEndAsync() { using var sr = new System.IO.StreamReader(filePath); return await sr.ReadToEndAsync(); } [Benchmark] public async Task Utf8TextReaderReadToEndAsync() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath).AsTextReader(); return await sr.ReadToEndAsync(); } [Benchmark] public async Task FileReadAllTextAsync() { return await File.ReadAllTextAsync(filePath); } ``` For an explanation of the performance difference, please refer to the [ReadString Section](#readstring). ## Getting Started This library is distributed via NuGet, supporting `.NET Standard 2.1`, `.NET 6(.NET 7)` and `.NET 8` or above. For information on usage with Unity, please refer to the [Unity Section](#unity). PM> Install-Package [Utf8StreamReader](https://www.nuget.org/packages/Utf8StreamReader) The basic API involves `using var streamReader = new Utf8StreamReader(stream);` and then `ReadOnlyMemory line = await streamReader.ReadLineAsync();`. When enumerating all lines, you can choose from three styles: ```csharp using Cysharp.IO; // namespace of Utf8StreamReader public async Task Sample1(Stream stream) { using var reader = new Utf8StreamReader(stream); // Most performant style, similar as System.Threading.Channels while (await reader.LoadIntoBufferAsync()) { while (reader.TryReadLine(out var line)) { // line is ReadOnlyMemory, deserialize UTF8 directly. _ = JsonSerializer.Deserialize(line.Span); } } } public async Task Sample2(Stream stream) { using var reader = new Utf8StreamReader(stream); // Classical style, same as StreamReader ReadOnlyMemory? line = null; while ((line = await reader.ReadLineAsync()) != null) { _ = JsonSerializer.Deserialize(line.Value.Span); } } public async Task Sample3(Stream stream) { using var reader = new Utf8StreamReader(stream); // Most easiest style, use async streams await foreach (var line in reader.ReadAllLinesAsync()) { _ = JsonSerializer.Deserialize(line.Span); } } ``` From a performance perspective, `Utf8StreamReader` only provides asynchronous APIs. Theoretically, the highest performance can be achieved by combining `LoadIntoBufferAsync` and `TryReadLine` in a double while loop. This is similar to the combination of `WaitToReadAsync` and `TryRead` in [Channels](https://learn.microsoft.com/en-us/dotnet/core/extensions/channels). `ReadLineAsync`, like StreamReader.ReadLine, returns null to indicate that the end has been reached. `ReadAllLinesAsync` returns an `IAsyncEnumerable>`. Although there is a performance difference, it is minimal, so this API is ideal when you want to use it easily. All asynchronous methods accept a `CancellationToken` and support cancellation. For a real-world usage example, refer to [StreamMessageReader.cs](https://github.com/Cysharp/Claudia/blob/main/src/Claudia/StreamMessageReader.cs) in [Cysharp/Claudia](https://github.com/Cysharp/Claudia/), a C# SDK for Anthropic Claude, which parses [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events). ## Buffer Lifetimes The `ReadOnlyMemory` returned from `ReadLineAsync` or `TryReadLine` is only valid until the next call to `LoadIntoBufferAsync` or `TryReadLine` or `ReadLineAsync`. Since the data is shared with the internal buffer, it may be overwritten, moved, or returned on the next call, so the safety of the data cannot be guaranteed. The received data must be promptly parsed and converted into a separate object. If you want to keep the data as is, use `ToArray()` to convert it to a `byte[]`. This design is similar to [System.IO.Pipelines](https://learn.microsoft.com/en-us/dotnet/standard/io/pipelines). ## Read as `ReadOnlyMemory` You can convert it to a `Utf8TextReader` that extracts `ReadOnlyMemory` or `string`. Although there is a conversion cost, it is still fast and low allocation, so it can be used as an alternative to `StreamReader`. ![image](https://github.com/Cysharp/Utf8StreamReader/assets/46207/d77af0fd-76af-46ce-8261-0863e4ab7109) After converting with `AsTextReader()`, all the same methods (`TryReadLine`, `ReadLineAsync`, `LoadIntoBufferAsync`, `ReadAllLinesAsync`) can be used. ```csharp using var sr = new Cysharp.IO.Utf8StreamReader(ms).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // line is ReadOnlyMemory, you can add to StringBuilder or other parsing method. // If you neeed string, ReadOnlyMemory.ToString() build string instance // string str = line.ToString(); } } ``` You can perform text processing without allocation, such as splitting `ReadOnlySpan` using [MemoryExtensions.Split](https://learn.microsoft.com/en-us/dotnet/api/system.memoryextensions.split?view=net-8.0#system-memoryextensions-split(system-readonlyspan((system-char))-system-span((system-range))-system-char-system-stringsplitoptions)), and concatenate the results using StringBuilder's [`Append/AppendLine(ReadOnlySpan)`](https://learn.microsoft.com/en-us/dotnet/api/system.text.stringbuilder.append). This way, string-based processing can be done with much lower allocation compared to `StreamReader`. When a string is needed, you can convert `ReadOnlyMemory` to a string using `ToString()`. Even with the added string conversion, the performance is higher than `StreamReader`, so it can be used as a better alternative. ## Optimizing FileStream Similar to `StreamReader`, `Utf8StreamReader` has the ability to open a `FileStream` by accepting a `string path`. ```csharp public Utf8StreamReader(string path, FileOpenMode fileOpenMode = FileOpenMode.Throughput) public Utf8StreamReader(string path, int bufferSize, FileOpenMode fileOpenMode = FileOpenMode.Throughput) public Utf8StreamReader(string path, FileStreamOptions options) public Utf8StreamReader(string path, FileStreamOptions options, int bufferSize) ``` Unfortunately, the `FileStream` used by `StreamReader` is not optimized for modern .NET. For example, when using `FileStream` with asynchronous methods, it should be opened with `useAsync: true` for optimal performance. However, since `StreamReader` has both synchronous and asynchronous methods in its API, false is specified. Additionally, although `StreamReader` itself has a buffer and `FileStream` does not require a buffer, the buffer of `FileStream` is still being utilized. It is difficult to handle `FileStream` correctly with high performance. By specifying a `string path`, the stream is opened with options optimized for `Utf8StreamReader`, so it is recommended to use this overload rather than opening `FileStream` yourself. The following is a benchmark of `FileStream`. ![image](https://github.com/Cysharp/Utf8StreamReader/assets/46207/83936827-2380-414a-9778-f53252689eb7) `Utf8StreamReader` opens `FileStream` with the following settings: ```csharp var useAsync = (fileOpenMode == FileOpenMode.Scalability); new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize: 1, useAsync: useAsync) ``` Due to historical reasons, the options for `FileStream` are odd, but by setting `bufferSize` to 1, you can avoid the use of internal buffers. `FileStream` has been significantly revamped in .NET 6, and by controlling the setting of this option and the way `Utf8StreamReader` is called as a whole, it can function as a thin wrapper around the fast [RandomAccess.ReadAsync](https://learn.microsoft.com/en-us/dotnet/api/system.io.randomaccess.readasync), allowing you to avoid most of the overhead of FileStream. `FileOpenMode` is a proprietary option of `Utf8StreamReader`. ```csharp public enum FileOpenMode { Scalability, Throughput } ``` In a Windows environment, the table in the [IO section of the Performance Improvements in .NET 6 blog](https://devblogs.microsoft.com/dotnet/performance-improvements-in-net-6/#io) shows that throughput decreases when `useAsync: true` is used. | Method | Runtime | IsAsync | BufferSize | Mean | | - | - | - | - | - | | ReadAsync | .NET 6.0 | True | 1 | 119.573 ms | | ReadAsync | .NET 6.0 | False | 1 | 36.018 ms | By setting `Utf8StreamReader` to `FileOpenMode.Scalability`, true async I/O is enabled and scalability is prioritized. If set to `FileOpenMode.Throughput`, it internally becomes sync-over-async and consumes the ThreadPool, but reduces the overhead of asynchronous I/O and improves throughput. If frequently executed within a server application, setting it to `Scalability`, and for batch applications, setting it to `Throughput` will likely yield the best performance characteristics. The default is `Throughput`. (In the current .NET implementation, both seem to be the same (similar to Throughput on Windows) in Linux environments.) In `Utf8StreamReader`, by carefully adjusting the buffer size on the `Utf8StreamReader` side, the performance difference is minimized. Please refer to the above benchmark results image for specific values. For overloads that accept `FileStreamOptions`, the above settings are not reflected, so please adjust them manually. ## ReadString By combining the above FileStream optimization with `.AsTextReader().ReadToEndAsync()`, you can achieve higher performance when reading out a `string` compared to `StreamReader.ReadToEnd` or `File.ReadAllText`. ![image](https://github.com/Cysharp/Utf8StreamReader/assets/46207/f2dc965a-768a-4069-a3e3-387f5279421a) The implementation of `File.ReadAllText` in dotnet/runtime uses `StreamReader.ReadToEnd`, so they are almost the same. However, in the case of `File.ReadAllText`, it uses `useAsync: true` when opening the `FileStream`. That accounts for the performance difference in the benchmark. Another significant difference in the implementation is that `Utf8StreamReader` generates a `string` without using `StringBuilder`. `StreamReader.ReadToEnd` generates a string using the following flow: `byte[] buffer` -> `char[] decodeBuffer` -> `StringBuilder.Append(char[])` -> `StringBuilder.ToString()`, but there are removable inefficiencies. Both `char[]` and `StringBuilder` are `char[]` buffers, and copying occurs. By generating a `string` directly from `char[]`, the copy to the internal buffer of `StringBuilder` can be eliminated. In `Utf8StreamReader`'s `.AsTextReader().ReadToEndAsync()`, it receives streaming data in read buffer units from `Utf8StreamReader` (`ReadToEndChunksAsync`), converts it to `char[]` chunks using `Decoder`, and generates the string all at once using `string.Create`. ```csharp // Utf8TextReader is a helper class for ReadOnlyMemory and string generation that internally holds Utf8StreamReader public async ValueTask ReadToEndAsync(CancellationToken cancellationToken = default) { // Using a method similar to .NET 9 LINQ to Objects's ToArray improvement, returns a structure optimized for gap-free sequential expansion // StreamReader.ReadToEnd copies the buffer to a StringBuilder, but this implementation holds char[] chunks(char[][]) without copying. using var writer = new SegmentedArrayBufferWriter(); var decoder = Encoding.UTF8.GetDecoder(); // Utf8StreamReader.ReadToEndChunksAsync returns the internal buffer ReadOnlyMemory as an asynchronous sequence upon each read completion await foreach (var chunk in reader.ReadToEndChunksAsync(cancellationToken).ConfigureAwait(reader.ConfigureAwait)) { var input = chunk; while (input.Length != 0) { // The Decoder directly writes from the read buffer to the char[] buffer decoder.Convert(input.Span, writer.GetMemory().Span, flush: false, out var bytesUsed, out var charsUsed, out var completed); input = input.Slice(bytesUsed); writer.Advance(charsUsed); } } decoder.Convert([], writer.GetMemory().Span, flush: true, out _, out var finalCharsUsed, out _); writer.Advance(finalCharsUsed); // Directly generate a string from the char[][] buffer using String.Create return string.Create(writer.WrittenCount, writer, static (stringSpan, writer) => { foreach (var item in writer.GetSegmentsAndDispose()) { item.Span.CopyTo(stringSpan); stringSpan = stringSpan.Slice(item.Length); } }); } ``` SegmentedArrayBufferWriter borrows the idea (which I proposed) from [the performance improvement of ToArray in LINQ in .NET 9](https://github.com/dotnet/runtime/pull/96570), and internally holds an InlineArray that expands by equal multipliers. ```csharp [StructLayout(LayoutKind.Sequential)] struct InlineArray19 { public const int InitialSize = 8192; T[] array00; // 8192 T[] array01; // 16384 T[] array02; // 32768 T[] array03; // 65536 T[] array04; // 131072 T[] array05; // 262144 T[] array06; // 524288 T[] array07; // 1048576 T[] array08; // 2097152 T[] array09; // 4194304 T[] array10; // 8388608 T[] array11; // 16777216 T[] array12; // 33554432 T[] array13; // 67108864 T[] array14; // 134217728 T[] array15; // 268435456 T[] array16; // 536870912 T[] array17; // 1073741824 T[] array18; // Array.MaxLength - total public T[] this[int i] { [MethodImpl(MethodImplOptions.AggressiveInlining)] get { if (i < 0 || i > 18) Throw(); return Unsafe.Add(ref array00, i); } [MethodImpl(MethodImplOptions.AggressiveInlining)] set { if (i < 0 || i > 18) Throw(); Unsafe.Add(ref array00, i) = value; } } void Throw() { throw new ArgumentOutOfRangeException(); } } ``` With these optimizations for both reading and writing, we achieved several times the speedup compared to the .NET standard library. ## Binary Read `TryPeek`, `PeekAsync`, `TryRead`, `ReadAsync`, `TryReadBlock`, and `ReadBlockAsync` enable reading as binary, irrespective of newline codes. For example, [Redis's protocol, RESP](https://redis.io/docs/latest/develop/reference/protocol-spec/), is a text protocol and typically newline-delimited, but after `$N`, it requires reading N bytes (BulkString). For instance, `$5\r\nhello\r\n` means reading 5 bytes. Here's an example of how it can be parsed: ```csharp // $5\r\nhello\r\n var line = await reader.ReadLineAsync(); // $5(+ consumed \r\n) if (line.Value.Span[0] == (byte)'$') { Utf8Parser.TryParse(line.Value.Span.Slice(1), out int size, out _); // 5 var block = await reader.ReadBlockAsync(size); // hello await reader.ReadLineAsync(); // consume \r\n Console.WriteLine(Encoding.UTF8.GetString(block.Span)); } ``` A sample that parses all RESP code is available in [RespReader.cs](https://github.com/Cysharp/Utf8StreamReader/blob/e400444/sandbox/ConsoleApp1/RespReader.cs). Additionally, when using `LoadIntoBufferAsync` and `LoadIntoBufferAtLeastAsync` to include data in the buffer, using `Try***` allows for more efficient execution. ```csharp while (await reader.LoadIntoBufferAsync()) { while (reader.TryReadLine(out var line)) { switch (line.Span[0]) { case (byte)'$': Utf8Parser.TryParse(line.Span.Slice(1), out int size, out _); if (!reader.TryReadBlock(size + 2, out var block)) // +2 is \r\n { // ReadBlockAsync is TryReadBlock + LoadIntoBufferAtLeastAsync block = await reader.ReadBlockAsync(size + 2); } yield return block.Slice(0, size); break; // and others('+', '-', ':', '*') default: break; } } } ``` When using `ReadToEndAsync`, you can obtain a `byte[]` using Utf8StreamReader's efficient binary reading/concatenation (`SegmentedArrayBufferWriter, InlineArray19`). ```csharp using var reader = new Utf8StreamReader(stream); byte[] bytes = await reader.ReadToEndAsync(); ``` `ReadToEndAsync()` has two optional overloads, `(bool disableBomCheck)` and `(long resultSizeHint)`. If `disableBomCheck` is true, it disables the BOM check/trim and always performs a complete binary-matching read. The default for `ReadToEndAsync` is true, which always expects a binary-matching read. If false, it follows Utf8StreamReader.SkipBom. `resultSizeHint` allows for reducing the copy cost by directly generating `new byte[resultSizeHint]` when the final binary size is known and reading directly into that buffer. When reading a file, i.e., when the `Stream` is a `FileStream` and seekable, `FileStream.Length` is used as the resultSizeHint as an optimization. Here is the peformance comparison between copying a normal `Stream` to a `MemoryStream` by `CopyToAsync` and using `ToArray`, and using `ReadToEndAsync` of `Utf8StreamReader` when converting to `byte[]`. The options are adjusted so that optimization does not occur when directly passing FileStream to Utf8StreamReader, in order to intentionally avoid optimization. ![image](https://github.com/Cysharp/Utf8StreamReader/assets/46207/5d8fc9a3-8455-43de-ab8a-80a0963f2638) ```csharp [Benchmark] public async Task MemoryStreamCopyToToArray() { using var fs = new FileStream(filePath, FileMode.Open); var ms = new MemoryStream(); await fs.CopyToAsync(ms); return ms.ToArray(); } [Benchmark] public async Task Utf8StreamReaderReadToEndAsync() { using var fs = new FileStream(filePath, FileMode.Open); using var sr = new Cysharp.IO.Utf8StreamReader(fs); return await sr.ReadToEndAsync(disableBomCheck: false); // hack for disable optimize(for benchmark fairness) } ``` ## Reset `Utf8StreamReader` is a class that supports reuse. By calling `Reset()`, the Stream and internal state are released. Using `Reset(Stream)`, it can be reused with a new `Stream`. ## Options The constructor accepts `int bufferSize` and `bool leaveOpen` as parameters. `int bufferSize` defaults to 65536 and the buffer is rented from `ArrayPool`. If the data per line is large, changing the buffer size may improve performance. When the buffer size and the size per line are close, frequent buffer copy operations occur, leading to performance degradation. `bool leaveOpen` determines whether the internal Stream is also disposed when the object is disposed. The default is `false`, which means the Stream is disposed. Additionally, there are init properties that allow changing the option values for `ConfigureAwait`, `SyncRead` and `SkipBom`. `bool ConfigureAwait { init; }` allows you to specify the value for `ConfigureAwait(bool continueOnCapturedContext)` when awaiting asynchronous methods internally. The default is `false`. `bool SyncRead { init; }` configures the Stream to use synchronous reading, meaning it will use Read instead. This causes all Async operations to complete synchronously. There is potential for slight performance improvements when a `FileStream` is opened with `useAsync:false`. Normally, leaving it as false is fine. The default is `false`. `bool SkipBom { init; }` determines whether to identify and skip the BOM (Byte Order Mark) included at the beginning of the data during the first read. The default is `true`, which means the BOM is skipped. Currently, this is not an option, but `Utf8StreamReader` only determines `CRLF(\r\n)` or `LF(\n)` as newline characters. Since environments that use `CR(\r)` are now extremely rare, the CR check is omitted for performance reasons. If you need this functionality, please let us know by creating an Issue. We will consider adding it as an option Unity --- Unity, which supports .NET Standard 2.1, can run this library. Since the library is only provided through NuGet, it is recommended to use [NuGetForUnity](https://github.com/GlitchEnzo/NuGetForUnity) for installation. For detailed instructions on using NuGet libraries in Unity, please refer to the documentation of [Cysharp/R3](https://github.com/Cysharp/R3/) and other similar resources. License --- This library is under the MIT License. ================================================ FILE: Utf8StreamReader.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.8.34330.188 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{BD07BD08-1CB4-41AE-B2BD-3975BE13B8EC}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Utf8StreamReader", "src\Utf8StreamReader\Utf8StreamReader.csproj", "{983561F1-F180-4188-AE80-BFA95FD69656}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{5A8808D6-63E0-48EE-A115-0380E0E57156}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Utf8StreamReader.Tests", "tests\Utf8StreamReader.Tests\Utf8StreamReader.Tests.csproj", "{6C953584-A04B-42C7-9CF3-267AFB010C2B}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "sandbox", "sandbox", "{6BA94544-B2DF-4DD2-9390-DAA8AF5CA90A}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ConsoleApp1", "sandbox\ConsoleApp1\ConsoleApp1.csproj", "{27B89B32-EC1A-48B0-BFC9-6172FCCE2961}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Benchmark", "sandbox\Benchmark\Benchmark.csproj", "{48293CC8-A87C-4F59-A398-51CD37E6B62B}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {983561F1-F180-4188-AE80-BFA95FD69656}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {983561F1-F180-4188-AE80-BFA95FD69656}.Debug|Any CPU.Build.0 = Debug|Any CPU {983561F1-F180-4188-AE80-BFA95FD69656}.Release|Any CPU.ActiveCfg = Release|Any CPU {983561F1-F180-4188-AE80-BFA95FD69656}.Release|Any CPU.Build.0 = Release|Any CPU {6C953584-A04B-42C7-9CF3-267AFB010C2B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {6C953584-A04B-42C7-9CF3-267AFB010C2B}.Debug|Any CPU.Build.0 = Debug|Any CPU {6C953584-A04B-42C7-9CF3-267AFB010C2B}.Release|Any CPU.ActiveCfg = Release|Any CPU {6C953584-A04B-42C7-9CF3-267AFB010C2B}.Release|Any CPU.Build.0 = Release|Any CPU {27B89B32-EC1A-48B0-BFC9-6172FCCE2961}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {27B89B32-EC1A-48B0-BFC9-6172FCCE2961}.Debug|Any CPU.Build.0 = Debug|Any CPU {27B89B32-EC1A-48B0-BFC9-6172FCCE2961}.Release|Any CPU.ActiveCfg = Release|Any CPU {27B89B32-EC1A-48B0-BFC9-6172FCCE2961}.Release|Any CPU.Build.0 = Release|Any CPU {48293CC8-A87C-4F59-A398-51CD37E6B62B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {48293CC8-A87C-4F59-A398-51CD37E6B62B}.Debug|Any CPU.Build.0 = Debug|Any CPU {48293CC8-A87C-4F59-A398-51CD37E6B62B}.Release|Any CPU.ActiveCfg = Release|Any CPU {48293CC8-A87C-4F59-A398-51CD37E6B62B}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(NestedProjects) = preSolution {983561F1-F180-4188-AE80-BFA95FD69656} = {BD07BD08-1CB4-41AE-B2BD-3975BE13B8EC} {6C953584-A04B-42C7-9CF3-267AFB010C2B} = {5A8808D6-63E0-48EE-A115-0380E0E57156} {27B89B32-EC1A-48B0-BFC9-6172FCCE2961} = {6BA94544-B2DF-4DD2-9390-DAA8AF5CA90A} {48293CC8-A87C-4F59-A398-51CD37E6B62B} = {6BA94544-B2DF-4DD2-9390-DAA8AF5CA90A} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {38C0CA37-B15E-4200-9F2C-AD08076E4013} EndGlobalSection EndGlobal ================================================ FILE: sandbox/Benchmark/Benchmark.csproj ================================================  Exe net8.0 enable enable ================================================ FILE: sandbox/Benchmark/BytesReadToEnd.cs ================================================ using BenchmarkDotNet.Attributes; using Cysharp.IO; using System.Text.Encodings.Web; using System.Text.Json; using System.Text.Unicode; namespace Benchmark; [SimpleJob, MemoryDiagnoser] public class BytesReadToEnd { const int C = 1000000; string filePath = default!; [GlobalSetup] public void GlobalSetup() { var options = new JsonSerializerOptions { Encoder = JavaScriptEncoder.Create(UnicodeRanges.All) }; var path = Path.GetTempFileName(); var newline = OperatingSystem.IsWindows() ? "\r\n"u8 : "\n"u8; using var file = File.OpenWrite(path); for (var i = 0; i < C; i++) { var json = JsonSerializer.SerializeToUtf8Bytes( new MyClass { MyProperty = i, MyProperty2 = "あいうえおかきくけこ" }, options); file.Write(json); file.Write(newline); } filePath = path; } [GlobalCleanup] public void GlobalCleanup() { File.Delete(filePath); } [Benchmark] public async Task FileReadAllBytesAsync() { // ReadAllBytes knows file-length so fastest. return await File.ReadAllBytesAsync(filePath); } [Benchmark] public async Task Utf8StreamReaderReadToEndAsync() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath); return await sr.ReadToEndAsync(); } } [SimpleJob, MemoryDiagnoser] public class BytesReadToEnd2 { const int C = 1000000; string filePath = default!; [GlobalSetup] public void GlobalSetup() { var options = new JsonSerializerOptions { Encoder = JavaScriptEncoder.Create(UnicodeRanges.All) }; var path = Path.GetTempFileName(); var newline = OperatingSystem.IsWindows() ? "\r\n"u8 : "\n"u8; using var file = File.OpenWrite(path); for (var i = 0; i < C; i++) { var json = JsonSerializer.SerializeToUtf8Bytes( new MyClass { MyProperty = i, MyProperty2 = "あいうえおかきくけこ" }, options); file.Write(json); file.Write(newline); } filePath = path; } [GlobalCleanup] public void GlobalCleanup() { File.Delete(filePath); } [Benchmark] public async Task MemoryStreamCopyToToArray() { using var fs = new FileStream(filePath, FileMode.Open); var ms = new MemoryStream(); await fs.CopyToAsync(ms); return ms.ToArray(); } [Benchmark] public async Task Utf8StreamReaderReadToEndAsync() { using var fs = new FileStream(filePath, FileMode.Open); using var sr = new Cysharp.IO.Utf8StreamReader(fs); return await sr.ReadToEndAsync(disableBomCheck: false); // hack for ignore optimize(for benchmark fairness) } } ================================================ FILE: sandbox/Benchmark/FromFile.cs ================================================ using BenchmarkDotNet.Attributes; using Cysharp.IO; using System.Text; using System.Text.Encodings.Web; using System.Text.Json; using System.Text.Unicode; namespace Benchmark; [SimpleJob, MemoryDiagnoser] public class FromFile { const int C = 1000000; string filePath = default!; [GlobalSetup] public void GlobalSetup() { var options = new JsonSerializerOptions { Encoder = JavaScriptEncoder.Create(UnicodeRanges.All) }; var path = Path.GetTempFileName(); var newline = OperatingSystem.IsWindows() ? "\r\n"u8 : "\n"u8; using var file = File.OpenWrite(path); for (var i = 0; i < C; i++) { var json = JsonSerializer.SerializeToUtf8Bytes( new MyClass { MyProperty = i, MyProperty2 = "あいうえおかきくけこ" }, options); file.Write(json); file.Write(newline); } filePath = path; } [GlobalCleanup] public void GlobalCleanup() { File.Delete(filePath); } [Benchmark] public async Task StreamReaderFileStream() { using var sr = new System.IO.StreamReader(filePath); string? line; while ((line = await sr.ReadLineAsync()) != null) { // ... } } [Benchmark] public async Task FileReadLinesAsync() { await foreach (var line in File.ReadLinesAsync(filePath, Encoding.UTF8)) { } } [Benchmark] public async Task Utf8StreamReaderFileStreamScalability() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Scalability); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // ... } } } [Benchmark] public async Task Utf8StreamReaderFileStreamThroughput() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Throughput); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // ... } } } [Benchmark] public async ValueTask Utf8StreamReaderFileStreamThroughputSyncRead() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Throughput) { SyncRead = true }; while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { } } } [Benchmark] public async Task Utf8TextReaderFileStreamScalability() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Scalability).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // ... } } } [Benchmark] public async Task Utf8TextReaderFileStreamThroughput() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Throughput).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // ... } } } [Benchmark] public async ValueTask Utf8TextReaderFileStreamThroughputSyncRead() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Throughput) { SyncRead = true }.AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // ... } } } [Benchmark] public async Task Utf8TextReaderToStringFileStreamScalability() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Scalability).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { _ = line.ToString(); } } } [Benchmark] public async Task Utf8TextReaderToStringFileStreamThroughput() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath, fileOpenMode: FileOpenMode.Throughput).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { _ = line.ToString(); } } } } ================================================ FILE: sandbox/Benchmark/FromMemory.cs ================================================ using System.Buffers; using System.IO.Pipelines; using System.Text; using System.Text.Encodings.Web; using System.Text.Json; using System.Text.Unicode; using BenchmarkDotNet.Attributes; using Cysharp.IO; namespace Benchmark; [SimpleJob, MemoryDiagnoser] public class FromMemory { const int C = 1000000; // const int C = 100; byte[] utf8Data = default!; MemoryStream ms = default!; [GlobalSetup] public void GlobalSetup() { var options = new JsonSerializerOptions { Encoder = JavaScriptEncoder.Create(UnicodeRanges.All) }; var jsonLines = Enumerable.Range(0, C) .Select(x => new MyClass { MyProperty = x, MyProperty2 = "あいうえおかきくけこ" }) .Select(x => JsonSerializer.Serialize(x, options)) .ToArray(); utf8Data = Encoding.UTF8.GetBytes(string.Join(Environment.NewLine, jsonLines)); } [IterationSetup] public void Setup() { ms = new MemoryStream(utf8Data); } [Benchmark] public async Task StreamReader() { using var sr = new System.IO.StreamReader(ms); string? line; while ((line = await sr.ReadLineAsync()) != null) { // Console.WriteLine(line); } } [Benchmark] public async Task Utf8StreamReader() { using var sr = new Cysharp.IO.Utf8StreamReader(ms); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // Console.WriteLine(Encoding.UTF8.GetString( line.Span)); } } } [Benchmark] public async Task Utf8TextReader() { using var sr = new Cysharp.IO.Utf8StreamReader(ms).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { // Console.WriteLine(Encoding.UTF8.GetString( line.Span)); } } } [Benchmark] public async Task Utf8TextReaderToString() { using var sr = new Cysharp.IO.Utf8StreamReader(ms).AsTextReader(); while (await sr.LoadIntoBufferAsync()) { while (sr.TryReadLine(out var line)) { _ = line.ToString(); // Console.WriteLine(Encoding.UTF8.GetString( line.Span)); } } } //[Benchmark] //public async Task Utf8StreamReaderReadLine() //{ // using var sr = new Cysharp.IO.Utf8StreamReader(ms); // ReadOnlyMemory? line; // while ((line = await sr.ReadLineAsync()) != null) // { // // Console.WriteLine(Encoding.UTF8.GetString(line.Value.Span)); // } //} //[Benchmark] //public async Task Utf8StreamReaderReadAllLines() //{ // using var sr = new Cysharp.IO.Utf8StreamReader(ms); // await foreach (var line in sr.ReadAllLinesAsync()) // { // //Console.WriteLine(Encoding.UTF8.GetString(line.Span)); // } //} [Benchmark] public async Task PipeReaderSequenceReader() { using (ms) { var reader = PipeReader.Create(ms); READ_AGAIN: var readResult = await reader.ReadAsync(); if (!(readResult.IsCompleted | readResult.IsCanceled)) { var buffer = readResult.Buffer; while (TryReadData(ref buffer, out var line)) { //Console.WriteLine(Encoding.UTF8.GetString(line)); } reader.AdvanceTo(buffer.Start, buffer.End); goto READ_AGAIN; } } static bool TryReadData(ref ReadOnlySequence buffer, out ReadOnlySequence line) { var reader = new SequenceReader(buffer); if (reader.TryReadTo(out line, (byte)'\n', advancePastDelimiter: true)) { buffer = buffer.Slice(reader.Consumed); return true; } return false; } } //[Benchmark] //public async Task PipelineStreamReader2() //{ // using (ms) // { // var reader = PipeReader.Create(ms); // READ_AGAIN: // var readResult = await reader.ReadAsync(); // if (!(readResult.IsCompleted | readResult.IsCanceled)) // { // var buffer = readResult.Buffer; // ConsumeAllData(ref buffer); // reader.AdvanceTo(buffer.Start, buffer.End); // goto READ_AGAIN; // } // } // static void ConsumeAllData(ref ReadOnlySequence buffer) // { // var reader = new SequenceReader(buffer); // while (reader.TryReadTo(out ReadOnlySequence line, (byte)'\n', advancePastDelimiter: true)) // { // //Console.WriteLine(Encoding.UTF8.GetString(line)); // } // buffer = buffer.Slice(reader.Consumed); // } //} } public class MyClass { public int MyProperty { get; set; } public string? MyProperty2 { get; set; } } ================================================ FILE: sandbox/Benchmark/Program.cs ================================================ #if DEBUG using Benchmark; using System.Runtime.CompilerServices; global::System.Console.WriteLine("DEBUG"); //var benchmark = new BytesReadToEnd(); var benchmark = new ReadToEndString(); benchmark.GlobalSetup(); //var s1 = await benchmark.FileReadAllBytesAsync(); var s2 = await benchmark.Utf8TextReaderReadToEndAsync(); //Console.WriteLine(s1.SequenceEqual(s2)); benchmark.GlobalCleanup(); #else using BenchmarkDotNet.Running; BenchmarkSwitcher .FromAssembly(typeof(Program).Assembly) .Run(args); #endif ================================================ FILE: sandbox/Benchmark/ReadToEndString.cs ================================================ using BenchmarkDotNet.Attributes; using Cysharp.IO; using System.Text.Encodings.Web; using System.Text.Json; using System.Text.Unicode; namespace Benchmark; [SimpleJob, MemoryDiagnoser] public class ReadToEndString { const int C = 1000000; string filePath = default!; [GlobalSetup] public void GlobalSetup() { var options = new JsonSerializerOptions { Encoder = JavaScriptEncoder.Create(UnicodeRanges.All) }; var path = Path.GetTempFileName(); var newline = OperatingSystem.IsWindows() ? "\r\n"u8 : "\n"u8; using var file = File.OpenWrite(path); for (var i = 0; i < C; i++) { var json = JsonSerializer.SerializeToUtf8Bytes( new MyClass { MyProperty = i, MyProperty2 = "あいうえおかきくけこ" }, options); file.Write(json); file.Write(newline); } filePath = path; } [GlobalCleanup] public void GlobalCleanup() { File.Delete(filePath); } [Benchmark] public async Task StreamReaderReadToEndAsync() { using var sr = new System.IO.StreamReader(filePath); return await sr.ReadToEndAsync(); } [Benchmark] public async Task Utf8TextReaderReadToEndAsync() { using var sr = new Cysharp.IO.Utf8StreamReader(filePath).AsTextReader(); return await sr.ReadToEndAsync(); } [Benchmark] public async Task FileReadAllTextAsync() { return await File.ReadAllTextAsync(filePath); } } ================================================ FILE: sandbox/ConsoleApp1/ConsoleApp1.csproj ================================================  Exe net8.0 enable enable Always ================================================ FILE: sandbox/ConsoleApp1/Program.cs ================================================ using Cysharp.IO; using Microsoft.Win32.SafeHandles; using System.Buffers; using System.Buffers.Text; using System.IO; using System.IO.Pipelines; using System.Runtime.InteropServices; using System.Runtime.InteropServices.Marshalling; using System.Text; using System.Text.Encodings.Web; using System.Text.Json; using System.Text.Unicode; var aa = Encoding.UTF8.GetBytes("$5\r\nhello\r\n"); var stream = new MemoryStream(aa); using var reader = new Utf8StreamReader(stream) { SkipBom = false }; byte[] bytes = await reader.ReadToEndAsync(); //while (await reader.LoadIntoBufferAsync()) //{ // while (reader.TryReadLine(out var line)) // { // switch (line.Span[0]) // { // case (byte)'$': // Utf8Parser.TryParse(line.Span.Slice(1), out int size, out _); // if (!reader.TryReadBlock(size + 2, out var block)) // +2 is \r\n // { // // ReadBlockAsync is TryReadBlock + LoadIntoBufferAtLeastAsync // block = await reader.ReadBlockAsync(size + 2); // } // yield return block.Slice(0, size); // break; // // and others('+', '-', ':', '*') // default: // break; // } // } //} //var path = "file1.txt"; //var fs = new FileStream(path, FileMode.Open,FileAccess.Read, FileShare.Read, 0, false); //var buf = new byte[1024]; //await fs.ReadAsync(buf); //using var reader = new Utf8StreamReader(path).AsTextReader(); //var str = await reader.ReadToEndAsync(); //Console.WriteLine(str.ToString()); // new StreamReader().ReadBlock( //var options = new JsonSerializerOptions(); //options.Encoder = JavaScriptEncoder.Create(UnicodeRanges.All); //var jsonLines = Enumerable.Range(0, 100000) // .Select(x => new MyClass { MyProperty = x, MyProperty2 = "あいうえおかきくけこ" }) // .Select(x => JsonSerializer.Serialize(x, options)) // .ToArray(); //var utf8Data = Encoding.UTF8.GetBytes(string.Join(Environment.NewLine, jsonLines)); //var ms = new MemoryStream(utf8Data); ////using var sr = new System.IO.StreamReader(ms); ////string? line; ////while ((line = await sr.ReadLineAsync()) != null) ////{ //// // JsonSerializer.Deserialize(line); ////} //using var sr = new Cysharp.IO.Utf8StreamReader(ms); //ReadOnlyMemory? line; //while ((line = await sr.ReadLineAsync()) != null) //{ //} //public class MyClass //{ // public int MyProperty { get; set; } // public string? MyProperty2 { get; set; } //} ================================================ FILE: sandbox/ConsoleApp1/ReadMeSample.cs ================================================ using Cysharp.IO; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.Json; using System.Threading.Tasks; namespace ConsoleApp1; internal class ReadMeSample { public async void Sample1(Stream stream) { using var reader = new Utf8StreamReader(stream); // Most performant style, similar as System.Threading.Channels while (await reader.LoadIntoBufferAsync()) { while (reader.TryReadLine(out var line)) { // line is ReadOnlyMemory, deserialize UTF8 directly. _ = JsonSerializer.Deserialize(line.Span); } } } public async void Sample2(Stream stream) { using var reader = new Utf8StreamReader(stream); // Classical style, same as StreamReader ReadOnlyMemory? line = null; while ((line = await reader.ReadLineAsync()) != null) { _ = JsonSerializer.Deserialize(line.Value.Span); } } public async void Sample3(Stream stream) { using var reader = new Utf8StreamReader(stream); // Most easiest style, use async streams await foreach (var line in reader.ReadAllLinesAsync()) { _ = JsonSerializer.Deserialize(line.Span); } } } public class Foo { } ================================================ FILE: sandbox/ConsoleApp1/RespReader.cs ================================================ using Cysharp.IO; using System.Buffers.Text; using System.Text; namespace ConsoleApp1; public enum RespType : byte { SimpleStrings = (byte)'+', Errors = (byte)'-', Integers = (byte)':', BulkStrings = (byte)'$', Arrays = (byte)'*' } public class RespReader : IDisposable { Utf8StreamReader reader; public RespReader(Stream stream) { this.reader = new Utf8StreamReader(stream); } // NOTE: for more fast processing, you need to use TryRead method. public async ValueTask ReadRespTypeAsync(CancellationToken cancellationToken = default) { return (RespType)await reader.ReadAsync(cancellationToken); } // all read message api expect befor call ReadRespTypeAsync(already trimed type prefix) public async ValueTask ReadSimpleStringAsync(CancellationToken cancellationToken = default) // +OK\r\n { return Encoding.UTF8.GetString((await reader.ReadLineAsync(cancellationToken)).Value.Span); } public async ValueTask ReadErrorMessageAsync(CancellationToken cancellationToken = default) // -Error message\r\n { return Encoding.UTF8.GetString((await reader.ReadLineAsync(cancellationToken)).Value.Span); } public async ValueTask ReadIntegerAsync(CancellationToken cancellationToken = default) // :1000\r\n { var line = await reader.ReadLineAsync(cancellationToken); Utf8Parser.TryParse(line.Value.Span, out long value, out _); return value; } public async ValueTask?> ReadBulkStringAsync(CancellationToken cancellationToken = default) // "$5\r\nhello\r\n" { var line = await reader.ReadLineAsync(cancellationToken); Utf8Parser.TryParse(line.Value.Span, out int count, out _); if (count == -1) { return null; } else { var dataWithNewLine = await reader.ReadBlockAsync(count + 2, cancellationToken); return dataWithNewLine[..^2]; // without newline } } // for perf improvement, ReadIntegerArray, ReadStringArray, ReadArray for bulkstrings is better approach public async ValueTask ReadArrayAsync(CancellationToken cancellationToken = default) // "*2\r\n$5\r\nhello\r\n$5\r\nworld\r\n" { var line = await reader.ReadLineAsync(); Utf8Parser.TryParse(line.Value.Span, out int count, out _); var result = new object[count]; for (int i = 0; i < count; i++) { var type = await ReadRespTypeAsync(cancellationToken); switch (type) { case RespType.SimpleStrings: result[i] = await ReadSimpleStringAsync(cancellationToken); break; case RespType.Errors: result[i] = await ReadErrorMessageAsync(cancellationToken); break; case RespType.Integers: result[i] = await ReadIntegerAsync(cancellationToken); break; case RespType.BulkStrings: result[i] = (await ReadBulkStringAsync(cancellationToken)).Value.ToArray(); // materialize immediately break; case RespType.Arrays: result[i] = await ReadArrayAsync(cancellationToken); break; default: break; } } return result; } public void Dispose() { reader.Dispose(); } } ================================================ FILE: sandbox/ConsoleApp1/file1.txt ================================================ abcde fgh ijklmnopqrs ================================================ FILE: src/Utf8StreamReader/SegmentedArrayBufferWriter.cs ================================================ using System.Buffers; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; namespace Cysharp.IO; // similar as .NET9 SegmentedArrayBuilder but for async operation and direct write internal sealed class SegmentedArrayBufferWriter : IDisposable { // NetStandard2.1 does not have Array.MaxLength so use constant. const int ArrayMaxLength = 0X7FFFFFC7; InlineArray19 segments; int currentSegmentIndex; int countInFinishedSegments; T[] currentSegment; int currentWritten; bool isDisposed = false; public int WrittenCount => countInFinishedSegments + currentWritten; public SegmentedArrayBufferWriter() { currentSegment = segments[0] = ArrayPool.Shared.Rent(InlineArray19.InitialSize); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public Memory GetMemory() // no sizeHint { return currentSegment.AsMemory(currentWritten); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public Span GetSpan() { return currentSegment.AsSpan(currentWritten); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Advance(int count) { checked { currentWritten += count; }; if (currentWritten == currentSegment.Length) { AllocateNextMemory(); } } void AllocateNextMemory() { countInFinishedSegments += currentSegment.Length; var nextSize = currentSegment.Length * 2L; if (nextSize + countInFinishedSegments > ArrayMaxLength) { nextSize = ArrayMaxLength - countInFinishedSegments; } currentSegmentIndex++; currentSegment = segments[currentSegmentIndex] = ArrayPool.Shared.Rent((int)nextSize); currentWritten = 0; } public void Write(ReadOnlySpan source) { while (source.Length != 0) { var destination = GetSpan(); var copySize = Math.Min(source.Length, destination.Length); source.Slice(0, copySize).CopyTo(destination); Advance(copySize); source = source.Slice(copySize); } } public T[] ToArrayAndDispose() { if (isDisposed) throw new ObjectDisposedException(""); isDisposed = true; var size = checked(countInFinishedSegments + currentWritten); if (size == 0) { ArrayPool.Shared.Return(currentSegment, clearArray: RuntimeHelpers.IsReferenceOrContainsReferences()); return []; } #if !NETSTANDARD var result = GC.AllocateUninitializedArray(size); #else var result = new T[size]; #endif var destination = result.AsSpan(); // without current for (int i = 0; i < currentSegmentIndex; i++) { var segment = segments[i]; segment.AsSpan().CopyTo(destination); destination = destination.Slice(segment.Length); ArrayPool.Shared.Return(segment, clearArray: RuntimeHelpers.IsReferenceOrContainsReferences()); } // write current currentSegment.AsSpan(0, currentWritten).CopyTo(destination); ArrayPool.Shared.Return(currentSegment, clearArray: RuntimeHelpers.IsReferenceOrContainsReferences()); currentSegment = null!; segments = default!; return result; } // NOTE: create struct enumerator? public IEnumerable> GetSegmentsAndDispose() { if (isDisposed) throw new ObjectDisposedException(""); isDisposed = true; // without current for (int i = 0; i < currentSegmentIndex; i++) { var segment = segments[i]; yield return segment; ArrayPool.Shared.Return(segment, clearArray: RuntimeHelpers.IsReferenceOrContainsReferences()); } // current if (currentWritten != 0) { yield return currentSegment.AsMemory(0, currentWritten); } ArrayPool.Shared.Return(currentSegment, clearArray: RuntimeHelpers.IsReferenceOrContainsReferences()); currentSegment = null!; segments = default!; } public void Dispose() { if (isDisposed) return; isDisposed = true; for (int i = 0; i <= currentSegmentIndex; i++) { ArrayPool.Shared.Return(segments[i], clearArray: RuntimeHelpers.IsReferenceOrContainsReferences()); } currentSegment = null!; segments = default!; } } [StructLayout(LayoutKind.Sequential)] struct InlineArray19 { public const int InitialSize = 8192; T[] array00; // 8192 T[] array01; // 16384 T[] array02; // 32768 T[] array03; // 65536 T[] array04; // 131072 T[] array05; // 262144 T[] array06; // 524288 T[] array07; // 1048576 T[] array08; // 2097152 T[] array09; // 4194304 T[] array10; // 8388608 T[] array11; // 16777216 T[] array12; // 33554432 T[] array13; // 67108864 T[] array14; // 134217728 T[] array15; // 268435456 T[] array16; // 536870912 T[] array17; // 1073741824 T[] array18; // Array.MaxLength - total public T[] this[int i] { [MethodImpl(MethodImplOptions.AggressiveInlining)] get { if (i < 0 || i > 18) Throw(); return Unsafe.Add(ref array00, i); } [MethodImpl(MethodImplOptions.AggressiveInlining)] set { if (i < 0 || i > 18) Throw(); Unsafe.Add(ref array00, i) = value; } } void Throw() { throw new ArgumentOutOfRangeException(); } } ================================================ FILE: src/Utf8StreamReader/Utf8StreamReader.cs ================================================ using System.Buffers; using System.Runtime.CompilerServices; using System.Text; namespace Cysharp.IO; public enum FileOpenMode { Scalability, Throughput } public sealed class Utf8StreamReader : IAsyncDisposable, IDisposable { // NetStandard2.1 does not have Array.MaxLength so use constant. const int ArrayMaxLength = 0X7FFFFFC7; const int DefaultBufferSize = 65536; const int MinBufferSize = 1024; Stream stream; readonly bool leaveOpen; readonly int bufferSize; bool endOfStream; bool checkPreamble = true; bool skipBom = true; bool isDisposed; byte[] inputBuffer; int positionBegin; int positionEnd; int lastNewLinePosition = -2; // -2 is not exists new line in buffer, -1 is not yet searched. absolute path from inputBuffer begin int lastExaminedPosition; public bool SkipBom { get => skipBom; init => skipBom = checkPreamble = value; } public bool ConfigureAwait { get; init; } = false; public bool SyncRead { get; init; } = false; public Utf8StreamReader(Stream stream) : this(stream, DefaultBufferSize, false) { } public Utf8StreamReader(Stream stream, int bufferSize) : this(stream, bufferSize, false) { } public Utf8StreamReader(Stream stream, bool leaveOpen) : this(stream, DefaultBufferSize, leaveOpen) { } public Utf8StreamReader(Stream stream, int bufferSize, bool leaveOpen) { this.inputBuffer = ArrayPool.Shared.Rent(Math.Max(bufferSize, MinBufferSize)); this.stream = stream; this.bufferSize = bufferSize; this.leaveOpen = leaveOpen; } public Utf8StreamReader(string path, FileOpenMode fileOpenMode = FileOpenMode.Throughput) : this(path, DefaultBufferSize, fileOpenMode) { } public Utf8StreamReader(string path, int bufferSize, FileOpenMode fileOpenMode = FileOpenMode.Throughput) : this(OpenPath(path, fileOpenMode), bufferSize, leaveOpen: false) { } static FileStream OpenPath(string path, FileOpenMode fileOpenMode = FileOpenMode.Throughput) { #if NETSTANDARD var useSequentialScan = FileOptions.SequentialScan; #else // SequentialScan is a perf hint that requires extra sys-call on non-Windows OSes. var useSequentialScan = OperatingSystem.IsWindows() ? FileOptions.SequentialScan : FileOptions.None; #endif var fileOptions = (fileOpenMode == FileOpenMode.Scalability) ? (FileOptions.Asynchronous | useSequentialScan) : useSequentialScan; return new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize: 1, options: fileOptions); } #if !NETSTANDARD public Utf8StreamReader(string path, FileStreamOptions options) : this(path, options, DefaultBufferSize) { } public Utf8StreamReader(string path, FileStreamOptions options, int bufferSize) : this(OpenPath(path, options), bufferSize) { } static FileStream OpenPath(string path, FileStreamOptions options) { return new FileStream(path, options); } #endif // Peek() and EndOfStream is `Sync` method so does not provided. public Stream BaseStream => stream; public bool TryReadLine(out ReadOnlyMemory line) { ThrowIfDisposed(); if (lastNewLinePosition >= 0) { line = inputBuffer.AsMemory(positionBegin, lastNewLinePosition - positionBegin); positionBegin = lastExaminedPosition + 1; lastNewLinePosition = lastExaminedPosition = -1; return true; } // AsSpan(positionBegin..positionEnd) is more readable but don't use range notation, it is slower. var index = IndexOfNewline(inputBuffer.AsSpan(positionBegin, positionEnd - positionBegin), out var newLineIndex); if (index == -1) { if (endOfStream && positionBegin != positionEnd) { // return last line line = inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); positionBegin = positionEnd; return true; } lastNewLinePosition = lastExaminedPosition = -2; // not exists new line in this buffer line = default; return false; } line = inputBuffer.AsMemory(positionBegin, index); // positionBegin..(positionBegin+index) positionBegin = (positionBegin + newLineIndex + 1); lastNewLinePosition = lastExaminedPosition = -1; return true; } #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))] #endif public async ValueTask LoadIntoBufferAsync(CancellationToken cancellationToken = default) { ThrowIfDisposed(); cancellationToken.ThrowIfCancellationRequested(); // pre-check if (endOfStream) { if (positionBegin != positionEnd) // not yet fully consumed { return true; } else { return false; } } else { if (lastNewLinePosition >= 0) return true; // already filled line into buffer // lastNewLineIndex, lastExamined is relative from positionBegin if (lastNewLinePosition == -1) { var index = IndexOfNewline(inputBuffer.AsSpan(positionBegin, positionEnd - positionBegin), out var examinedIndex); if (index != -1) { // convert to absolute lastNewLinePosition = positionBegin + index; lastExaminedPosition = positionBegin + examinedIndex; return true; } } else { // change status to not searched lastNewLinePosition = -1; } } // requires load into buffer if (positionEnd != 0 && positionBegin == positionEnd) { // can reset buffer position positionBegin = positionEnd = 0; } var examined = positionEnd; // avoid to duplicate scan LOAD_INTO_BUFFER: // not reaches full, repeatedly read if (positionEnd != inputBuffer.Length) { var read = SyncRead ? stream.Read(inputBuffer.AsSpan(positionEnd)) : await stream.ReadAsync(inputBuffer.AsMemory(positionEnd), cancellationToken).ConfigureAwait(ConfigureAwait); positionEnd += read; if (read == 0) { endOfStream = true; if (positionBegin != positionEnd) // has last line { return true; } else { return false; } } else { // first Read, require to check UTF8 BOM if (checkPreamble) { if (positionEnd < 3) goto LOAD_INTO_BUFFER; if (inputBuffer.AsSpan(0, 3).SequenceEqual(Encoding.UTF8.Preamble)) { positionBegin = 3; } checkPreamble = false; } // scan examined(already scanned) to End. // Back one index to check if CRLF fell on buffer boundary var scanFrom = examined > 0 ? examined - 1 : examined; var index = IndexOfNewline(inputBuffer.AsSpan(scanFrom, positionEnd - scanFrom), out var examinedIndex); if (index != -1) { lastNewLinePosition = scanFrom + index; lastExaminedPosition = scanFrom + examinedIndex; return true; } examined = positionEnd; goto LOAD_INTO_BUFFER; } } // slide current buffer if (positionBegin != 0) { inputBuffer.AsSpan(positionBegin, positionEnd - positionBegin).CopyTo(inputBuffer); positionEnd -= positionBegin; positionBegin = 0; examined = positionEnd; goto LOAD_INTO_BUFFER; } // buffer is completely full, needs resize(positionBegin, positionEnd, examined are same) { var newBuffer = ArrayPool.Shared.Rent(GetNewSize(inputBuffer.Length)); inputBuffer.AsSpan().CopyTo(newBuffer); ArrayPool.Shared.Return(inputBuffer); inputBuffer = newBuffer; goto LOAD_INTO_BUFFER; } } #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder))] #endif public async ValueTask LoadIntoBufferAtLeastAsync(int minimumBytes, CancellationToken cancellationToken = default) { var loaded = positionEnd - positionBegin; if (minimumBytes < loaded) { return; } if (endOfStream) { throw new EndOfStreamException(); } if (positionEnd != 0 && positionBegin == positionEnd) { // can reset buffer position loaded = positionBegin = positionEnd = 0; lastNewLinePosition = -1; } var remains = minimumBytes - loaded; if (inputBuffer.Length - positionEnd < remains) { // needs resize before load loop var newBuffer = ArrayPool.Shared.Rent(Math.Min(GetNewSize(inputBuffer.Length), positionEnd + remains)); inputBuffer.AsSpan().CopyTo(newBuffer); ArrayPool.Shared.Return(inputBuffer); inputBuffer = newBuffer; } LOAD_INTO_BUFFER: var read = SyncRead ? stream.Read(inputBuffer.AsSpan(positionEnd)) : await stream.ReadAsync(inputBuffer.AsMemory(positionEnd), cancellationToken).ConfigureAwait(ConfigureAwait); positionEnd += read; if (read == 0) { throw new EndOfStreamException(); } else { // first Read, require to check UTF8 BOM if (checkPreamble) { if (positionEnd < 3) goto LOAD_INTO_BUFFER; if (inputBuffer.AsSpan(0, 3).SequenceEqual(Encoding.UTF8.Preamble)) { positionBegin = 3; remains += 3; // read 3 bytes should not contains } checkPreamble = false; } remains -= read; if (remains < 0) { return; } goto LOAD_INTO_BUFFER; } } public async IAsyncEnumerable> ReadToEndChunksAsync([EnumeratorCancellation] CancellationToken cancellationToken = default) { if (endOfStream) { var result = inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); positionBegin = positionEnd; if (result.Length != 0) { yield return result; } yield break; } if (positionEnd != 0 && positionBegin != positionEnd) { yield return inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); } positionBegin = positionEnd = 0; lastNewLinePosition = -2; LOAD_INTO_BUFFER: var read = SyncRead ? stream.Read(inputBuffer.AsSpan(positionEnd)) : await stream.ReadAsync(inputBuffer.AsMemory(positionEnd), cancellationToken).ConfigureAwait(ConfigureAwait); positionEnd += read; if (read == 0) { endOfStream = true; var result = inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); positionBegin = positionEnd; if (result.Length != 0) { yield return result; } yield break; } else { // first Read, require to check UTF8 BOM if (checkPreamble) { if (positionEnd < 3) goto LOAD_INTO_BUFFER; if (inputBuffer.AsSpan(0, 3).SequenceEqual(Encoding.UTF8.Preamble)) { positionBegin = 3; } checkPreamble = false; if (positionEnd - positionBegin == 0) { goto LOAD_INTO_BUFFER; } } yield return inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); positionBegin = positionEnd = 0; goto LOAD_INTO_BUFFER; } } public ValueTask ReadToEndAsync(CancellationToken cancellationToken = default) { return ReadToEndAsync(true, cancellationToken); } public ValueTask ReadToEndAsync(bool disableBomCheck, CancellationToken cancellationToken = default) { if (disableBomCheck && BaseStream is FileStream fs && fs.CanSeek) { return ReadToEndAsyncCore(fs.Length, true, cancellationToken); } return ReadToEndAsyncCore(-1, disableBomCheck, cancellationToken); } public ValueTask ReadToEndAsync(long resultSizeHint, CancellationToken cancellationToken = default) { return ReadToEndAsyncCore(resultSizeHint, true, cancellationToken); } async ValueTask ReadToEndAsyncCore(long resultSizeHint, bool disableBomCheck = true, CancellationToken cancellationToken = default) { if (endOfStream) { var slice = inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); positionBegin = positionEnd = 0; lastNewLinePosition = -2; return (slice.Length != 0) ? slice.ToArray() : []; } if (resultSizeHint != -1) { if (resultSizeHint == 0) { return []; } var result = new byte[resultSizeHint]; var memory = result.AsMemory(); var totalRead = 0; if (positionEnd != 0 && positionBegin != positionEnd) { var slice = inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); slice.CopyTo(memory); memory = memory.Slice(slice.Length); totalRead = slice.Length; } positionBegin = positionEnd = 0; lastNewLinePosition = -2; while (true) { var read = SyncRead ? stream.Read(memory.Span) : await stream.ReadAsync(memory, cancellationToken).ConfigureAwait(ConfigureAwait); totalRead += read; if (read == 0) { break; } else { memory = memory.Slice(read); if (memory.Length == 0) { // try to check stream is finished. var finalRead = SyncRead ? stream.Read(result.AsSpan(0, 1)) : await stream.ReadAsync(result.AsMemory(0, 1), cancellationToken).ConfigureAwait(ConfigureAwait); if (finalRead == 0) { break; } else { throw new InvalidOperationException("resultSizeHint is smaller than data size."); } } } } if (result.Length == totalRead) { return result; } else { return result.AsSpan(0, totalRead).ToArray(); } } else { using var writer = new SegmentedArrayBufferWriter(); if (positionEnd != 0 && positionBegin != positionEnd) { var slice = inputBuffer.AsMemory(positionBegin, positionEnd - positionBegin); writer.Write(slice.Span); } positionBegin = positionEnd = 0; lastNewLinePosition = -2; if (!disableBomCheck && checkPreamble && writer.WrittenCount == 0) { var memory = writer.GetMemory(); var readCount = 0; READ_FOR_BOM: var read = SyncRead ? stream.Read(memory.Span) : await stream.ReadAsync(memory, cancellationToken).ConfigureAwait(ConfigureAwait); readCount += read; if (readCount < 3) { memory = memory.Slice(read); goto READ_FOR_BOM; } memory = writer.GetMemory(); if (memory.Span.Slice(0, 3).SequenceEqual(Encoding.UTF8.Preamble)) { // copy memory.Span.Slice(3).CopyTo(memory.Span); writer.Advance(readCount - 3); } else { writer.Advance(readCount); } checkPreamble = false; } while (true) { var read = SyncRead ? stream.Read(writer.GetMemory().Span) : await stream.ReadAsync(writer.GetMemory(), cancellationToken).ConfigureAwait(ConfigureAwait); if (read == 0) { break; } else { writer.Advance(read); } } endOfStream = true; return writer.ToArrayAndDispose(); } } public ValueTask?> ReadLineAsync(CancellationToken cancellationToken = default) { if (TryReadLine(out var line)) { return new ValueTask?>(line); } return Core(cancellationToken); #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))] #endif async ValueTask?> Core(CancellationToken cancellationToken) { if (await LoadIntoBufferAsync(cancellationToken).ConfigureAwait(ConfigureAwait)) { if (TryReadLine(out var line)) { return line; } } return null; } } public async IAsyncEnumerable> ReadAllLinesAsync([EnumeratorCancellation] CancellationToken cancellationToken = default) { while (await LoadIntoBufferAsync(cancellationToken).ConfigureAwait(ConfigureAwait)) { while (TryReadLine(out var line)) { yield return line; } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool TryPeek(out byte data) { ThrowIfDisposed(); if (positionEnd - positionBegin > 0) { data = inputBuffer[positionBegin]; return true; } data = default; return false; } public ValueTask PeekAsync(CancellationToken cancellationToken = default) { if (TryPeek(out var data)) { return new ValueTask(data); } return Core(cancellationToken); #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))] #endif async ValueTask Core(CancellationToken cancellationToken) { await LoadIntoBufferAtLeastAsync(1, cancellationToken); return inputBuffer[positionBegin]; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool TryRead(out byte data) { ThrowIfDisposed(); if (TryPeek(out data)) { positionBegin += 1; lastNewLinePosition = lastExaminedPosition = -1; return true; } data = default; return false; } public ValueTask ReadAsync(CancellationToken cancellationToken = default) { ThrowIfDisposed(); if (TryRead(out var data)) { return new ValueTask(data); } return Core(cancellationToken); #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))] #endif async ValueTask Core(CancellationToken cancellationToken) { await LoadIntoBufferAtLeastAsync(1, cancellationToken); TryRead(out var data); return data; } } public bool TryReadBlock(int count, out ReadOnlyMemory block) { ThrowIfDisposed(); var loaded = positionEnd - positionBegin; if (count < loaded) { block = inputBuffer.AsMemory(positionBegin, count); positionBegin += count; lastNewLinePosition = lastExaminedPosition = -1; return true; } block = default; return false; } public ValueTask> ReadBlockAsync(int count, CancellationToken cancellationToken = default) { if (TryReadBlock(count, out var block)) { return new ValueTask>(block); } return Core(count, cancellationToken); #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))] #endif async ValueTask> Core(int count, CancellationToken cancellationToken) { await LoadIntoBufferAtLeastAsync(count, cancellationToken); TryReadBlock(count, out var block); return block; } } static int GetNewSize(int capacity) { int newCapacity = unchecked(capacity * 2); if ((uint)newCapacity > ArrayMaxLength) newCapacity = ArrayMaxLength; return newCapacity; } [MethodImpl(MethodImplOptions.AggressiveInlining)] static int IndexOfNewline(ReadOnlySpan span, out int examined) { // we only supports LF(\n) or CRLF(\r\n). var indexOfNewLine = span.IndexOf((byte)'\n'); if (indexOfNewLine == -1) { examined = span.Length - 1; return -1; } examined = indexOfNewLine; if (indexOfNewLine >= 1 && span[indexOfNewLine - 1] == '\r') { indexOfNewLine--; // case of '\r\n' } return indexOfNewLine; } // Reset API like Utf8JsonWriter public void Reset() { ThrowIfDisposed(); ClearState(); } public void Reset(Stream stream) { ThrowIfDisposed(); ClearState(); this.inputBuffer = ArrayPool.Shared.Rent(Math.Max(bufferSize, MinBufferSize)); this.stream = stream; } public void Dispose() { if (isDisposed) return; isDisposed = true; ClearState(); } public async ValueTask DisposeAsync() { if (isDisposed) return; isDisposed = true; if (!leaveOpen && stream != null) { await stream.DisposeAsync().ConfigureAwait(ConfigureAwait); stream = null!; } ClearState(); } void ClearState() { if (inputBuffer != null) { ArrayPool.Shared.Return(inputBuffer); inputBuffer = null!; } if (!leaveOpen && stream != null) { stream.Dispose(); stream = null!; } positionBegin = positionEnd = 0; endOfStream = false; checkPreamble = skipBom; lastNewLinePosition = lastExaminedPosition = -2; } void ThrowIfDisposed() { if (isDisposed) throw new ObjectDisposedException(""); } } ================================================ FILE: src/Utf8StreamReader/Utf8StreamReader.csproj ================================================  netstandard2.1;net6.0;net8.0 12 enable enable Cysharp.IO true 1701;1702;1591;1573 true string Utf8 based StreamReader for high performance text processing. all runtime; build; native; contentfiles; analyzers; buildtransitive ================================================ FILE: src/Utf8StreamReader/Utf8TextReader.cs ================================================ using System.Buffers; using System.Runtime.CompilerServices; using System.Text; namespace Cysharp.IO; public sealed class Utf8TextReader : IDisposable, IAsyncDisposable { const int DefaultCharBufferSize = 1024; // buffer per line. const int MinBufferSize = 128; readonly Utf8StreamReader reader; readonly int bufferSize; char[] outputBuffer; bool isDisposed; public Utf8TextReader(Utf8StreamReader reader) : this(reader, DefaultCharBufferSize) { } public Utf8TextReader(Utf8StreamReader reader, int bufferSize) { this.reader = reader; this.outputBuffer = ArrayPool.Shared.Rent(Math.Max(bufferSize, MinBufferSize)); this.bufferSize = bufferSize; } public Stream BaseStream => reader.BaseStream; public Utf8StreamReader BaseReader => reader; public ValueTask LoadIntoBufferAsync(CancellationToken cancellationToken = default) { return reader.LoadIntoBufferAsync(cancellationToken); } public bool TryReadLine(out ReadOnlyMemory line) { if (!reader.TryReadLine(out var utf8Line)) { line = default; return false; } var maxCharCount = Encoding.UTF8.GetMaxCharCount(utf8Line.Length); if (outputBuffer.Length < maxCharCount) { // need new buffer ArrayPool.Shared.Return(outputBuffer); outputBuffer = ArrayPool.Shared.Rent(maxCharCount); } var size = Encoding.UTF8.GetChars(utf8Line.Span, outputBuffer); line = outputBuffer.AsMemory(0, size); return true; } public ValueTask?> ReadLineAsync(CancellationToken cancellationToken = default) { if (TryReadLine(out var line)) { return new ValueTask?>(line); } return Core(cancellationToken); #if !NETSTANDARD [AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))] #endif async ValueTask?> Core(CancellationToken cancellationToken) { if (await LoadIntoBufferAsync(cancellationToken).ConfigureAwait(reader.ConfigureAwait)) { if (TryReadLine(out var line)) { return line; } } return null; } } public async IAsyncEnumerable> ReadAllLinesAsync([EnumeratorCancellation] CancellationToken cancellationToken = default) { while (await LoadIntoBufferAsync(cancellationToken).ConfigureAwait(reader.ConfigureAwait)) { while (TryReadLine(out var line)) { yield return line; } } } // Utf8TextReader is a helper class for ReadOnlyMemory and string generation that internally holds Utf8StreamReader public async ValueTask ReadToEndAsync(CancellationToken cancellationToken = default) { // Using a method similar to .NET 9 LINQ to Objects's ToArray improvement, returns a structure optimized for gap-free sequential expansion // StreamReader.ReadToEnd copies the buffer to a StringBuilder, but this implementation holds char[] chunks(char[][]) without copying. using var writer = new SegmentedArrayBufferWriter(); var decoder = Encoding.UTF8.GetDecoder(); // Utf8StreamReader.ReadToEndChunksAsync returns the internal buffer ReadOnlyMemory as an asynchronous sequence upon each read completion await foreach (var chunk in reader.ReadToEndChunksAsync(cancellationToken).ConfigureAwait(reader.ConfigureAwait)) { var input = chunk; while (input.Length != 0) { // The Decoder directly writes from the read buffer to the char[] buffer decoder.Convert(input.Span, writer.GetMemory().Span, flush: false, out var bytesUsed, out var charsUsed, out var completed); input = input.Slice(bytesUsed); writer.Advance(charsUsed); } } decoder.Convert([], writer.GetMemory().Span, flush: true, out _, out var finalCharsUsed, out _); writer.Advance(finalCharsUsed); // Directly generate a string from the char[][] buffer using String.Create return string.Create(writer.WrittenCount, writer, static (stringSpan, writer) => { foreach (var item in writer.GetSegmentsAndDispose()) { item.Span.CopyTo(stringSpan); stringSpan = stringSpan.Slice(item.Length); } }); } public void Reset() { ThrowIfDisposed(); ClearState(); reader.Reset(); } public void Reset(Stream stream) { ThrowIfDisposed(); ClearState(); outputBuffer = ArrayPool.Shared.Rent(Math.Max(bufferSize, MinBufferSize)); reader.Reset(stream); } public void Dispose() { if (isDisposed) return; isDisposed = true; ClearState(); reader.Dispose(); } public ValueTask DisposeAsync() { if (isDisposed) return default; isDisposed = true; ClearState(); return reader.DisposeAsync(); } void ClearState() { if (outputBuffer != null) { ArrayPool.Shared.Return(outputBuffer); outputBuffer = null!; } } void ThrowIfDisposed() { if (isDisposed) throw new ObjectDisposedException(""); } } public static class Utf8StreamReaderExtensions { public static Utf8TextReader AsTextReader(this Utf8StreamReader reader) => new Utf8TextReader(reader); public static Utf8TextReader AsTextReader(this Utf8StreamReader reader, int bufferSize) => new Utf8TextReader(reader, bufferSize); } ================================================ FILE: tests/Utf8StreamReader.Tests/FakeMemoryStream.cs ================================================ #pragma warning disable CS1998 using System.Runtime.CompilerServices; using System.Runtime.InteropServices; namespace Utf8StreamReaderTests; internal class FakeMemoryStream : Stream { #region NotImplemented public override bool CanRead => true; public override bool CanSeek => throw new NotImplementedException(); public override bool CanWrite => throw new NotImplementedException(); public override long Length => throw new NotImplementedException(); public override long Position { get => throw new NotImplementedException(); set => throw new NotImplementedException(); } public override void Flush() { throw new NotImplementedException(); } public override int Read(byte[] buffer, int offset, int count) { throw new NotImplementedException(); } public override long Seek(long offset, SeekOrigin origin) { throw new NotImplementedException(); } public override void SetLength(long value) { throw new NotImplementedException(); } public override void Write(byte[] buffer, int offset, int count) { throw new NotImplementedException(); } #endregion public bool IsDisposed { get; set; } protected override void Dispose(bool disposing) { IsDisposed = true; } Memory[] lastAddedData = default!; Queue>> data = new(); public void AddMemory(params Memory[] memories) { foreach (Memory mem in memories) { if (mem.Length == 0) throw new ArgumentException("Length 0 is not allowed."); data.Enqueue(new(mem)); } this.lastAddedData = memories; } public void Restart() { data.Clear(); AddMemory(lastAddedData); } public override async ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default) { if (data.Count == 0) { return 0; } var memory = data.Peek().Value; var copySize = Math.Min(memory.Length, buffer.Length); memory.Slice(0, copySize).CopyTo(buffer); var newMemory = memory.Slice(copySize); if (newMemory.Length == 0) { data.Dequeue(); } else { data.Peek().Value = newMemory; } return copySize; } } ================================================ FILE: tests/Utf8StreamReader.Tests/FileReadTest.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Reflection; using System.Text; using System.Threading.Tasks; namespace Utf8StreamReaderTests; public class FileReadTest(ITestOutputHelper Console) { [Fact] public async Task ReadPath() { var path1 = Path.Combine(Path.GetDirectoryName(typeof(FileReadTest).Assembly.FullName!)!, "file1.txt"); var actual = await Utf8StreamReaderResultAsync(path1); actual.Should().Equal([ "abcde", "fgh", "ijklmnopqrs" ]); } static async Task Utf8StreamReaderResultAsync(string path) { using var reader = new Utf8StreamReader(path); var l = new List(); await foreach (var item in reader.ReadAllLinesAsync()) { l.Add(Encoding.UTF8.GetString(item.Span)); } return l.ToArray(); } } ================================================ FILE: tests/Utf8StreamReader.Tests/ReadBlockTest.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Utf8StreamReaderTests; public class ReadBlockTest { [Fact] public async Task LineAndBlock() { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); //var sr = new StreamReader(ms); //var a = await sr.ReadLineAsync(); //var buf = new char[1024]; //await sr.ReadBlockAsync(buf.AsMemory(0, 10)); var reader = new Utf8StreamReader(ms); ToString((await reader.ReadLineAsync()).Value).Should().Be("abc"); ToString((await reader.ReadBlockAsync(2))).Should().Be("de"); ToString((await reader.ReadLineAsync()).Value).Should().Be("f"); ToString((await reader.ReadBlockAsync(8))).Should().Be("ghij\nzkl"); ToString((await reader.ReadLineAsync()).Value).Should().Be("mno"); } static byte[] GetBytes(string x) { return Encoding.UTF8.GetBytes(x); } static string ToString(ReadOnlyMemory buffer) { return Encoding.UTF8.GetString(buffer.Span); } } ================================================ FILE: tests/Utf8StreamReader.Tests/ReadTest.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Utf8StreamReaderTests; public class ReadTest { [Fact] public async Task ReadToEndAsync() { // with bom { var bom = Encoding.UTF8.GetPreamble(); var ms = new FakeMemoryStream(); ms.AddMemory( new byte[] { bom[0] }, new byte[] { bom[1] }, new byte[] { bom[2], (byte)'Z' }, GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var sr = new Utf8StreamReader(ms); var result = await sr.ReadToEndAsync(disableBomCheck: false); var expected = "Zabc\ndef\r\nghij\nzklmno\r\n\n"; var actual = ToString(result); actual.Should().Be(expected); } // no bom { var ms = new FakeMemoryStream(); ms.AddMemory( new byte[] { (byte)'Z' }, GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var sr = new Utf8StreamReader(ms); var result = await sr.ReadToEndAsync(); var expected = "Zabc\ndef\r\nghij\nzklmno\r\n\n"; var actual = ToString(result); actual.Should().Be(expected); } } [Fact] public async Task ReadToEndChunks() { var bom = Encoding.UTF8.GetPreamble(); var ms = new FakeMemoryStream(); ms.AddMemory( new byte[] { bom[0] }, new byte[] { bom[1] }, new byte[] { bom[2], (byte)'Z' }, GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var sr = new Utf8StreamReader(ms); var list = new List(); await foreach (var item in sr.ReadToEndChunksAsync()) { list.Add(item.ToArray()); } ToString(list[0]).Should().Be("Z"); ToString(list[1]).Should().Be("a"); ToString(list[2]).Should().Be("bc\n"); ToString(list[3]).Should().Be("def\r\n"); ToString(list[4]).Should().Be("ghij\n"); ToString(list[5]).Should().Be("zklmno\r\n\n"); } [Fact] public async Task TestPeek() { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var sr = new Utf8StreamReader(ms); sr.TryPeek(out var data).Should().BeFalse(); (await sr.PeekAsync()).Should().Be((byte)'a'); sr.TryPeek(out data).Should().BeTrue(); data.Should().Be((byte)'a'); ToString(await sr.ReadLineAsync()).Should().Be("abc"); (await sr.PeekAsync()).Should().Be((byte)'d'); ToString(await sr.ReadLineAsync()).Should().Be("def"); } // LoadIntoBufferAtLeastAsync // TryRead // ReadAsync [Fact] public async Task TestRead() { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var sr = new Utf8StreamReader(ms); await sr.LoadIntoBufferAtLeastAsync(2); sr.TryRead(out var a).Should().BeTrue(); a.Should().Be((byte)'a'); sr.TryRead(out var b).Should().BeTrue(); b.Should().Be((byte)'b'); sr.TryRead(out var c).Should().BeTrue(); c.Should().Be((byte)'c'); sr.TryRead(out var n).Should().BeTrue(); n.Should().Be((byte)'\n'); sr.TryRead(out _).Should().BeFalse(); (await sr.ReadAsync()).Should().Be((byte)'d'); } static byte[] GetBytes(string x) { return Encoding.UTF8.GetBytes(x); } static string ToString(ReadOnlyMemory? buffer) { if (buffer == null) return null!; return Encoding.UTF8.GetString(buffer.Value.Span); } } ================================================ FILE: tests/Utf8StreamReader.Tests/ReadToEndTest.cs ================================================ using System.Text; namespace Utf8StreamReaderTests; public class ReadToEndTest { [Fact] public async Task AfterRead() { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var all = await new Utf8StreamReader(ms).ReadToEndAsync(); ms.Restart(); var reader = new Utf8StreamReader(ms); await reader.ReadLineAsync(); var expected = "def\r\nghij\nzklmno\r\n\n"; var actual = await reader.ReadToEndAsync(resultSizeHint: all.Length); Encoding.UTF8.GetString(actual).Should().Be(expected); } [Fact] public async Task SmallHint() { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var reader = new Utf8StreamReader(ms); var expected = "abc\ndef\r\nghij\nzklmno\r\n\n"; await Assert.ThrowsAsync(async () => { var actual = await reader.ReadToEndAsync(resultSizeHint: Encoding.UTF8.GetByteCount(expected) - 2); }); } [Fact] public async Task Just() { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("zklmno\r\n\n")); var reader = new Utf8StreamReader(ms); var expected = "abc\ndef\r\nghij\nzklmno\r\n\n"; var actual = await reader.ReadToEndAsync(resultSizeHint: expected.Length); Encoding.UTF8.GetString(actual).Should().Be(expected); } static byte[] GetBytes(string x) { return Encoding.UTF8.GetBytes(x); } } ================================================ FILE: tests/Utf8StreamReader.Tests/SegmentedArrayBufferWriterTest.cs ================================================ namespace Utf8StreamReaderTests; public class SegmentedArrayBufferWriterTest { [Fact(Skip = "Reduce memory usage in CI")] public void AllocateFull() { var writer = new SegmentedArrayBufferWriter(); var memCount = 8192; long total = 0; for (int i = 0; i < 18; i++) { var mem = writer.GetMemory(); mem.Length.Should().Be(memCount); total += mem.Length; memCount *= 2; writer.Advance(mem.Length); } Memory lastMemory = writer.GetMemory(); (total).Should().BeLessThan(Array.MaxLength); (total + lastMemory.Length).Should().BeGreaterThan(Array.MaxLength); } } ================================================ FILE: tests/Utf8StreamReader.Tests/Tests.cs ================================================ using System.Buffers; using System.Text; namespace Utf8StreamReaderTests; public class Tests(ITestOutputHelper Console) { [Fact] public async Task Standard() { var originalStrings = """ foo bare baz boz too """; var stream = CreateStringStream(originalStrings); using var reader = new Utf8StreamReader(stream); var sb = new StringBuilder(); bool isFirst = true; ReadOnlyMemory? line; while ((line = await reader.ReadLineAsync()) != null) { if (isFirst) isFirst = false; else sb.AppendLine(); Console.WriteLine(Encoding.UTF8.GetString(line.Value.Span)); sb.Append(Encoding.UTF8.GetString(line.Value.Span)); } sb.ToString().Should().Be(originalStrings.ToString()); } [Fact] public async Task BOM() { var bytes = Encoding.UTF8.GetPreamble().Concat(""" foo bare baz boz too """u8.ToArray()).ToArray(); var bomStrings = Encoding.UTF8.GetString(bytes); var stream = CreateStringStream(bomStrings); var originalStrings = """ foo bare baz boz too """; using var reader = new Utf8StreamReader(stream); var sb = new StringBuilder(); bool isFirst = true; ReadOnlyMemory? line; while ((line = await reader.ReadLineAsync()) != null) { if (isFirst) isFirst = false; else sb.AppendLine(); Console.WriteLine(Encoding.UTF8.GetString(line.Value.Span)); sb.Append(Encoding.UTF8.GetString(line.Value.Span)); } sb.ToString().Should().Be(originalStrings.ToString()); } [Fact] public async Task NewLineCheck() { { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("jklmno")); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("jklmno\r\n")); // + last new line var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("a"), GetBytes("bc\n"), GetBytes("def\r\n"), GetBytes("ghij\n"), GetBytes("jklmno\r\n\n")); // + last new line x2 var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } } [Fact] public async Task BOM2() { { var ms = new FakeMemoryStream(); // small bom ms.AddMemory( Encoding.UTF8.GetPreamble(), GetBytes("a")); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } { var ms = new FakeMemoryStream(); // long bom ms.AddMemory( Encoding.UTF8.GetPreamble(), GetBytes("abcdefghijklmnopqrastu")); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } // yes bom { var ms = new FakeMemoryStream(); ms.AddMemory( Encoding.UTF8.GetPreamble(), GetBytes("あいうえお")); // japanese hiragana. var reader = new Utf8StreamReader(ms) { SkipBom = false }; var line = await reader.ReadLineAsync(); line.Value.Slice(0, 3).Span.SequenceEqual(Encoding.UTF8.Preamble).Should().BeTrue(); line.Value.Slice(3).Span.SequenceEqual(GetBytes("あいうえお")).Should().BeTrue(); } } [Fact] public async Task EmptyString() { { var ms = new MemoryStream(); var expected = await StreamReaderResultAsync(ms); ms = new MemoryStream(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } // bom only { var ms = new FakeMemoryStream(); // small bom ms.AddMemory(Encoding.UTF8.GetPreamble()); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } // newline only { var ms = new FakeMemoryStream(); // small bom ms.AddMemory(GetBytes("\r\n")); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } // newline only 2 { var ms = new FakeMemoryStream(); // small bom ms.AddMemory(GetBytes("\n\r\n")); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } } [Fact] public async Task SmallString() { var ms = new FakeMemoryStream(); ms.AddMemory(GetBytes("z")); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms); actual.Should().Equal(expected); } // minbuffer = 1024 [Fact] public async Task Resize() { var bufferSize = 1024; { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("!!!\r\n"), // first line consume GetBytes(new string('a', 1018)), GetBytes("bcdefghijklmnopqrstuvwxyz\r\n"), GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms, bufferSize); actual[1].Should().Be(expected[1]); actual.Should().Equal(expected); } { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes("!!!\r\n"), // first line consume GetBytes(new string('a', 1018)), GetBytes("bcdefghijklmnopqrstuvwxyz\r\n"), GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms, bufferSize); actual[1].Should().Be(expected[1]); actual.Should().Equal(expected); } } [Fact] public async Task OnlySlice() { var bufferSize = 1024; { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes(new string('a', 1018) + "\r\nbcdefgh"), GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms, bufferSize); actual[1].Should().Be(expected[1]); actual.Should().Equal(expected); } } [Fact] public async Task HugeBuffer() { var bufferSize = 1024; { var ms = new FakeMemoryStream(); ms.AddMemory( GetBytes(new string('a', 30000) + "\r\nb"), GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8StreamReaderResultAsync(ms, bufferSize); actual[1].Should().Be(expected[1]); actual.Should().Equal(expected); } } [Fact] public async Task NewLineTrimmedAtBufferBoundary() { // Buffer 1: aaa....\r\nasdf\r // Buffer 2: \nasdf var ms2 = CreateStringStream( new string('a', 1017) + "\r\nasdf" + "\r\nasdf"); var actual = await Utf8StreamReaderResultAsync(ms2, 1024); string[] expected = [ new string('a', 1017), "asdf", "asdf", ]; actual[1].Should().Be(expected[1]); actual.Should().Equal(expected); } static async Task Utf8StreamReaderResultAsync(Stream ms, int? size = null) { var reader = (size == null) ? new Utf8StreamReader(ms) : new Utf8StreamReader(ms, size.Value); var l = new List(); await foreach (var item in reader.ReadAllLinesAsync()) { l.Add(GetString(item)); } return l.ToArray(); } static async Task StreamReaderResultAsync(Stream ms) { var reader = new StreamReader(ms); var l = new List(); string? s; while ((s = (await reader.ReadLineAsync())) != null) { l.Add(s); } return l.ToArray(); } static string GetString(ReadOnlyMemory x) { return Encoding.UTF8.GetString(x.Span); } static byte[] GetBytes(string x) { return Encoding.UTF8.GetBytes(x); } static MemoryStream CreateStringStream(string input) => new(Encoding.UTF8.GetBytes(input)); } ================================================ FILE: tests/Utf8StreamReader.Tests/TextReaderTest.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Utf8StreamReaderTests; public class TextReaderTest { [Fact] public async Task ReadLine() { var ms = new FakeMemoryStream(); ms.AddMemory( Encoding.UTF8.GetPreamble(), GetBytes(new string('a', 30000) + "\r\nb"), GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); var expected = await StreamReaderResultAsync(ms); ms.Restart(); var actual = await Utf8TextReaderResultAsync(ms); actual.Should().Equal(expected); } [Fact] public async Task ReadToEnd() { var ms = new FakeMemoryStream(); ms.AddMemory( Encoding.UTF8.GetPreamble(), GetBytes(new string('a', 30000) + "\r\nb"), GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); using var sr = new StreamReader(ms, leaveOpen: true); var expected = await sr.ReadToEndAsync(); ms.Restart(); using var usr = new Utf8StreamReader(ms).AsTextReader(); var actual = await usr.ReadToEndAsync(); actual.Should().Be(expected); } [Fact] public async Task ReadToEndLeftOver() { var ms = new FakeMemoryStream(); var hiragana = Encoding.UTF8.GetBytes("あ"); // 3 byte ms.AddMemory( Encoding.UTF8.GetPreamble(), new byte[] { hiragana[0] }, new byte[] { hiragana[1] }, new byte[] { hiragana[2] }, GetBytes("あいうえおかきくけこ\n"), GetBytes("ABCDEFGHIJKLMN") ); using var sr = new StreamReader(ms, leaveOpen: true); var expected = await sr.ReadToEndAsync(); ms.Restart(); using var usr = new Utf8StreamReader(ms).AsTextReader(); var actual = await usr.ReadToEndAsync(); actual.Should().Be(expected); } static async Task Utf8TextReaderResultAsync(Stream ms) { using var reader = new Utf8StreamReader(ms).AsTextReader(); var l = new List(); await foreach (var item in reader.ReadAllLinesAsync()) { l.Add(item.ToString()); } return l.ToArray(); } static async Task StreamReaderResultAsync(Stream ms) { var reader = new StreamReader(ms); var l = new List(); string? s; while ((s = (await reader.ReadLineAsync())) != null) { l.Add(s); } return l.ToArray(); } static string GetString(ReadOnlyMemory x) { return Encoding.UTF8.GetString(x.Span); } static byte[] GetBytes(string x) { return Encoding.UTF8.GetBytes(x); } } ================================================ FILE: tests/Utf8StreamReader.Tests/Utf8StreamReader.Tests.csproj ================================================  net8.0 enable enable Utf8StreamReaderTests true 9113 all runtime; build; native; contentfiles; analyzers; buildtransitive runtime; build; native; contentfiles; analyzers; buildtransitive all Always ================================================ FILE: tests/Utf8StreamReader.Tests/file1.txt ================================================ abcde fgh ijklmnopqrs