From 00b963311e1e3c3dd8a9de820b3571e47a157a21 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Thu, 3 Jun 2021 11:18:24 -0700 Subject: [PATCH 1/5] add sylvan csv benchmark --- src/FileIO/FileIO.csproj | 1 + src/FileIO/WithSylvanLib.cs | 31 +++++++++++++++++++++++++++ tests/FileIO.Benchmarks/FileIOTest.cs | 18 +++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 src/FileIO/WithSylvanLib.cs diff --git a/src/FileIO/FileIO.csproj b/src/FileIO/FileIO.csproj index a15d9bc..99f4de5 100644 --- a/src/FileIO/FileIO.csproj +++ b/src/FileIO/FileIO.csproj @@ -8,6 +8,7 @@ + diff --git a/src/FileIO/WithSylvanLib.cs b/src/FileIO/WithSylvanLib.cs new file mode 100644 index 0000000..8e771ae --- /dev/null +++ b/src/FileIO/WithSylvanLib.cs @@ -0,0 +1,31 @@ +using System.IO; +using Sylvan.Data.Csv; +using System.Buffers; + +namespace FileIO +{ + public class WithSylvanLib + { + public void ProcessFile(string filePath, Employee[] employeeRecords) + { + using var reader = new StreamReader(filePath); + + char[] buffer = ArrayPool.Shared.Rent(0x10000); + + using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions { Buffer = buffer }); + int idx = 0; + while (csv.Read()) + { + employeeRecords[idx++] = new Employee + { + Name = csv.GetString(0), + Email = csv.GetString(1), + DateOfJoining = csv.GetDateTime(2), + Salary = csv.GetDouble(3), + Age = csv.GetInt32(4), + }; + } + ArrayPool.Shared.Return(buffer); + } + } +} diff --git a/tests/FileIO.Benchmarks/FileIOTest.cs b/tests/FileIO.Benchmarks/FileIOTest.cs index 2d55254..2a0b67a 100644 --- a/tests/FileIO.Benchmarks/FileIOTest.cs +++ b/tests/FileIO.Benchmarks/FileIOTest.cs @@ -58,5 +58,21 @@ public void CsvHelper() var employeesList = csvHelper.ProcessFileAsync(_filePath); } + + [Benchmark] + public void Sylvan() + { + var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); + _filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); + var sylv = new WithSylvanLib(); + var pool = ArrayPool.Shared; + var employeeRecords = pool.Rent(100000); + + try { + sylv.ProcessFile(_filePath, employeeRecords); + } finally { + pool.Return(employeeRecords, true); + } + } } -} \ No newline at end of file +} From c9d18c211ec890b7f13871f0269df202b2bba0ed Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Fri, 4 Jun 2021 07:13:17 -0700 Subject: [PATCH 2/5] Increase buffer size. Avoid allocating temp string for date parsing. --- src/FileIO/WithPipeLines.cs | 40 ++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/FileIO/WithPipeLines.cs b/src/FileIO/WithPipeLines.cs index 5a49eb4..e5ca061 100644 --- a/src/FileIO/WithPipeLines.cs +++ b/src/FileIO/WithPipeLines.cs @@ -2,6 +2,7 @@ using System; using System.Buffers; using System.Buffers.Text; +using System.Globalization; using System.IO; using System.IO.Pipelines; using System.Text; @@ -19,10 +20,10 @@ public class WithPipeLines /// PipeReader Sequence Position public async Task ProcessFileAsync(string filePath, Employee[] employeeRecords) { + const int BufferSize = 0x10000; var position = 0; - if (!File.Exists(filePath)) return position; - await using var fileStream = File.OpenRead(filePath); - var pipeReader = PipeReader.Create(fileStream); + await using var fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, BufferSize); + var pipeReader = PipeReader.Create(fileStream, new StreamPipeReaderOptions(bufferSize: BufferSize)); while (true) { var fileData = await pipeReader.ReadAsync(); @@ -47,14 +48,16 @@ public async Task ProcessFileAsync(string filePath, Employee[] employeeReco private static SequencePosition ParseLines(Employee[] employeeRecords, in ReadOnlySequence buffer, ref int position) { var reader = new SequenceReader(buffer); + ReadOnlySpan line; + // skip the header row + reader.TryReadTo(out line, (byte)'\n', true); // Read the whole line till the new line is found - while (reader.TryReadTo(out ReadOnlySpan line, (byte)'\n', true)) + while (reader.TryReadTo(out line, (byte)'\n', true)) { var employee = LineParser.ParseLine(line); // we have a line to parse - if (employee is { }) // if the returned value is valid Employee object - employeeRecords[position++] = employee.Value; + employeeRecords[position++] = employee; } return reader.Position; // returning the Last position of the reader @@ -63,16 +66,9 @@ private static SequencePosition ParseLines(Employee[] employeeRecords, in ReadOn private static class LineParser { private const byte Coma = (byte)','; - private static readonly byte[] ColumnHeaders = Encoding.UTF8.GetBytes("Name,Email,DateOfJoining,Salary,Age"); - public static Employee? ParseLine(ReadOnlySpan line) + public static Employee ParseLine(ReadOnlySpan line) { - // REVIEW: There are better ways to do this - if (line.IndexOf(ColumnHeaders) >= 0) // Ignore the Header row - { - return null; - } - // Trim \r (if it exists) line = line.TrimEnd((byte)'\r'); @@ -104,19 +100,17 @@ private static class LineParser } case 3: { + // stand on our heads to avoid allocating a temp string to parse the date var buffer = line[..comaAt]; - if (DateTime.TryParse(Encoding.UTF8.GetString(line[..comaAt]), out var doj)) - + Span chars = stackalloc char[buffer.Length]; + for (int i = 0; i < buffer.Length; i++) + { + chars[i] = (char)buffer[i]; + } + if (DateTime.TryParse(chars, out var doj)) { record.DateOfJoining = doj; } - // Can't use Utf8 parser to extract datetime field because csv format doesn't have time - //https://docs.microsoft.com/en-us/dotnet/api/system.buffers.text.utf8parser.tryparse?view=net-5.0#System_Buffers_Text_Utf8Parser_TryParse_System_ReadOnlySpan_System_Byte__System_DateTime__System_Int32__System_Char_ - - // if (Utf8Parser.TryParse(buffer, out DateTime value, out var bytesConsumed)) - // { - // record.DateOfJoining = value; - // } break; } From 18c04d21480944370e029c1fe6b6f8ac83c6e0b0 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Fri, 4 Jun 2021 07:21:50 -0700 Subject: [PATCH 3/5] unroll loop --- src/FileIO/WithPipeLines.cs | 98 +++++++++++++------------------------ 1 file changed, 33 insertions(+), 65 deletions(-) diff --git a/src/FileIO/WithPipeLines.cs b/src/FileIO/WithPipeLines.cs index e5ca061..f77d8fd 100644 --- a/src/FileIO/WithPipeLines.cs +++ b/src/FileIO/WithPipeLines.cs @@ -2,7 +2,6 @@ using System; using System.Buffers; using System.Buffers.Text; -using System.Globalization; using System.IO; using System.IO.Pipelines; using System.Text; @@ -65,79 +64,48 @@ private static SequencePosition ParseLines(Employee[] employeeRecords, in ReadOn private static class LineParser { - private const byte Coma = (byte)','; + private const byte Comma = (byte)','; public static Employee ParseLine(ReadOnlySpan line) { // Trim \r (if it exists) line = line.TrimEnd((byte)'\r'); - var fieldCount = 1; - var record = new Employee(); - while (fieldCount <= 5) // we have five fields in csv file + var idx = line.IndexOf(Comma); + + record.Name = Encoding.UTF8.GetString(line[..idx]); + line = line[(idx + 1)..]; + idx = line.IndexOf(Comma); + record.Email = Encoding.UTF8.GetString(line[..idx]); + line = line[(idx + 1)..]; + idx = line.IndexOf(Comma); + + // stand on our heads to avoid allocating a temp string to parse the date + var buffer = line[..idx]; + Span chars = stackalloc char[buffer.Length]; + for (int i = 0; i < buffer.Length; i++) + { + chars[i] = (char)buffer[i]; + } + if (DateTime.TryParse(chars, out var doj)) + { + record.DateOfJoining = doj; + } + line = line[(idx + 1)..]; + idx = line.IndexOf(Comma); + + if (Utf8Parser.TryParse(line[..idx], out double salary, out _)) + { + record.Salary = salary; + } + + line = line[(idx + 1)..]; + + if (Utf8Parser.TryParse(line, out short age, out _)) { - var comaAt = line.IndexOf(Coma); - if (comaAt < 0) // No more comas are found we have reached the last field. - { - comaAt = line.Length; - } - - switch (fieldCount) - { - case 1: - { - var value = Encoding.UTF8.GetString(line[..comaAt]); - record.Name = value; - break; - } - case 2: - { - var value = Encoding.UTF8.GetString(line[..comaAt]); - record.Email = value; - break; - } - case 3: - { - // stand on our heads to avoid allocating a temp string to parse the date - var buffer = line[..comaAt]; - Span chars = stackalloc char[buffer.Length]; - for (int i = 0; i < buffer.Length; i++) - { - chars[i] = (char)buffer[i]; - } - if (DateTime.TryParse(chars, out var doj)) - { - record.DateOfJoining = doj; - } - break; - } - - case 4: - { - var buffer = line[..comaAt]; - if (Utf8Parser.TryParse(buffer, out double value, out var bytesConsumed)) - { - record.Salary = value; - } - break; - } - - case 5: - { - var buffer = line[..comaAt]; - if (Utf8Parser.TryParse(buffer, out short value, out var bytesConsumed)) - { - record.Age = value; - } - return record; - } - } - - line = line[(comaAt + 1)..]; // slice past field - - fieldCount++; + record.Age = age; } return record; From e155fc7e2438f5e6ab48a53b1afe2d34597d7835 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Fri, 4 Jun 2021 07:36:59 -0700 Subject: [PATCH 4/5] sylvan async. Increase buffer size. clean up benchmarks. --- src/FileIO/WithSylvanLib.cs | 30 +++++++++++++++-- tests/FileIO.Benchmarks/FileIOTest.cs | 47 +++++++++++++++------------ 2 files changed, 55 insertions(+), 22 deletions(-) diff --git a/src/FileIO/WithSylvanLib.cs b/src/FileIO/WithSylvanLib.cs index 8e771ae..c431300 100644 --- a/src/FileIO/WithSylvanLib.cs +++ b/src/FileIO/WithSylvanLib.cs @@ -1,6 +1,8 @@ using System.IO; using Sylvan.Data.Csv; using System.Buffers; +using System.Threading.Tasks; +using System.Text; namespace FileIO { @@ -8,9 +10,10 @@ public class WithSylvanLib { public void ProcessFile(string filePath, Employee[] employeeRecords) { - using var reader = new StreamReader(filePath); + const int BufferSize = 0x10000; + using var reader = new StreamReader(filePath, Encoding.UTF8, false, BufferSize); - char[] buffer = ArrayPool.Shared.Rent(0x10000); + char[] buffer = ArrayPool.Shared.Rent(BufferSize); using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions { Buffer = buffer }); int idx = 0; @@ -27,5 +30,28 @@ public void ProcessFile(string filePath, Employee[] employeeRecords) } ArrayPool.Shared.Return(buffer); } + + public async Task ProcessFileAsync(string filePath, Employee[] employeeRecords) + { + const int BufferSize = 0x10000; + using var reader = new StreamReader(filePath, Encoding.UTF8, false, BufferSize); + + char[] buffer = ArrayPool.Shared.Rent(BufferSize); + + await using var csv = await CsvDataReader.CreateAsync(reader, new CsvDataReaderOptions { Buffer = buffer }); + int idx = 0; + while (await csv.ReadAsync()) + { + employeeRecords[idx++] = new Employee + { + Name = csv.GetString(0), + Email = csv.GetString(1), + DateOfJoining = csv.GetDateTime(2), + Salary = csv.GetDouble(3), + Age = csv.GetInt32(4), + }; + } + ArrayPool.Shared.Return(buffer); + } } } diff --git a/tests/FileIO.Benchmarks/FileIOTest.cs b/tests/FileIO.Benchmarks/FileIOTest.cs index 2a0b67a..edcf8ae 100644 --- a/tests/FileIO.Benchmarks/FileIOTest.cs +++ b/tests/FileIO.Benchmarks/FileIOTest.cs @@ -13,18 +13,11 @@ namespace FileIO.Benchmarks [RankColumn()] public class FileIOTest { - private string _filePath; - [GlobalSetup] - public void Setup() - { - var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - _filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); - } + private string _filePath = "Employees.csv"; + [Benchmark] public async Task PipeLines() { - // var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - //_filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); var pool = ArrayPool.Shared; var employeeRecords = pool.Rent(100000); var pipeLinesTest = new WithPipeLines(); @@ -41,19 +34,15 @@ public async Task PipeLines() [Benchmark] public async Task> AsyncStream() - { - // var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - // _filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); + { var asyncStream = new WithAsyncStreams(); - var employees = await asyncStream.ProcessStreamAsync(_filePath); - return employees; + var employees = await asyncStream.ProcessStreamAsync(_filePath); + return employees; } [Benchmark] public void CsvHelper() { - // var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - //_filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); var csvHelper = new WithCsvHelperLib(); var employeesList = csvHelper.ProcessFileAsync(_filePath); @@ -62,15 +51,33 @@ public void CsvHelper() [Benchmark] public void Sylvan() { - var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - _filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); var sylv = new WithSylvanLib(); var pool = ArrayPool.Shared; var employeeRecords = pool.Rent(100000); - try { + try + { sylv.ProcessFile(_filePath, employeeRecords); - } finally { + } + finally + { + pool.Return(employeeRecords, true); + } + } + + [Benchmark] + public async Task SylvanAsync() + { + var sylv = new WithSylvanLib(); + var pool = ArrayPool.Shared; + var employeeRecords = pool.Rent(100000); + + try + { + await sylv.ProcessFileAsync(_filePath, employeeRecords); + } + finally + { pool.Return(employeeRecords, true); } } From c6b9b40adee13d9b21f819b9f22766365570f515 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Fri, 4 Jun 2021 08:11:09 -0700 Subject: [PATCH 5/5] Manually bind CsvHelper data to improve perf. --- src/FileIO/WithCsvHelperLib.cs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/FileIO/WithCsvHelperLib.cs b/src/FileIO/WithCsvHelperLib.cs index 0a9e0b1..31e63e3 100644 --- a/src/FileIO/WithCsvHelperLib.cs +++ b/src/FileIO/WithCsvHelperLib.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.Globalization; using System.IO; @@ -9,12 +10,28 @@ namespace FileIO public class WithCsvHelperLib { - public IEnumerable ProcessFileAsync(string filePath) + public IEnumerable ProcessFileAsync(string filePath) { using var reader = new StreamReader(filePath); + + Employee[] employees = new Employee[100000]; using var csv = new CsvReader(reader, CultureInfo.InvariantCulture); - var records = csv.GetRecords(); - return records.ToList(); + int idx = 0; + csv.Read(); + while (csv.Read()) + { + var emp = new Employee + { + Name = csv[0], + Email = csv[1], + DateOfJoining = DateTime.Parse(csv[2]), + Salary = double.Parse(csv[3]), + Age = int.Parse(csv[4]), + }; + employees[idx++] = emp; + + } + return employees; } }