From 332ee73fb136e4dac6e859c4762d06d415f35cef Mon Sep 17 00:00:00 2001 From: Masahiro Nakagawa Date: Thu, 16 Feb 2017 01:43:29 +0900 Subject: [PATCH 1/3] buf_file: handle "Too many open files" error to keep buffer and metadata pair This is not incomplete but keep buffer and metadata pair phase as much as possible. --- lib/fluent/plugin/buffer/file_chunk.rb | 58 +++++++++++++++++++++----- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/lib/fluent/plugin/buffer/file_chunk.rb b/lib/fluent/plugin/buffer/file_chunk.rb index 811e04de39..e601ece17c 100644 --- a/lib/fluent/plugin/buffer/file_chunk.rb +++ b/lib/fluent/plugin/buffer/file_chunk.rb @@ -106,10 +106,36 @@ def enqueued! write_metadata(update: false) # re-write metadata w/ finalized records - file_rename(@chunk, @path, new_chunk_path, ->(new_io){ @chunk = new_io }) - @path = new_chunk_path + begin + file_rename(@chunk, @path, new_chunk_path, ->(new_io) { @chunk = new_io }) + rescue => e + begin + file_rename(@chunk, new_chunk_path, @path, ->(new_io) { @chunk = new_io }) if File.exist?(new_chunk_path) + rescue => re + end + if re + raise "can't enqueue buffer file. This may causes inconsistent state: path = #{@path}, error = '#{e}', retry error = '#{re}'" + else + raise "can't enqueue buffer file: path = #{@path}, error = '#{e}'" + end + end + + begin + file_rename(@meta, @meta_path, new_meta_path, ->(new_io) { @meta = new_io }) + rescue => e + begin + file_rename(@chunk, new_chunk_path, @path, ->(new_io) { @chunk = new_io }) if File.exist?(new_chunk_path) + file_rename(@meta, new_meta_path, @meta_path, ->(new_io) { @meta = new_io }) if File.exist?(new_meta_path) + rescue => re + end + if re + raise "can't enqueue buffer metadata. This may causes inconsistent state: path = #{@meta_path}, error = '#{e}', retry error = '#{re}'" + else + raise "can't enqueue buffer metadata: path = #{@meta_path}, error = '#{e}'" + end + end - file_rename(@meta, @meta_path, new_meta_path, ->(new_io){ @meta = new_io }) + @path = new_chunk_path @meta_path = new_meta_path super @@ -242,14 +268,24 @@ def file_rename(file, old_path, new_path, callback=nil) def create_new_chunk(path, perm) @path = self.class.generate_stage_chunk_path(path, @unique_id) @meta_path = @path + '.meta' - @chunk = File.open(@path, 'wb+', perm) - @chunk.set_encoding(Encoding::ASCII_8BIT) - @chunk.sync = true - @chunk.binmode - @meta = File.open(@meta_path, 'wb', perm) - @meta.set_encoding(Encoding::ASCII_8BIT) - @meta.sync = true - @meta.binmode + begin + @chunk = File.open(@path, 'wb+', perm) + @chunk.set_encoding(Encoding::ASCII_8BIT) + @chunk.sync = true + @chunk.binmode + rescue => e + raise BufferOverflowError, "can't create buffer file for #{path}. Stop creating buffer files: error = #{e}" + end + begin + @meta = File.open(@meta_path, 'wb', perm) + @meta.set_encoding(Encoding::ASCII_8BIT) + @meta.sync = true + @meta.binmode + rescue => e + @chunk.close + File.unlink(@path) + raise BufferOverflowError, "can't create buffer metadata for #{path}. Stop creating buffer files: error = #{e}" + end @state = :unstaged @bytesize = 0 From c5f0d9da75f410da52551f4a1e911c1b65a26511 Mon Sep 17 00:00:00 2001 From: Masahiro Nakagawa Date: Thu, 16 Feb 2017 14:41:07 +0900 Subject: [PATCH 2/3] Add comment for implementation detail --- lib/fluent/plugin/buffer/file_chunk.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/fluent/plugin/buffer/file_chunk.rb b/lib/fluent/plugin/buffer/file_chunk.rb index e601ece17c..e8dec13ed8 100644 --- a/lib/fluent/plugin/buffer/file_chunk.rb +++ b/lib/fluent/plugin/buffer/file_chunk.rb @@ -112,6 +112,10 @@ def enqueued! begin file_rename(@chunk, new_chunk_path, @path, ->(new_io) { @chunk = new_io }) if File.exist?(new_chunk_path) rescue => re + # In this point, restore buffer state is hard because previous `file_rename` failed by resource problem. + # Retry is one possible approach but it may cause livelock under limited resources or high load environment. + # So we ignore such errors for now and log better message instead. + # "Too many open files" should be fixed by proper buffer configuration and system setting. end if re raise "can't enqueue buffer file. This may causes inconsistent state: path = #{@path}, error = '#{e}', retry error = '#{re}'" @@ -127,6 +131,7 @@ def enqueued! file_rename(@chunk, new_chunk_path, @path, ->(new_io) { @chunk = new_io }) if File.exist?(new_chunk_path) file_rename(@meta, new_meta_path, @meta_path, ->(new_io) { @meta = new_io }) if File.exist?(new_meta_path) rescue => re + # See above end if re raise "can't enqueue buffer metadata. This may causes inconsistent state: path = #{@meta_path}, error = '#{e}', retry error = '#{re}'" From fa29c7c3f3c2a53d39e4ca8ff8b4b969bc93c64f Mon Sep 17 00:00:00 2001 From: Masahiro Nakagawa Date: Thu, 16 Feb 2017 15:41:17 +0900 Subject: [PATCH 3/3] Apply review for using 'else' and add more comment for implementation --- lib/fluent/plugin/buffer/file_chunk.rb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/fluent/plugin/buffer/file_chunk.rb b/lib/fluent/plugin/buffer/file_chunk.rb index e8dec13ed8..004260c7a2 100644 --- a/lib/fluent/plugin/buffer/file_chunk.rb +++ b/lib/fluent/plugin/buffer/file_chunk.rb @@ -116,9 +116,7 @@ def enqueued! # Retry is one possible approach but it may cause livelock under limited resources or high load environment. # So we ignore such errors for now and log better message instead. # "Too many open files" should be fixed by proper buffer configuration and system setting. - end - if re - raise "can't enqueue buffer file. This may causes inconsistent state: path = #{@path}, error = '#{e}', retry error = '#{re}'" + raise "can't enqueue buffer file and failed to restore. This may causes inconsistent state: path = #{@path}, error = '#{e}', retry error = '#{re}'" else raise "can't enqueue buffer file: path = #{@path}, error = '#{e}'" end @@ -132,9 +130,7 @@ def enqueued! file_rename(@meta, new_meta_path, @meta_path, ->(new_io) { @meta = new_io }) if File.exist?(new_meta_path) rescue => re # See above - end - if re - raise "can't enqueue buffer metadata. This may causes inconsistent state: path = #{@meta_path}, error = '#{e}', retry error = '#{re}'" + raise "can't enqueue buffer metadata and failed to restore. This may causes inconsistent state: path = #{@meta_path}, error = '#{e}', retry error = '#{re}'" else raise "can't enqueue buffer metadata: path = #{@meta_path}, error = '#{e}'" end @@ -279,6 +275,8 @@ def create_new_chunk(path, perm) @chunk.sync = true @chunk.binmode rescue => e + # Here assumes "Too many open files" like recoverable error so raising BufferOverflowError. + # If other cases are possible, we will change erorr handling with proper classes. raise BufferOverflowError, "can't create buffer file for #{path}. Stop creating buffer files: error = #{e}" end begin @@ -287,8 +285,10 @@ def create_new_chunk(path, perm) @meta.sync = true @meta.binmode rescue => e - @chunk.close - File.unlink(@path) + # This case is easier than enqueued!. Just removing pre-create buffer file + @chunk.close rescue nil + File.unlink(@path) rescue nil + # Same as @chunk case. See above raise BufferOverflowError, "can't create buffer metadata for #{path}. Stop creating buffer files: error = #{e}" end