diff --git a/lib/html_cleaner.rb b/lib/html_cleaner.rb index a70e19517ec..d9573b13c1e 100644 --- a/lib/html_cleaner.rb +++ b/lib/html_cleaner.rb @@ -34,11 +34,6 @@ def fix_bad_characters(text) # argh, get rid of ____spacer____ inserts text.gsub! "____spacer____", "" - # trash a whole bunch of crappy non-printing format characters stuck - # in most commonly by MS Word - # \p{Cf} matches all unicode char in the "other, format" category - text.gsub!(/\p{Cf}/u, '') - return text end diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index 21d9ca47e85..0b5a93573d7 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -524,6 +524,21 @@ expect(fix_bad_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“") end + it "does not touch zero-width non-joiner" do + string = ["A".ord, 0x200C, "A".ord] # "A[zwnj]A" + expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + end + + it "does not touch zero-width joiner" do + string = ["A".ord, 0x200D, "A".ord] # "A[zwj]A" + expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + end + + it "does not touch word joiner" do + string = ["A".ord, 0x2060, "A".ord] # "A[wj]A" + expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + end + it "should remove invalid unicode chars" do bad_string = [65, 150, 65].pack("C*") # => "A\226A" expect(fix_bad_characters(bad_string)).to eq("AA") @@ -540,10 +555,6 @@ it "should remove the spacer" do expect(fix_bad_characters("A____spacer____A")).to eq("AA") end - - it "should remove unicode chars in the 'other, format' category" do - expect(fix_bad_characters("A\xE2\x81\xA0A")).to eq("AA") - end end describe "add_paragraphs_to_text" do