diff --git a/lib/html_cleaner.rb b/lib/html_cleaner.rb
index a70e19517ec..d9573b13c1e 100644
--- a/lib/html_cleaner.rb
+++ b/lib/html_cleaner.rb
@@ -34,11 +34,6 @@ def fix_bad_characters(text)
# argh, get rid of ____spacer____ inserts
text.gsub! "____spacer____", ""
- # trash a whole bunch of crappy non-printing format characters stuck
- # in most commonly by MS Word
- # \p{Cf} matches all unicode char in the "other, format" category
- text.gsub!(/\p{Cf}/u, '')
-
return text
end
diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb
index 21d9ca47e85..0b5a93573d7 100644
--- a/spec/lib/html_cleaner_spec.rb
+++ b/spec/lib/html_cleaner_spec.rb
@@ -524,6 +524,21 @@
expect(fix_bad_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“")
end
+ it "does not touch zero-width non-joiner" do
+ string = ["A".ord, 0x200C, "A".ord] # "A[zwnj]A"
+ expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
+ end
+
+ it "does not touch zero-width joiner" do
+ string = ["A".ord, 0x200D, "A".ord] # "A[zwj]A"
+ expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
+ end
+
+ it "does not touch word joiner" do
+ string = ["A".ord, 0x2060, "A".ord] # "A[wj]A"
+ expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
+ end
+
it "should remove invalid unicode chars" do
bad_string = [65, 150, 65].pack("C*") # => "A\226A"
expect(fix_bad_characters(bad_string)).to eq("AA")
@@ -540,10 +555,6 @@
it "should remove the spacer" do
expect(fix_bad_characters("A____spacer____A")).to eq("AA")
end
-
- it "should remove unicode chars in the 'other, format' category" do
- expect(fix_bad_characters("A\xE2\x81\xA0A")).to eq("AA")
- end
end
describe "add_paragraphs_to_text" do