Skip to content

Commit

Permalink
Revert "AO3-6087 Stop removing non-printable Unicode characters" (#4820)
Browse files Browse the repository at this point in the history
Revert "AO3-6087 Stop removing non-printable Unicode characters (#4798)"

This reverts commit 3e683fc.
  • Loading branch information
sarken authored May 31, 2024
1 parent 6ab37e1 commit 1c70bc0
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 15 deletions.
5 changes: 5 additions & 0 deletions lib/html_cleaner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def fix_bad_characters(text)
# argh, get rid of ____spacer____ inserts
text.gsub! "____spacer____", ""

# trash a whole bunch of crappy non-printing format characters stuck
# in most commonly by MS Word
# \p{Cf} matches all unicode char in the "other, format" category
text.gsub!(/\p{Cf}/u, '')

return text
end

Expand Down
19 changes: 4 additions & 15 deletions spec/lib/html_cleaner_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -524,21 +524,6 @@
expect(fix_bad_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“")
end

it "does not touch zero-width non-joiner" do
string = ["A".ord, 0x200C, "A".ord] # "A[zwnj]A"
expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
end

it "does not touch zero-width joiner" do
string = ["A".ord, 0x200D, "A".ord] # "A[zwj]A"
expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
end

it "does not touch word joiner" do
string = ["A".ord, 0x2060, "A".ord] # "A[wj]A"
expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
end

it "should remove invalid unicode chars" do
bad_string = [65, 150, 65].pack("C*") # => "A\226A"
expect(fix_bad_characters(bad_string)).to eq("AA")
Expand All @@ -555,6 +540,10 @@
it "should remove the spacer" do
expect(fix_bad_characters("A____spacer____A")).to eq("AA")
end

it "should remove unicode chars in the 'other, format' category" do
expect(fix_bad_characters("A\xE2\x81\xA0A")).to eq("AA")
end
end

describe "add_paragraphs_to_text" do
Expand Down

0 comments on commit 1c70bc0

Please sign in to comment.