From 2b80dd13ed65f6a35e366762bfeadee7ec8a8c3b Mon Sep 17 00:00:00 2001 From: foglabs Date: Tue, 19 Apr 2022 13:04:05 -0400 Subject: [PATCH 1/3] 'useless html attribute scrub step from htmlscrubber' --- lib/html_scrubber.rb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/html_scrubber.rb b/lib/html_scrubber.rb index 0e939373fe..e70e8210d9 100644 --- a/lib/html_scrubber.rb +++ b/lib/html_scrubber.rb @@ -17,10 +17,6 @@ def self.scrub(dirty) .split("\n").map(&:strip).join("\n").strip # strip at ends of lines, then replace consecutive whitespace w/ 1 space - if dirtay =~ /\/\w+/ - # Angle-brackets stripped, so be more aggressive - dirtay = dirtay.gsub(/\w+=\S+/, ' ') - end dirtay = dirtay.gsub(/[ \t]+/, ' ') .gsub(/\n( ?\n)+/, "\n") From 8f334d409d4a1f18ddc08fe9d8a94b7a17813e71 Mon Sep 17 00:00:00 2001 From: foglabs Date: Tue, 19 Apr 2022 13:16:07 -0400 Subject: [PATCH 2/3] 'removing live chat scrub test because html elements will not be allowed to ingest into descriptions in the first place due to xsd validation' --- spec/lib/html_scrubber_spec.rb | 5 ----- 1 file changed, 5 deletions(-) diff --git a/spec/lib/html_scrubber_spec.rb b/spec/lib/html_scrubber_spec.rb index 8f4ecad7a2..bbaf134432 100644 --- a/spec/lib/html_scrubber_spec.rb +++ b/spec/lib/html_scrubber_spec.rb @@ -127,9 +127,4 @@ 'span style=font-family: \'times new roman\', times span style=font-size: 18pt a http://wgcu.org/yourvoiceshow/home.html target=_self YOUR VOICE / /span //span WGCU Public Media\'s initiative, em Your Voice/em , examines issues affecting Southwest Florida. ' )).to eq "span 'times new roman', times span a http://wgcu.org/yourvoiceshow/home.html YOUR VOICE / /span //span WGCU Public Media's initiative, em Your Voice/em , examines issues affecting Southwest Florida." end - it 'cleans up "live chat"' do - expect(HtmlScrubber.scrub( - '!-- START LIVE CHAT MODULE -- br iframe src=http://www.coveritlive.com/index2.php/option=com_altcaster/task=viewaltcast/altcast_code=4947f0148c/height=500/width=320 mce_src=http://www.coveritlive.com/index2.php/option=com_altcaster/task=viewaltcast/altcast_code=4947f0148c/height=500/width=320 scrolling=no width=320 frameborder=0 height=500 amp amp amp amp' - )).to eq '!-- START LIVE CHAT MODULE -- br iframe amp amp amp amp' - end end From cca14acdca6ebece8929c0cfece46a702d97e863 Mon Sep 17 00:00:00 2001 From: foglabs Date: Tue, 19 Apr 2022 13:31:46 -0400 Subject: [PATCH 3/3] 'removing other htmlscrubber tests where positive test case doesnt actually preserve content or are unapplicable due to blocking of html elements at xsd validation' --- spec/lib/html_scrubber_spec.rb | 57 ---------------------------------- 1 file changed, 57 deletions(-) diff --git a/spec/lib/html_scrubber_spec.rb b/spec/lib/html_scrubber_spec.rb index bbaf134432..3684aabba1 100644 --- a/spec/lib/html_scrubber_spec.rb +++ b/spec/lib/html_scrubber_spec.rb @@ -31,26 +31,6 @@ 'Waterbury rejects town, village merger' ].join("\n") end - it 'tries to fix html with angle-brackets removed' do - expect(HtmlScrubber.scrub( - <<-EOF - em img style=margin-right: 10px float: left src=images/stories/earth.jpg - alt=earth width=250 height=90 Earth Edition /em focuses on diverse and - unique natural world of Southwest Florida. ... Produced from 2003 to 2006, - the programs received Emmy nominations and won Telly awards. - a http://video.wgcu.org/program/1354335502 img src=images/stories/watchbutton.gif - alt=watchbutton2 width=75 height=26 / /??/ - EOF - )).to eq [ - # TODO: Want this to be cleaner. - 'em', - 'Earth Edition /em focuses on diverse and', - 'unique natural world of Southwest Florida. ... Produced from 2003 to 2006,', - 'the programs received Emmy nominations and won Telly awards.', - 'a http://video.wgcu.org/program/1354335502', - '/ /??/' - ].join("\n") - end it 'leaves name slashes in place' do expect(HtmlScrubber.scrub('Stunk/White slash fiction')).to eq 'Stunk/White slash fiction' end @@ -71,23 +51,6 @@ "Michigan Public Radio's Rachel Lippmann reports." ].join("\n") end - it 'tries to clean up style fragments' do - expect(HtmlScrubber.scrub( - <<-EOF - a http://www.pbs.org/wnet/need-to-know/ target=_blank;span style=font-size: 12pt;;em;Need to - Know/em/ ;img style=margin-right: 10px; float: left; src=images/stories/tv/needtoknow.png - alt=needtoknow width=250 height=90 ; Need to Know is the PBS TV- and web- newsmagazine - that gives you what you need to know along with a healthy dose of insight, perspective - and wit. - EOF - )).to eq [ - 'a http://www.pbs.org/wnet/need-to-know/ ;;em;Need to', - 'Know/em/ ;', - 'Need to Know is the PBS TV- and web- newsmagazine', - 'that gives you what you need to know along with a healthy dose of insight, perspective', - 'and wit.' - ].join("\n") - end it 'handles MS Word XML' do expect(HtmlScrubber.scrub( <<-EOF @@ -107,24 +70,4 @@ '(Host) Vermont veterans who\'ve returned from deployment to Afghanistan only to find the job market scarce would get a leg up on finding work under a bill sponsored by Congressman Peter Welch.' ].join("\n") end - it 'handles nmap weirdness' do - expect(HtmlScrubber.scrub( - 'Heres some of what we heard. ; ;{nmap}normal|250|80|images/stories/audio/news/FS2089.mp3|||a{/nmap} a mce_http://wgcu.org/blogs/news/workjanuary2011%20073.jpg http://wgcu.org/blogs/news/workjanuary2011%20073.jpg; img width=350 border=0 mce_src=http://wgcu.org/blogs/news/workjanuary2011%20073.jpg src=http://wgcu.org/blogs/news/workjanuary2011%20073.jpg ;/ ;' - )).to eq 'Heres some of what we heard. a mce_http://wgcu.org/blogs/news/workjanuary2011%20073.jpg http://wgcu.org/blogs/news/workjanuary2011%20073.jpg;' - end - it 'handles more nmap weirdness' do - expect(HtmlScrubber.scrub( - '{nmap}popup|250|40|images/stories/audio/gulfcoastlive/GL012612.mp3|1||||1|{/nmap} ; ; a http://firesigntheatre.com/media/media.php?member=all target=_blank;Nick Danger: Third Eye"/ a parody of the 1940s radio detective shows originally written and performed by a http://firesigntheatre.com/index.php target=_blank;The Firesign Theatre/ in 1969 comes to Sarasota. This bit of theatrical history is the first ever fully dramatized presentation of Nick Danger. It will be performed at the a http://www.annamariaisland-longboatkey.com/crosley-estate/ target=_blank;Powel Crosley Estate/ .' - )).to eq 'a http://firesigntheatre.com/media/media.php? Danger: Third Eye"/ a parody of the 1940s radio detective shows originally written and performed by a http://firesigntheatre.com/index.php Firesign Theatre/ in 1969 comes to Sarasota. This bit of theatrical history is the first ever fully dramatized presentation of Nick Danger. It will be performed at the a http://www.annamariaisland-longboatkey.com/crosley-estate/ Crosley Estate/ .' - end - it 'handles "??" weirdness' do - expect(HtmlScrubber.scrub( - 'enjoy an active and exciting life in Southwest Florida. ??table border=0 cellpadding=10 ??tbody ??tr ??td img src=images/stories/Connect/arts.jpg alt=arts width=205 height=115 /td' - )).to eq 'enjoy an active and exciting life in Southwest Florida. /td' - end - it 'muddles through' do - expect(HtmlScrubber.scrub( - 'span style=font-family: \'times new roman\', times span style=font-size: 18pt a http://wgcu.org/yourvoiceshow/home.html target=_self YOUR VOICE / /span //span WGCU Public Media\'s initiative, em Your Voice/em , examines issues affecting Southwest Florida. ' - )).to eq "span 'times new roman', times span a http://wgcu.org/yourvoiceshow/home.html YOUR VOICE / /span //span WGCU Public Media's initiative, em Your Voice/em , examines issues affecting Southwest Florida." - end end