Handle overflows in wrap_optimal_fit by divide-and-conquer

The `wrap_optimal_fit algorithm` computes the penalty for a gap as `gap * gap`. If a fragment has a size near `usize::max_value()` and if the line width is small, this computation can easily overflow. When this happened, we would previously abort or unwind. Now, we instead do the computations with checked arithmetic and detect the overflow. We then proceed to wrap the half of the fragments by themselves. If this work, we then wrap the second half. This way, we might be able to wrap everything without overflow. Should there be a single fragment which causes the overflow by itself, this fragment is put on a line by itself. When wrapping part of the fragments, we might of course end up with a partial last line. To fix this, we simply pop this line and re-wrap the fragments that were put onto this line. This ensures no “seams” in the wrapping. Fixes #247.
mgeisler · Dec 27, 2020 · bba043c · bba043c
1 parent e26ef14
commit bba043c
Show file tree

Hide file tree

Showing 4 changed files with 203 additions and 20 deletions.
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -10,7 +10,7 @@ edition = "2018"
 cargo-fuzz = true
 
 [dependencies]
-libfuzzer-sys = "0.3"
+libfuzzer-sys = { version = "0.3", features = ["arbitrary-derive"] }
 textwrap = { path = ".." }
 
 # Prevent this from interfering with workspaces
@@ -28,3 +28,9 @@ name = "fill_first_fit"
 path = "fuzz_targets/fill_first_fit.rs"
 test = false
 doc = false
+
+[[bin]]
+name = "wrap_optimal_fit"
+path = "fuzz_targets/wrap_optimal_fit.rs"
+test = false
+doc = false
diff --git a/fuzz/fuzz_targets/wrap_optimal_fit.rs b/fuzz/fuzz_targets/wrap_optimal_fit.rs
@@ -0,0 +1,30 @@
+#![no_main]
+use libfuzzer_sys::{arbitrary, fuzz_target};
+use textwrap::core;
+use textwrap::core::Fragment;
+
+#[derive(arbitrary::Arbitrary, Debug, Eq, PartialEq, Clone)]
+struct BoxGluePenalty(usize, usize, usize);
+
+#[rustfmt::skip]
+impl core::Fragment for BoxGluePenalty {
+    fn width(&self) -> usize { self.0 }
+    fn whitespace_width(&self) -> usize { self.1 }
+    fn penalty_width(&self) -> usize { self.2 }
+}
+
+fuzz_target!(|input: (Vec<BoxGluePenalty>, u64)| {
+    let line_width = input.1 as usize;
+    let fragments = input.0.clone();
+
+    let total_width: Option<usize> = fragments.iter().fold(Some(0), |sum, f| {
+        sum.and_then(|sum| sum.checked_add(f.width()))
+            .and_then(|sum| sum.checked_add(f.whitespace_width()))
+            .and_then(|sum| sum.checked_add(f.penalty_width()))
+    });
+    if total_width.is_none() {
+        return;
+    }
+
+    let _ = core::wrap_optimal_fit(&fragments, &|_| line_width);
+});
diff --git a/src/core.rs b/src/core.rs
@@ -31,7 +31,7 @@
 //! improving it. We would love to hear from you!
 
 use crate::{Options, WordSplitter};
-use std::cell::RefCell;
+use std::cell::{Cell, RefCell};
 use unicode_width::UnicodeWidthChar;
 
 /// The CSI or “Control Sequence Introducer” introduces an ANSI escape
@@ -538,7 +538,7 @@ impl LineNumbers {
         }
     }
 
-    fn get(&self, i: usize, minima: &[(usize, i32)]) -> usize {
+    fn get<T>(&self, i: usize, minima: &[(usize, T)]) -> usize {
         while self.line_numbers.borrow_mut().len() < i + 1 {
             let pos = self.line_numbers.borrow().len();
             let line_number = 1 + self.get(minima[pos].0, &minima);
@@ -551,7 +551,7 @@ impl LineNumbers {
 
 /// Per-line penalty. This is added for every line, which makes it
 /// expensive to output more lines than the minimum required.
-const NLINE_PENALTY: i32 = 1000;
+const NLINE_PENALTY: usize = 1000;
 
 /// Per-character cost for lines that overflow the target line width.
 ///
@@ -590,16 +590,16 @@ const NLINE_PENALTY: i32 = 1000;
 /// _and_ if it happens to overflow the line by exactly one character.
 /// If it overflows by more than one character, the overflow penalty
 /// will quickly outgrow the cost of the gap, as seen above.
-const OVERFLOW_PENALTY: i32 = 50 * 50;
+const OVERFLOW_PENALTY: usize = 50 * 50;
 
 /// The last line is short if it is less than 1/4 of the target width.
 const SHORT_LINE_FRACTION: usize = 4;
 
 /// Penalize a short last line.
-const SHORT_LAST_LINE_PENALTY: i32 = 25;
+const SHORT_LAST_LINE_PENALTY: usize = 25;
 
 /// Penalty for lines ending with a hyphen.
-const HYPHEN_PENALTY: i32 = 25;
+const HYPHEN_PENALTY: usize = 25;
 
 /// Wrap abstract fragments into lines with an optimal-fit algorithm.
 ///
@@ -675,6 +675,60 @@ pub fn wrap_optimal_fit<'a, T: Fragment, F: Fn(usize) -> usize>(
     fragments: &'a [T],
     line_widths: F,
 ) -> Vec<&'a [T]> {
+    let mut min_idx = 0;
+    let mut max_idx = fragments.len();
+
+    let mut result = Vec::new();
+
+    // We call wrap_optimal_fit_checked on smaller and smaller slices
+    // until we either end up with a single fragment or we find a
+    // slice which can be wrapped without overflow. In either case, we
+    // advance min_idx which ensures that we make progress.
+    loop {
+        match wrap_optimal_fit_checked(&fragments[min_idx..max_idx], &line_widths) {
+            Some(lines) => {
+                let partial_last_line = lines.len() > 1;
+                result.extend(lines);
+                if max_idx == fragments.len() {
+                    return result; // All done!
+                }
+
+                min_idx = max_idx;
+                max_idx = fragments.len();
+
+                // We assume that the last wrapped line is incomplete
+                // and needs to be re-wrapped.
+                if partial_last_line {
+                    let last_line = result.pop().unwrap();
+                    min_idx -= last_line.len();
+                }
+            }
+            None => {
+                if max_idx - min_idx == 1 {
+                    // This single fragment is causing an overflow, so
+                    // we put on its own line.
+                    result.push(&fragments[min_idx..max_idx]);
+                    if max_idx == fragments.len() {
+                        return result; // All done!
+                    }
+
+                    min_idx = max_idx;
+                    max_idx = fragments.len();
+                } else {
+                    max_idx = min_idx + (max_idx - min_idx) / 2;
+                }
+            }
+        }
+    }
+}
+
+/// Wrap abstract fragments into lines with an optimal-fit algorithm.
+/// Returns `None` if an overflow occurs during the penalty
+/// computations. See [`wrap_optimal_fit`].
+fn wrap_optimal_fit_checked<'a, T: Fragment, F: Fn(usize) -> usize>(
+    fragments: &'a [T],
+    line_widths: F,
+) -> Option<Vec<&'a [T]>> {
     let mut widths = Vec::with_capacity(fragments.len() + 1);
     let mut width = 0;
     widths.push(width);
@@ -683,53 +737,76 @@ pub fn wrap_optimal_fit<'a, T: Fragment, F: Fn(usize) -> usize>(
         widths.push(width);
     }
 
+    if widths.last() < Some(&line_widths(0)) {
+        return Some(vec![fragments]);
+    }
+
     let line_numbers = LineNumbers::new(fragments.len());
+    let detected_overflow = Cell::new(false);
 
-    let minima = smawk::online_column_minima(0, widths.len(), |minima, i, j| {
+    let cost_fn = |minima: &[(usize, usize)], i, j| -> Option<usize> {
         // Line number for fragment `i`.
         let line_number = line_numbers.get(i, &minima);
         let target_width = std::cmp::max(1, line_widths(line_number));
 
         // Compute the width of a line spanning fragments[i..j] in
         // constant time. We need to adjust widths[j] by subtracting
         // the whitespace of fragment[j-i] and then add the penalty.
-        let line_width = widths[j] - widths[i] - fragments[j - 1].whitespace_width()
-            + fragments[j - 1].penalty_width();
+        let last_fragment: &T = &fragments[j - 1];
+        let line_width = widths[j] - widths[i] - last_fragment.whitespace_width()
+            + last_fragment.penalty_width();
 
         // We compute cost of the line containing fragments[i..j]. We
         // start with values[i].1, which is the optimal cost for
         // breaking before fragments[i].
         //
         // First, every extra line cost NLINE_PENALTY.
-        let mut cost = minima[i].1 + NLINE_PENALTY;
+        let mut cost = minima[i].1.checked_add(NLINE_PENALTY)?;
 
         // Next, we add a penalty depending on the line length.
         if line_width > target_width {
             // Lines that overflow get a hefty penalty.
-            let overflow = (line_width - target_width) as i32;
-            cost += overflow * OVERFLOW_PENALTY;
+            let overflow: usize = line_width - target_width;
+            cost = cost.checked_add(overflow.checked_mul(OVERFLOW_PENALTY)?)?;
         } else if j < fragments.len() {
             // Other lines (except for the last line) get a milder
             // penalty which depend on the size of the gap.
-            let gap = (target_width - line_width) as i32;
-            cost += gap * gap;
+            let gap: usize = target_width - line_width;
+            cost = cost.checked_add(gap.checked_mul(gap)?)?;
         } else if i + 1 == j && line_width < target_width / SHORT_LINE_FRACTION {
             // The last line can have any size gap, but we do add a
             // penalty if the line is very short (typically because it
             // contains just a single word).
-            cost += SHORT_LAST_LINE_PENALTY;
+            cost = cost.checked_add(SHORT_LAST_LINE_PENALTY)?;
         }
 
         // Finally, we discourage hyphens.
         if fragments[j - 1].penalty_width() > 0 {
             // TODO: this should use a penalty value from the fragment
             // instead.
-            cost += HYPHEN_PENALTY;
+            cost = cost.checked_add(HYPHEN_PENALTY)?;
         }
 
-        cost
+        Some(cost)
+    };
+
+    let minima = smawk::online_column_minima(0, widths.len(), |minima: &[(usize, usize)], i, j| {
+        if detected_overflow.get() {
+            return 0;
+        }
+        match cost_fn(minima, i, j) {
+            Some(cost) => cost,
+            None => {
+                detected_overflow.set(true);
+                0
+            }
+        }
     });
 
+    if detected_overflow.into_inner() {
+        return None;
+    }
+
     let mut lines = Vec::with_capacity(line_numbers.get(fragments.len(), &minima));
     let mut pos = fragments.len();
     loop {
@@ -742,7 +819,7 @@ pub fn wrap_optimal_fit<'a, T: Fragment, F: Fn(usize) -> usize>(
     }
 
     lines.reverse();
-    lines
+    Some(lines)
 }
 
 #[cfg(test)]
@@ -927,4 +1004,69 @@ mod tests {
             ]
         );
     }
+
+    #[derive(Debug, Eq, PartialEq)]
+    struct BoxGluePenalty(usize);
+
+    #[rustfmt::skip]
+    impl Fragment for BoxGluePenalty {
+        fn width(&self) -> usize { self.0 }
+        fn whitespace_width(&self) -> usize { 1 }
+        fn penalty_width(&self) -> usize { 0 }
+    }
+
+    #[test]
+    fn optimal_fit_single_fragment_overflow() {
+        let fragments = vec![BoxGluePenalty(2 << 60)];
+        let line_widths = |_| 80;
+
+        assert_eq!(wrap_optimal_fit_checked(&fragments, &line_widths), None);
+        assert_eq!(
+            wrap_optimal_fit(&fragments, &line_widths),
+            vec![[BoxGluePenalty(2 << 60)]]
+        );
+    }
+
+    #[test]
+    fn optimal_fit_rewrapping_on_overflow() {
+        let fragments = vec![
+            BoxGluePenalty(1001),
+            BoxGluePenalty(1002),
+            BoxGluePenalty(1003),
+            BoxGluePenalty(1004),
+            BoxGluePenalty(105),     // small fragment
+            BoxGluePenalty(2 << 60), // over-sized fragment
+            BoxGluePenalty(1007),
+            BoxGluePenalty(1008),
+        ];
+        let line_widths = |_| 2500; // Room for two big fragments.
+
+        assert_eq!(wrap_optimal_fit_checked(&fragments, &line_widths), None);
+        // First five fragments fit on two lines and the small 105
+        // fragment is included on the second line:
+        assert_eq!(
+            wrap_optimal_fit_checked(&fragments[..5], &line_widths).unwrap(),
+            vec![
+                vec![BoxGluePenalty(1001), BoxGluePenalty(1002)],
+                vec![
+                    BoxGluePenalty(1003),
+                    BoxGluePenalty(1004),
+                    BoxGluePenalty(105)
+                ]
+            ]
+        );
+        assert_eq!(
+            wrap_optimal_fit(&fragments, &line_widths),
+            vec![
+                vec![BoxGluePenalty(1001), BoxGluePenalty(1002)],
+                vec![
+                    BoxGluePenalty(1003),
+                    BoxGluePenalty(1004),
+                    BoxGluePenalty(105),
+                ],
+                vec![BoxGluePenalty(2 << 60)],
+                vec![BoxGluePenalty(1007), BoxGluePenalty(1008)]
+            ]
+        );
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -850,10 +850,15 @@ mod tests {
     }
 
     #[test]
-    fn max_width() {
+    fn max_width_usize() {
         assert_eq!(wrap("foo bar", usize::max_value()), vec!["foo bar"]);
     }
 
+    #[test]
+    fn max_width_usize_issue_247() {
+        assert_eq!(wrap("x y", 515566821223), vec!["x y"]);
+    }
+
     #[test]
     fn leading_whitespace() {
         assert_eq!(wrap("  foo bar", 6), vec!["  foo", "bar"]);