-
Notifications
You must be signed in to change notification settings - Fork 13k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Auto merge of #99487 - bmacnaughton:is_whitespace_updates, r=thomcc
is_whitespace() performance improvements This is my first rust PR, so if I miss anything obvious please let me know and I'll do my best to fix it. This was a bit more of a challenge than I realized because, while I made working code locally and tested it against the native `is_whitespace()`, this PR required changing `src/tools/unicode-table-generator`, the code that generated the code. I have benchmarked this locally, using criterion, and have seen meaningful performance improvements. I can add those outputs to this if you'd like, but am guessing that the perf run that `@fmease` recommended is what's needed. I have run ` ./x.py test --stage 0 library/std` after building it locally after executing `./x.py build library`. I didn't try to build the whole compiler, but maybe I should have - any guidance would be appreciated. If this general approach makes sense, I'll take a look at some other candidate categories, e.g., `Cc`, in the future. Oh, and I wasn't sure whether the generated code should be included in this PR or not. I did include it.
- Loading branch information
Showing
4 changed files
with
113 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
use crate::fmt_list; | ||
use crate::raw_emitter::RawEmitter; | ||
use std::collections::HashMap; | ||
use std::fmt::Write as _; | ||
use std::ops::Range; | ||
|
||
impl RawEmitter { | ||
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool { | ||
let mut map: [u8; 256] = [ | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
]; | ||
|
||
let points = ranges | ||
.iter() | ||
.flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>()) | ||
.collect::<Vec<u32>>(); | ||
|
||
println!("there are {} points", points.len()); | ||
|
||
// how many distinct ranges need to be counted? | ||
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new(); | ||
for point in points { | ||
// assert that there is no whitespace over the 0x3000 range. | ||
assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); | ||
let high_bytes = point as usize >> 8; | ||
let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new); | ||
codepoints.push(point); | ||
} | ||
|
||
let mut bit_for_high_byte = 1u8; | ||
let mut arms = Vec::<String>::new(); | ||
|
||
let mut high_bytes: Vec<usize> = | ||
codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); | ||
high_bytes.sort(); | ||
for high_byte in high_bytes { | ||
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); | ||
if codepoints.len() == 1 { | ||
let ch = codepoints.pop().unwrap(); | ||
arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch)); | ||
continue; | ||
} | ||
// more than 1 codepoint in this arm | ||
for codepoint in codepoints { | ||
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; | ||
} | ||
arms.push(format!( | ||
"{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0", | ||
high_byte, bit_for_high_byte | ||
)); | ||
bit_for_high_byte <<= 1; | ||
} | ||
|
||
writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) | ||
.unwrap(); | ||
self.bytes_used += 256; | ||
|
||
writeln!(&mut self.file, "#[inline]").unwrap(); | ||
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); | ||
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); | ||
for arm in arms { | ||
writeln!(&mut self.file, " {},", arm).unwrap(); | ||
} | ||
writeln!(&mut self.file, " _ => false,").unwrap(); | ||
writeln!(&mut self.file, " }}").unwrap(); | ||
writeln!(&mut self.file, "}}").unwrap(); | ||
|
||
true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters