-
Notifications
You must be signed in to change notification settings - Fork 30.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
http: simplify checkIsHttpToken() #17399
Conversation
Benchmark results: improvement confidence p.value
http/check_is_http_token.js n=1000000 key=":" -67.14 % *** 4.065668e-34
http/check_is_http_token.js n=1000000 key=":alternate-protocol" -54.61 % *** 5.705110e-28
http/check_is_http_token.js n=1000000 key="((((())))" -66.34 % *** 2.636118e-33
http/check_is_http_token.js n=1000000 key="@@" -67.24 % *** 2.329542e-28
http/check_is_http_token.js n=1000000 key="Accept-Ranges" 146.79 % *** 2.298935e-45
http/check_is_http_token.js n=1000000 key="alt-svc" -3.79 % *** 3.200379e-04
http/check_is_http_token.js n=1000000 key="alternate-protocol:" 150.80 % *** 1.859951e-41
http/check_is_http_token.js n=1000000 key="alternate-protocol" 239.27 % *** 2.887460e-42
http/check_is_http_token.js n=1000000 key="Cache-Control" 148.05 % *** 1.588821e-41
http/check_is_http_token.js n=1000000 key="Connection" 38.53 % *** 9.847708e-29
http/check_is_http_token.js n=1000000 key="Content-Encoding" 205.91 % *** 1.416987e-37
http/check_is_http_token.js n=1000000 key="content-length" 157.29 % *** 1.049744e-37
http/check_is_http_token.js n=1000000 key="Content-Location" 203.87 % *** 1.005885e-38
http/check_is_http_token.js n=1000000 key="content-type" 60.62 % *** 1.051226e-40
http/check_is_http_token.js n=1000000 key="Content-Type" 61.86 % *** 1.993077e-30
http/check_is_http_token.js n=1000000 key="date" -58.40 % *** 4.130459e-34
http/check_is_http_token.js n=1000000 key="ETag" -58.21 % *** 2.484249e-43
http/check_is_http_token.js n=1000000 key="Expires" -1.77 % 8.884764e-02
http/check_is_http_token.js n=1000000 key="Keep-Alive" 38.08 % *** 3.190244e-35
http/check_is_http_token.js n=1000000 key="Last-Modified" 150.14 % *** 4.677389e-55
http/check_is_http_token.js n=1000000 key="location" 12.19 % *** 1.952609e-14
http/check_is_http_token.js n=1000000 key="server" -18.56 % *** 6.947429e-20
http/check_is_http_token.js n=1000000 key="Server" -18.91 % *** 1.729007e-19
http/check_is_http_token.js n=1000000 key="status" -19.17 % *** 4.218003e-27
http/check_is_http_token.js n=1000000 key="TCN" -61.80 % *** 1.219346e-23
http/check_is_http_token.js n=1000000 key="Transfer-Encoding" 223.55 % *** 1.359534e-40
http/check_is_http_token.js n=1000000 key="Vary" -59.13 % *** 3.830565e-45
http/check_is_http_token.js n=1000000 key="version" -6.03 % *** 2.214569e-04
http/check_is_http_token.js n=1000000 key="x-frame-options" 164.26 % *** 1.243066e-30
http/check_is_http_token.js n=1000000 key="x-xss-protection" 204.88 % *** 1.000083e-35
http/check_is_http_token.js n=1000000 key="中文呢" -31.09 % *** 6.800282e-23 |
Benchmark summary seems to be:
I'm tempted to log arguments sent to |
@nodejs/v8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM for readability.
I think I'd prefer a hybrid + expanded loop unrolling solution. The expanded unrolling would cover the most common http headers (based on headers listed on Wikipedia for example and also the list we use in _http_incoming.js when converting headers to lowercase) and the regexp would be used for larger, less common header names. We could dynamically generate the function to avoid the lengthy function source code, it performs the same as the inline version: const validTokens = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 - 15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 - 31
0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, // 32 - 47
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 48 - 63
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64 - 79
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, // 80 - 95
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 96 - 111
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, // 112 - 127
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 128 ...
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // ... 255
];
const tokenRegExp = /^[\^_`a-zA-Z\-0-9!#$%&'*+.|~]+$/;
function checkIsHttpToken(val) {
if (val.length > 19) return tokenRegExp.test(val);
if (!validTokens[val.charCodeAt(0)]) return false;
if (val.length < 2) return true;
if (!validTokens[val.charCodeAt(1)]) return false;
if (val.length < 3) return true;
if (!validTokens[val.charCodeAt(2)]) return false;
if (val.length < 4) return true;
if (!validTokens[val.charCodeAt(3)]) return false;
if (val.length < 5) return true;
if (!validTokens[val.charCodeAt(4)]) return false;
if (val.length < 6) return true;
if (!validTokens[val.charCodeAt(5)]) return false;
if (val.length < 7) return true;
if (!validTokens[val.charCodeAt(6)]) return false;
if (val.length < 8) return true;
if (!validTokens[val.charCodeAt(7)]) return false;
if (val.length < 9) return true;
if (!validTokens[val.charCodeAt(8)]) return false;
if (val.length < 10) return true;
if (!validTokens[val.charCodeAt(9)]) return false;
if (val.length < 11) return true;
if (!validTokens[val.charCodeAt(10)]) return false;
if (val.length < 12) return true;
if (!validTokens[val.charCodeAt(11)]) return false;
if (val.length < 13) return true;
if (!validTokens[val.charCodeAt(12)]) return false;
if (val.length < 14) return true;
if (!validTokens[val.charCodeAt(13)]) return false;
if (val.length < 15) return true;
if (!validTokens[val.charCodeAt(14)]) return false;
if (val.length < 16) return true;
if (!validTokens[val.charCodeAt(15)]) return false;
if (val.length < 17) return true;
if (!validTokens[val.charCodeAt(16)]) return false;
if (val.length < 18) return true;
if (!validTokens[val.charCodeAt(17)]) return false;
if (val.length < 19) return true;
if (!validTokens[val.charCodeAt(18)]) return false;
return true;
} Benchmarking the various solutions with
|
It would be good to get some real world benchmarking data; but lacking that, my preference would be for readability. |
Ignore my — now deleted — post re: for loop, testing on the wrong V8 version... It seems like the version @mscdex proposed is currently the best (albeit ugly) it gets. |
@mscdex or anyone else: Do you have an explanation as to why your benchmarking shows the hybrid+expanded unroll faster for a 19-character value than this PR? I'm mystified as to how that could be and TBH it's making me look at those timings with a bit of side-eye.... |
@Trott because a regexp (or just a loop for that matter) has a lot more overhead than a series of |
@mscdex But for a 19-character string, your unrolled version uses a regexp too. So how is it faster for a 19-character string? Something isn't right... |
@Trott it's |
Ah! Off-by-one error in my brain. |
@Trott It looks like the regexp code is faster at processing 8 or more characters. By 'processing' I mean how many characters it actually looks at, not how long the input string is, e.g. it bails on out the first-character of ":alternate-protocol" and so is slower than the handwritten JS code. It looks like every string under 7 characters in length regresses. My guess is that that is caused by the overhead of calling to the regexp builtin or some setup/initialization/allocation that we have to do for each match. I'm not an expert on regexps in v8 though. The manual loop unrolling is concerning - this should definitely not be necessary to do by hand. I like this code much better just based on readability. Parsing time will also be lower which is nice. Inlining decisions are more complicated than just the pure length of the function now - I think we have a higher budget for extremely small functions, so this could potentially help there too. One more thing to think about - is this function even run as optimized code on a server? i.e. is it actually called enough times with reasonably stable input types. I don't have any intuition there. The microbenchmarks probably aren't stressing this code in the same way a real server would. I'd suggest the following:
|
I'm pretty in favour of landing this very soon (with the caveat we don't use it for v8.x or lower). @mscdex would you still like this to use unrolled checks? Could you make your request/objection more explicit if so, as otherwise this will end up landing eventually given the 3 approvals (incl 2 from TSC). |
@apapirovski to be honest a lot of the performance suggestions I make are too much for most people, so just take the suggestions/benchmark results I posted as some food for thought. |
lib/_http_common.js
Outdated
/** | ||
* Verifies that the given val is a valid HTTP token | ||
* per the rules defined in RFC 7230 | ||
* See https://tools.ietf.org/html/rfc7230#section-3.2.6 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would like to keep this comment. When looking through the code it is good to have a reference handy.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@BridgeAR Restored.
Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken().
Landed in 9f55eac 🎉 |
Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken(). PR-URL: #17399 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Anatoli Papirovski <apapirovski@mac.com> Reviewed-By: Timothy Gu <timothygu99@gmail.com>
Any volunteers to open the two issues for V8 as suggested by @psmarshall? I'll do it if not one else does, but I feel like someone who better understands V8 and benchmarking would do a better job... @nodejs/v8 |
Great job! 👍 |
Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken(). PR-URL: #17399 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Anatoli Papirovski <apapirovski@mac.com> Reviewed-By: Timothy Gu <timothygu99@gmail.com>
Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken(). PR-URL: #17399 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Anatoli Papirovski <apapirovski@mac.com> Reviewed-By: Timothy Gu <timothygu99@gmail.com>
@Trott Please go ahead and file those issues. You’ve got this! 👍 |
V8 issues opened at https://bugs.chromium.org/p/v8/issues/detail?id=7200 and https://bugs.chromium.org/p/v8/issues/detail?id=7201. |
In the spirit of [17399](nodejs#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data.
In the spirit of [17399](nodejs#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: nodejs#18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>
In the spirit of [17399](#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: #18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>
In the spirit of [17399](#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: #18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>
In the spirit of [17399](nodejs#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: nodejs#18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>
Replace code optimized for older versions of V8 with more
straightforward code in checkIsHttpToken().
Checklist
make -j4 test
(UNIX), orvcbuild test
(Windows) passesAffected core subsystem(s)
http