From 841bb4c61f07a5c2bf818a3b985ea0f88257210e Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Thu, 4 May 2017 23:43:47 -0700 Subject: [PATCH] url: fix C0 control and whitespace handling PR-URL: https://github.com/nodejs/node/pull/12846 Fixes: https://github.com/nodejs/node/issues/12825 Refs: https://github.com/w3c/web-platform-tests/pull/5792 Reviewed-By: James M Snell Reviewed-By: Joyee Cheung --- src/node_url.cc | 59 +++++++++++++++++++++++++------------- src/node_url.h | 17 +++++++---- test/fixtures/url-tests.js | 32 ++++++++++++++++++++- 3 files changed, 81 insertions(+), 27 deletions(-) diff --git a/src/node_url.cc b/src/node_url.cc index 2bdc080953f294..703ff4ffd61335 100644 --- a/src/node_url.cc +++ b/src/node_url.cc @@ -133,6 +133,9 @@ enum url_error_cb_args { // https://infra.spec.whatwg.org/#ascii-tab-or-newline CHAR_TEST(8, IsASCIITabOrNewline, (ch == '\t' || ch == '\n' || ch == '\r')) +// https://infra.spec.whatwg.org/#c0-control-or-space +CHAR_TEST(8, IsC0ControlOrSpace, (ch >= '\0' && ch <= ' ')) + // https://infra.spec.whatwg.org/#ascii-digit CHAR_TEST(8, IsASCIIDigit, (ch >= '0' && ch <= '9')) @@ -1134,15 +1137,45 @@ static inline void ShortenUrlPath(struct url_data* url) { } void URL::Parse(const char* input, - const size_t len, + size_t len, enum url_parse_state state_override, struct url_data* url, + bool has_url, const struct url_data* base, bool has_base) { + const char* p = input; + const char* end = input + len; + + if (!has_url) { + for (const char* ptr = p; ptr < end; ptr++) { + if (IsC0ControlOrSpace(*ptr)) + p++; + else + break; + } + for (const char* ptr = end - 1; ptr >= p; ptr--) { + if (IsC0ControlOrSpace(*ptr)) + end--; + else + break; + } + len = end - p; + } + + std::string whitespace_stripped; + whitespace_stripped.reserve(len); + for (const char* ptr = p; ptr < end; ptr++) + if (!IsASCIITabOrNewline(*ptr)) + whitespace_stripped += *ptr; + + input = whitespace_stripped.c_str(); + len = whitespace_stripped.size(); + p = input; + end = input + len; + bool atflag = false; bool sbflag = false; bool uflag = false; - int wskip = 0; std::string buffer; url->scheme.reserve(len); @@ -1159,9 +1192,6 @@ void URL::Parse(const char* input, enum url_parse_state state = has_state_override ? state_override : kSchemeStart; - const char* p = input; - const char* end = input + len; - if (state < kSchemeStart || state > kFragment) { url->flags |= URL_FLAGS_INVALID_PARSE_STATE; return; @@ -1171,18 +1201,6 @@ void URL::Parse(const char* input, const char ch = p < end ? p[0] : kEOL; const size_t remaining = end == p ? 0 : (end - p - 1); - if (IsASCIITabOrNewline(ch)) { - if (state == kAuthority) { - // It's necessary to keep track of how much whitespace - // is being ignored when in kAuthority state because of - // how the buffer is managed. TODO: See if there's a better - // way - wskip++; - } - p++; - continue; - } - bool special = (url->flags & URL_FLAGS_SPECIAL); bool cannot_be_base; const bool special_back_slash = (special && ch == '\\'); @@ -1500,7 +1518,7 @@ void URL::Parse(const char* input, url->flags |= URL_FLAGS_FAILED; return; } - p -= buffer.size() + 1 + wskip; + p -= buffer.size() + 1; buffer.clear(); state = kHost; } else { @@ -1892,16 +1910,17 @@ static void Parse(Environment* env, HandleScope handle_scope(isolate); Context::Scope context_scope(context); + const bool has_context = context_obj->IsObject(); const bool has_base = base_obj->IsObject(); struct url_data base; struct url_data url; - if (context_obj->IsObject()) + if (has_context) HarvestContext(env, &url, context_obj.As()); if (has_base) HarvestBase(env, &base, base_obj.As()); - URL::Parse(input, len, state_override, &url, &base, has_base); + URL::Parse(input, len, state_override, &url, has_context, &base, has_base); if ((url.flags & URL_FLAGS_INVALID_PARSE_STATE) || ((state_override != kUnknownState) && (url.flags & URL_FLAGS_TERMINATED))) diff --git a/src/node_url.h b/src/node_url.h index 49bfb264e8d987..72ac366ec1386b 100644 --- a/src/node_url.h +++ b/src/node_url.h @@ -81,30 +81,35 @@ struct url_data { class URL { public: static void Parse(const char* input, - const size_t len, + size_t len, enum url_parse_state state_override, struct url_data* url, + bool has_url, const struct url_data* base, bool has_base); URL(const char* input, const size_t len) { - Parse(input, len, kUnknownState, &context_, nullptr, false); + Parse(input, len, kUnknownState, &context_, false, nullptr, false); } URL(const char* input, const size_t len, const URL* base) { if (base != nullptr) - Parse(input, len, kUnknownState, &context_, &(base->context_), true); + Parse(input, len, kUnknownState, + &context_, false, + &(base->context_), true); else - Parse(input, len, kUnknownState, &context_, nullptr, false); + Parse(input, len, kUnknownState, &context_, false, nullptr, false); } URL(const char* input, const size_t len, const char* base, const size_t baselen) { if (base != nullptr && baselen > 0) { URL _base(base, baselen); - Parse(input, len, kUnknownState, &context_, &(_base.context_), true); + Parse(input, len, kUnknownState, + &context_, false, + &(_base.context_), true); } else { - Parse(input, len, kUnknownState, &context_, nullptr, false); + Parse(input, len, kUnknownState, &context_, false, nullptr, false); } } diff --git a/test/fixtures/url-tests.js b/test/fixtures/url-tests.js index befe325f931c6c..f75ea129906002 100644 --- a/test/fixtures/url-tests.js +++ b/test/fixtures/url-tests.js @@ -1,7 +1,7 @@ 'use strict'; /* WPT Refs: - https://github.com/w3c/web-platform-tests/blob/28541bb/url/urltestdata.json + https://github.com/w3c/web-platform-tests/blob/0f26c418a5/url/urltestdata.json License: http://www.w3.org/Consortium/Legal/2008/04-testsuite-copyright.html */ module.exports = @@ -3566,6 +3566,22 @@ module.exports = "search": "", "hash": "" }, + "Leading and trailing C0 control or space", + { + "input": "\u0000\u001b\u0004\u0012 http://example.com/\u001f \u000d ", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, "Ideographic full stop (full-width period for Chinese, etc.) should be treated as a dot. U+3002 is mapped to U+002E (dot)", { "input": "http://www.foo怂bar.com", @@ -5487,6 +5503,20 @@ module.exports = "search": "", "hash": "" }, + { + "input": "C|\n/", + "base": "file://host/dir/file", + "href": "file:///C:/", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/", + "search": "", + "hash": "" + }, { "input": "C|\\", "base": "file://host/dir/file",