From 5cea15e19f7a056c49eb9507a89b317d6882c879 Mon Sep 17 00:00:00 2001 From: David D Lowe Date: Fri, 25 Sep 2015 15:49:04 +0000 Subject: [PATCH 1/3] http: Reject paths containing non-ASCII characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit http would previously accept paths with non-ASCII characters. This proved problematic, because multi-byte characters were encoded as 'binary', that is, the first byte was taken and the remaining bytes were dropped for that character. There is no sensible way to fix this without breaking backwards compatibility for paths containing U+0080 to U+00FF characters. We already reject paths with unescaped spaces with an exception. This commit does the same for paths with non-ASCII characters too. The alternative would have been to encode paths in UTF-8, but this would cause the behaviour to silently change for paths with single-byte non-ASCII characters (eg: the copyright character U+00A9 ©). I find it preferable to to add to the existing prohibition of bad paths with spaces. Bug report: https://github.com/nodejs/node/issues/2114 --- lib/_http_client.js | 10 ++++++---- test/parallel/test-http-client-unescaped-path.js | 11 +++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/_http_client.js b/lib/_http_client.js index aef05eb8fa2299..40ca318c2f2481 100644 --- a/lib/_http_client.js +++ b/lib/_http_client.js @@ -41,13 +41,15 @@ function ClientRequest(options, cb) { if (self.agent && self.agent.protocol) expectedProtocol = self.agent.protocol; - if (options.path && / /.test(options.path)) { + if (options.path && ! /^[\u0000-\u001f\u0021-\u007f]*$/.test(options.path)) { // The actual regex is more like /[^A-Za-z0-9\-._~!$&'()*+,;=/:@]/ // with an additional rule for ignoring percentage-escaped characters // but that's a) hard to capture in a regular expression that performs - // well, and b) possibly too restrictive for real-world usage. That's - // why it only scans for spaces because those are guaranteed to create - // an invalid request. + // well, and b) possibly too restrictive for real-world usage. + // + // This regex rejects paths with a space U+0020 in them, (as those are + // guaranteed to create an invalid request), as well as paths with + // non-ASCII characters, (to deprecate previous buggy behaviour). throw new TypeError('Request path contains unescaped characters.'); } else if (protocol !== expectedProtocol) { throw new Error('Protocol "' + protocol + '" not supported. ' + diff --git a/test/parallel/test-http-client-unescaped-path.js b/test/parallel/test-http-client-unescaped-path.js index 1536916ae9b49c..63a4c6d4e31b88 100644 --- a/test/parallel/test-http-client-unescaped-path.js +++ b/test/parallel/test-http-client-unescaped-path.js @@ -4,6 +4,13 @@ var assert = require('assert'); var http = require('http'); assert.throws(function() { - // Path with spaces in it should throw. http.get({ path: 'bad path' }, assert.fail); -}, /contains unescaped characters/); +}, /contains unescaped characters/, 'Paths with spaces in it should throw'); + +for (var path of ['caf\u00e9', '\u0649\u0644\u0649', '💩' /* U+1F4A9 */ ]) { + assert.throws( + function() { http.get({ path: 'caf\u00e9' }, assert.fail); }, + /contains unescaped characters/, + 'Path ' + path + ' with non-ASCII characters should throw' + ); +} From 5743cae0b457c91522ee2408cc6f4200a660a1e5 Mon Sep 17 00:00:00 2001 From: David D Lowe Date: Fri, 25 Sep 2015 16:19:32 +0000 Subject: [PATCH 2/3] http: Reject additional whitespace chars in paths --- lib/_http_client.js | 9 +++++---- test/parallel/test-http-client-unescaped-path.js | 14 +++++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/lib/_http_client.js b/lib/_http_client.js index 40ca318c2f2481..0b6185bc41e166 100644 --- a/lib/_http_client.js +++ b/lib/_http_client.js @@ -41,15 +41,16 @@ function ClientRequest(options, cb) { if (self.agent && self.agent.protocol) expectedProtocol = self.agent.protocol; - if (options.path && ! /^[\u0000-\u001f\u0021-\u007f]*$/.test(options.path)) { + if (options.path && ! /^[\x00-\x09\x0E-\x1F\x21-\x7F]*$/.test(options.path)) { // The actual regex is more like /[^A-Za-z0-9\-._~!$&'()*+,;=/:@]/ // with an additional rule for ignoring percentage-escaped characters // but that's a) hard to capture in a regular expression that performs // well, and b) possibly too restrictive for real-world usage. // - // This regex rejects paths with a space U+0020 in them, (as those are - // guaranteed to create an invalid request), as well as paths with - // non-ASCII characters, (to deprecate previous buggy behaviour). + // This regex rejects paths with whitespace (U+000A, U+000B, U+000C, U+000D + // and U+0020) as those can have special meaning in the HTTP protocol, as + // well as paths with non-ASCII characters, (to deprecate previous buggy + // behaviour). throw new TypeError('Request path contains unescaped characters.'); } else if (protocol !== expectedProtocol) { throw new Error('Protocol "' + protocol + '" not supported. ' + diff --git a/test/parallel/test-http-client-unescaped-path.js b/test/parallel/test-http-client-unescaped-path.js index 63a4c6d4e31b88..34984e8d79e96f 100644 --- a/test/parallel/test-http-client-unescaped-path.js +++ b/test/parallel/test-http-client-unescaped-path.js @@ -3,14 +3,18 @@ var common = require('../common'); var assert = require('assert'); var http = require('http'); -assert.throws(function() { - http.get({ path: 'bad path' }, assert.fail); -}, /contains unescaped characters/, 'Paths with spaces in it should throw'); +for (var path of ['bad path', 'line\nfeed', 'carriage\rreturn']) { + assert.throws( + function() { http.get({ path: path }, assert.fail); }, + /contains unescaped characters/, + 'Path "' + path + '" with whitespace should throw' + ); +} for (var path of ['caf\u00e9', '\u0649\u0644\u0649', '💩' /* U+1F4A9 */ ]) { assert.throws( - function() { http.get({ path: 'caf\u00e9' }, assert.fail); }, + function() { http.get({ path: path }, assert.fail); }, /contains unescaped characters/, - 'Path ' + path + ' with non-ASCII characters should throw' + 'Path "' + path + '" with non-ASCII characters should throw' ); } From b4d6da87281d5f12b7542b9a67942d980c57b2de Mon Sep 17 00:00:00 2001 From: David D Lowe Date: Fri, 25 Sep 2015 16:38:16 +0000 Subject: [PATCH 3/3] http: Reject tab character in path --- lib/_http_client.js | 10 +++++----- test/parallel/test-http-client-unescaped-path.js | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/_http_client.js b/lib/_http_client.js index 0b6185bc41e166..644c0bb8c89e70 100644 --- a/lib/_http_client.js +++ b/lib/_http_client.js @@ -41,16 +41,16 @@ function ClientRequest(options, cb) { if (self.agent && self.agent.protocol) expectedProtocol = self.agent.protocol; - if (options.path && ! /^[\x00-\x09\x0E-\x1F\x21-\x7F]*$/.test(options.path)) { + if (options.path && ! /^[\x00-\x08\x0E-\x1F\x21-\x7F]*$/.test(options.path)) { // The actual regex is more like /[^A-Za-z0-9\-._~!$&'()*+,;=/:@]/ // with an additional rule for ignoring percentage-escaped characters // but that's a) hard to capture in a regular expression that performs // well, and b) possibly too restrictive for real-world usage. // - // This regex rejects paths with whitespace (U+000A, U+000B, U+000C, U+000D - // and U+0020) as those can have special meaning in the HTTP protocol, as - // well as paths with non-ASCII characters, (to deprecate previous buggy - // behaviour). + // This regex rejects paths with whitespace (U+0009, U+000A, U+000B, + // U+000C, U+000D and U+0020) as those can have special meaning in the HTTP + // protocol, as well as paths with non-ASCII characters, (to deprecate + // previous buggy behaviour). throw new TypeError('Request path contains unescaped characters.'); } else if (protocol !== expectedProtocol) { throw new Error('Protocol "' + protocol + '" not supported. ' + diff --git a/test/parallel/test-http-client-unescaped-path.js b/test/parallel/test-http-client-unescaped-path.js index 34984e8d79e96f..3fe5db402cf8f7 100644 --- a/test/parallel/test-http-client-unescaped-path.js +++ b/test/parallel/test-http-client-unescaped-path.js @@ -3,7 +3,7 @@ var common = require('../common'); var assert = require('assert'); var http = require('http'); -for (var path of ['bad path', 'line\nfeed', 'carriage\rreturn']) { +for (var path of ['bad path', 'line\nfeed', 'carriage\rreturn', 'tab\t']) { assert.throws( function() { http.get({ path: path }, assert.fail); }, /contains unescaped characters/,